From 4e4f0e5cb61dbe74f0dd95ef83116d961d233534 Mon Sep 17 00:00:00 2001
From: Casey Davenport <davenport.cas@gmail.com>
Date: Tue, 7 Apr 2026 13:17:40 -0700
Subject: [PATCH 01/12] Cherry-pick status manager readiness and crash context
 improvements

---
 pkg/controller/status/status.go      |  23 ++++-
 pkg/controller/status/status_test.go | 149 +++++++++++++++++++++++++++
 2 files changed, 170 insertions(+), 2 deletions(-)

diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go
index 1a9e00650d..481d1789d1 100644
--- a/pkg/controller/status/status.go
+++ b/pkg/controller/status/status.go
@@ -42,6 +42,17 @@ import (
 
 var log = logf.Log.WithName("status_manager")
 
+const (
+	// terminationReasonError is the reason string the container runtime sets on
+	// ContainerStateTerminated when the container exits with a non-zero exit code.
+	terminationReasonError = "Error"
+
+	// exitCodeSIGKILL is the exit code for a container killed by SIGKILL (128 + 9).
+	// The kubelet sends SIGKILL when a liveness probe fails, but other actors (OOM
+	// killer, manual kill) can also produce this code.
+	exitCodeSIGKILL = 137
+)
+
 // StatusManager manages the status for a single controller and component, and reports the status via
 // a TigeraStatus API object. The status manager uses the following conditions/states to represent the
 // component's current status:
@@ -700,13 +711,21 @@ func (m *statusManager) containerErrorMessage(p corev1.Pod, c corev1.ContainerSt
 		// Check well-known error states here and report an appropriate mesage to the end user.
 		switch c.State.Waiting.Reason {
 		case "CrashLoopBackOff":
-			return fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name)
+			msg := fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name)
+			if lt := c.LastTerminationState.Terminated; lt != nil {
+				if lt.Reason == terminationReasonError && lt.ExitCode == exitCodeSIGKILL {
+					msg += " (exit code 137, possible liveness probe failure)"
+				} else {
+					msg += fmt.Sprintf(" (%s, exit code %d)", lt.Reason, lt.ExitCode)
+				}
+			}
+			return msg
 		case "ImagePullBackOff", "ErrImagePull":
 			return fmt.Sprintf("Pod %s/%s failed to pull container image for: %s", p.Namespace, p.Name, c.Name)
 		}
 	}
 	if c.State.Terminated != nil {
-		if c.State.Terminated.Reason == "Error" {
+		if c.State.Terminated.Reason == terminationReasonError {
 			return fmt.Sprintf("Pod %s/%s has terminated container: %s", p.Namespace, p.Name, c.Name)
 		}
 	}
diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go
index 5484e32e0c..d19dda0ee5 100644
--- a/pkg/controller/status/status_test.go
+++ b/pkg/controller/status/status_test.go
@@ -390,6 +390,155 @@ var _ = Describe("Status reporting tests", func() {
 			})
 		})
 
+		Context("when pod is crash looping", func() {
+			var gen int64
+
+			createCrashLoopPodAndDeployment := func(lastTermination *corev1.ContainerStateTerminated) {
+				sm.ReadyToMonitor()
+				podStatus := corev1.PodStatus{
+					Phase: corev1.PodRunning,
+					ContainerStatuses: []corev1.ContainerStatus{
+						{
+							Name: "test-container",
+							State: corev1.ContainerState{
+								Waiting: &corev1.ContainerStateWaiting{
+									Reason: "CrashLoopBackOff",
+								},
+							},
+							LastTerminationState: corev1.ContainerState{
+								Terminated: lastTermination,
+							},
+						},
+					},
+				}
+				Expect(client.Create(ctx, &corev1.Pod{
+					ObjectMeta: metav1.ObjectMeta{
+						Namespace: "NS1",
+						Name:      "DP1pod-crash",
+						Labels:    map[string]string{"dp1Key": "dp1Value"},
+					},
+					Spec:   corev1.PodSpec{},
+					Status: podStatus,
+				})).NotTo(HaveOccurred())
+
+				gen = 5
+				replicas := int32(1)
+				sm.AddDeployments([]types.NamespacedName{{Namespace: "NS1", Name: "DP1"}})
+				Expect(client.Create(ctx, &appsv1.Deployment{
+					ObjectMeta: metav1.ObjectMeta{
+						Namespace:  "NS1",
+						Name:       "DP1",
+						Generation: gen,
+					},
+					Spec: appsv1.DeploymentSpec{
+						Selector: &metav1.LabelSelector{
+							MatchLabels: map[string]string{"dp1Key": "dp1Value"},
+						},
+						Replicas: &replicas,
+					},
+					Status: appsv1.DeploymentStatus{
+						ObservedGeneration:  gen,
+						UnavailableReplicas: 1,
+						AvailableReplicas:   0,
+						ReadyReplicas:       0,
+					},
+				})).NotTo(HaveOccurred())
+			}
+
+			It("should include OOMKilled reason in degraded message", func() {
+				createCrashLoopPodAndDeployment(&corev1.ContainerStateTerminated{
+					Reason:   "OOMKilled",
+					ExitCode: 137,
+				})
+				sm.updateStatus()
+				Expect(sm.IsDegraded()).To(BeTrue())
+				Expect(sm.failing).To(ContainElement(ContainSubstring("(OOMKilled, exit code 137)")))
+			})
+
+			It("should suggest liveness probe failure for exit code 137 with Error reason", func() {
+				createCrashLoopPodAndDeployment(&corev1.ContainerStateTerminated{
+					Reason:   "Error",
+					ExitCode: 137,
+				})
+				sm.updateStatus()
+				Expect(sm.IsDegraded()).To(BeTrue())
+				Expect(sm.failing).To(ContainElement(ContainSubstring("possible liveness probe failure")))
+			})
+
+			It("should include exit code for other termination reasons", func() {
+				createCrashLoopPodAndDeployment(&corev1.ContainerStateTerminated{
+					Reason:   "Error",
+					ExitCode: 1,
+				})
+				sm.updateStatus()
+				Expect(sm.IsDegraded()).To(BeTrue())
+				Expect(sm.failing).To(ContainElement(ContainSubstring("(Error, exit code 1)")))
+			})
+
+			It("should report crash loop without detail when no last termination state", func() {
+				createCrashLoopPodAndDeployment(nil)
+				sm.updateStatus()
+				Expect(sm.IsDegraded()).To(BeTrue())
+				Expect(sm.failing).To(ContainElement(And(
+					ContainSubstring("crash looping container"),
+					Not(ContainSubstring("exit code")),
+				)))
+			})
+		})
+
+		Context("when pod is running but not ready", func() {
+			var gen int64
+			BeforeEach(func() {
+				sm.ReadyToMonitor()
+				Expect(client.Create(ctx, &corev1.Pod{
+					ObjectMeta: metav1.ObjectMeta{
+						Namespace: "NS1",
+						Name:      "DP1pod-notready",
+						Labels: map[string]string{
+							"dp1Key": "dp1Value",
+						},
+					},
+					Spec: corev1.PodSpec{},
+					Status: corev1.PodStatus{
+						Phase: corev1.PodRunning,
+						Conditions: []corev1.PodCondition{
+							{
+								Type:   corev1.ContainersReady,
+								Status: corev1.ConditionFalse,
+							},
+						},
+					},
+				})).NotTo(HaveOccurred())
+				gen = 5
+			})
+			It("should degrade when the deployment does not have the correct pod counts", func() {
+				sm.AddDeployments([]types.NamespacedName{{Namespace: "NS1", Name: "DP1"}})
+				replicas := int32(1)
+
+				Expect(client.Create(ctx, &appsv1.Deployment{
+					ObjectMeta: metav1.ObjectMeta{
+						Namespace: "NS1", Name: "DP1",
+						Generation: gen,
+					},
+					Spec: appsv1.DeploymentSpec{
+						Selector: &metav1.LabelSelector{
+							MatchLabels: map[string]string{"dp1Key": "dp1Value"},
+						},
+						Replicas: &replicas,
+					},
+					Status: appsv1.DeploymentStatus{
+						ObservedGeneration:  gen,
+						UnavailableReplicas: 1,
+						AvailableReplicas:   0,
+						ReadyReplicas:       0,
+					},
+				})).NotTo(HaveOccurred())
+				sm.updateStatus()
+				Expect(sm.IsDegraded()).To(BeTrue())
+				Expect(sm.failing).To(ContainElement(ContainSubstring("running but not ready")))
+			})
+		})
+
 		It("Should handle basic state changes", func() {
 			// We expect no state to be "True" at boot.
 			Expect(sm.IsAvailable()).To(BeFalse())

From c5b360ad2d3776c74ee476a79816559be7bb8206 Mon Sep 17 00:00:00 2001
From: Casey Davenport <davenport.cas@gmail.com>
Date: Tue, 7 Apr 2026 13:20:50 -0700
Subject: [PATCH 02/12] Add podIssue types for structured pod diagnostics

---
 pkg/controller/status/status.go      | 38 ++++++++++++++++++++++++++++
 pkg/controller/status/status_test.go | 28 ++++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go
index 481d1789d1..3cc62d3ab3 100644
--- a/pkg/controller/status/status.go
+++ b/pkg/controller/status/status.go
@@ -53,6 +53,44 @@ const (
 	exitCodeSIGKILL = 137
 )
 
+type podIssueSeverity int
+
+const (
+	severityProgressing podIssueSeverity = iota
+	severityFailing
+)
+
+// podIssueType defines the category of a pod issue. The enum order determines display
+// priority when issues compete for the 3-slot cap in summarizeIssues.
+type podIssueType int
+
+const (
+	issueCrashLoopBackOff podIssueType = iota
+	issueImagePull
+	issueTerminated
+	issuePodFailed
+	issueNotReady
+	issuePending
+)
+
+// podIssue represents a single diagnosed problem with a pod.
+type podIssue struct {
+	severity  podIssueSeverity
+	issueType podIssueType
+	message   string
+
+	// isOldRevision is true if the pod belongs to an old revision during a rollout.
+	isOldRevision bool
+
+	// terminationReason and exitCode provide dedup granularity within an issue type.
+	terminationReason string
+	exitCode          int32
+}
+
+func (p podIssue) key() string {
+	return fmt.Sprintf("%d:%s:%d", p.issueType, p.terminationReason, p.exitCode)
+}
+
 // StatusManager manages the status for a single controller and component, and reports the status via
 // a TigeraStatus API object. The status manager uses the following conditions/states to represent the
 // component's current status:
diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go
index d19dda0ee5..dec7cadd95 100644
--- a/pkg/controller/status/status_test.go
+++ b/pkg/controller/status/status_test.go
@@ -880,4 +880,32 @@ var _ = Describe("Status reporting tests", func() {
 				}, false, true),
 		)
 	})
+
+	Describe("podIssue", func() {
+		It("should produce distinct keys for different crash reasons", func() {
+			oom := podIssue{
+				issueType:         issueCrashLoopBackOff,
+				terminationReason: "OOMKilled",
+				exitCode:          137,
+			}
+			errExit1 := podIssue{
+				issueType:         issueCrashLoopBackOff,
+				terminationReason: terminationReasonError,
+				exitCode:          1,
+			}
+			Expect(oom.key()).NotTo(Equal(errExit1.key()))
+		})
+
+		It("should produce the same key for same issue type with no termination context", func() {
+			a := podIssue{issueType: issueNotReady}
+			b := podIssue{issueType: issueNotReady}
+			Expect(a.key()).To(Equal(b.key()))
+		})
+
+		It("should produce distinct keys for different issue types", func() {
+			a := podIssue{issueType: issueCrashLoopBackOff}
+			b := podIssue{issueType: issueImagePull}
+			Expect(a.key()).NotTo(Equal(b.key()))
+		})
+	})
 })

From 900420ea7414af41fb6411b604b7e25000d020e4 Mon Sep 17 00:00:00 2001
From: Casey Davenport <davenport.cas@gmail.com>
Date: Tue, 7 Apr 2026 13:22:44 -0700
Subject: [PATCH 03/12] Add summarizeIssues for dedup, prioritization, and
 capping

---
 pkg/controller/status/status.go      | 57 +++++++++++++++++++++++++
 pkg/controller/status/status_test.go | 63 ++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+)

diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go
index 3cc62d3ab3..0fa65c41de 100644
--- a/pkg/controller/status/status.go
+++ b/pkg/controller/status/status.go
@@ -91,6 +91,63 @@ func (p podIssue) key() string {
 	return fmt.Sprintf("%d:%s:%d", p.issueType, p.terminationReason, p.exitCode)
 }
 
+const maxIssuesPerWorkload = 3
+
+// summarizeIssues deduplicates, prioritizes, and caps a list of pod issues into
+// human-readable failing and progressing message slices.
+func summarizeIssues(issues []podIssue) (failing []string, progressing []string) {
+	if len(issues) == 0 {
+		return nil, nil
+	}
+
+	// Sort: new-revision first, then by issue type priority (enum order).
+	sort.SliceStable(issues, func(i, j int) bool {
+		if issues[i].isOldRevision != issues[j].isOldRevision {
+			return !issues[i].isOldRevision
+		}
+		return issues[i].issueType < issues[j].issueType
+	})
+
+	// Group by key. First occurrence provides the message; count tracks duplicates.
+	type issueGroup struct {
+		issue podIssue
+		count int
+	}
+	seen := map[string]int{}
+	var groups []issueGroup
+	for _, iss := range issues {
+		k := iss.key()
+		if idx, ok := seen[k]; ok {
+			groups[idx].count++
+			continue
+		}
+		seen[k] = len(groups)
+		groups = append(groups, issueGroup{issue: iss, count: 1})
+	}
+
+	// Cap at maxIssuesPerWorkload unique reasons.
+	if len(groups) > maxIssuesPerWorkload {
+		groups = groups[:maxIssuesPerWorkload]
+	}
+
+	// Format messages.
+	for _, g := range groups {
+		msg := g.issue.message
+		if g.count > 1 {
+			msg += fmt.Sprintf(" (%d pods affected)", g.count)
+		}
+		if g.issue.isOldRevision {
+			msg += " (old revision)"
+		}
+		if g.issue.severity == severityFailing {
+			failing = append(failing, msg)
+		} else {
+			progressing = append(progressing, msg)
+		}
+	}
+	return failing, progressing
+}
+
 // StatusManager manages the status for a single controller and component, and reports the status via
 // a TigeraStatus API object. The status manager uses the following conditions/states to represent the
 // component's current status:
diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go
index dec7cadd95..0c05ef4013 100644
--- a/pkg/controller/status/status_test.go
+++ b/pkg/controller/status/status_test.go
@@ -908,4 +908,67 @@ var _ = Describe("Status reporting tests", func() {
 			Expect(a.key()).NotTo(Equal(b.key()))
 		})
 	})
+
+	Describe("summarizeIssues", func() {
+		It("should deduplicate issues with the same key", func() {
+			issues := []podIssue{
+				{severity: severityFailing, issueType: issueCrashLoopBackOff, message: "Pod ns/pod1 has crash looping container: c1", terminationReason: "OOMKilled", exitCode: 137},
+				{severity: severityFailing, issueType: issueCrashLoopBackOff, message: "Pod ns/pod2 has crash looping container: c1", terminationReason: "OOMKilled", exitCode: 137},
+				{severity: severityFailing, issueType: issueCrashLoopBackOff, message: "Pod ns/pod3 has crash looping container: c1", terminationReason: "OOMKilled", exitCode: 137},
+			}
+			failing, progressing := summarizeIssues(issues)
+			Expect(failing).To(HaveLen(1))
+			Expect(failing[0]).To(ContainSubstring("3 pods affected"))
+			Expect(progressing).To(BeEmpty())
+		})
+
+		It("should cap at 3 unique reasons", func() {
+			issues := []podIssue{
+				{severity: severityFailing, issueType: issueCrashLoopBackOff, message: "msg1", terminationReason: "OOMKilled", exitCode: 137},
+				{severity: severityFailing, issueType: issueImagePull, message: "msg2"},
+				{severity: severityFailing, issueType: issueTerminated, message: "msg3", terminationReason: terminationReasonError, exitCode: 1},
+				{severity: severityFailing, issueType: issuePodFailed, message: "msg4"},
+			}
+			failing, _ := summarizeIssues(issues)
+			Expect(failing).To(HaveLen(3))
+		})
+
+		It("should prioritize new-revision pods over old-revision pods", func() {
+			issues := []podIssue{
+				{severity: severityFailing, issueType: issueCrashLoopBackOff, message: "old pod crash", isOldRevision: true, terminationReason: "OOMKilled", exitCode: 137},
+				{severity: severityFailing, issueType: issueCrashLoopBackOff, message: "new pod crash", isOldRevision: false, terminationReason: terminationReasonError, exitCode: 1},
+			}
+			failing, _ := summarizeIssues(issues)
+			Expect(failing).To(HaveLen(2))
+			Expect(failing[0]).To(ContainSubstring("new pod crash"))
+			Expect(failing[1]).To(ContainSubstring("old pod crash"))
+			Expect(failing[1]).To(ContainSubstring("old revision"))
+		})
+
+		It("should split failing and progressing", func() {
+			issues := []podIssue{
+				{severity: severityFailing, issueType: issueNotReady, message: "Pod ns/p1 is running but not ready"},
+				{severity: severityProgressing, issueType: issuePending, message: "Pod ns/p2 is pending: Unschedulable"},
+			}
+			failing, progressing := summarizeIssues(issues)
+			Expect(failing).To(HaveLen(1))
+			Expect(progressing).To(HaveLen(1))
+			Expect(progressing[0]).To(ContainSubstring("pending"))
+		})
+
+		It("should annotate old revision issues", func() {
+			issues := []podIssue{
+				{severity: severityFailing, issueType: issueNotReady, message: "Pod ns/p1 is running but not ready", isOldRevision: true},
+			}
+			failing, _ := summarizeIssues(issues)
+			Expect(failing).To(HaveLen(1))
+			Expect(failing[0]).To(ContainSubstring("old revision"))
+		})
+
+		It("should return empty slices for no issues", func() {
+			failing, progressing := summarizeIssues(nil)
+			Expect(failing).To(BeEmpty())
+			Expect(progressing).To(BeEmpty())
+		})
+	})
 })

From 7ed265687558c165d8e9dc831b214955bb9ca3f6 Mon Sep 17 00:00:00 2001
From: Casey Davenport <davenport.cas@gmail.com>
Date: Tue, 7 Apr 2026 13:25:41 -0700
Subject: [PATCH 04/12] Add diagnosePods to replace podsFailing with structured
 diagnosis

---
 pkg/controller/status/status.go      | 141 ++++++++++++++++++
 pkg/controller/status/status_test.go | 213 +++++++++++++++++++++++++++
 2 files changed, 354 insertions(+)

diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go
index 0fa65c41de..04b95b1e6a 100644
--- a/pkg/controller/status/status.go
+++ b/pkg/controller/status/status.go
@@ -827,6 +827,147 @@ func (m *statusManager) containerErrorMessage(p corev1.Pod, c corev1.ContainerSt
 	return ""
 }
 
+// diagnosePods lists pods matching the selector and returns a podIssue for each
+// unhealthy pod found. If currentRevision is non-empty, pods not matching that
+// revision are marked as old-revision.
+func (m *statusManager) diagnosePods(selector *metav1.LabelSelector, namespace string, currentRevision string) []podIssue {
+	l := corev1.PodList{}
+	s, err := metav1.LabelSelectorAsMap(selector)
+	if err != nil {
+		panic(err)
+	}
+	err = m.client.List(context.TODO(), &l, client.MatchingLabels(s), client.InNamespace(namespace))
+	if err != nil {
+		log.WithValues("reason", err).Info("Failed to list pods for diagnosis")
+		return nil
+	}
+
+	var issues []podIssue
+	for _, p := range l.Items {
+		oldRevision := currentRevision != "" && !podMatchesRevision(p, currentRevision)
+
+		if p.Status.Phase == corev1.PodFailed {
+			issues = append(issues, podIssue{
+				severity:      severityFailing,
+				issueType:     issuePodFailed,
+				message:       fmt.Sprintf("Pod %s/%s has failed", p.Namespace, p.Name),
+				isOldRevision: oldRevision,
+			})
+			continue
+		}
+
+		// Check init and regular container statuses for errors.
+		if iss := diagnoseContainers(p, p.Status.InitContainerStatuses, oldRevision); len(iss) > 0 {
+			issues = append(issues, iss...)
+			continue
+		}
+		if iss := diagnoseContainers(p, p.Status.ContainerStatuses, oldRevision); len(iss) > 0 {
+			issues = append(issues, iss...)
+			continue
+		}
+
+		// Running but not passing readiness checks.
+		if p.Status.Phase == corev1.PodRunning {
+			for _, cond := range p.Status.Conditions {
+				if cond.Type == corev1.ContainersReady && cond.Status == corev1.ConditionFalse {
+					issues = append(issues, podIssue{
+						severity:      severityFailing,
+						issueType:     issueNotReady,
+						message:       fmt.Sprintf("Pod %s/%s is running but not ready", p.Namespace, p.Name),
+						isOldRevision: oldRevision,
+					})
+					break
+				}
+			}
+			continue
+		}
+
+		// Pending pod - check for scheduling failures.
+		if p.Status.Phase == corev1.PodPending {
+			msg := fmt.Sprintf("Pod %s/%s is pending", p.Namespace, p.Name)
+			for _, cond := range p.Status.Conditions {
+				if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse {
+					if cond.Message != "" {
+						msg += fmt.Sprintf(": %s", cond.Message)
+					}
+					break
+				}
+			}
+			issues = append(issues, podIssue{
+				severity:      severityProgressing,
+				issueType:     issuePending,
+				message:       msg,
+				isOldRevision: oldRevision,
+			})
+		}
+	}
+	return issues
+}
+
+// diagnoseContainers checks a list of container statuses for known error states and
+// returns a podIssue for each one found.
+func diagnoseContainers(p corev1.Pod, statuses []corev1.ContainerStatus, oldRevision bool) []podIssue {
+	var issues []podIssue
+	for _, c := range statuses {
+		if c.State.Waiting != nil {
+			switch c.State.Waiting.Reason {
+			case "CrashLoopBackOff":
+				msg := fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name)
+				var termReason string
+				var exitCode int32
+				if lt := c.LastTerminationState.Terminated; lt != nil {
+					termReason = lt.Reason
+					exitCode = lt.ExitCode
+					if lt.Reason == terminationReasonError && lt.ExitCode == exitCodeSIGKILL {
+						msg += " (exit code 137, possible liveness probe failure)"
+					} else {
+						msg += fmt.Sprintf(" (%s, exit code %d)", lt.Reason, lt.ExitCode)
+					}
+				}
+				issues = append(issues, podIssue{
+					severity:          severityFailing,
+					issueType:         issueCrashLoopBackOff,
+					message:           msg,
+					isOldRevision:     oldRevision,
+					terminationReason: termReason,
+					exitCode:          exitCode,
+				})
+			case "ImagePullBackOff", "ErrImagePull":
+				issues = append(issues, podIssue{
+					severity:      severityFailing,
+					issueType:     issueImagePull,
+					message:       fmt.Sprintf("Pod %s/%s failed to pull container image for: %s", p.Namespace, p.Name, c.Name),
+					isOldRevision: oldRevision,
+				})
+			}
+		}
+		if c.State.Terminated != nil {
+			if c.State.Terminated.Reason == terminationReasonError {
+				issues = append(issues, podIssue{
+					severity:      severityFailing,
+					issueType:     issueTerminated,
+					message:       fmt.Sprintf("Pod %s/%s has terminated container: %s", p.Namespace, p.Name, c.Name),
+					isOldRevision: oldRevision,
+				})
+			}
+		}
+	}
+	return issues
+}
+
+// podMatchesRevision checks whether a pod belongs to the given revision.
+// Works for DaemonSets and StatefulSets (controller-revision-hash label)
+// and Deployments (pod-template-hash label).
+func podMatchesRevision(p corev1.Pod, currentRevision string) bool {
+	if hash, ok := p.Labels[appsv1.ControllerRevisionHashLabelKey]; ok {
+		return hash == currentRevision
+	}
+	if hash, ok := p.Labels[appsv1.DefaultDeploymentUniqueLabelKey]; ok {
+		return hash == currentRevision
+	}
+	return true
+}
+
 func (m *statusManager) set(retry bool, conditions ...operator.TigeraStatusCondition) {
 	if m.enabled == nil || !*m.enabled {
 		// Never set any conditions unless the status manager is enabled.
diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go
index 0c05ef4013..c125b16574 100644
--- a/pkg/controller/status/status_test.go
+++ b/pkg/controller/status/status_test.go
@@ -971,4 +971,217 @@ var _ = Describe("Status reporting tests", func() {
 			Expect(progressing).To(BeEmpty())
 		})
 	})
+
+	Describe("diagnosePods", func() {
+		var sm *statusManager
+		var cl controllerRuntimeClient.Client
+		var ctx = context.Background()
+
+		BeforeEach(func() {
+			scheme := runtime.NewScheme()
+			Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred())
+			Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred())
+			err := apis.AddToScheme(scheme, false)
+			Expect(err).NotTo(HaveOccurred())
+			cl = ctrlrfake.DefaultFakeClientBuilder(scheme).Build()
+			sm = New(cl, "test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager)
+		})
+
+		selector := &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}
+		podMeta := metav1.ObjectMeta{Namespace: "ns", Name: "pod1", Labels: map[string]string{"app": "test"}}
+
+		It("should detect CrashLoopBackOff with OOMKilled context", func() {
+			Expect(cl.Create(ctx, &corev1.Pod{
+				ObjectMeta: podMeta,
+				Spec:       corev1.PodSpec{},
+				Status: corev1.PodStatus{
+					Phase: corev1.PodRunning,
+					ContainerStatuses: []corev1.ContainerStatus{
+						{
+							Name:  "c1",
+							State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}},
+							LastTerminationState: corev1.ContainerState{
+								Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137},
+							},
+						},
+					},
+				},
+			})).NotTo(HaveOccurred())
+			issues := sm.diagnosePods(selector, "ns", "")
+			Expect(issues).To(HaveLen(1))
+			Expect(issues[0].issueType).To(Equal(issueCrashLoopBackOff))
+			Expect(issues[0].severity).To(Equal(severityFailing))
+			Expect(issues[0].terminationReason).To(Equal("OOMKilled"))
+			Expect(issues[0].exitCode).To(BeEquivalentTo(137))
+			Expect(issues[0].message).To(ContainSubstring("OOMKilled"))
+		})
+
+		It("should detect CrashLoopBackOff with possible liveness probe failure", func() {
+			Expect(cl.Create(ctx, &corev1.Pod{
+				ObjectMeta: podMeta,
+				Spec:       corev1.PodSpec{},
+				Status: corev1.PodStatus{
+					Phase: corev1.PodRunning,
+					ContainerStatuses: []corev1.ContainerStatus{
+						{
+							Name:  "c1",
+							State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}},
+							LastTerminationState: corev1.ContainerState{
+								Terminated: &corev1.ContainerStateTerminated{Reason: terminationReasonError, ExitCode: exitCodeSIGKILL},
+							},
+						},
+					},
+				},
+			})).NotTo(HaveOccurred())
+			issues := sm.diagnosePods(selector, "ns", "")
+			Expect(issues).To(HaveLen(1))
+			Expect(issues[0].message).To(ContainSubstring("possible liveness probe failure"))
+		})
+
+		It("should detect ImagePullBackOff", func() {
+			Expect(cl.Create(ctx, &corev1.Pod{
+				ObjectMeta: podMeta,
+				Spec:       corev1.PodSpec{},
+				Status: corev1.PodStatus{
+					Phase: corev1.PodRunning,
+					ContainerStatuses: []corev1.ContainerStatus{
+						{
+							Name:  "c1",
+							State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "ImagePullBackOff"}},
+						},
+					},
+				},
+			})).NotTo(HaveOccurred())
+			issues := sm.diagnosePods(selector, "ns", "")
+			Expect(issues).To(HaveLen(1))
+			Expect(issues[0].issueType).To(Equal(issueImagePull))
+			Expect(issues[0].severity).To(Equal(severityFailing))
+		})
+
+		It("should detect terminated container with Error", func() {
+			Expect(cl.Create(ctx, &corev1.Pod{
+				ObjectMeta: podMeta,
+				Spec:       corev1.PodSpec{},
+				Status: corev1.PodStatus{
+					Phase: corev1.PodRunning,
+					ContainerStatuses: []corev1.ContainerStatus{
+						{
+							Name:  "c1",
+							State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{Reason: terminationReasonError}},
+						},
+					},
+				},
+			})).NotTo(HaveOccurred())
+			issues := sm.diagnosePods(selector, "ns", "")
+			Expect(issues).To(HaveLen(1))
+			Expect(issues[0].issueType).To(Equal(issueTerminated))
+		})
+
+		It("should detect pod in Failed phase", func() {
+			Expect(cl.Create(ctx, &corev1.Pod{
+				ObjectMeta: podMeta,
+				Spec:       corev1.PodSpec{},
+				Status:     corev1.PodStatus{Phase: corev1.PodFailed},
+			})).NotTo(HaveOccurred())
+			issues := sm.diagnosePods(selector, "ns", "")
+			Expect(issues).To(HaveLen(1))
+			Expect(issues[0].issueType).To(Equal(issuePodFailed))
+			Expect(issues[0].severity).To(Equal(severityFailing))
+		})
+
+		It("should detect running but not ready", func() {
+			Expect(cl.Create(ctx, &corev1.Pod{
+				ObjectMeta: podMeta,
+				Spec:       corev1.PodSpec{},
+				Status: corev1.PodStatus{
+					Phase: corev1.PodRunning,
+					Conditions: []corev1.PodCondition{
+						{Type: corev1.ContainersReady, Status: corev1.ConditionFalse},
+					},
+				},
+			})).NotTo(HaveOccurred())
+			issues := sm.diagnosePods(selector, "ns", "")
+			Expect(issues).To(HaveLen(1))
+			Expect(issues[0].issueType).To(Equal(issueNotReady))
+			Expect(issues[0].severity).To(Equal(severityFailing))
+		})
+
+		It("should detect pending unschedulable pod with scheduler reason", func() {
+			Expect(cl.Create(ctx, &corev1.Pod{
+				ObjectMeta: podMeta,
+				Spec:       corev1.PodSpec{},
+				Status: corev1.PodStatus{
+					Phase: corev1.PodPending,
+					Conditions: []corev1.PodCondition{
+						{
+							Type:    corev1.PodScheduled,
+							Status:  corev1.ConditionFalse,
+							Message: "0/3 nodes are available: 3 Insufficient memory",
+						},
+					},
+				},
+			})).NotTo(HaveOccurred())
+			issues := sm.diagnosePods(selector, "ns", "")
+			Expect(issues).To(HaveLen(1))
+			Expect(issues[0].issueType).To(Equal(issuePending))
+			Expect(issues[0].severity).To(Equal(severityProgressing))
+			Expect(issues[0].message).To(ContainSubstring("Insufficient memory"))
+		})
+
+		It("should detect pending pod without scheduler condition", func() {
+			Expect(cl.Create(ctx, &corev1.Pod{
+				ObjectMeta: podMeta,
+				Spec:       corev1.PodSpec{},
+				Status:     corev1.PodStatus{Phase: corev1.PodPending},
+			})).NotTo(HaveOccurred())
+			issues := sm.diagnosePods(selector, "ns", "")
+			Expect(issues).To(HaveLen(1))
+			Expect(issues[0].issueType).To(Equal(issuePending))
+			Expect(issues[0].severity).To(Equal(severityProgressing))
+		})
+
+		It("should report multiple issues from different pods", func() {
+			pod1 := podMeta.DeepCopy()
+			pod1.Name = "pod-crash"
+			Expect(cl.Create(ctx, &corev1.Pod{
+				ObjectMeta: *pod1,
+				Spec:       corev1.PodSpec{},
+				Status: corev1.PodStatus{
+					Phase: corev1.PodRunning,
+					ContainerStatuses: []corev1.ContainerStatus{
+						{
+							Name:  "c1",
+							State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}},
+						},
+					},
+				},
+			})).NotTo(HaveOccurred())
+
+			pod2 := podMeta.DeepCopy()
+			pod2.Name = "pod-pending"
+			Expect(cl.Create(ctx, &corev1.Pod{
+				ObjectMeta: *pod2,
+				Spec:       corev1.PodSpec{},
+				Status:     corev1.PodStatus{Phase: corev1.PodPending},
+			})).NotTo(HaveOccurred())
+
+			issues := sm.diagnosePods(selector, "ns", "")
+			Expect(issues).To(HaveLen(2))
+		})
+
+		It("should return no issues for healthy pods", func() {
+			Expect(cl.Create(ctx, &corev1.Pod{
+				ObjectMeta: podMeta,
+				Spec:       corev1.PodSpec{},
+				Status: corev1.PodStatus{
+					Phase: corev1.PodRunning,
+					Conditions: []corev1.PodCondition{
+						{Type: corev1.ContainersReady, Status: corev1.ConditionTrue},
+					},
+				},
+			})).NotTo(HaveOccurred())
+			issues := sm.diagnosePods(selector, "ns", "")
+			Expect(issues).To(BeEmpty())
+		})
+	})
 })

From 2b122623956e4323684f32c34755293b990b985c Mon Sep 17 00:00:00 2001
From: Casey Davenport <davenport.cas@gmail.com>
Date: Tue, 7 Apr 2026 13:28:05 -0700
Subject: [PATCH 05/12] Add revision detection helpers for Deployments and
 DaemonSets

---
 pkg/controller/status/status.go      |  60 ++++++++++++++
 pkg/controller/status/status_test.go | 114 +++++++++++++++++++++++++++
 2 files changed, 174 insertions(+)

diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go
index 04b95b1e6a..764458e6fc 100644
--- a/pkg/controller/status/status.go
+++ b/pkg/controller/status/status.go
@@ -968,6 +968,66 @@ func podMatchesRevision(p corev1.Pod, currentRevision string) bool {
 	return true
 }
 
+// currentDeploymentRevision returns the pod-template-hash of the active ReplicaSet
+// for the given Deployment. Returns empty string if it cannot be determined.
+func (m *statusManager) currentDeploymentRevision(dep *appsv1.Deployment) string {
+	rsList := &appsv1.ReplicaSetList{}
+	s, err := metav1.LabelSelectorAsMap(dep.Spec.Selector)
+	if err != nil {
+		log.WithValues("reason", err).Info("Failed to parse deployment selector for revision lookup")
+		return ""
+	}
+	err = m.client.List(context.TODO(), rsList, client.MatchingLabels(s), client.InNamespace(dep.Namespace))
+	if err != nil {
+		log.WithValues("reason", err).Info("Failed to list ReplicaSets for revision lookup")
+		return ""
+	}
+
+	// Find the ReplicaSet that is owned by this Deployment and has active replicas.
+	for _, rs := range rsList.Items {
+		if !isOwnedBy(rs.OwnerReferences, dep.UID) {
+			continue
+		}
+		if rs.Status.Replicas > 0 {
+			return rs.Labels[appsv1.DefaultDeploymentUniqueLabelKey]
+		}
+	}
+	return ""
+}
+
+// currentDaemonSetRevision returns the controller-revision-hash of the most recent
+// ControllerRevision for the given DaemonSet. Returns empty string if it cannot be determined.
+func (m *statusManager) currentDaemonSetRevision(ds *appsv1.DaemonSet) string {
+	revList := &appsv1.ControllerRevisionList{}
+	err := m.client.List(context.TODO(), revList, client.InNamespace(ds.Namespace))
+	if err != nil {
+		log.WithValues("reason", err).Info("Failed to list ControllerRevisions for revision lookup")
+		return ""
+	}
+
+	var maxRevision int64
+	var currentHash string
+	for _, rev := range revList.Items {
+		if !isOwnedBy(rev.OwnerReferences, ds.UID) {
+			continue
+		}
+		if rev.Revision > maxRevision {
+			maxRevision = rev.Revision
+			currentHash = rev.Labels[appsv1.ControllerRevisionHashLabelKey]
+		}
+	}
+	return currentHash
+}
+
+func isOwnedBy(refs []metav1.OwnerReference, uid types.UID) bool {
+	for _, ref := range refs {
+		if ref.UID == uid && ref.Controller != nil && *ref.Controller {
+			return true
+		}
+	}
+	return false
+}
+
 func (m *statusManager) set(retry bool, conditions ...operator.TigeraStatusCondition) {
 	if m.enabled == nil || !*m.enabled {
 		// Never set any conditions unless the status manager is enabled.
diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go
index c125b16574..ada11b430c 100644
--- a/pkg/controller/status/status_test.go
+++ b/pkg/controller/status/status_test.go
@@ -71,6 +71,8 @@ var _ = Describe("Status reporting tests", func() {
 		Expect(oldVersionSm.IsAvailable()).To(BeFalse())
 	})
 
+	boolPtr := func(b bool) *bool { return &b }
+
 	Context("without CR found", func() {
 		It("status is not created", func() {
 			sm.updateStatus()
@@ -1184,4 +1186,116 @@ var _ = Describe("Status reporting tests", func() {
 			Expect(issues).To(BeEmpty())
 		})
 	})
+
+	Describe("currentRevision helpers", func() {
+		var sm *statusManager
+		var cl controllerRuntimeClient.Client
+		var ctx = context.Background()
+
+		BeforeEach(func() {
+			scheme := runtime.NewScheme()
+			Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred())
+			Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred())
+			err := apis.AddToScheme(scheme, false)
+			Expect(err).NotTo(HaveOccurred())
+			cl = ctrlrfake.DefaultFakeClientBuilder(scheme).Build()
+			sm = New(cl, "test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager)
+		})
+
+		Describe("currentDeploymentRevision", func() {
+			It("should return the pod-template-hash of the newest ReplicaSet", func() {
+				dep := &appsv1.Deployment{
+					ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep1", UID: "dep-uid"},
+					Spec: appsv1.DeploymentSpec{
+						Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}},
+					},
+				}
+				// Current ReplicaSet.
+				Expect(cl.Create(ctx, &appsv1.ReplicaSet{
+					ObjectMeta: metav1.ObjectMeta{
+						Namespace: "ns",
+						Name:      "dep1-abc123",
+						Labels:    map[string]string{"app": "test", appsv1.DefaultDeploymentUniqueLabelKey: "abc123"},
+						OwnerReferences: []metav1.OwnerReference{
+							{UID: "dep-uid", Controller: boolPtr(true)},
+						},
+					},
+					Spec: appsv1.ReplicaSetSpec{
+						Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}},
+					},
+					Status: appsv1.ReplicaSetStatus{Replicas: 1},
+				})).NotTo(HaveOccurred())
+				// Old ReplicaSet.
+				Expect(cl.Create(ctx, &appsv1.ReplicaSet{
+					ObjectMeta: metav1.ObjectMeta{
+						Namespace: "ns",
+						Name:      "dep1-old999",
+						Labels:    map[string]string{"app": "test", appsv1.DefaultDeploymentUniqueLabelKey: "old999"},
+						OwnerReferences: []metav1.OwnerReference{
+							{UID: "dep-uid", Controller: boolPtr(true)},
+						},
+					},
+					Spec: appsv1.ReplicaSetSpec{
+						Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}},
+					},
+					Status: appsv1.ReplicaSetStatus{Replicas: 0},
+				})).NotTo(HaveOccurred())
+
+				rev := sm.currentDeploymentRevision(dep)
+				Expect(rev).To(Equal("abc123"))
+			})
+
+			It("should return empty string when no ReplicaSets exist", func() {
+				dep := &appsv1.Deployment{
+					ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep1", UID: "dep-uid"},
+					Spec: appsv1.DeploymentSpec{
+						Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}},
+					},
+				}
+				rev := sm.currentDeploymentRevision(dep)
+				Expect(rev).To(BeEmpty())
+			})
+		})
+
+		Describe("currentDaemonSetRevision", func() {
+			It("should return the hash of the highest-revision ControllerRevision", func() {
+				ds := &appsv1.DaemonSet{
+					ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "ds1", UID: "ds-uid"},
+				}
+				Expect(cl.Create(ctx, &appsv1.ControllerRevision{
+					ObjectMeta: metav1.ObjectMeta{
+						Namespace: "ns",
+						Name:      "ds1-rev1",
+						Labels:    map[string]string{appsv1.ControllerRevisionHashLabelKey: "hash-old"},
+						OwnerReferences: []metav1.OwnerReference{
+							{UID: "ds-uid", Controller: boolPtr(true)},
+						},
+					},
+					Revision: 1,
+				})).NotTo(HaveOccurred())
+				Expect(cl.Create(ctx, &appsv1.ControllerRevision{
+					ObjectMeta: metav1.ObjectMeta{
+						Namespace: "ns",
+						Name:      "ds1-rev2",
+						Labels:    map[string]string{appsv1.ControllerRevisionHashLabelKey: "hash-new"},
+						OwnerReferences: []metav1.OwnerReference{
+							{UID: "ds-uid", Controller: boolPtr(true)},
+						},
+					},
+					Revision: 2,
+				})).NotTo(HaveOccurred())
+
+				rev := sm.currentDaemonSetRevision(ds)
+				Expect(rev).To(Equal("hash-new"))
+			})
+
+			It("should return empty string when no ControllerRevisions exist", func() {
+				ds := &appsv1.DaemonSet{
+					ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "ds1", UID: "ds-uid"},
+				}
+				rev := sm.currentDaemonSetRevision(ds)
+				Expect(rev).To(BeEmpty())
+			})
+		})
+	})
 })

From ad24032d3357b6a2bc5e2626083c8e1a3805db8c Mon Sep 17 00:00:00 2001
From: Casey Davenport <davenport.cas@gmail.com>
Date: Tue, 7 Apr 2026 13:31:22 -0700
Subject: [PATCH 06/12] Update syncState with not-found handling, diagnosePods,
 and summarizeIssues

Wire diagnosePods and summarizeIssues into syncState, replacing the
old podsFailing/containerErrorMessage functions. Each workload type
now reports not-found as a degraded condition instead of silently
continuing. DaemonSets and Deployments pass revision info so
diagnosePods can distinguish old-revision pods from current ones.
---
 pkg/controller/status/status.go      | 121 ++++++++-------------------
 pkg/controller/status/status_test.go |  36 ++++++++
 2 files changed, 70 insertions(+), 87 deletions(-)

diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go
index 764458e6fc..ce0af479f4 100644
--- a/pkg/controller/status/status.go
+++ b/pkg/controller/status/status.go
@@ -593,7 +593,11 @@ func (m *statusManager) syncState() {
 		ds := &appsv1.DaemonSet{}
 		err := m.client.Get(context.TODO(), dsnn, ds)
 		if err != nil {
-			log.WithValues("reason", err).Info("Failed to query daemonset")
+			if errors.IsNotFound(err) {
+				failing = append(failing, fmt.Sprintf("DaemonSet %q not found", dsnn.String()))
+			} else {
+				log.WithValues("reason", err).Info("Failed to query daemonset")
+			}
 			continue
 		}
 		if ds.Status.UpdatedNumberScheduled < ds.Status.DesiredNumberScheduled {
@@ -616,22 +620,22 @@ func (m *statusManager) syncState() {
 			continue
 		}
 
-		// Check if any pods within the daemonset are failing.
-		if f, err := m.podsFailing(ds.Spec.Selector, ds.Namespace); err == nil {
-			if f != "" {
-				failing = append(failing, f)
-			}
-		} else {
-			log.WithValues("reason", err, "daemonset", dsnn).Info("Failed to check for failing pods")
-			continue
-		}
+		revision := m.currentDaemonSetRevision(ds)
+		issues := m.diagnosePods(ds.Spec.Selector, ds.Namespace, revision)
+		f, p := summarizeIssues(issues)
+		failing = append(failing, f...)
+		progressing = append(progressing, p...)
 	}
 
 	for _, depnn := range m.deployments {
 		dep := &appsv1.Deployment{}
 		err := m.client.Get(context.TODO(), depnn, dep)
 		if err != nil {
-			log.WithValues("reason", err).Info("Failed to query deployment")
+			if errors.IsNotFound(err) {
+				failing = append(failing, fmt.Sprintf("Deployment %q not found", depnn.String()))
+			} else {
+				log.WithValues("reason", err).Info("Failed to query deployment")
+			}
 			continue
 		}
 		if dep.Status.UnavailableReplicas > 0 {
@@ -656,22 +660,22 @@ func (m *statusManager) syncState() {
 			continue
 		}
 
-		// Check if any pods within the deployment are failing.
-		if f, err := m.podsFailing(dep.Spec.Selector, dep.Namespace); err == nil {
-			if f != "" {
-				failing = append(failing, f)
-			}
-		} else {
-			log.WithValues("reason", err, "deployment", depnn).Info("Failed to check for failing pods")
-			continue
-		}
+		revision := m.currentDeploymentRevision(dep)
+		issues := m.diagnosePods(dep.Spec.Selector, dep.Namespace, revision)
+		f, p := summarizeIssues(issues)
+		failing = append(failing, f...)
+		progressing = append(progressing, p...)
 	}
 
 	for _, depnn := range m.statefulsets {
 		ss := &appsv1.StatefulSet{}
 		err := m.client.Get(context.TODO(), depnn, ss)
 		if err != nil {
-			log.WithValues("reason", err).Info("Failed to query statefulset")
+			if errors.IsNotFound(err) {
+				failing = append(failing, fmt.Sprintf("StatefulSet %q not found", depnn.String()))
+			} else {
+				log.WithValues("reason", err).Info("Failed to query statefulset")
+			}
 			continue
 		}
 		if *ss.Spec.Replicas != ss.Status.CurrentReplicas {
@@ -694,21 +698,20 @@ func (m *statusManager) syncState() {
 			continue
 		}
 
-		// Check if any pods within the deployment are failing.
-		if f, err := m.podsFailing(ss.Spec.Selector, ss.Namespace); err == nil {
-			if f != "" {
-				failing = append(failing, f)
-			}
-		} else {
-			log.WithValues("reason", err, "statefuleset", depnn).Info("Failed to check for failing pods")
-			continue
-		}
+		issues := m.diagnosePods(ss.Spec.Selector, ss.Namespace, ss.Status.UpdateRevision)
+		f, p := summarizeIssues(issues)
+		failing = append(failing, f...)
+		progressing = append(progressing, p...)
 	}
 
 	for _, depnn := range m.cronjobs {
 		cj := &batchv1.CronJob{}
 		if err := m.client.Get(context.TODO(), depnn, cj); err != nil {
-			log.WithValues("reason", err).Info("Failed to query cronjobs")
+			if errors.IsNotFound(err) {
+				failing = append(failing, fmt.Sprintf("CronJob %q not found", depnn.String()))
+			} else {
+				log.WithValues("reason", err).Info("Failed to query cronjobs")
+			}
 			continue
 		}
 
@@ -771,62 +774,6 @@ func (m *statusManager) removeTigeraStatus() {
 	}
 }
 
-// podsFailing takes a selector and returns if any of the pods that match it are failing. Failing pods are defined
-// to be in CrashLoopBackOff state.
-func (m *statusManager) podsFailing(selector *metav1.LabelSelector, namespace string) (string, error) {
-	l := corev1.PodList{}
-	s, err := metav1.LabelSelectorAsMap(selector)
-	if err != nil {
-		panic(err)
-	}
-	err = m.client.List(context.TODO(), &l, client.MatchingLabels(s), client.InNamespace(namespace))
-	if err != nil {
-		return "", err
-	}
-	for _, p := range l.Items {
-		if p.Status.Phase == corev1.PodFailed {
-			return fmt.Sprintf("Pod %s/%s has failed", p.Namespace, p.Name), nil
-		}
-		for _, c := range p.Status.InitContainerStatuses {
-			if msg := m.containerErrorMessage(p, c); msg != "" {
-				return msg, nil
-			}
-		}
-		for _, c := range p.Status.ContainerStatuses {
-			if msg := m.containerErrorMessage(p, c); msg != "" {
-				return msg, nil
-			}
-		}
-	}
-	return "", nil
-}
-
-func (m *statusManager) containerErrorMessage(p corev1.Pod, c corev1.ContainerStatus) string {
-	if c.State.Waiting != nil {
-		// Check well-known error states here and report an appropriate mesage to the end user.
-		switch c.State.Waiting.Reason {
-		case "CrashLoopBackOff":
-			msg := fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name)
-			if lt := c.LastTerminationState.Terminated; lt != nil {
-				if lt.Reason == terminationReasonError && lt.ExitCode == exitCodeSIGKILL {
-					msg += " (exit code 137, possible liveness probe failure)"
-				} else {
-					msg += fmt.Sprintf(" (%s, exit code %d)", lt.Reason, lt.ExitCode)
-				}
-			}
-			return msg
-		case "ImagePullBackOff", "ErrImagePull":
-			return fmt.Sprintf("Pod %s/%s failed to pull container image for: %s", p.Namespace, p.Name, c.Name)
-		}
-	}
-	if c.State.Terminated != nil {
-		if c.State.Terminated.Reason == terminationReasonError {
-			return fmt.Sprintf("Pod %s/%s has terminated container: %s", p.Namespace, p.Name, c.Name)
-		}
-	}
-	return ""
-}
-
 // diagnosePods lists pods matching the selector and returns a podIssue for each
 // unhealthy pod found. If currentRevision is non-empty, pods not matching that
 // revision are marked as old-revision.
diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go
index ada11b430c..b77d8ac2c6 100644
--- a/pkg/controller/status/status_test.go
+++ b/pkg/controller/status/status_test.go
@@ -21,6 +21,7 @@ import (
 	. "github.com/onsi/gomega"
 
 	appsv1 "k8s.io/api/apps/v1"
+	batchv1 "k8s.io/api/batch/v1"
 	certV1 "k8s.io/api/certificates/v1"
 	certV1beta1 "k8s.io/api/certificates/v1beta1"
 	corev1 "k8s.io/api/core/v1"
@@ -55,6 +56,7 @@ var _ = Describe("Status reporting tests", func() {
 		err := apis.AddToScheme(scheme, false)
 		Expect(err).NotTo(HaveOccurred())
 		Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred())
+		Expect(batchv1.AddToScheme(scheme)).NotTo(HaveOccurred())
 		Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred())
 		client = ctrlrfake.DefaultFakeClientBuilder(scheme).Build()
 
@@ -541,6 +543,40 @@ var _ = Describe("Status reporting tests", func() {
 			})
 		})
 
+		Context("when a monitored workload is not found", func() {
+			BeforeEach(func() {
+				sm.ReadyToMonitor()
+			})
+
+			It("should report degraded when a DaemonSet is not found", func() {
+				sm.AddDaemonsets([]types.NamespacedName{{Namespace: "NS1", Name: "missing-ds"}})
+				sm.updateStatus()
+				Expect(sm.IsDegraded()).To(BeTrue())
+				Expect(sm.failing).To(ContainElement(ContainSubstring(`DaemonSet "NS1/missing-ds" not found`)))
+			})
+
+			It("should report degraded when a Deployment is not found", func() {
+				sm.AddDeployments([]types.NamespacedName{{Namespace: "NS1", Name: "missing-dep"}})
+				sm.updateStatus()
+				Expect(sm.IsDegraded()).To(BeTrue())
+				Expect(sm.failing).To(ContainElement(ContainSubstring(`Deployment "NS1/missing-dep" not found`)))
+			})
+
+			It("should report degraded when a StatefulSet is not found", func() {
+				sm.AddStatefulSets([]types.NamespacedName{{Namespace: "NS1", Name: "missing-ss"}})
+				sm.updateStatus()
+				Expect(sm.IsDegraded()).To(BeTrue())
+				Expect(sm.failing).To(ContainElement(ContainSubstring(`StatefulSet "NS1/missing-ss" not found`)))
+			})
+
+			It("should report degraded when a CronJob is not found", func() {
+				sm.AddCronJobs([]types.NamespacedName{{Namespace: "NS1", Name: "missing-cj"}})
+				sm.updateStatus()
+				Expect(sm.IsDegraded()).To(BeTrue())
+				Expect(sm.failing).To(ContainElement(ContainSubstring(`CronJob "NS1/missing-cj" not found`)))
+			})
+		})
+
 		It("Should handle basic state changes", func() {
 			// We expect no state to be "True" at boot.
 			Expect(sm.IsAvailable()).To(BeFalse())

From 528915da7cb2fe4aadb96b2b4c40c779268db89e Mon Sep 17 00:00:00 2001
From: Casey Davenport <davenport.cas@gmail.com>
Date: Tue, 7 Apr 2026 13:33:11 -0700
Subject: [PATCH 07/12] Add integration test for rollout-aware status reporting

---
 pkg/controller/status/status_test.go | 112 +++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go
index b77d8ac2c6..55da840170 100644
--- a/pkg/controller/status/status_test.go
+++ b/pkg/controller/status/status_test.go
@@ -577,6 +577,118 @@ var _ = Describe("Status reporting tests", func() {
 			})
 		})
 
+		Context("during a rollout with mixed old and new revision pods", func() {
+			BeforeEach(func() {
+				sm.ReadyToMonitor()
+			})
+
+			It("should prioritize new-revision pod failures over old-revision failures", func() {
+				sm.AddDeployments([]types.NamespacedName{{Namespace: "NS1", Name: "DP1"}})
+				replicas := int32(2)
+				gen := int64(5)
+
+				// Create the Deployment.
+				Expect(client.Create(ctx, &appsv1.Deployment{
+					ObjectMeta: metav1.ObjectMeta{
+						Namespace:  "NS1",
+						Name:       "DP1",
+						UID:        "dp1-uid",
+						Generation: gen,
+					},
+					Spec: appsv1.DeploymentSpec{
+						Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dp1"}},
+						Replicas: &replicas,
+					},
+					Status: appsv1.DeploymentStatus{
+						ObservedGeneration:  gen,
+						UnavailableReplicas: 2,
+						AvailableReplicas:   0,
+						ReadyReplicas:       0,
+					},
+				})).NotTo(HaveOccurred())
+
+				// Current ReplicaSet.
+				Expect(client.Create(ctx, &appsv1.ReplicaSet{
+					ObjectMeta: metav1.ObjectMeta{
+						Namespace: "NS1",
+						Name:      "DP1-new",
+						Labels:    map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"},
+						OwnerReferences: []metav1.OwnerReference{
+							{UID: "dp1-uid", Controller: boolPtr(true)},
+						},
+					},
+					Spec: appsv1.ReplicaSetSpec{
+						Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dp1"}},
+					},
+					Status: appsv1.ReplicaSetStatus{Replicas: 1},
+				})).NotTo(HaveOccurred())
+
+				// Old ReplicaSet.
+				Expect(client.Create(ctx, &appsv1.ReplicaSet{
+					ObjectMeta: metav1.ObjectMeta{
+						Namespace: "NS1",
+						Name:      "DP1-old",
+						Labels:    map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"},
+						OwnerReferences: []metav1.OwnerReference{
+							{UID: "dp1-uid", Controller: boolPtr(true)},
+						},
+					},
+					Spec: appsv1.ReplicaSetSpec{
+						Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dp1"}},
+					},
+					Status: appsv1.ReplicaSetStatus{Replicas: 0},
+				})).NotTo(HaveOccurred())
+
+				// New-revision pod: crash looping.
+				Expect(client.Create(ctx, &corev1.Pod{
+					ObjectMeta: metav1.ObjectMeta{
+						Namespace: "NS1",
+						Name:      "dp1-new-pod",
+						Labels:    map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"},
+					},
+					Spec: corev1.PodSpec{},
+					Status: corev1.PodStatus{
+						Phase: corev1.PodRunning,
+						ContainerStatuses: []corev1.ContainerStatus{
+							{
+								Name:  "c1",
+								State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}},
+							},
+						},
+					},
+				})).NotTo(HaveOccurred())
+
+				// Old-revision pod: also crash looping but with different reason.
+				Expect(client.Create(ctx, &corev1.Pod{
+					ObjectMeta: metav1.ObjectMeta{
+						Namespace: "NS1",
+						Name:      "dp1-old-pod",
+						Labels:    map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"},
+					},
+					Spec: corev1.PodSpec{},
+					Status: corev1.PodStatus{
+						Phase: corev1.PodRunning,
+						ContainerStatuses: []corev1.ContainerStatus{
+							{
+								Name: "c1",
+								State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}},
+								LastTerminationState: corev1.ContainerState{
+									Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137},
+								},
+							},
+						},
+					},
+				})).NotTo(HaveOccurred())
+
+				sm.updateStatus()
+				Expect(sm.IsDegraded()).To(BeTrue())
+				Expect(sm.failing).To(HaveLen(2))
+				// New-revision issue should appear first.
+				Expect(sm.failing[0]).NotTo(ContainSubstring("old revision"))
+				Expect(sm.failing[1]).To(ContainSubstring("old revision"))
+			})
+		})
+
 		It("Should handle basic state changes", func() {
 			// We expect no state to be "True" at boot.
 			Expect(sm.IsAvailable()).To(BeFalse())

From 90f17e2b9882357ea7aabfc073eaa9452568877b Mon Sep 17 00:00:00 2001
From: Casey Davenport <davenport.cas@gmail.com>
Date: Tue, 7 Apr 2026 13:34:10 -0700
Subject: [PATCH 08/12] Fix formatting

---
 pkg/controller/status/status_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go
index 55da840170..d832c4f026 100644
--- a/pkg/controller/status/status_test.go
+++ b/pkg/controller/status/status_test.go
@@ -670,7 +670,7 @@ var _ = Describe("Status reporting tests", func() {
 						Phase: corev1.PodRunning,
 						ContainerStatuses: []corev1.ContainerStatus{
 							{
-								Name: "c1",
+								Name:  "c1",
 								State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}},
 								LastTerminationState: corev1.ContainerState{
 									Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137},

From 9243429d2073fc373c2b927118f3df2d0378036b Mon Sep 17 00:00:00 2001
From: Casey Davenport <davenport.cas@gmail.com>
Date: Tue, 7 Apr 2026 14:01:48 -0700
Subject: [PATCH 09/12] Add TigeraStatus warning for unsupported ignore
 annotation

When an object has the unsupported.operator.tigera.io/ignore annotation,
surface a warning through TigeraStatus so users know the operator is not
managing the resource. Clear the warning if the annotation is later removed.
---
 pkg/controller/utils/component.go      | 31 +++++++++++++-
 pkg/controller/utils/component_test.go | 56 ++++++++++++++++++++++++++
 2 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/pkg/controller/utils/component.go b/pkg/controller/utils/component.go
index 10ffc062c8..36dd9e5ff9 100644
--- a/pkg/controller/utils/component.go
+++ b/pkg/controller/utils/component.go
@@ -16,6 +16,7 @@ package utils
 
 import (
 	"context"
+	stderrors "errors"
 	"fmt"
 	"maps"
 	"reflect"
@@ -49,6 +50,10 @@ import (
 	rmeta "github.com/tigera/operator/pkg/render/common/meta"
 )
 
+// errObjectIgnored is returned by createOrUpdateObject when an existing object has the
+// unsupported.operator.tigera.io/ignore annotation.
+var errObjectIgnored = fmt.Errorf("object has unsupported ignore annotation")
+
 const TLS_CIPHERS_ENV_VAR_NAME = "TLS_CIPHER_SUITES"
 
 // dCache is a global deduplication cache that is used to avoid unnecessary updates to objects. It is shared
@@ -313,7 +318,7 @@ func (c *componentHandler) createOrUpdateObject(ctx context.Context, obj client.
 	// The object exists. Update it, unless the user has marked it as "ignored".
 	if IgnoreObject(cur) {
 		logCtx.Info("Ignoring annotated object")
-		return nil
+		return errObjectIgnored
 	}
 	logCtx.V(2).Info("Resource already exists, update it")
 
@@ -464,7 +469,19 @@ func (c *componentHandler) CreateOrUpdateOrDelete(ctx context.Context, component
 	conflictRetry:
 		err := c.createOrUpdateObject(ctx, obj.DeepCopyObject().(client.Object), osType)
 		if err != nil {
-			if errors.IsAlreadyExists(err) {
+			if stderrors.Is(err, errObjectIgnored) {
+				if status != nil {
+					kind := obj.GetObjectKind().GroupVersionKind().Kind
+					if kind == "" {
+						kind = reflect.TypeOf(obj).Elem().Name()
+					}
+					warningKey := fmt.Sprintf("ignore-%s-%s", kind, key)
+					status.SetWarning(warningKey, fmt.Sprintf(
+						"%s %q has the unsupported ignore annotation; the operator is not managing this resource",
+						kind, key,
+					))
+				}
+			} else if errors.IsAlreadyExists(err) {
 				// Remember that we've had an "already exists" error, but otherwise
 				// carry on.
 				alreadyExistsErr = err
@@ -491,6 +508,16 @@ func (c *componentHandler) CreateOrUpdateOrDelete(ctx context.Context, component
 			cronJobs = append(cronJobs, key)
 		}
 
+		// Object updated normally - clear any stale ignore warning.
+		if err == nil && status != nil {
+			kind := obj.GetObjectKind().GroupVersionKind().Kind
+			if kind == "" {
+				kind = reflect.TypeOf(obj).Elem().Name()
+			}
+			warningKey := fmt.Sprintf("ignore-%s-%s", kind, key)
+			status.ClearWarning(warningKey)
+		}
+
 		continue
 	}
 
diff --git a/pkg/controller/utils/component_test.go b/pkg/controller/utils/component_test.go
index 6716fa9221..b555049f14 100644
--- a/pkg/controller/utils/component_test.go
+++ b/pkg/controller/utils/component_test.go
@@ -16,6 +16,7 @@ package utils
 
 import (
 	"context"
+	stderrors "errors"
 	"fmt"
 
 	rbacv1 "k8s.io/api/rbac/v1"
@@ -2017,6 +2018,61 @@ var _ = Describe("Component handler tests", func() {
 			))
 		})
 	})
+
+	Context("unsupported ignore annotation", func() {
+		It("should return errObjectIgnored from createOrUpdateObject when object has ignore annotation", func() {
+			ds := &apps.DaemonSet{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-ignored-ds",
+					Namespace: "default",
+					Annotations: map[string]string{
+						"unsupported.operator.tigera.io/ignore": "true",
+					},
+				},
+				Spec: apps.DaemonSetSpec{
+					Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}},
+					Template: corev1.PodTemplateSpec{
+						ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "test"}},
+						Spec:       corev1.PodSpec{Containers: []corev1.Container{{Name: "c", Image: "img"}}},
+					},
+				},
+			}
+			Expect(c.Create(ctx, ds)).NotTo(HaveOccurred())
+
+			rendered := ds.DeepCopy()
+			rendered.Annotations = nil
+			rendered.SetGroupVersionKind(apps.SchemeGroupVersion.WithKind("DaemonSet"))
+			err := handler.(*componentHandler).createOrUpdateObject(ctx, rendered, rmeta.OSTypeLinux)
+			Expect(stderrors.Is(err, errObjectIgnored)).To(BeTrue())
+		})
+
+		It("should not return an error from CreateOrUpdateOrDelete when object has ignore annotation", func() {
+			cm := &corev1.ConfigMap{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-ignored-cm",
+					Namespace: "default",
+					Annotations: map[string]string{
+						"unsupported.operator.tigera.io/ignore": "true",
+					},
+				},
+			}
+			Expect(c.Create(ctx, cm)).NotTo(HaveOccurred())
+
+			rendered := &corev1.ConfigMap{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-ignored-cm",
+					Namespace: "default",
+				},
+				Data: map[string]string{"key": "value"},
+			}
+			fc := &fakeComponent{
+				objs:            []client.Object{rendered},
+				supportedOSType: rmeta.OSTypeLinux,
+			}
+			err := handler.CreateOrUpdateOrDelete(ctx, fc, sm)
+			Expect(err).NotTo(HaveOccurred())
+		})
+	})
 })
 
 var _ = Describe("Mocked client Component handler tests", func() {

From 3013cc4f646fe9b62723594f9587d717df184a83 Mon Sep 17 00:00:00 2001
From: Casey Davenport <davenport.cas@gmail.com>
Date: Tue, 7 Apr 2026 14:09:58 -0700
Subject: [PATCH 10/12] Set custom-overrides annotation when probe or resource
 overrides are applied

When the render package applies probe timing or resource overrides to a
workload, set an operator.tigera.io/custom-overrides annotation with a
comma-separated list of which override types were applied. This will be
used by diagnosePods to correlate pod failures with user overrides.
---
 pkg/render/common/components/components.go    | 26 ++++++
 .../common/components/components_test.go      | 91 +++++++++++++++++++
 2 files changed, 117 insertions(+)

diff --git a/pkg/render/common/components/components.go b/pkg/render/common/components/components.go
index e45b4c53fd..a64072f4b4 100644
--- a/pkg/render/common/components/components.go
+++ b/pkg/render/common/components/components.go
@@ -36,6 +36,10 @@ import (
 
 var log = logf.Log.WithName("components")
 
+// CustomOverridesAnnotation is set on workloads when the render package applies user-specified
+// probe timing or resource overrides. The value is a comma-separated list of override types.
+const CustomOverridesAnnotation = "operator.tigera.io/custom-overrides"
+
 // containerNameAliases maps deprecated container names to their current names.
 // When a user provides an override using a deprecated name, it is transparently
 // resolved to the current name before matching against rendered containers.
@@ -396,6 +400,28 @@ func applyReplicatedPodResourceOverrides(r *replicatedPodResource, overrides any
 	// probe timing overrides are applied to the corresponding container.
 	if cos := GetContainerOverrides(overrides); cos != nil {
 		mergeContainerOverrides(r.podTemplateSpec.Spec.Containers, cos)
+
+		// Track which override types were applied for status correlation.
+		seen := map[string]bool{}
+		var overrideTypes []string
+		for _, co := range cos {
+			if co.ReadinessProbe != nil && !seen["readinessProbe"] {
+				seen["readinessProbe"] = true
+				overrideTypes = append(overrideTypes, "readinessProbe")
+			}
+			if co.LivenessProbe != nil && !seen["livenessProbe"] {
+				seen["livenessProbe"] = true
+				overrideTypes = append(overrideTypes, "livenessProbe")
+			}
+			if co.Resources != nil && !seen["resources"] {
+				seen["resources"] = true
+				overrideTypes = append(overrideTypes, "resources")
+			}
+		}
+		if len(overrideTypes) > 0 {
+			r.annotations = common.MapExistsOrInitialize(r.annotations)
+			r.annotations[CustomOverridesAnnotation] = strings.Join(overrideTypes, ",")
+		}
 	}
 
 	// If `overrides` has a Spec.Template.Spec.Affinity field, and it's non-nil, it sets
diff --git a/pkg/render/common/components/components_test.go b/pkg/render/common/components/components_test.go
index 08adc81231..0d5939ee7b 100644
--- a/pkg/render/common/components/components_test.go
+++ b/pkg/render/common/components/components_test.go
@@ -602,6 +602,7 @@ var _ = Describe("Common components render tests", func() {
 				expected := defaultedDaemonSet()
 				Expect(expected.Spec.Template.Spec.Containers).To(HaveLen(2))
 				expected.Spec.Template.Spec.Containers[0].Resources = resources1
+				expected.Annotations[CustomOverridesAnnotation] = "resources"
 				Expect(result.Spec.Template.Spec.Containers).To(ContainElements(expected.Spec.Template.Spec.Containers))
 				Expect(result).To(Equal(expected))
 			}),
@@ -1069,6 +1070,7 @@ var _ = Describe("Common components render tests", func() {
 				expected := defaultedDeployment()
 				Expect(expected.Spec.Template.Spec.Containers).To(HaveLen(2))
 				expected.Spec.Template.Spec.Containers[0].Resources = resources1
+				expected.Annotations[CustomOverridesAnnotation] = "resources"
 				Expect(result.Spec.Template.Spec.Containers).To(ContainElements(expected.Spec.Template.Spec.Containers))
 				Expect(result).To(Equal(expected))
 			}),
@@ -1260,6 +1262,95 @@ var _ = Describe("Common components render tests", func() {
 			Expect(c.Resources).To(Equal(overrideResources), "container %q should have overridden resources", c.Name)
 		}
 	})
+
+	Describe("custom-overrides annotation", func() {
+		It("should set annotation when readiness probe override is applied", func() {
+			d := appsv1.Deployment{}
+			d.Spec.Template.Spec.Containers = []corev1.Container{
+				{
+					Name:  "compliance-server",
+					Image: "test-image",
+					ReadinessProbe: &corev1.Probe{
+						ProbeHandler: corev1.ProbeHandler{HTTPGet: &corev1.HTTPGetAction{Path: "/health"}},
+					},
+				},
+			}
+
+			period := int32(30)
+			overrides := &v1.ComplianceServerDeployment{
+				Spec: &v1.ComplianceServerDeploymentSpec{
+					Template: &v1.ComplianceServerDeploymentPodTemplateSpec{
+						Spec: &v1.ComplianceServerDeploymentPodSpec{
+							Containers: []v1.ComplianceServerDeploymentContainer{
+								{
+									Name:           "compliance-server",
+									ReadinessProbe: &v1.ProbeOverride{PeriodSeconds: &period},
+								},
+							},
+						},
+					},
+				},
+			}
+
+			ApplyDeploymentOverrides(&d, overrides)
+			Expect(d.Annotations).To(HaveKeyWithValue(CustomOverridesAnnotation, "readinessProbe"))
+		})
+
+		It("should set annotation with multiple override types", func() {
+			d := appsv1.Deployment{}
+			d.Spec.Template.Spec.Containers = []corev1.Container{
+				{
+					Name:  "compliance-server",
+					Image: "test-image",
+					ReadinessProbe: &corev1.Probe{
+						ProbeHandler: corev1.ProbeHandler{HTTPGet: &corev1.HTTPGetAction{Path: "/health"}},
+					},
+					LivenessProbe: &corev1.Probe{
+						ProbeHandler: corev1.ProbeHandler{HTTPGet: &corev1.HTTPGetAction{Path: "/health"}},
+					},
+				},
+			}
+
+			period := int32(30)
+			overrideResources := corev1.ResourceRequirements{
+				Limits: corev1.ResourceList{corev1.ResourceCPU: resource.MustParse("500m")},
+			}
+			overrides := &v1.ComplianceServerDeployment{
+				Spec: &v1.ComplianceServerDeploymentSpec{
+					Template: &v1.ComplianceServerDeploymentPodTemplateSpec{
+						Spec: &v1.ComplianceServerDeploymentPodSpec{
+							Containers: []v1.ComplianceServerDeploymentContainer{
+								{
+									Name:           "compliance-server",
+									ReadinessProbe: &v1.ProbeOverride{PeriodSeconds: &period},
+									LivenessProbe:  &v1.ProbeOverride{PeriodSeconds: &period},
+									Resources:      &overrideResources,
+								},
+							},
+						},
+					},
+				},
+			}
+
+			ApplyDeploymentOverrides(&d, overrides)
+			ann := d.Annotations[CustomOverridesAnnotation]
+			Expect(ann).To(ContainSubstring("readinessProbe"))
+			Expect(ann).To(ContainSubstring("livenessProbe"))
+			Expect(ann).To(ContainSubstring("resources"))
+		})
+
+		It("should not set annotation when no probe or resource overrides", func() {
+			d := appsv1.Deployment{}
+			d.Spec.Template.Spec.Containers = []corev1.Container{
+				{Name: "compliance-server", Image: "test-image"},
+			}
+			overrides := &v1.ComplianceServerDeployment{
+				Spec: &v1.ComplianceServerDeploymentSpec{},
+			}
+			ApplyDeploymentOverrides(&d, overrides)
+			Expect(d.Annotations).NotTo(HaveKey(CustomOverridesAnnotation))
+		})
+	})
 })
 
 func addContainer(cs []corev1.Container) []corev1.Container {

From 2e1de6ed81a6a010d600eca2461a8680cb8d239f Mon Sep 17 00:00:00 2001
From: Casey Davenport <davenport.cas@gmail.com>
Date: Tue, 7 Apr 2026 14:13:19 -0700
Subject: [PATCH 11/12] Add override correlation hints to pod diagnostic
 messages

---
 pkg/controller/status/status.go      | 42 +++++++++---
 pkg/controller/status/status_test.go | 99 +++++++++++++++++++++++++---
 2 files changed, 123 insertions(+), 18 deletions(-)

diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go
index ce0af479f4..94d55bbe04 100644
--- a/pkg/controller/status/status.go
+++ b/pkg/controller/status/status.go
@@ -51,8 +51,24 @@ const (
 	// The kubelet sends SIGKILL when a liveness probe fails, but other actors (OOM
 	// killer, manual kill) can also produce this code.
 	exitCodeSIGKILL = 137
+
+	customOverridesAnnotation = "operator.tigera.io/custom-overrides"
 )
 
+// hasOverride checks if the workload has a specific override type configured.
+func hasOverride(annotations map[string]string, overrideType string) bool {
+	val, ok := annotations[customOverridesAnnotation]
+	if !ok {
+		return false
+	}
+	for _, t := range strings.Split(val, ",") {
+		if t == overrideType {
+			return true
+		}
+	}
+	return false
+}
+
 type podIssueSeverity int
 
 const (
@@ -621,7 +637,7 @@ func (m *statusManager) syncState() {
 		}
 
 		revision := m.currentDaemonSetRevision(ds)
-		issues := m.diagnosePods(ds.Spec.Selector, ds.Namespace, revision)
+		issues := m.diagnosePods(ds.Spec.Selector, ds.Namespace, revision, ds.Annotations)
 		f, p := summarizeIssues(issues)
 		failing = append(failing, f...)
 		progressing = append(progressing, p...)
@@ -661,7 +677,7 @@ func (m *statusManager) syncState() {
 		}
 
 		revision := m.currentDeploymentRevision(dep)
-		issues := m.diagnosePods(dep.Spec.Selector, dep.Namespace, revision)
+		issues := m.diagnosePods(dep.Spec.Selector, dep.Namespace, revision, dep.Annotations)
 		f, p := summarizeIssues(issues)
 		failing = append(failing, f...)
 		progressing = append(progressing, p...)
@@ -698,7 +714,7 @@ func (m *statusManager) syncState() {
 			continue
 		}
 
-		issues := m.diagnosePods(ss.Spec.Selector, ss.Namespace, ss.Status.UpdateRevision)
+		issues := m.diagnosePods(ss.Spec.Selector, ss.Namespace, ss.Status.UpdateRevision, ss.Annotations)
 		f, p := summarizeIssues(issues)
 		failing = append(failing, f...)
 		progressing = append(progressing, p...)
@@ -777,7 +793,7 @@ func (m *statusManager) removeTigeraStatus() {
 // diagnosePods lists pods matching the selector and returns a podIssue for each
 // unhealthy pod found. If currentRevision is non-empty, pods not matching that
 // revision are marked as old-revision.
-func (m *statusManager) diagnosePods(selector *metav1.LabelSelector, namespace string, currentRevision string) []podIssue {
+func (m *statusManager) diagnosePods(selector *metav1.LabelSelector, namespace string, currentRevision string, workloadAnnotations map[string]string) []podIssue {
 	l := corev1.PodList{}
 	s, err := metav1.LabelSelectorAsMap(selector)
 	if err != nil {
@@ -804,11 +820,11 @@ func (m *statusManager) diagnosePods(selector *metav1.LabelSelector, namespace s
 		}
 
 		// Check init and regular container statuses for errors.
-		if iss := diagnoseContainers(p, p.Status.InitContainerStatuses, oldRevision); len(iss) > 0 {
+		if iss := diagnoseContainers(p, p.Status.InitContainerStatuses, oldRevision, workloadAnnotations); len(iss) > 0 {
 			issues = append(issues, iss...)
 			continue
 		}
-		if iss := diagnoseContainers(p, p.Status.ContainerStatuses, oldRevision); len(iss) > 0 {
+		if iss := diagnoseContainers(p, p.Status.ContainerStatuses, oldRevision, workloadAnnotations); len(iss) > 0 {
 			issues = append(issues, iss...)
 			continue
 		}
@@ -817,10 +833,14 @@ func (m *statusManager) diagnosePods(selector *metav1.LabelSelector, namespace s
 		if p.Status.Phase == corev1.PodRunning {
 			for _, cond := range p.Status.Conditions {
 				if cond.Type == corev1.ContainersReady && cond.Status == corev1.ConditionFalse {
+					msg := fmt.Sprintf("Pod %s/%s is running but not ready", p.Namespace, p.Name)
+					if hasOverride(workloadAnnotations, "readinessProbe") {
+						msg += "; custom readiness probe configuration is in effect"
+					}
 					issues = append(issues, podIssue{
 						severity:      severityFailing,
 						issueType:     issueNotReady,
-						message:       fmt.Sprintf("Pod %s/%s is running but not ready", p.Namespace, p.Name),
+						message:       msg,
 						isOldRevision: oldRevision,
 					})
 					break
@@ -853,7 +873,7 @@ func (m *statusManager) diagnosePods(selector *metav1.LabelSelector, namespace s
 
 // diagnoseContainers checks a list of container statuses for known error states and
 // returns a podIssue for each one found.
-func diagnoseContainers(p corev1.Pod, statuses []corev1.ContainerStatus, oldRevision bool) []podIssue {
+func diagnoseContainers(p corev1.Pod, statuses []corev1.ContainerStatus, oldRevision bool, workloadAnnotations map[string]string) []podIssue {
 	var issues []podIssue
 	for _, c := range statuses {
 		if c.State.Waiting != nil {
@@ -867,8 +887,14 @@ func diagnoseContainers(p corev1.Pod, statuses []corev1.ContainerStatus, oldRevi
 					exitCode = lt.ExitCode
 					if lt.Reason == terminationReasonError && lt.ExitCode == exitCodeSIGKILL {
 						msg += " (exit code 137, possible liveness probe failure)"
+						if hasOverride(workloadAnnotations, "livenessProbe") {
+							msg += "; custom liveness probe configuration is in effect"
+						}
 					} else {
 						msg += fmt.Sprintf(" (%s, exit code %d)", lt.Reason, lt.ExitCode)
+						if lt.Reason == "OOMKilled" && hasOverride(workloadAnnotations, "resources") {
+							msg += "; custom resource limits are in effect"
+						}
 					}
 				}
 				issues = append(issues, podIssue{
diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go
index d832c4f026..d8702bf906 100644
--- a/pkg/controller/status/status_test.go
+++ b/pkg/controller/status/status_test.go
@@ -1157,7 +1157,7 @@ var _ = Describe("Status reporting tests", func() {
 					},
 				},
 			})).NotTo(HaveOccurred())
-			issues := sm.diagnosePods(selector, "ns", "")
+			issues := sm.diagnosePods(selector, "ns", "", nil)
 			Expect(issues).To(HaveLen(1))
 			Expect(issues[0].issueType).To(Equal(issueCrashLoopBackOff))
 			Expect(issues[0].severity).To(Equal(severityFailing))
@@ -1183,7 +1183,7 @@ var _ = Describe("Status reporting tests", func() {
 					},
 				},
 			})).NotTo(HaveOccurred())
-			issues := sm.diagnosePods(selector, "ns", "")
+			issues := sm.diagnosePods(selector, "ns", "", nil)
 			Expect(issues).To(HaveLen(1))
 			Expect(issues[0].message).To(ContainSubstring("possible liveness probe failure"))
 		})
@@ -1202,7 +1202,7 @@ var _ = Describe("Status reporting tests", func() {
 					},
 				},
 			})).NotTo(HaveOccurred())
-			issues := sm.diagnosePods(selector, "ns", "")
+			issues := sm.diagnosePods(selector, "ns", "", nil)
 			Expect(issues).To(HaveLen(1))
 			Expect(issues[0].issueType).To(Equal(issueImagePull))
 			Expect(issues[0].severity).To(Equal(severityFailing))
@@ -1222,7 +1222,7 @@ var _ = Describe("Status reporting tests", func() {
 					},
 				},
 			})).NotTo(HaveOccurred())
-			issues := sm.diagnosePods(selector, "ns", "")
+			issues := sm.diagnosePods(selector, "ns", "", nil)
 			Expect(issues).To(HaveLen(1))
 			Expect(issues[0].issueType).To(Equal(issueTerminated))
 		})
@@ -1233,7 +1233,7 @@ var _ = Describe("Status reporting tests", func() {
 				Spec:       corev1.PodSpec{},
 				Status:     corev1.PodStatus{Phase: corev1.PodFailed},
 			})).NotTo(HaveOccurred())
-			issues := sm.diagnosePods(selector, "ns", "")
+			issues := sm.diagnosePods(selector, "ns", "", nil)
 			Expect(issues).To(HaveLen(1))
 			Expect(issues[0].issueType).To(Equal(issuePodFailed))
 			Expect(issues[0].severity).To(Equal(severityFailing))
@@ -1250,7 +1250,7 @@ var _ = Describe("Status reporting tests", func() {
 					},
 				},
 			})).NotTo(HaveOccurred())
-			issues := sm.diagnosePods(selector, "ns", "")
+			issues := sm.diagnosePods(selector, "ns", "", nil)
 			Expect(issues).To(HaveLen(1))
 			Expect(issues[0].issueType).To(Equal(issueNotReady))
 			Expect(issues[0].severity).To(Equal(severityFailing))
@@ -1271,7 +1271,7 @@ var _ = Describe("Status reporting tests", func() {
 					},
 				},
 			})).NotTo(HaveOccurred())
-			issues := sm.diagnosePods(selector, "ns", "")
+			issues := sm.diagnosePods(selector, "ns", "", nil)
 			Expect(issues).To(HaveLen(1))
 			Expect(issues[0].issueType).To(Equal(issuePending))
 			Expect(issues[0].severity).To(Equal(severityProgressing))
@@ -1284,7 +1284,7 @@ var _ = Describe("Status reporting tests", func() {
 				Spec:       corev1.PodSpec{},
 				Status:     corev1.PodStatus{Phase: corev1.PodPending},
 			})).NotTo(HaveOccurred())
-			issues := sm.diagnosePods(selector, "ns", "")
+			issues := sm.diagnosePods(selector, "ns", "", nil)
 			Expect(issues).To(HaveLen(1))
 			Expect(issues[0].issueType).To(Equal(issuePending))
 			Expect(issues[0].severity).To(Equal(severityProgressing))
@@ -1315,7 +1315,7 @@ var _ = Describe("Status reporting tests", func() {
 				Status:     corev1.PodStatus{Phase: corev1.PodPending},
 			})).NotTo(HaveOccurred())
 
-			issues := sm.diagnosePods(selector, "ns", "")
+			issues := sm.diagnosePods(selector, "ns", "", nil)
 			Expect(issues).To(HaveLen(2))
 		})
 
@@ -1330,9 +1330,88 @@ var _ = Describe("Status reporting tests", func() {
 					},
 				},
 			})).NotTo(HaveOccurred())
-			issues := sm.diagnosePods(selector, "ns", "")
+			issues := sm.diagnosePods(selector, "ns", "", nil)
 			Expect(issues).To(BeEmpty())
 		})
+
+		It("should add readiness probe hint when pod is not ready and readinessProbe override is configured", func() {
+			Expect(cl.Create(ctx, &corev1.Pod{
+				ObjectMeta: podMeta,
+				Spec:       corev1.PodSpec{},
+				Status: corev1.PodStatus{
+					Phase: corev1.PodRunning,
+					Conditions: []corev1.PodCondition{
+						{Type: corev1.ContainersReady, Status: corev1.ConditionFalse},
+					},
+				},
+			})).NotTo(HaveOccurred())
+			overrides := map[string]string{"operator.tigera.io/custom-overrides": "readinessProbe,resources"}
+			issues := sm.diagnosePods(selector, "ns", "", overrides)
+			Expect(issues).To(HaveLen(1))
+			Expect(issues[0].message).To(ContainSubstring("custom readiness probe configuration is in effect"))
+		})
+
+		It("should add liveness probe hint when pod has possible liveness failure and livenessProbe override is configured", func() {
+			Expect(cl.Create(ctx, &corev1.Pod{
+				ObjectMeta: podMeta,
+				Spec:       corev1.PodSpec{},
+				Status: corev1.PodStatus{
+					Phase: corev1.PodRunning,
+					ContainerStatuses: []corev1.ContainerStatus{
+						{
+							Name:  "c1",
+							State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}},
+							LastTerminationState: corev1.ContainerState{
+								Terminated: &corev1.ContainerStateTerminated{Reason: terminationReasonError, ExitCode: exitCodeSIGKILL},
+							},
+						},
+					},
+				},
+			})).NotTo(HaveOccurred())
+			overrides := map[string]string{"operator.tigera.io/custom-overrides": "livenessProbe"}
+			issues := sm.diagnosePods(selector, "ns", "", overrides)
+			Expect(issues).To(HaveLen(1))
+			Expect(issues[0].message).To(ContainSubstring("custom liveness probe configuration is in effect"))
+		})
+
+		It("should add resources hint when pod is OOMKilled and resources override is configured", func() {
+			Expect(cl.Create(ctx, &corev1.Pod{
+				ObjectMeta: podMeta,
+				Spec:       corev1.PodSpec{},
+				Status: corev1.PodStatus{
+					Phase: corev1.PodRunning,
+					ContainerStatuses: []corev1.ContainerStatus{
+						{
+							Name:  "c1",
+							State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}},
+							LastTerminationState: corev1.ContainerState{
+								Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137},
+							},
+						},
+					},
+				},
+			})).NotTo(HaveOccurred())
+			overrides := map[string]string{"operator.tigera.io/custom-overrides": "resources"}
+			issues := sm.diagnosePods(selector, "ns", "", overrides)
+			Expect(issues).To(HaveLen(1))
+			Expect(issues[0].message).To(ContainSubstring("custom resource limits are in effect"))
+		})
+
+		It("should not add hints when no override annotation is present", func() {
+			Expect(cl.Create(ctx, &corev1.Pod{
+				ObjectMeta: podMeta,
+				Spec:       corev1.PodSpec{},
+				Status: corev1.PodStatus{
+					Phase: corev1.PodRunning,
+					Conditions: []corev1.PodCondition{
+						{Type: corev1.ContainersReady, Status: corev1.ConditionFalse},
+					},
+				},
+			})).NotTo(HaveOccurred())
+			issues := sm.diagnosePods(selector, "ns", "", nil)
+			Expect(issues).To(HaveLen(1))
+			Expect(issues[0].message).NotTo(ContainSubstring("custom"))
+		})
 	})
 
 	Describe("currentRevision helpers", func() {

From 54f87dc2fc24510b4368495ba4d56b272e1513cb Mon Sep 17 00:00:00 2001
From: Casey Davenport <davenport.cas@gmail.com>
Date: Tue, 7 Apr 2026 15:15:33 -0700
Subject: [PATCH 12/12] Add top-level Message field to TigeraStatus for better
 kubectl output

---
 api/v1/tigerastatus_types.go                  |  8 ++++-
 pkg/controller/status/status.go               | 30 ++++++++++++++++++-
 .../operator.tigera.io_tigerastatuses.yaml    | 10 +++++--
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/api/v1/tigerastatus_types.go b/api/v1/tigerastatus_types.go
index 9206a9bac9..46e47e047d 100644
--- a/api/v1/tigerastatus_types.go
+++ b/api/v1/tigerastatus_types.go
@@ -33,6 +33,12 @@ type TigeraStatusStatus struct {
 	// Conditions represents the latest observed set of conditions for this component. A component may be one or more of
 	// Available, Progressing, or Degraded.
 	Conditions []TigeraStatusCondition `json:"conditions"`
+
+	// Message is a human-readable summary of the component's current state. It shows the most
+	// actionable information: the degraded reason if degraded, the progressing reason if progressing,
+	// or warnings if available with caveats. Empty when fully healthy with no warnings.
+	// +optional
+	Message string `json:"message,omitempty"`
 }
 
 // +kubebuilder:object:root=true
@@ -45,7 +51,7 @@ type TigeraStatusStatus struct {
 // +kubebuilder:printcolumn:name="Progressing",type="string",JSONPath=".status.conditions[?(@.type=='Progressing')].status",description="Whether the component is processing changes."
 // +kubebuilder:printcolumn:name="Degraded",type="string",JSONPath=".status.conditions[?(@.type=='Degraded')].status",description="Whether the component is degraded."
 // +kubebuilder:printcolumn:name="Since",type="date",JSONPath=".status.conditions[?(@.type=='Available')].lastTransitionTime",description="The time the component's Available status last changed."
-// +kubebuilder:printcolumn:name="Message",type="string",JSONPath=".status.conditions[?(@.type=='Degraded')].message",description="Error message when the component is degraded.",priority=0
+// +kubebuilder:printcolumn:name="Message",type="string",JSONPath=".status.message",description="Human-readable summary of the component's current state."
 type TigeraStatus struct {
 	metav1.TypeMeta   `json:",inline"`
 	metav1.ObjectMeta `json:"metadata,omitempty"`
diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go
index 94d55bbe04..982d264b47 100644
--- a/pkg/controller/status/status.go
+++ b/pkg/controller/status/status.go
@@ -527,6 +527,12 @@ func (m *statusManager) ClearWarning(key string) {
 func (m *statusManager) warningMessage() string {
 	m.lock.Lock()
 	defer m.lock.Unlock()
+	return m.warningMessageLocked()
+}
+
+// warningMessageLocked returns the warning message without acquiring the lock.
+// The caller must hold m.lock.
+func (m *statusManager) warningMessageLocked() string {
 	if len(m.warnings) == 0 {
 		return ""
 	}
@@ -1049,8 +1055,11 @@ func (m *statusManager) set(retry bool, conditions ...operator.TigeraStatusCondi
 		}
 	}
 
+	// Set the top-level Message field based on the most actionable condition.
+	ts.Status.Message = m.statusMessage(ts.Status.Conditions)
+
 	// If nothing has changed, we don't need to update in the API.
-	if reflect.DeepEqual(ts.Status.Conditions, old.Status.Conditions) {
+	if reflect.DeepEqual(ts.Status.Conditions, old.Status.Conditions) && ts.Status.Message == old.Status.Message {
 		return
 	}
 
@@ -1158,6 +1167,25 @@ func (m *statusManager) degradedMessage() string {
 	return strings.Join(msgs, "\n")
 }
 
+// statusMessage returns the most actionable message for the top-level Message field,
+// based on the current conditions. Priority: degraded > progressing > warnings > empty.
+func (m *statusManager) statusMessage(conditions []operator.TigeraStatusCondition) string {
+	for _, c := range conditions {
+		if c.Type == operator.ComponentDegraded && c.Status == operator.ConditionTrue {
+			return c.Message
+		}
+	}
+	for _, c := range conditions {
+		if c.Type == operator.ComponentProgressing && c.Status == operator.ConditionTrue {
+			return c.Message
+		}
+	}
+	if w := m.warningMessageLocked(); w != "" {
+		return w
+	}
+	return ""
+}
+
 // This function should only be called if we are in a degraded state.
 // Every path should return a non-empty string that can be used in
 // the Condition Reason.
diff --git a/pkg/imports/crds/operator/operator.tigera.io_tigerastatuses.yaml b/pkg/imports/crds/operator/operator.tigera.io_tigerastatuses.yaml
index 838626ba16..b00cb4760c 100644
--- a/pkg/imports/crds/operator/operator.tigera.io_tigerastatuses.yaml
+++ b/pkg/imports/crds/operator/operator.tigera.io_tigerastatuses.yaml
@@ -30,8 +30,8 @@ spec:
           jsonPath: .status.conditions[?(@.type=='Available')].lastTransitionTime
           name: Since
           type: date
-        - description: Error message when the component is degraded.
-          jsonPath: .status.conditions[?(@.type=='Degraded')].message
+        - description: Human-readable summary of the component's current state.
+          jsonPath: .status.message
           name: Message
           type: string
       name: v1
@@ -110,6 +110,12 @@ spec:
                       - type
                     type: object
                   type: array
+                message:
+                  description: |-
+                    Message is a human-readable summary of the component's current state. It shows the most
+                    actionable information: the degraded reason if degraded, the progressing reason if progressing,
+                    or warnings if available with caveats. Empty when fully healthy with no warnings.
+                  type: string
               required:
                 - conditions
               type: object