diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index e7270a20a7..534d09868c 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -51,8 +51,130 @@ const ( // The kubelet sends SIGKILL when a liveness probe fails, but other actors (OOM // killer, manual kill) can also produce this code. exitCodeSIGKILL = 137 + + // Container waiting reason strings. These are set by the container runtime and + // kubelet; Kubernetes does not define constants for them. + waitingCrashLoopBackOff = "CrashLoopBackOff" + waitingImagePullBackOff = "ImagePullBackOff" + waitingErrImagePull = "ErrImagePull" + + customOverridesAnnotation = "operator.tigera.io/custom-overrides" +) + +// hasOverride checks if the workload has a specific override type configured. +func hasOverride(annotations map[string]string, overrideType string) bool { + val, ok := annotations[customOverridesAnnotation] + if !ok { + return false + } + for _, t := range strings.Split(val, ",") { + if t == overrideType { + return true + } + } + return false +} + +type podIssueSeverity int + +const ( + severityProgressing podIssueSeverity = iota + severityFailing +) + +// podIssueType defines the category of a pod issue. The enum order determines display +// priority when issues compete for the 3-slot cap in summarizeIssues. +type podIssueType int + +const ( + issueCrashLoopBackOff podIssueType = iota + issueImagePull + issueTerminated + issuePodFailed + issueNotReady + issuePending ) +// podIssue represents a single diagnosed problem with a pod. +type podIssue struct { + severity podIssueSeverity + issueType podIssueType + message string + + // isOldRevision is true if the pod belongs to an old revision during a rollout. + isOldRevision bool + + // terminationReason and exitCode provide dedup granularity within an issue type. + terminationReason string + exitCode int32 +} + +// key returns a deduplication key for this issue. Issues with the same key are grouped +// together in summarizeIssues, so that e.g. 10 pods all OOMKilled produce one message +// with "(10 pods affected)" rather than 10 separate messages. +func (p podIssue) key() string { + return fmt.Sprintf("%d:%s:%d", p.issueType, p.terminationReason, p.exitCode) +} + +// maxIssuesPerWorkload is the maximum number of distinct issue types reported per workload. +// Beyond this cap, additional issues are dropped to keep messages concise. +const maxIssuesPerWorkload = 3 + +// summarizeIssues deduplicates, prioritizes, and caps a list of pod issues into +// human-readable failing and progressing message slices. +func summarizeIssues(issues []podIssue) (failing []string, progressing []string) { + if len(issues) == 0 { + return nil, nil + } + + // Sort: new-revision first, then by issue type priority (enum order). + sort.SliceStable(issues, func(i, j int) bool { + if issues[i].isOldRevision != issues[j].isOldRevision { + return !issues[i].isOldRevision + } + return issues[i].issueType < issues[j].issueType + }) + + // Group by key. First occurrence provides the message; count tracks duplicates. + type issueGroup struct { + issue podIssue + count int + } + seen := map[string]int{} + var groups []issueGroup + for _, iss := range issues { + k := iss.key() + if idx, ok := seen[k]; ok { + groups[idx].count++ + continue + } + seen[k] = len(groups) + groups = append(groups, issueGroup{issue: iss, count: 1}) + } + + // Cap at maxIssuesPerWorkload unique reasons. + if len(groups) > maxIssuesPerWorkload { + groups = groups[:maxIssuesPerWorkload] + } + + // Format messages. + for _, g := range groups { + msg := g.issue.message + if g.count > 1 { + msg += fmt.Sprintf(" (%d pods affected)", g.count) + } + if g.issue.isOldRevision { + msg += " (old revision)" + } + if g.issue.severity == severityFailing { + failing = append(failing, msg) + } else { + progressing = append(progressing, msg) + } + } + return failing, progressing +} + // StatusManager manages the status for a single controller and component, and reports the status via // a TigeraStatus API object. The status manager uses the following conditions/states to represent the // component's current status: @@ -498,7 +620,11 @@ func (m *statusManager) syncState() { ds := &appsv1.DaemonSet{} err := m.client.Get(context.TODO(), dsnn, ds) if err != nil { - log.WithValues("reason", err).Info("Failed to query daemonset") + if errors.IsNotFound(err) { + failing = append(failing, fmt.Sprintf("DaemonSet %q not found", dsnn.String())) + } else { + log.WithValues("reason", err).Info("Failed to query daemonset") + } continue } if ds.Status.UpdatedNumberScheduled < ds.Status.DesiredNumberScheduled { @@ -521,22 +647,22 @@ func (m *statusManager) syncState() { continue } - // Check if any pods within the daemonset are failing. - if f, err := m.podsFailing(ds.Spec.Selector, ds.Namespace); err == nil { - if f != "" { - failing = append(failing, f) - } - } else { - log.WithValues("reason", err, "daemonset", dsnn).Info("Failed to check for failing pods") - continue - } + revision := m.currentDaemonSetRevision(ds) + issues := m.diagnosePods(ds.Spec.Selector, ds.Namespace, revision, ds.Annotations) + f, p := summarizeIssues(issues) + failing = append(failing, f...) + progressing = append(progressing, p...) } for _, depnn := range m.deployments { dep := &appsv1.Deployment{} err := m.client.Get(context.TODO(), depnn, dep) if err != nil { - log.WithValues("reason", err).Info("Failed to query deployment") + if errors.IsNotFound(err) { + failing = append(failing, fmt.Sprintf("Deployment %q not found", depnn.String())) + } else { + log.WithValues("reason", err).Info("Failed to query deployment") + } continue } if dep.Status.UnavailableReplicas > 0 { @@ -561,22 +687,22 @@ func (m *statusManager) syncState() { continue } - // Check if any pods within the deployment are failing. - if f, err := m.podsFailing(dep.Spec.Selector, dep.Namespace); err == nil { - if f != "" { - failing = append(failing, f) - } - } else { - log.WithValues("reason", err, "deployment", depnn).Info("Failed to check for failing pods") - continue - } + revision := m.currentDeploymentRevision(dep) + issues := m.diagnosePods(dep.Spec.Selector, dep.Namespace, revision, dep.Annotations) + f, p := summarizeIssues(issues) + failing = append(failing, f...) + progressing = append(progressing, p...) } for _, depnn := range m.statefulsets { ss := &appsv1.StatefulSet{} err := m.client.Get(context.TODO(), depnn, ss) if err != nil { - log.WithValues("reason", err).Info("Failed to query statefulset") + if errors.IsNotFound(err) { + failing = append(failing, fmt.Sprintf("StatefulSet %q not found", depnn.String())) + } else { + log.WithValues("reason", err).Info("Failed to query statefulset") + } continue } if *ss.Spec.Replicas != ss.Status.CurrentReplicas { @@ -599,21 +725,20 @@ func (m *statusManager) syncState() { continue } - // Check if any pods within the deployment are failing. - if f, err := m.podsFailing(ss.Spec.Selector, ss.Namespace); err == nil { - if f != "" { - failing = append(failing, f) - } - } else { - log.WithValues("reason", err, "statefuleset", depnn).Info("Failed to check for failing pods") - continue - } + issues := m.diagnosePods(ss.Spec.Selector, ss.Namespace, ss.Status.UpdateRevision, ss.Annotations) + f, p := summarizeIssues(issues) + failing = append(failing, f...) + progressing = append(progressing, p...) } for _, depnn := range m.cronjobs { cj := &batchv1.CronJob{} if err := m.client.Get(context.TODO(), depnn, cj); err != nil { - log.WithValues("reason", err).Info("Failed to query cronjobs") + if errors.IsNotFound(err) { + failing = append(failing, fmt.Sprintf("CronJob %q not found", depnn.String())) + } else { + log.WithValues("reason", err).Info("Failed to query cronjobs") + } continue } @@ -676,9 +801,10 @@ func (m *statusManager) removeTigeraStatus() { } } -// podsFailing takes a selector and returns if any of the pods that match it are failing. Failing pods are defined -// to be in CrashLoopBackOff state. -func (m *statusManager) podsFailing(selector *metav1.LabelSelector, namespace string) (string, error) { +// diagnosePods lists pods matching the selector and returns a podIssue for each +// unhealthy pod found. If currentRevision is non-empty, pods not matching that +// revision are marked as old-revision. +func (m *statusManager) diagnosePods(selector *metav1.LabelSelector, namespace string, currentRevision string, workloadAnnotations map[string]string) []podIssue { l := corev1.PodList{} s, err := metav1.LabelSelectorAsMap(selector) if err != nil { @@ -686,62 +812,201 @@ func (m *statusManager) podsFailing(selector *metav1.LabelSelector, namespace st } err = m.client.List(context.TODO(), &l, client.MatchingLabels(s), client.InNamespace(namespace)) if err != nil { - return "", err + log.WithValues("reason", err).Info("Failed to list pods for diagnosis") + return nil } + + var issues []podIssue for _, p := range l.Items { + oldRevision := currentRevision != "" && !podMatchesRevision(p, currentRevision) + if p.Status.Phase == corev1.PodFailed { - return fmt.Sprintf("Pod %s/%s has failed", p.Namespace, p.Name), nil + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issuePodFailed, + message: fmt.Sprintf("Pod %s/%s has failed", p.Namespace, p.Name), + isOldRevision: oldRevision, + }) + continue } - for _, c := range p.Status.InitContainerStatuses { - if msg := m.containerErrorMessage(p, c); msg != "" { - return msg, nil - } + + // Check init and regular container statuses for errors. + if iss := diagnoseContainers(p, p.Status.InitContainerStatuses, oldRevision, workloadAnnotations); len(iss) > 0 { + issues = append(issues, iss...) + continue } - for _, c := range p.Status.ContainerStatuses { - if msg := m.containerErrorMessage(p, c); msg != "" { - return msg, nil - } + if iss := diagnoseContainers(p, p.Status.ContainerStatuses, oldRevision, workloadAnnotations); len(iss) > 0 { + issues = append(issues, iss...) + continue } - // If none of the container-level checks matched, check if the pod is running but - // not passing readiness checks. + // Running but not passing readiness checks. if p.Status.Phase == corev1.PodRunning { for _, cond := range p.Status.Conditions { if cond.Type == corev1.ContainersReady && cond.Status == corev1.ConditionFalse { - return fmt.Sprintf("Pod %s/%s is running but not ready", p.Namespace, p.Name), nil + msg := fmt.Sprintf("Pod %s/%s is running but not ready", p.Namespace, p.Name) + if hasOverride(workloadAnnotations, "readinessProbe") { + msg += "; custom readiness probe configuration is in effect" + } + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issueNotReady, + message: msg, + isOldRevision: oldRevision, + }) + break } } + continue + } + + // Pending pod - check for scheduling failures. + if p.Status.Phase == corev1.PodPending { + msg := fmt.Sprintf("Pod %s/%s is pending", p.Namespace, p.Name) + for _, cond := range p.Status.Conditions { + if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse { + if cond.Message != "" { + msg += fmt.Sprintf(": %s", cond.Message) + } + break + } + } + issues = append(issues, podIssue{ + severity: severityProgressing, + issueType: issuePending, + message: msg, + isOldRevision: oldRevision, + }) } } - return "", nil + return issues } -func (m *statusManager) containerErrorMessage(p corev1.Pod, c corev1.ContainerStatus) string { - if c.State.Waiting != nil { - // Check well-known error states here and report an appropriate mesage to the end user. - switch c.State.Waiting.Reason { - case "CrashLoopBackOff": - msg := fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name) - if lt := c.LastTerminationState.Terminated; lt != nil { - if lt.Reason == terminationReasonError && lt.ExitCode == exitCodeSIGKILL { - msg += " (exit code 137, possible liveness probe failure)" - } else { - msg += fmt.Sprintf(" (%s, exit code %d)", lt.Reason, lt.ExitCode) +// diagnoseContainers checks a list of container statuses for known error states and +// returns a podIssue for each one found. +func diagnoseContainers(p corev1.Pod, statuses []corev1.ContainerStatus, oldRevision bool, workloadAnnotations map[string]string) []podIssue { + var issues []podIssue + for _, c := range statuses { + if c.State.Waiting != nil { + switch c.State.Waiting.Reason { + case waitingCrashLoopBackOff: + msg := fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name) + var termReason string + var exitCode int32 + if lt := c.LastTerminationState.Terminated; lt != nil { + termReason = lt.Reason + exitCode = lt.ExitCode + if lt.Reason == terminationReasonError && lt.ExitCode == exitCodeSIGKILL { + msg += " (exit code 137, possible liveness probe failure)" + if hasOverride(workloadAnnotations, "livenessProbe") { + msg += "; custom liveness probe configuration is in effect" + } + } else { + msg += fmt.Sprintf(" (%s, exit code %d)", lt.Reason, lt.ExitCode) + if lt.Reason == "OOMKilled" && hasOverride(workloadAnnotations, "resources") { + msg += "; custom resource limits are in effect" + } + } } + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issueCrashLoopBackOff, + message: msg, + isOldRevision: oldRevision, + terminationReason: termReason, + exitCode: exitCode, + }) + case waitingImagePullBackOff, waitingErrImagePull: + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issueImagePull, + message: fmt.Sprintf("Pod %s/%s failed to pull container image for: %s", p.Namespace, p.Name, c.Name), + isOldRevision: oldRevision, + }) + } + } + if c.State.Terminated != nil { + if c.State.Terminated.Reason == terminationReasonError { + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issueTerminated, + message: fmt.Sprintf("Pod %s/%s has terminated container: %s", p.Namespace, p.Name, c.Name), + isOldRevision: oldRevision, + }) } - return msg - case "ImagePullBackOff", "ErrImagePull": - return fmt.Sprintf("Pod %s/%s failed to pull container image for: %s", p.Namespace, p.Name, c.Name) } } - if c.State.Terminated != nil { - if c.State.Terminated.Reason == terminationReasonError { - return fmt.Sprintf("Pod %s/%s has terminated container: %s", p.Namespace, p.Name, c.Name) + return issues +} + +// podMatchesRevision checks whether a pod belongs to the given revision. +// Works for DaemonSets and StatefulSets (controller-revision-hash label) +// and Deployments (pod-template-hash label). +func podMatchesRevision(p corev1.Pod, currentRevision string) bool { + if hash, ok := p.Labels[appsv1.ControllerRevisionHashLabelKey]; ok { + return hash == currentRevision + } + if hash, ok := p.Labels[appsv1.DefaultDeploymentUniqueLabelKey]; ok { + return hash == currentRevision + } + return true +} + +// currentDeploymentRevision returns the pod-template-hash of the active ReplicaSet +// for the given Deployment. Returns empty string if it cannot be determined. +// Only called when the Deployment is unhealthy. The List call goes through +// the controller-runtime informer cache, not directly to the API server. +func (m *statusManager) currentDeploymentRevision(dep *appsv1.Deployment) string { + rsList := &appsv1.ReplicaSetList{} + s, err := metav1.LabelSelectorAsMap(dep.Spec.Selector) + if err != nil { + log.WithValues("reason", err).Info("Failed to parse deployment selector for revision lookup") + return "" + } + err = m.client.List(context.TODO(), rsList, client.MatchingLabels(s), client.InNamespace(dep.Namespace)) + if err != nil { + log.WithValues("reason", err).Info("Failed to list ReplicaSets for revision lookup") + return "" + } + + for _, rs := range rsList.Items { + if ref := metav1.GetControllerOf(&rs); ref == nil || ref.UID != dep.UID { + continue + } + if rs.Status.Replicas > 0 { + return rs.Labels[appsv1.DefaultDeploymentUniqueLabelKey] } } return "" } +// currentDaemonSetRevision returns the controller-revision-hash of the most recent +// ControllerRevision for the given DaemonSet. Returns empty string if it cannot be determined. +// Only called when the DaemonSet is unhealthy. The List call goes through +// the controller-runtime informer cache, not directly to the API server. +func (m *statusManager) currentDaemonSetRevision(ds *appsv1.DaemonSet) string { + revList := &appsv1.ControllerRevisionList{} + err := m.client.List(context.TODO(), revList, client.InNamespace(ds.Namespace)) + if err != nil { + log.WithValues("reason", err).Info("Failed to list ControllerRevisions for revision lookup") + return "" + } + + var maxRevision int64 + var currentHash string + for _, rev := range revList.Items { + if ref := metav1.GetControllerOf(&rev); ref == nil || ref.UID != ds.UID { + continue + } + if rev.Revision > maxRevision { + maxRevision = rev.Revision + currentHash = rev.Labels[appsv1.ControllerRevisionHashLabelKey] + } + } + return currentHash +} + + func (m *statusManager) set(retry bool, conditions ...operator.TigeraStatusCondition) { if m.enabled == nil || !*m.enabled { // Never set any conditions unless the status manager is enabled. diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index d19dda0ee5..19ca8ee03f 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -16,11 +16,14 @@ package status import ( "context" + "fmt" + "strings" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" certV1 "k8s.io/api/certificates/v1" certV1beta1 "k8s.io/api/certificates/v1beta1" corev1 "k8s.io/api/core/v1" @@ -29,6 +32,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" controllerRuntimeClient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -55,6 +59,7 @@ var _ = Describe("Status reporting tests", func() { err := apis.AddToScheme(scheme, false) Expect(err).NotTo(HaveOccurred()) Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(batchv1.AddToScheme(scheme)).NotTo(HaveOccurred()) Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred()) client = ctrlrfake.DefaultFakeClientBuilder(scheme).Build() @@ -71,6 +76,8 @@ var _ = Describe("Status reporting tests", func() { Expect(oldVersionSm.IsAvailable()).To(BeFalse()) }) + + Context("without CR found", func() { It("status is not created", func() { sm.updateStatus() @@ -539,6 +546,152 @@ var _ = Describe("Status reporting tests", func() { }) }) + Context("when a monitored workload is not found", func() { + BeforeEach(func() { + sm.ReadyToMonitor() + }) + + It("should report degraded when a DaemonSet is not found", func() { + sm.AddDaemonsets([]types.NamespacedName{{Namespace: "NS1", Name: "missing-ds"}}) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring(`DaemonSet "NS1/missing-ds" not found`))) + }) + + It("should report degraded when a Deployment is not found", func() { + sm.AddDeployments([]types.NamespacedName{{Namespace: "NS1", Name: "missing-dep"}}) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring(`Deployment "NS1/missing-dep" not found`))) + }) + + It("should report degraded when a StatefulSet is not found", func() { + sm.AddStatefulSets([]types.NamespacedName{{Namespace: "NS1", Name: "missing-ss"}}) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring(`StatefulSet "NS1/missing-ss" not found`))) + }) + + It("should report degraded when a CronJob is not found", func() { + sm.AddCronJobs([]types.NamespacedName{{Namespace: "NS1", Name: "missing-cj"}}) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring(`CronJob "NS1/missing-cj" not found`))) + }) + }) + + Context("during a rollout with mixed old and new revision pods", func() { + BeforeEach(func() { + sm.ReadyToMonitor() + }) + + It("should prioritize new-revision pod failures over old-revision failures", func() { + sm.AddDeployments([]types.NamespacedName{{Namespace: "NS1", Name: "DP1"}}) + replicas := int32(2) + gen := int64(5) + + // Create the Deployment. + Expect(client.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1", + UID: "dp1-uid", + Generation: gen, + }, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dp1"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 2, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + // Current ReplicaSet. + Expect(client.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1-new", + Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "dp1-uid", Controller: ptr.To(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dp1"}}, + }, + Status: appsv1.ReplicaSetStatus{Replicas: 1}, + })).NotTo(HaveOccurred()) + + // Old ReplicaSet. + Expect(client.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1-old", + Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "dp1-uid", Controller: ptr.To(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dp1"}}, + }, + Status: appsv1.ReplicaSetStatus{Replicas: 0}, + })).NotTo(HaveOccurred()) + + // New-revision pod: crash looping. + Expect(client.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "dp1-new-pod", + Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + }, + }, + }, + })).NotTo(HaveOccurred()) + + // Old-revision pod: also crash looping but with different reason. + Expect(client.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "dp1-old-pod", + Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(HaveLen(2)) + // New-revision issue should appear first. + Expect(sm.failing[0]).NotTo(ContainSubstring("old revision")) + Expect(sm.failing[1]).To(ContainSubstring("old revision")) + }) + }) + It("Should handle basic state changes", func() { // We expect no state to be "True" at boot. Expect(sm.IsAvailable()).To(BeFalse()) @@ -880,4 +1033,861 @@ var _ = Describe("Status reporting tests", func() { }, false, true), ) }) + + Describe("podIssue", func() { + It("should produce distinct keys for different crash reasons", func() { + oom := podIssue{ + issueType: issueCrashLoopBackOff, + terminationReason: "OOMKilled", + exitCode: 137, + } + errExit1 := podIssue{ + issueType: issueCrashLoopBackOff, + terminationReason: terminationReasonError, + exitCode: 1, + } + Expect(oom.key()).NotTo(Equal(errExit1.key())) + }) + + It("should produce the same key for same issue type with no termination context", func() { + a := podIssue{issueType: issueNotReady} + b := podIssue{issueType: issueNotReady} + Expect(a.key()).To(Equal(b.key())) + }) + + It("should produce distinct keys for different issue types", func() { + a := podIssue{issueType: issueCrashLoopBackOff} + b := podIssue{issueType: issueImagePull} + Expect(a.key()).NotTo(Equal(b.key())) + }) + }) + + Describe("summarizeIssues", func() { + It("should deduplicate issues with the same key", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "Pod ns/pod1 has crash looping container: c1", terminationReason: "OOMKilled", exitCode: 137}, + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "Pod ns/pod2 has crash looping container: c1", terminationReason: "OOMKilled", exitCode: 137}, + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "Pod ns/pod3 has crash looping container: c1", terminationReason: "OOMKilled", exitCode: 137}, + } + failing, progressing := summarizeIssues(issues) + Expect(failing).To(HaveLen(1)) + Expect(failing[0]).To(ContainSubstring("3 pods affected")) + Expect(progressing).To(BeEmpty()) + }) + + It("should cap at 3 unique reasons", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "msg1", terminationReason: "OOMKilled", exitCode: 137}, + {severity: severityFailing, issueType: issueImagePull, message: "msg2"}, + {severity: severityFailing, issueType: issueTerminated, message: "msg3", terminationReason: terminationReasonError, exitCode: 1}, + {severity: severityFailing, issueType: issuePodFailed, message: "msg4"}, + } + failing, _ := summarizeIssues(issues) + Expect(failing).To(HaveLen(3)) + }) + + It("should prioritize new-revision pods over old-revision pods", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "old pod crash", isOldRevision: true, terminationReason: "OOMKilled", exitCode: 137}, + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "new pod crash", isOldRevision: false, terminationReason: terminationReasonError, exitCode: 1}, + } + failing, _ := summarizeIssues(issues) + Expect(failing).To(HaveLen(2)) + Expect(failing[0]).To(ContainSubstring("new pod crash")) + Expect(failing[1]).To(ContainSubstring("old pod crash")) + Expect(failing[1]).To(ContainSubstring("old revision")) + }) + + It("should split failing and progressing", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueNotReady, message: "Pod ns/p1 is running but not ready"}, + {severity: severityProgressing, issueType: issuePending, message: "Pod ns/p2 is pending: Unschedulable"}, + } + failing, progressing := summarizeIssues(issues) + Expect(failing).To(HaveLen(1)) + Expect(progressing).To(HaveLen(1)) + Expect(progressing[0]).To(ContainSubstring("pending")) + }) + + It("should annotate old revision issues", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueNotReady, message: "Pod ns/p1 is running but not ready", isOldRevision: true}, + } + failing, _ := summarizeIssues(issues) + Expect(failing).To(HaveLen(1)) + Expect(failing[0]).To(ContainSubstring("old revision")) + }) + + It("should return empty slices for no issues", func() { + failing, progressing := summarizeIssues(nil) + Expect(failing).To(BeEmpty()) + Expect(progressing).To(BeEmpty()) + }) + }) + + Describe("diagnosePods", func() { + var sm *statusManager + var cl controllerRuntimeClient.Client + var ctx = context.Background() + + BeforeEach(func() { + scheme := runtime.NewScheme() + Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred()) + err := apis.AddToScheme(scheme, false) + Expect(err).NotTo(HaveOccurred()) + cl = ctrlrfake.DefaultFakeClientBuilder(scheme).Build() + sm = New(cl, "test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + }) + + selector := &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}} + podMeta := metav1.ObjectMeta{Namespace: "ns", Name: "pod1", Labels: map[string]string{"app": "test"}} + + It("should detect CrashLoopBackOff with OOMKilled context", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "", nil) + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issueCrashLoopBackOff)) + Expect(issues[0].severity).To(Equal(severityFailing)) + Expect(issues[0].terminationReason).To(Equal("OOMKilled")) + Expect(issues[0].exitCode).To(BeEquivalentTo(137)) + Expect(issues[0].message).To(ContainSubstring("OOMKilled")) + }) + + It("should detect CrashLoopBackOff with possible liveness probe failure", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: terminationReasonError, ExitCode: exitCodeSIGKILL}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "", nil) + Expect(issues).To(HaveLen(1)) + Expect(issues[0].message).To(ContainSubstring("possible liveness probe failure")) + }) + + It("should detect ImagePullBackOff", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "ImagePullBackOff"}}, + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "", nil) + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issueImagePull)) + Expect(issues[0].severity).To(Equal(severityFailing)) + }) + + It("should detect terminated container with Error", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{Reason: terminationReasonError}}, + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "", nil) + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issueTerminated)) + }) + + It("should detect pod in Failed phase", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{Phase: corev1.PodFailed}, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "", nil) + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issuePodFailed)) + Expect(issues[0].severity).To(Equal(severityFailing)) + }) + + It("should detect running but not ready", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.ContainersReady, Status: corev1.ConditionFalse}, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "", nil) + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issueNotReady)) + Expect(issues[0].severity).To(Equal(severityFailing)) + }) + + It("should detect pending unschedulable pod with scheduler reason", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodPending, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodScheduled, + Status: corev1.ConditionFalse, + Message: "0/3 nodes are available: 3 Insufficient memory", + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "", nil) + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issuePending)) + Expect(issues[0].severity).To(Equal(severityProgressing)) + Expect(issues[0].message).To(ContainSubstring("Insufficient memory")) + }) + + It("should detect pending pod without scheduler condition", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{Phase: corev1.PodPending}, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "", nil) + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issuePending)) + Expect(issues[0].severity).To(Equal(severityProgressing)) + }) + + It("should report multiple issues from different pods", func() { + pod1 := podMeta.DeepCopy() + pod1.Name = "pod-crash" + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: *pod1, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + }, + }, + }, + })).NotTo(HaveOccurred()) + + pod2 := podMeta.DeepCopy() + pod2.Name = "pod-pending" + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: *pod2, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{Phase: corev1.PodPending}, + })).NotTo(HaveOccurred()) + + issues := sm.diagnosePods(selector, "ns", "", nil) + Expect(issues).To(HaveLen(2)) + }) + + It("should return no issues for healthy pods", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.ContainersReady, Status: corev1.ConditionTrue}, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "", nil) + Expect(issues).To(BeEmpty()) + }) + + It("should add readiness probe hint when pod is not ready and readinessProbe override is configured", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.ContainersReady, Status: corev1.ConditionFalse}, + }, + }, + })).NotTo(HaveOccurred()) + overrides := map[string]string{"operator.tigera.io/custom-overrides": "readinessProbe,resources"} + issues := sm.diagnosePods(selector, "ns", "", overrides) + Expect(issues).To(HaveLen(1)) + Expect(issues[0].message).To(ContainSubstring("custom readiness probe configuration is in effect")) + }) + + It("should add liveness probe hint when pod has possible liveness failure and livenessProbe override is configured", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: terminationReasonError, ExitCode: exitCodeSIGKILL}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + overrides := map[string]string{"operator.tigera.io/custom-overrides": "livenessProbe"} + issues := sm.diagnosePods(selector, "ns", "", overrides) + Expect(issues).To(HaveLen(1)) + Expect(issues[0].message).To(ContainSubstring("custom liveness probe configuration is in effect")) + }) + + It("should add resources hint when pod is OOMKilled and resources override is configured", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + overrides := map[string]string{"operator.tigera.io/custom-overrides": "resources"} + issues := sm.diagnosePods(selector, "ns", "", overrides) + Expect(issues).To(HaveLen(1)) + Expect(issues[0].message).To(ContainSubstring("custom resource limits are in effect")) + }) + + It("should not add hints when no override annotation is present", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.ContainersReady, Status: corev1.ConditionFalse}, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "", nil) + Expect(issues).To(HaveLen(1)) + Expect(issues[0].message).NotTo(ContainSubstring("custom")) + }) + }) + + Describe("currentRevision helpers", func() { + var sm *statusManager + var cl controllerRuntimeClient.Client + var ctx = context.Background() + + BeforeEach(func() { + scheme := runtime.NewScheme() + Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred()) + err := apis.AddToScheme(scheme, false) + Expect(err).NotTo(HaveOccurred()) + cl = ctrlrfake.DefaultFakeClientBuilder(scheme).Build() + sm = New(cl, "test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + }) + + Describe("currentDeploymentRevision", func() { + It("should return the pod-template-hash of the newest ReplicaSet", func() { + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep1", UID: "dep-uid"}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + }, + } + // Current ReplicaSet. + Expect(cl.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "dep1-abc123", + Labels: map[string]string{"app": "test", appsv1.DefaultDeploymentUniqueLabelKey: "abc123"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "dep-uid", Controller: ptr.To(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + }, + Status: appsv1.ReplicaSetStatus{Replicas: 1}, + })).NotTo(HaveOccurred()) + // Old ReplicaSet. + Expect(cl.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "dep1-old999", + Labels: map[string]string{"app": "test", appsv1.DefaultDeploymentUniqueLabelKey: "old999"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "dep-uid", Controller: ptr.To(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + }, + Status: appsv1.ReplicaSetStatus{Replicas: 0}, + })).NotTo(HaveOccurred()) + + rev := sm.currentDeploymentRevision(dep) + Expect(rev).To(Equal("abc123")) + }) + + It("should return empty string when no ReplicaSets exist", func() { + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep1", UID: "dep-uid"}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + }, + } + rev := sm.currentDeploymentRevision(dep) + Expect(rev).To(BeEmpty()) + }) + }) + + Describe("currentDaemonSetRevision", func() { + It("should return the hash of the highest-revision ControllerRevision", func() { + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "ds1", UID: "ds-uid"}, + } + Expect(cl.Create(ctx, &appsv1.ControllerRevision{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "ds1-rev1", + Labels: map[string]string{appsv1.ControllerRevisionHashLabelKey: "hash-old"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "ds-uid", Controller: ptr.To(true)}, + }, + }, + Revision: 1, + })).NotTo(HaveOccurred()) + Expect(cl.Create(ctx, &appsv1.ControllerRevision{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "ds1-rev2", + Labels: map[string]string{appsv1.ControllerRevisionHashLabelKey: "hash-new"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "ds-uid", Controller: ptr.To(true)}, + }, + }, + Revision: 2, + })).NotTo(HaveOccurred()) + + rev := sm.currentDaemonSetRevision(ds) + Expect(rev).To(Equal("hash-new")) + }) + + It("should return empty string when no ControllerRevisions exist", func() { + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "ds1", UID: "ds-uid"}, + } + rev := sm.currentDaemonSetRevision(ds) + Expect(rev).To(BeEmpty()) + }) + }) + }) +}) + +// getTigeraStatusCondition returns the condition of the given type from the TigeraStatus, or nil if not found. +func getTigeraStatusCondition(ts *operator.TigeraStatus, condType operator.StatusConditionType) *operator.TigeraStatusCondition { + for _, c := range ts.Status.Conditions { + if c.Type == condType { + return &c + } + } + return nil +} + +var _ = Describe("Status manager integration tests", Ordered, func() { + var cl controllerRuntimeClient.Client + var ctx = context.Background() + + BeforeAll(func() { + scheme := runtime.NewScheme() + Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(batchv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + err := apis.AddToScheme(scheme, false) + Expect(err).NotTo(HaveOccurred()) + cl = ctrlrfake.DefaultFakeClientBuilder(scheme).Build() + }) + + Describe("degraded message content", func() { + It("should include CrashLoopBackOff with termination context in the TigeraStatus", func() { + sm := New(cl, "crash-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + gen := int64(1) + replicas := int32(1) + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "dep1"}}) + + Expect(cl.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep1", Generation: gen}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "crash"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 1, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "crash-pod", Labels: map[string]string{"app": "crash"}}, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "main", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "crash-test"}, ts)).NotTo(HaveOccurred()) + + degraded := getTigeraStatusCondition(ts, operator.ComponentDegraded) + Expect(degraded).NotTo(BeNil()) + Expect(degraded.Status).To(Equal(operator.ConditionTrue)) + Expect(degraded.Message).To(ContainSubstring("crash looping container")) + Expect(degraded.Message).To(ContainSubstring("OOMKilled, exit code 137")) + }) + + It("should include 'running but not ready' in the TigeraStatus", func() { + sm := New(cl, "notready-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + gen := int64(1) + replicas := int32(1) + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "dep2"}}) + + Expect(cl.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep2", Generation: gen}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "notready"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 1, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "notready-pod", Labels: map[string]string{"app": "notready"}}, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.ContainersReady, Status: corev1.ConditionFalse}, + }, + }, + })).NotTo(HaveOccurred()) + + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "notready-test"}, ts)).NotTo(HaveOccurred()) + + degraded := getTigeraStatusCondition(ts, operator.ComponentDegraded) + Expect(degraded).NotTo(BeNil()) + Expect(degraded.Status).To(Equal(operator.ConditionTrue)) + Expect(degraded.Message).To(ContainSubstring("running but not ready")) + }) + + It("should include pending pod scheduler reason in the TigeraStatus progressing message", func() { + sm := New(cl, "pending-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + gen := int64(1) + replicas := int32(1) + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "dep3"}}) + + Expect(cl.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep3", Generation: gen}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "pending"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 1, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "pending-pod", Labels: map[string]string{"app": "pending"}}, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodPending, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodScheduled, + Status: corev1.ConditionFalse, + Message: "0/3 nodes are available: 3 Insufficient memory", + }, + }, + }, + })).NotTo(HaveOccurred()) + + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "pending-test"}, ts)).NotTo(HaveOccurred()) + + progressing := getTigeraStatusCondition(ts, operator.ComponentProgressing) + Expect(progressing).NotTo(BeNil()) + Expect(progressing.Status).To(Equal(operator.ConditionTrue)) + Expect(progressing.Message).To(ContainSubstring("Insufficient memory")) + }) + + It("should report not-found workloads as degraded in TigeraStatus", func() { + sm := New(cl, "notfound-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "missing-dep"}}) + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "notfound-test"}, ts)).NotTo(HaveOccurred()) + + degraded := getTigeraStatusCondition(ts, operator.ComponentDegraded) + Expect(degraded).NotTo(BeNil()) + Expect(degraded.Status).To(Equal(operator.ConditionTrue)) + Expect(degraded.Message).To(ContainSubstring(`Deployment "ns/missing-dep" not found`)) + }) + }) + + Describe("deduplication and capping in TigeraStatus", func() { + It("should deduplicate identical pod failures and show count", func() { + sm := New(cl, "dedup-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + gen := int64(1) + replicas := int32(3) + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "dep-dedup"}}) + + Expect(cl.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep-dedup", Generation: gen}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dedup"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 3, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + // Create 3 pods all OOMKilled - should be deduplicated into one message. + for i := 0; i < 3; i++ { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: fmt.Sprintf("dedup-pod-%d", i), + Labels: map[string]string{"app": "dedup"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "main", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + } + + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "dedup-test"}, ts)).NotTo(HaveOccurred()) + + degraded := getTigeraStatusCondition(ts, operator.ComponentDegraded) + Expect(degraded).NotTo(BeNil()) + Expect(degraded.Message).To(ContainSubstring("3 pods affected")) + }) + }) + + Describe("rollout revision awareness in TigeraStatus", func() { + It("should annotate old-revision failures and prioritize new-revision failures", func() { + sm := New(cl, "rollout-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + gen := int64(1) + replicas := int32(2) + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "dep-rollout"}}) + + Expect(cl.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep-rollout", UID: "rollout-uid", Generation: gen}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "rollout"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 2, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + // Current ReplicaSet. + Expect(cl.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "dep-rollout-new", + Labels: map[string]string{"app": "rollout", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "rollout-uid", Controller: ptr.To(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "rollout"}}}, + Status: appsv1.ReplicaSetStatus{Replicas: 1}, + })).NotTo(HaveOccurred()) + + // Old ReplicaSet. + Expect(cl.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "dep-rollout-old", + Labels: map[string]string{"app": "rollout", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "rollout-uid", Controller: ptr.To(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "rollout"}}}, + Status: appsv1.ReplicaSetStatus{Replicas: 0}, + })).NotTo(HaveOccurred()) + + // New-revision pod crash looping. + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "rollout-new-pod", + Labels: map[string]string{"app": "rollout", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "main", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: terminationReasonError, ExitCode: 1}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + + // Old-revision pod crash looping with different reason. + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "rollout-old-pod", + Labels: map[string]string{"app": "rollout", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "main", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "rollout-test"}, ts)).NotTo(HaveOccurred()) + + degraded := getTigeraStatusCondition(ts, operator.ComponentDegraded) + Expect(degraded).NotTo(BeNil()) + + // The message should contain both failures, with the new-revision first. + // Messages are newline-joined by degradedMessage(). + lines := strings.Split(degraded.Message, "\n") + Expect(lines).To(HaveLen(2)) + Expect(lines[0]).NotTo(ContainSubstring("old revision")) + Expect(lines[1]).To(ContainSubstring("old revision")) + }) + }) }) diff --git a/pkg/controller/utils/component.go b/pkg/controller/utils/component.go index 10ffc062c8..36dd9e5ff9 100644 --- a/pkg/controller/utils/component.go +++ b/pkg/controller/utils/component.go @@ -16,6 +16,7 @@ package utils import ( "context" + stderrors "errors" "fmt" "maps" "reflect" @@ -49,6 +50,10 @@ import ( rmeta "github.com/tigera/operator/pkg/render/common/meta" ) +// errObjectIgnored is returned by createOrUpdateObject when an existing object has the +// unsupported.operator.tigera.io/ignore annotation. +var errObjectIgnored = fmt.Errorf("object has unsupported ignore annotation") + const TLS_CIPHERS_ENV_VAR_NAME = "TLS_CIPHER_SUITES" // dCache is a global deduplication cache that is used to avoid unnecessary updates to objects. It is shared @@ -313,7 +318,7 @@ func (c *componentHandler) createOrUpdateObject(ctx context.Context, obj client. // The object exists. Update it, unless the user has marked it as "ignored". if IgnoreObject(cur) { logCtx.Info("Ignoring annotated object") - return nil + return errObjectIgnored } logCtx.V(2).Info("Resource already exists, update it") @@ -464,7 +469,19 @@ func (c *componentHandler) CreateOrUpdateOrDelete(ctx context.Context, component conflictRetry: err := c.createOrUpdateObject(ctx, obj.DeepCopyObject().(client.Object), osType) if err != nil { - if errors.IsAlreadyExists(err) { + if stderrors.Is(err, errObjectIgnored) { + if status != nil { + kind := obj.GetObjectKind().GroupVersionKind().Kind + if kind == "" { + kind = reflect.TypeOf(obj).Elem().Name() + } + warningKey := fmt.Sprintf("ignore-%s-%s", kind, key) + status.SetWarning(warningKey, fmt.Sprintf( + "%s %q has the unsupported ignore annotation; the operator is not managing this resource", + kind, key, + )) + } + } else if errors.IsAlreadyExists(err) { // Remember that we've had an "already exists" error, but otherwise // carry on. alreadyExistsErr = err @@ -491,6 +508,16 @@ func (c *componentHandler) CreateOrUpdateOrDelete(ctx context.Context, component cronJobs = append(cronJobs, key) } + // Object updated normally - clear any stale ignore warning. + if err == nil && status != nil { + kind := obj.GetObjectKind().GroupVersionKind().Kind + if kind == "" { + kind = reflect.TypeOf(obj).Elem().Name() + } + warningKey := fmt.Sprintf("ignore-%s-%s", kind, key) + status.ClearWarning(warningKey) + } + continue } diff --git a/pkg/controller/utils/component_test.go b/pkg/controller/utils/component_test.go index 6716fa9221..b555049f14 100644 --- a/pkg/controller/utils/component_test.go +++ b/pkg/controller/utils/component_test.go @@ -16,6 +16,7 @@ package utils import ( "context" + stderrors "errors" "fmt" rbacv1 "k8s.io/api/rbac/v1" @@ -2017,6 +2018,61 @@ var _ = Describe("Component handler tests", func() { )) }) }) + + Context("unsupported ignore annotation", func() { + It("should return errObjectIgnored from createOrUpdateObject when object has ignore annotation", func() { + ds := &apps.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-ignored-ds", + Namespace: "default", + Annotations: map[string]string{ + "unsupported.operator.tigera.io/ignore": "true", + }, + }, + Spec: apps.DaemonSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "test"}}, + Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: "c", Image: "img"}}}, + }, + }, + } + Expect(c.Create(ctx, ds)).NotTo(HaveOccurred()) + + rendered := ds.DeepCopy() + rendered.Annotations = nil + rendered.SetGroupVersionKind(apps.SchemeGroupVersion.WithKind("DaemonSet")) + err := handler.(*componentHandler).createOrUpdateObject(ctx, rendered, rmeta.OSTypeLinux) + Expect(stderrors.Is(err, errObjectIgnored)).To(BeTrue()) + }) + + It("should not return an error from CreateOrUpdateOrDelete when object has ignore annotation", func() { + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-ignored-cm", + Namespace: "default", + Annotations: map[string]string{ + "unsupported.operator.tigera.io/ignore": "true", + }, + }, + } + Expect(c.Create(ctx, cm)).NotTo(HaveOccurred()) + + rendered := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-ignored-cm", + Namespace: "default", + }, + Data: map[string]string{"key": "value"}, + } + fc := &fakeComponent{ + objs: []client.Object{rendered}, + supportedOSType: rmeta.OSTypeLinux, + } + err := handler.CreateOrUpdateOrDelete(ctx, fc, sm) + Expect(err).NotTo(HaveOccurred()) + }) + }) }) var _ = Describe("Mocked client Component handler tests", func() { diff --git a/pkg/render/common/components/components.go b/pkg/render/common/components/components.go index e45b4c53fd..a64072f4b4 100644 --- a/pkg/render/common/components/components.go +++ b/pkg/render/common/components/components.go @@ -36,6 +36,10 @@ import ( var log = logf.Log.WithName("components") +// CustomOverridesAnnotation is set on workloads when the render package applies user-specified +// probe timing or resource overrides. The value is a comma-separated list of override types. +const CustomOverridesAnnotation = "operator.tigera.io/custom-overrides" + // containerNameAliases maps deprecated container names to their current names. // When a user provides an override using a deprecated name, it is transparently // resolved to the current name before matching against rendered containers. @@ -396,6 +400,28 @@ func applyReplicatedPodResourceOverrides(r *replicatedPodResource, overrides any // probe timing overrides are applied to the corresponding container. if cos := GetContainerOverrides(overrides); cos != nil { mergeContainerOverrides(r.podTemplateSpec.Spec.Containers, cos) + + // Track which override types were applied for status correlation. + seen := map[string]bool{} + var overrideTypes []string + for _, co := range cos { + if co.ReadinessProbe != nil && !seen["readinessProbe"] { + seen["readinessProbe"] = true + overrideTypes = append(overrideTypes, "readinessProbe") + } + if co.LivenessProbe != nil && !seen["livenessProbe"] { + seen["livenessProbe"] = true + overrideTypes = append(overrideTypes, "livenessProbe") + } + if co.Resources != nil && !seen["resources"] { + seen["resources"] = true + overrideTypes = append(overrideTypes, "resources") + } + } + if len(overrideTypes) > 0 { + r.annotations = common.MapExistsOrInitialize(r.annotations) + r.annotations[CustomOverridesAnnotation] = strings.Join(overrideTypes, ",") + } } // If `overrides` has a Spec.Template.Spec.Affinity field, and it's non-nil, it sets diff --git a/pkg/render/common/components/components_test.go b/pkg/render/common/components/components_test.go index 08adc81231..0d5939ee7b 100644 --- a/pkg/render/common/components/components_test.go +++ b/pkg/render/common/components/components_test.go @@ -602,6 +602,7 @@ var _ = Describe("Common components render tests", func() { expected := defaultedDaemonSet() Expect(expected.Spec.Template.Spec.Containers).To(HaveLen(2)) expected.Spec.Template.Spec.Containers[0].Resources = resources1 + expected.Annotations[CustomOverridesAnnotation] = "resources" Expect(result.Spec.Template.Spec.Containers).To(ContainElements(expected.Spec.Template.Spec.Containers)) Expect(result).To(Equal(expected)) }), @@ -1069,6 +1070,7 @@ var _ = Describe("Common components render tests", func() { expected := defaultedDeployment() Expect(expected.Spec.Template.Spec.Containers).To(HaveLen(2)) expected.Spec.Template.Spec.Containers[0].Resources = resources1 + expected.Annotations[CustomOverridesAnnotation] = "resources" Expect(result.Spec.Template.Spec.Containers).To(ContainElements(expected.Spec.Template.Spec.Containers)) Expect(result).To(Equal(expected)) }), @@ -1260,6 +1262,95 @@ var _ = Describe("Common components render tests", func() { Expect(c.Resources).To(Equal(overrideResources), "container %q should have overridden resources", c.Name) } }) + + Describe("custom-overrides annotation", func() { + It("should set annotation when readiness probe override is applied", func() { + d := appsv1.Deployment{} + d.Spec.Template.Spec.Containers = []corev1.Container{ + { + Name: "compliance-server", + Image: "test-image", + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{HTTPGet: &corev1.HTTPGetAction{Path: "/health"}}, + }, + }, + } + + period := int32(30) + overrides := &v1.ComplianceServerDeployment{ + Spec: &v1.ComplianceServerDeploymentSpec{ + Template: &v1.ComplianceServerDeploymentPodTemplateSpec{ + Spec: &v1.ComplianceServerDeploymentPodSpec{ + Containers: []v1.ComplianceServerDeploymentContainer{ + { + Name: "compliance-server", + ReadinessProbe: &v1.ProbeOverride{PeriodSeconds: &period}, + }, + }, + }, + }, + }, + } + + ApplyDeploymentOverrides(&d, overrides) + Expect(d.Annotations).To(HaveKeyWithValue(CustomOverridesAnnotation, "readinessProbe")) + }) + + It("should set annotation with multiple override types", func() { + d := appsv1.Deployment{} + d.Spec.Template.Spec.Containers = []corev1.Container{ + { + Name: "compliance-server", + Image: "test-image", + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{HTTPGet: &corev1.HTTPGetAction{Path: "/health"}}, + }, + LivenessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{HTTPGet: &corev1.HTTPGetAction{Path: "/health"}}, + }, + }, + } + + period := int32(30) + overrideResources := corev1.ResourceRequirements{ + Limits: corev1.ResourceList{corev1.ResourceCPU: resource.MustParse("500m")}, + } + overrides := &v1.ComplianceServerDeployment{ + Spec: &v1.ComplianceServerDeploymentSpec{ + Template: &v1.ComplianceServerDeploymentPodTemplateSpec{ + Spec: &v1.ComplianceServerDeploymentPodSpec{ + Containers: []v1.ComplianceServerDeploymentContainer{ + { + Name: "compliance-server", + ReadinessProbe: &v1.ProbeOverride{PeriodSeconds: &period}, + LivenessProbe: &v1.ProbeOverride{PeriodSeconds: &period}, + Resources: &overrideResources, + }, + }, + }, + }, + }, + } + + ApplyDeploymentOverrides(&d, overrides) + ann := d.Annotations[CustomOverridesAnnotation] + Expect(ann).To(ContainSubstring("readinessProbe")) + Expect(ann).To(ContainSubstring("livenessProbe")) + Expect(ann).To(ContainSubstring("resources")) + }) + + It("should not set annotation when no probe or resource overrides", func() { + d := appsv1.Deployment{} + d.Spec.Template.Spec.Containers = []corev1.Container{ + {Name: "compliance-server", Image: "test-image"}, + } + overrides := &v1.ComplianceServerDeployment{ + Spec: &v1.ComplianceServerDeploymentSpec{}, + } + ApplyDeploymentOverrides(&d, overrides) + Expect(d.Annotations).NotTo(HaveKey(CustomOverridesAnnotation)) + }) + }) }) func addContainer(cs []corev1.Container) []corev1.Container {