From e11bbf2267b7efb1fe72c4024f69c46e716471af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Wed, 22 Apr 2026 18:58:37 +0100 Subject: [PATCH 1/2] update owner status outside of component reconciliation --- README.md | 27 ++- docs/component.md | 71 +++++- docs/guidelines.md | 41 +++- e2e/framework/cluster_reconciler.go | 12 +- e2e/framework/reconciler.go | 12 +- .../component-prerequisites/app/controller.go | 13 +- examples/custom-resource/app/controller.go | 23 +- .../extraction-and-guards/app/controller.go | 23 +- .../grace-inconsistency/app/controller.go | 23 +- .../mutations-and-gating/app/controller.go | 23 +- pkg/component/component.go | 137 ++++++++---- pkg/component/component_test.go | 6 + pkg/component/conditions.go | 35 +-- pkg/component/conditions_test.go | 208 ++++++++++++++---- 14 files changed, 484 insertions(+), 170 deletions(-) diff --git a/README.md b/README.md index 5cb7b340..ee15dc77 100644 --- a/README.md +++ b/README.md @@ -238,27 +238,38 @@ func NewWebInterfaceComponent(owner *MyOperatorCR) (*component.Component, error) The controller builds the component and hands it to the framework. ```go -func (r *MyReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { +func (r *MyReconciler) Reconcile(ctx context.Context, req reconcile.Request) (_ reconcile.Result, err error) { owner := &MyOperatorCR{} if err := r.Get(ctx, req.NamespacedName, owner); err != nil { return reconcile.Result{}, client.IgnoreNotFound(err) } - comp, err := NewWebInterfaceComponent(owner) - if err != nil { - return reconcile.Result{}, err - } - - return reconcile.Result{}, comp.Reconcile(ctx, component.ReconcileContext{ + recCtx := component.ReconcileContext{ Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder, Metrics: r.Metrics, Owner: owner, - }) + } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() + + comp, err := NewWebInterfaceComponent(owner) + if err != nil { + return reconcile.Result{}, err + } + + return reconcile.Result{}, comp.Reconcile(ctx, recCtx) } ``` +Components stage their conditions on `owner` in memory; a single deferred `component.FlushStatus` at the end of the +reconcile loop persists every condition with one `Status().Update` call. This keeps controllers with multiple components +free of self-induced 409 conflicts. + ## Beyond the Basics The Quick Start shows the common path. The sections below highlight capabilities that matter once your operator grows diff --git a/docs/component.md b/docs/component.md index d3b923bc..90ed7149 100644 --- a/docs/component.md +++ b/docs/component.md @@ -32,6 +32,7 @@ reports their aggregate health through one condition on the owner CRD. - [Grace Period](#grace-period) - [Suspension Lifecycle](#suspension-lifecycle) - [ReconcileContext](#reconcilecontext) +- [Persisting Status with FlushStatus](#persisting-status-with-flushstatus) - [Guards](#guards) - [Registering a Guard](#registering-a-guard) - [Guard Behavior](#guard-behavior) @@ -262,7 +263,9 @@ This means a read-only resource registered before a managed resource can extract resource's guard or mutations. **Phase 5: Status aggregation and condition update.** The health of each resource is collected, the grace period is -consulted, and a single aggregate condition is written to the owner object's status. +consulted, and a single aggregate condition is written to the owner object's conditions **in memory**. `Reconcile` never +calls the Kubernetes API to persist status; the controller does that in a single write at the end of its reconcile loop. +See [Persisting Status with FlushStatus](#persisting-status-with-flushstatus). **Phase 6: Resource deletion.** Resources registered for deletion are removed from the cluster. @@ -428,7 +431,7 @@ recCtx := component.ReconcileContext{ Client: r.Client, // sigs.k8s.io/controller-runtime/pkg/client Scheme: r.Scheme, // *runtime.Scheme Recorder: r.Recorder, // record.EventRecorder - Metrics: r.Metrics, // component.Recorder (condition metrics) + Metrics: r.Metrics, // component.Recorder (condition metrics), optional Owner: owner, // the CRD that owns this component } @@ -437,9 +440,57 @@ err = comp.Reconcile(ctx, recCtx) Dependencies are passed explicitly so components remain testable and decoupled from global state. -The `Metrics` field is required. The framework records Prometheus metrics for every condition state transition during -reconciliation. The recorder implementation is provided by -[go-crd-condition-metrics](https://github.com/sourcehawk/go-crd-condition-metrics). +The `Metrics` field is optional. When set, the framework records Prometheus metrics for every condition reported during +a reconcile. The recorder implementation is provided by +[go-crd-condition-metrics](https://github.com/sourcehawk/go-crd-condition-metrics). Leave the field `nil` to opt out of +metric recording. + +## Persisting Status with FlushStatus + +`Component.Reconcile` only mutates the owner's status conditions in memory. The controller is responsible for writing +those conditions to the Kubernetes API by calling `component.FlushStatus` once per reconcile, typically from a deferred +call so that conditions set on error paths are still persisted: + +```go +func (r *MyReconciler) Reconcile(ctx context.Context, req reconcile.Request) (_ reconcile.Result, err error) { + owner := &v1alpha1.MyApp{} + if err := r.Get(ctx, req.NamespacedName, owner); err != nil { + return reconcile.Result{}, client.IgnoreNotFound(err) + } + + recCtx := component.ReconcileContext{ + Client: r.Client, + Scheme: r.Scheme, + Recorder: r.Recorder, + Metrics: r.Metrics, + Owner: owner, + } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() + + comp, err := buildMyComponent(owner) + if err != nil { + return reconcile.Result{}, err + } + return reconcile.Result{}, comp.Reconcile(ctx, recCtx) +} +``` + +`FlushStatus` performs one `Status().Update` call that writes every condition currently on the owner in memory, wrapped +in `retry.RetryOnConflict`. If another writer updated the owner between the controller's initial `Get` and this call, +`FlushStatus` refetches, reapplies the conditions staged during the reconcile, and retries. Conditions managed by other +writers on the same owner are preserved because `meta.SetStatusCondition` merges by condition type. + +After the update succeeds, `FlushStatus` records metrics for every condition on the owner. If `rec.Metrics` is nil, +metric recording is skipped. + +This split is what allows a controller with several components (see [Keep Controllers Thin](./guidelines.md) and +[One Component Per Logical Condition](./guidelines.md)) to stage several conditions during one reconcile and persist +them all in a single write. Persisting after every component would race the components' writes against each other and +produce 409 conflicts. ## Guards @@ -460,8 +511,14 @@ The following example shows the complete pattern. A cloud provider role resource bucket resource uses that ARN in its spec and guards against being applied before the ARN is available: ```go -func (r *MyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - // ...fetch owner... +func (r *MyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, err error) { + // ...fetch owner and build recCtx... + + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() // roleARN is scoped to this reconcile call. The role resource's data extractor // populates it after the role is applied. Because extraction runs per-resource diff --git a/docs/guidelines.md b/docs/guidelines.md index 1c229749..8b52f679 100644 --- a/docs/guidelines.md +++ b/docs/guidelines.md @@ -301,34 +301,55 @@ clearer than splitting them into `DeploymentReady` and `ServiceReady`. ## Keep Controllers Thin -Controllers should fetch the owner, decide which components to build, and call `Reconcile()`. Business logic, resource -construction, and feature decisions belong in components and their resource builders. +Controllers should fetch the owner, decide which components to build, call `Reconcile()`, and defer a single +`component.FlushStatus` to persist status. Business logic, resource construction, and feature decisions belong in +components and their resource builders. ```go -func (r *MyReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { +func (r *MyReconciler) Reconcile(ctx context.Context, req reconcile.Request) (_ reconcile.Result, err error) { owner := &v1alpha1.MyApp{} if err := r.Get(ctx, req.NamespacedName, owner); err != nil { return reconcile.Result{}, client.IgnoreNotFound(err) } - comp, err := buildWebComponent(owner) - if err != nil { - return reconcile.Result{}, err - } - - return reconcile.Result{}, comp.Reconcile(ctx, component.ReconcileContext{ + recCtx := component.ReconcileContext{ Client: r.Client, Scheme: r.Scheme, Recorder: r.Recorder, Metrics: r.Metrics, Owner: owner, - }) + } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() + + comp, err := buildWebComponent(owner) + if err != nil { + return reconcile.Result{}, err + } + + return reconcile.Result{}, comp.Reconcile(ctx, recCtx) } ``` This keeps controller logic trivial to test (there is almost nothing to test) and makes component construction functions independently testable as pure functions: owner in, component out, no cluster required. +### Flushing status is the controller's job + +`Component.Reconcile` only mutates the owner's conditions in memory. Persisting them is explicitly the controller's +responsibility, via one `component.FlushStatus` call per reconcile, typically deferred so that conditions set by error +paths (for example, `fail()` in the framework) are still written when `Reconcile` returns an error. + +Do not call `FlushStatus` in between component reconciles. With several components per controller the point of the split +is to stage all their conditions in memory first and write them once at the end. Flushing between components brings back +the exact 409 conflict pattern the split was introduced to eliminate. + +If you do not want to emit condition metrics, leave `ReconcileContext.Metrics` as `nil`. `FlushStatus` tolerates a nil +recorder and simply skips metric emission. + ## Resource Registration Order Is Execution Order Resources are reconciled in the exact order they are registered with `WithResource()`. This is deliberate: guards and diff --git a/e2e/framework/cluster_reconciler.go b/e2e/framework/cluster_reconciler.go index 8f634787..9c113fb8 100644 --- a/e2e/framework/cluster_reconciler.go +++ b/e2e/framework/cluster_reconciler.go @@ -82,7 +82,7 @@ func (r *ClusterE2EReconciler) Unregister(name string) { // Reconcile implements reconcile.Reconciler. It fetches the ClusterTestApp, looks up // the registered factory, builds the component, and reconciles it. -func (r *ClusterE2EReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { +func (r *ClusterE2EReconciler) Reconcile(ctx context.Context, req reconcile.Request) (_ reconcile.Result, err error) { logger := log.FromContext(ctx).WithValues("clustertestapp", req.NamespacedName) owner := &ClusterTestApp{} @@ -98,20 +98,19 @@ func (r *ClusterE2EReconciler) Reconcile(ctx context.Context, req reconcile.Requ r.mu.RUnlock() var comp *component.Component - var err error switch { case hasComp: comp, err = compFactory(owner) case hasRes: - res, buildErr := resFactory(owner) + resource, buildErr := resFactory(owner) if buildErr != nil { return reconcile.Result{}, buildErr } comp, err = component.NewComponentBuilder(). WithName("e2e-test"). WithConditionType("E2EReady"). - WithResource(res, component.ResourceOptions{}). + WithResource(resource, component.ResourceOptions{}). Suspend(owner.Spec.Suspended). Build() default: @@ -130,6 +129,11 @@ func (r *ClusterE2EReconciler) Reconcile(ctx context.Context, req reconcile.Requ Metrics: r.Metrics, Owner: owner, } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() if err := comp.Reconcile(ctx, recCtx); err != nil { return reconcile.Result{}, err diff --git a/e2e/framework/reconciler.go b/e2e/framework/reconciler.go index 2bfd629c..c366f19a 100644 --- a/e2e/framework/reconciler.go +++ b/e2e/framework/reconciler.go @@ -82,7 +82,7 @@ func (r *E2EReconciler) Unregister(key types.NamespacedName) { // Reconcile implements reconcile.Reconciler. It fetches the TestApp, looks up // the registered factory, builds the component, and reconciles it. -func (r *E2EReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { +func (r *E2EReconciler) Reconcile(ctx context.Context, req reconcile.Request) (_ reconcile.Result, err error) { logger := log.FromContext(ctx).WithValues("testapp", req.NamespacedName) owner := &TestApp{} @@ -98,20 +98,19 @@ func (r *E2EReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r r.mu.RUnlock() var comp *component.Component - var err error switch { case hasComp: comp, err = compFactory(owner) case hasRes: - res, buildErr := resFactory(owner) + resource, buildErr := resFactory(owner) if buildErr != nil { return reconcile.Result{}, buildErr } comp, err = component.NewComponentBuilder(). WithName("e2e-test"). WithConditionType("E2EReady"). - WithResource(res, component.ResourceOptions{}). + WithResource(resource, component.ResourceOptions{}). Suspend(owner.Spec.Suspended). Build() default: @@ -130,6 +129,11 @@ func (r *E2EReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r Metrics: r.Metrics, Owner: owner, } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() if err := comp.Reconcile(ctx, recCtx); err != nil { return reconcile.Result{}, err diff --git a/examples/component-prerequisites/app/controller.go b/examples/component-prerequisites/app/controller.go index 91f04406..6f1cefd4 100644 --- a/examples/component-prerequisites/app/controller.go +++ b/examples/component-prerequisites/app/controller.go @@ -27,7 +27,13 @@ type Controller struct { } // Reconcile builds and reconciles the infra and app components in order. -func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { +// +// Both components share the same ReconcileContext and stage their conditions +// on the owner in memory; a single deferred FlushStatus at the end of +// reconciliation persists both conditions in one API call. That is what +// prevents the sequential components from racing two separate status updates +// against the same owner and hitting conflicts. +func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) (err error) { recCtx := component.ReconcileContext{ Client: r.Client, Scheme: r.Scheme, @@ -35,6 +41,11 @@ func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { Metrics: r.Metrics, Owner: owner, } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() // --- Infra component: no prerequisites --- cmResource, err := r.NewConfigMapResource(owner) diff --git a/examples/custom-resource/app/controller.go b/examples/custom-resource/app/controller.go index dfd16dd9..30ed308a 100644 --- a/examples/custom-resource/app/controller.go +++ b/examples/custom-resource/app/controller.go @@ -23,7 +23,20 @@ type Controller struct { } // Reconcile builds and reconciles a single component managing the certificate. -func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { +func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) (err error) { + recCtx := component.ReconcileContext{ + Client: r.Client, + Scheme: r.Scheme, + Recorder: r.Recorder, + Metrics: r.Metrics, + Owner: owner, + } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() + certResource, err := r.NewCertificateResource(owner) if err != nil { return err @@ -38,11 +51,5 @@ func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { return err } - return comp.Reconcile(ctx, component.ReconcileContext{ - Client: r.Client, - Scheme: r.Scheme, - Recorder: r.Recorder, - Metrics: r.Metrics, - Owner: owner, - }) + return comp.Reconcile(ctx, recCtx) } diff --git a/examples/extraction-and-guards/app/controller.go b/examples/extraction-and-guards/app/controller.go index 8f00c9ec..464d7816 100644 --- a/examples/extraction-and-guards/app/controller.go +++ b/examples/extraction-and-guards/app/controller.go @@ -30,7 +30,20 @@ type Controller struct { // Reconcile builds and reconciles a component where the ConfigMap is registered // before the Secret. Registration order matters: the guard on the Secret can // only read data extracted by a preceding resource. -func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { +func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) (err error) { + recCtx := component.ReconcileContext{ + Client: r.Client, + Scheme: r.Scheme, + Recorder: r.Recorder, + Metrics: r.Metrics, + Owner: owner, + } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() + // Shared state: the ConfigMap extractor writes here, the Secret guard reads it. var dbHost string @@ -54,11 +67,5 @@ func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { return err } - return comp.Reconcile(ctx, component.ReconcileContext{ - Client: r.Client, - Scheme: r.Scheme, - Recorder: r.Recorder, - Metrics: r.Metrics, - Owner: owner, - }) + return comp.Reconcile(ctx, recCtx) } diff --git a/examples/grace-inconsistency/app/controller.go b/examples/grace-inconsistency/app/controller.go index 42c8ebb7..297205c0 100644 --- a/examples/grace-inconsistency/app/controller.go +++ b/examples/grace-inconsistency/app/controller.go @@ -25,7 +25,20 @@ type Controller struct { // Reconcile builds and reconciles a component with grace period and // inconsistency suppression. -func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { +func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) (err error) { + recCtx := component.ReconcileContext{ + Client: r.Client, + Scheme: r.Scheme, + Recorder: r.Recorder, + Metrics: r.Metrics, + Owner: owner, + } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() + deployResource, err := r.NewDeploymentResource(owner) if err != nil { return err @@ -52,11 +65,5 @@ func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { return err } - return comp.Reconcile(ctx, component.ReconcileContext{ - Client: r.Client, - Scheme: r.Scheme, - Recorder: r.Recorder, - Metrics: r.Metrics, - Owner: owner, - }) + return comp.Reconcile(ctx, recCtx) } diff --git a/examples/mutations-and-gating/app/controller.go b/examples/mutations-and-gating/app/controller.go index 9e0f15b0..d2fc5305 100644 --- a/examples/mutations-and-gating/app/controller.go +++ b/examples/mutations-and-gating/app/controller.go @@ -24,7 +24,20 @@ type Controller struct { } // Reconcile builds and reconciles a single component containing both resources. -func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { +func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) (err error) { + recCtx := component.ReconcileContext{ + Client: r.Client, + Scheme: r.Scheme, + Recorder: r.Recorder, + Metrics: r.Metrics, + Owner: owner, + } + defer func() { + if flushErr := component.FlushStatus(ctx, recCtx); flushErr != nil && err == nil { + err = flushErr + } + }() + deployResource, err := r.NewDeploymentResource(owner) if err != nil { return err @@ -55,11 +68,5 @@ func (r *Controller) Reconcile(ctx context.Context, owner *ExampleApp) error { return err } - return comp.Reconcile(ctx, component.ReconcileContext{ - Client: r.Client, - Scheme: r.Scheme, - Recorder: r.Recorder, - Metrics: r.Metrics, - Owner: owner, - }) + return comp.Reconcile(ctx, recCtx) } diff --git a/pkg/component/component.go b/pkg/component/component.go index c6c1ec45..28cd24ae 100644 --- a/pkg/component/component.go +++ b/pkg/component/component.go @@ -5,10 +5,12 @@ import ( "fmt" "time" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/retry" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" @@ -30,6 +32,8 @@ type OperatorCRD interface { } // Recorder is an interface for recording status condition changes as metrics. +// It is optional: a [ReconcileContext] may leave [ReconcileContext.Metrics] +// nil, in which case [FlushStatus] skips metric emission. type Recorder interface { // RecordConditionFor records a condition change for a specific object and kind. RecordConditionFor( @@ -47,7 +51,8 @@ type ReconcileContext struct { Scheme *runtime.Scheme // Recorder is the event recorder for publishing Kubernetes events. Recorder record.EventRecorder - // Metrics is the recorder for status condition metrics. + // Metrics is the recorder for status condition metrics. It is optional; if + // nil, [FlushStatus] will skip metric emission. Metrics Recorder // Owner is the custom resource that owns and is updated by the components. Owner OperatorCRD @@ -134,8 +139,12 @@ func (c *Component) GetCondition(owner OperatorCRD) Condition { // Reconcile converges the component to the desired state. // -// A component manages its own condition on the parent and updates it accordingly -// to represent currently observable facts about the component status. +// A component manages its own condition on the parent and updates it in-memory +// to represent currently observable facts about the component status. Reconcile +// never writes to the Kubernetes API status subresource; the controller is +// responsible for persisting the final status by calling [FlushStatus] exactly +// once per reconciliation, typically via defer so that conditions set on error +// paths are still written. // // Reconciliation follows these steps: // @@ -170,7 +179,9 @@ func (c *Component) GetCondition(owner OperatorCRD) Condition { // // 6. Condition Update: Derives a new component condition using a stateful // progression model that considers the aggregate resource status, the -// previous condition, and the configured grace period to avoid churn. +// previous condition, and the configured grace period to avoid churn. The +// new condition is written to the owner in memory only; call [FlushStatus] +// to persist. // // 7. Resource Deletion: Finally, it deletes any resources registered for deletion. func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { @@ -184,7 +195,7 @@ func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { mapper := rec.Client.RESTMapper() if mapper == nil { return fail( - ctx, rec, c.conditionType, fmt.Errorf( + rec, c.conditionType, fmt.Errorf( "ReconcileContext.Client.RESTMapper() returned nil; a valid RESTMapper is required for reconciliation", ), ) @@ -195,17 +206,18 @@ func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { enabled, err := c.featureGate.Enabled() if err != nil { cond := conditionFeatureGateError(c.conditionType, err, rec.Owner.GetGeneration()) - _ = setStatusCondition(ctx, rec, cond) + applyStatusCondition(rec, cond) return err } if !enabled { if err := deleteResources(ctx, rec, c.allManagedResources(), withDeletionReason("disabled feature gate")); err != nil { - return fail(ctx, rec, c.conditionType, err) + return fail(rec, c.conditionType, err) } cond := conditionDisabled(c.conditionType, rec.Owner.GetGeneration()) - return setStatusCondition(ctx, rec, cond) + applyStatusCondition(rec, cond) + return nil } } @@ -219,13 +231,14 @@ func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { result, err := c.evaluatePrerequisites(rec) if err != nil { cond := conditionPrerequisiteNotMet(c.conditionType, err.Error(), rec.Owner.GetGeneration()) - _ = setStatusCondition(ctx, rec, cond) + applyStatusCondition(rec, cond) return err } if result.Status == PrerequisiteStatusNotMet { cond := conditionPrerequisiteNotMet(c.conditionType, result.Reason, rec.Owner.GetGeneration()) - return setStatusCondition(ctx, rec, cond) + applyStatusCondition(rec, cond) + return nil } } } @@ -236,7 +249,7 @@ func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { managed := c.managedResources() results, err := suspendResources(ctx, rec, managed, c.name, mapper) if err != nil { - return fail(ctx, rec, c.conditionType, err) + return fail(rec, c.conditionType, err) } cond := suspendingCondition( @@ -244,12 +257,10 @@ func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { suspensionResults(results).summary(), rec.Owner.GetGeneration(), ) - if err := setStatusCondition(ctx, rec, cond); err != nil { - return err - } + applyStatusCondition(rec, cond) if err := deleteResources(ctx, rec, c.deleteResources); err != nil { - return fail(ctx, rec, c.conditionType, err) + return fail(rec, c.conditionType, err) } return nil @@ -261,7 +272,7 @@ func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { // is available to subsequent resources' guards and mutations. results, err := reconcileResources(ctx, rec, c.reconcileResources, c.name, mapper) if err != nil { - return fail(ctx, rec, c.conditionType, err) + return fail(rec, c.conditionType, err) } // Determine new condition for component @@ -272,12 +283,10 @@ func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { c.gracePeriod, c.GetCondition(rec.Owner), ) - if err := setStatusCondition(ctx, rec, cond); err != nil { - return err - } + applyStatusCondition(rec, cond) if err := deleteResources(ctx, rec, c.deleteResources); err != nil { - return fail(ctx, rec, c.conditionType, err) + return fail(rec, c.conditionType, err) } return nil @@ -348,23 +357,77 @@ func (c *Component) managedResources() []Resource { return managed } -// fail sets the component's error status condition on the owner and returns the -// provided error. -// -// This helper centralizes the common reconciliation pattern where a failure -// should both: -// 1. Update the component condition on the owner to reflect the error. -// 2. Propagate the error to stop further reconciliation. -// -// The error from setting the status condition is intentionally ignored because -// the original reconciliation error is considered the primary failure. -func fail( - ctx context.Context, - rec ReconcileContext, - conditionType ConditionType, - err error, -) error { +// fail writes an error condition for the component to the owner in memory and +// returns the provided error. Persistence of the error condition happens when +// the controller calls [FlushStatus], typically from a deferred call that +// still runs even when Reconcile returns an error. +func fail(rec ReconcileContext, conditionType ConditionType, err error) error { cond := conditionError(conditionType, err, rec.Owner.GetGeneration()) - _ = setStatusCondition(ctx, rec, cond) + applyStatusCondition(rec, cond) return err } + +// FlushStatus persists the owner's current status conditions to the Kubernetes +// API and records condition metrics for every condition on the owner. +// +// Controllers must call FlushStatus exactly once per reconciliation, typically +// via defer so that conditions set on error paths are still persisted: +// +// func (r *MyReconciler) Reconcile(ctx context.Context, req reconcile.Request) (res reconcile.Result, err error) { +// owner := &v1alpha1.MyApp{} +// if err := r.Get(ctx, req.NamespacedName, owner); err != nil { +// return reconcile.Result{}, client.IgnoreNotFound(err) +// } +// rec := component.ReconcileContext{ /* ... */ Owner: owner} +// defer func() { +// if flushErr := component.FlushStatus(ctx, rec); flushErr != nil && err == nil { +// err = flushErr +// } +// }() +// return reconcile.Result{}, comp.Reconcile(ctx, rec) +// } +// +// On a 409 Conflict (for example if an external writer updated the owner +// between the controller fetching it and this call) FlushStatus refetches the +// owner, re-applies the conditions staged during reconciliation using +// meta.SetStatusCondition, and retries. Conditions managed by other writers on +// the owner are preserved because meta.SetStatusCondition merges by condition +// type. +// +// If rec.Metrics is nil, metric recording is skipped. All other fields of rec +// must be populated. +func FlushStatus(ctx context.Context, rec ReconcileContext) error { + desired := append([]metav1.Condition(nil), *rec.Owner.GetStatusConditions()...) + + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + updateErr := rec.Client.Status().Update(ctx, rec.Owner) + if updateErr == nil { + return nil + } + if !apierrors.IsConflict(updateErr) { + return updateErr + } + key := client.ObjectKeyFromObject(rec.Owner) + if getErr := rec.Client.Get(ctx, key, rec.Owner); getErr != nil { + return getErr + } + for _, cond := range desired { + meta.SetStatusCondition(rec.Owner.GetStatusConditions(), cond) + } + return updateErr + }) + if err != nil { + return err + } + + if rec.Metrics == nil { + return nil + } + for _, cond := range *rec.Owner.GetStatusConditions() { + rec.Metrics.RecordConditionFor( + rec.Owner.GetKind(), rec.Owner, cond.Type, string(cond.Status), + cond.Reason, cond.LastTransitionTime.Time, + ) + } + return nil +} diff --git a/pkg/component/component_test.go b/pkg/component/component_test.go index 05b6d536..7aa1ebf9 100644 --- a/pkg/component/component_test.go +++ b/pkg/component/component_test.go @@ -42,7 +42,13 @@ var _ = Describe("Component Reconciler", func() { } }) + // getOwnerCondition mirrors what a controller does in production: after + // Reconcile mutates the owner's status conditions in memory, a deferred + // FlushStatus writes them to the API. The helper flushes then refetches so + // that the returned condition reflects the persisted state, including + // error conditions set via fail(). getOwnerCondition := func() Condition { + Expect(FlushStatus(ctx, recCtx)).To(Succeed()) updatedOwner := &MockOperatorCRD{} Expect(k8sClient.Get(ctx, client.ObjectKey{Name: owner.Name, Namespace: namespace}, updatedOwner)).To(Succeed()) return comp.GetCondition(updatedOwner) diff --git a/pkg/component/conditions.go b/pkg/component/conditions.go index d1e3fcc8..8d79b188 100644 --- a/pkg/component/conditions.go +++ b/pkg/component/conditions.go @@ -1,8 +1,6 @@ package component import ( - "context" - "github.com/sourcehawk/operator-component-framework/pkg/component/concepts" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -153,29 +151,12 @@ func conditionUnknown(component ConditionType, observedGeneration int64) Conditi } } -// setStatusCondition updates the component condition on the owner CRD's status. -// It performs three key actions: -// 1. Updates the condition in the owner's status condition slice. -// 2. Records the condition change in the metrics recorder. -// 3. If the condition has changed, it persists the update to the Kubernetes API. -func setStatusCondition( - ctx context.Context, rec ReconcileContext, cond Condition, -) error { - changed := meta.SetStatusCondition(rec.Owner.GetStatusConditions(), metav1.Condition(cond)) - updated := meta.FindStatusCondition(*rec.Owner.GetStatusConditions(), cond.Type) - - if updated != nil { - rec.Metrics.RecordConditionFor( - rec.Owner.GetKind(), rec.Owner, updated.Type, string(updated.Status), - updated.Reason, updated.LastTransitionTime.Time, - ) - } - - if changed { - if err := rec.Client.Status().Update(ctx, rec.Owner); err != nil { - return err - } - } - - return nil +// applyStatusCondition updates the component condition on the owner's in-memory +// status conditions. It does not call the Kubernetes API and does not record +// metrics; persistence and metrics recording are performed once per reconcile +// by [FlushStatus]. Keeping this function purely in-memory is what allows a +// controller with several components to share a single status write at the end +// of reconciliation instead of racing multiple writes against the same owner. +func applyStatusCondition(rec ReconcileContext, cond Condition) { + meta.SetStatusCondition(rec.Owner.GetStatusConditions(), metav1.Condition(cond)) } diff --git a/pkg/component/conditions_test.go b/pkg/component/conditions_test.go index 35cbc939..4a72ef6e 100644 --- a/pkg/component/conditions_test.go +++ b/pkg/component/conditions_test.go @@ -3,6 +3,7 @@ package component import ( "context" "errors" + "fmt" "testing" "time" @@ -10,8 +11,10 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -191,10 +194,7 @@ func TestConditionMethods(t *testing.T) { }) } -func TestSetStatusCondition(t *testing.T) { - scheme := runtime.NewScheme() - _ = AddToScheme(scheme) - +func TestApplyStatusCondition(t *testing.T) { owner := &MockOperatorCRD{ ObjectMeta: metav1.ObjectMeta{ Name: "test-owner", @@ -203,7 +203,12 @@ func TestSetStatusCondition(t *testing.T) { }, } - ctx := context.Background() + rec := ReconcileContext{ + Client: failingClient{}, + Metrics: metricsThatPanic{}, + Owner: owner, + } + cond := Condition{ Type: "TestComponent", Status: metav1.ConditionTrue, @@ -212,56 +217,123 @@ func TestSetStatusCondition(t *testing.T) { ObservedGeneration: 1, } - t.Run("should update status and record metrics", func(t *testing.T) { - k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(owner).WithObjects(owner).Build() - metrics := &MockMetrics{} + // Pure in-memory mutation: no client call, no metrics call. Using a client + // that would fail on any API access and a metrics recorder that would + // panic on any invocation proves applyStatusCondition never reaches either. + applyStatusCondition(rec, cond) - rec := ReconcileContext{ - Client: k8sClient, - Metrics: metrics, - Owner: owner, + conditions := owner.GetStatusConditions() + require.Len(t, *conditions, 1) + assert.Equal(t, cond.Type, (*conditions)[0].Type) + assert.Equal(t, metav1.ConditionTrue, (*conditions)[0].Status) + assert.Equal(t, cond.Reason, (*conditions)[0].Reason) +} + +func TestFlushStatus(t *testing.T) { + ctx := context.Background() + scheme := runtime.NewScheme() + require.NoError(t, AddToScheme(scheme)) + + newOwner := func() *MockOperatorCRD { + return &MockOperatorCRD{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-owner", + Namespace: "default", + Generation: 1, + }, } + } + + cond := func(ctype, reason string, status metav1.ConditionStatus) Condition { + return Condition{Type: ctype, Status: status, Reason: reason, ObservedGeneration: 1} + } - metrics.On("RecordConditionFor", - owner.GetKind(), owner, cond.Type, string(cond.Status), - cond.Reason, mock.AnythingOfType("time.Time"), mock.Anything, - ).Return() + t.Run("persists every condition on the owner and records a metric per condition", func(t *testing.T) { + owner := newOwner() + applyStatusCondition(ReconcileContext{Owner: owner}, cond("InfraReady", "Ready", metav1.ConditionTrue)) + applyStatusCondition(ReconcileContext{Owner: owner}, cond("AppReady", "Ready", metav1.ConditionTrue)) - err := setStatusCondition(ctx, rec, cond) - require.NoError(t, err) + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(owner).WithObjects(owner).Build() + metrics := &MockMetrics{} + metrics.On("RecordConditionFor", owner.GetKind(), owner, "InfraReady", + string(metav1.ConditionTrue), "Ready", mock.Anything, mock.Anything).Return().Once() + metrics.On("RecordConditionFor", owner.GetKind(), owner, "AppReady", + string(metav1.ConditionTrue), "Ready", mock.Anything, mock.Anything).Return().Once() - // Verify condition set in owner - updated := owner.GetStatusConditions() - assert.Len(t, *updated, 1) - assert.Equal(t, cond.Type, (*updated)[0].Type) + require.NoError(t, FlushStatus(ctx, ReconcileContext{Client: k8sClient, Metrics: metrics, Owner: owner})) + persisted := &MockOperatorCRD{} + require.NoError(t, k8sClient.Get(ctx, client.ObjectKeyFromObject(owner), persisted)) + assert.Len(t, persisted.Status.Conditions, 2) metrics.AssertExpectations(t) }) - t.Run("should return error if update fails", func(t *testing.T) { - // Use a manual mock client to simulate error - innerClient := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(owner).WithObjects(owner).Build() - k8sClient := &errorMockClient{Client: innerClient} - metrics := &MockMetrics{} + t.Run("is a no-op when metrics recorder is nil", func(t *testing.T) { + owner := newOwner() + applyStatusCondition(ReconcileContext{Owner: owner}, cond("InfraReady", "Ready", metav1.ConditionTrue)) - rec := ReconcileContext{ - Client: k8sClient, - Metrics: metrics, - Owner: owner, - } + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(owner).WithObjects(owner).Build() - metrics.On("RecordConditionFor", - mock.Anything, mock.Anything, mock.Anything, mock.Anything, - mock.Anything, mock.Anything, mock.Anything, - ).Return() + require.NoError(t, FlushStatus(ctx, ReconcileContext{Client: k8sClient, Owner: owner})) - // New condition to ensure changed=true - newCond := cond - newCond.Message = "Something changed" + persisted := &MockOperatorCRD{} + require.NoError(t, k8sClient.Get(ctx, client.ObjectKeyFromObject(owner), persisted)) + assert.Len(t, persisted.Status.Conditions, 1) + }) - err := setStatusCondition(ctx, rec, newCond) + t.Run("surfaces non-conflict update errors without recording metrics", func(t *testing.T) { + owner := newOwner() + applyStatusCondition(ReconcileContext{Owner: owner}, cond("InfraReady", "Ready", metav1.ConditionTrue)) + + inner := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(owner).WithObjects(owner).Build() + k8sClient := &errorMockClient{Client: inner} + metrics := &MockMetrics{} + + err := FlushStatus(ctx, ReconcileContext{Client: k8sClient, Metrics: metrics, Owner: owner}) require.Error(t, err) assert.Equal(t, "update failed", err.Error()) + + metrics.AssertNotCalled(t, "RecordConditionFor") + }) + + t.Run("retries on conflict by refetching the owner and reapplying staged conditions", func(t *testing.T) { + owner := newOwner() + + // External writer got there first with a condition the framework does not manage. + serverSide := newOwner() + serverSide.Status.Conditions = []metav1.Condition{{ + Type: "ExternalReady", + Status: metav1.ConditionTrue, + Reason: "ExternalReason", + LastTransitionTime: metav1.Now(), + }} + inner := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(owner).WithObjects(serverSide).Build() + // The in-memory owner has a stale ResourceVersion (empty), so the first + // Update must conflict. The conflict wrapper refetches, reapplies our + // staged condition, and lets retry.RetryOnConflict succeed on the retry. + k8sClient := &conflictOnceClient{Client: inner} + + applyStatusCondition(ReconcileContext{Owner: owner}, cond("InfraReady", "Ready", metav1.ConditionTrue)) + + metrics := &MockMetrics{} + metrics.On("RecordConditionFor", owner.GetKind(), owner, "ExternalReady", + string(metav1.ConditionTrue), "ExternalReason", mock.Anything, mock.Anything).Return().Once() + metrics.On("RecordConditionFor", owner.GetKind(), owner, "InfraReady", + string(metav1.ConditionTrue), "Ready", mock.Anything, mock.Anything).Return().Once() + + require.NoError(t, FlushStatus(ctx, ReconcileContext{Client: k8sClient, Metrics: metrics, Owner: owner})) + + assert.GreaterOrEqual(t, k8sClient.gets, 1, "expected at least one Get for conflict refetch") + persisted := &MockOperatorCRD{} + require.NoError(t, k8sClient.Get(ctx, client.ObjectKeyFromObject(owner), persisted)) + // Both the externally written condition and the framework-managed + // condition must be present on the persisted owner. + types := make([]string, 0, len(persisted.Status.Conditions)) + for _, c := range persisted.Status.Conditions { + types = append(types, c.Type) + } + assert.ElementsMatch(t, []string{"ExternalReady", "InfraReady"}, types) + metrics.AssertExpectations(t) }) } @@ -280,3 +352,59 @@ type errorStatusWriter struct { func (e *errorStatusWriter) Update(_ context.Context, _ client.Object, _ ...client.SubResourceUpdateOption) error { return errors.New("update failed") } + +// conflictOnceClient wraps a real client and causes the first status Update to +// return a Conflict error, forcing FlushStatus to refetch and retry. Subsequent +// Updates fall through to the underlying client. +type conflictOnceClient struct { + client.Client + conflicts int + gets int +} + +func (c *conflictOnceClient) Status() client.SubResourceWriter { + return &conflictOnceStatusWriter{SubResourceWriter: c.Client.Status(), parent: c} +} + +func (c *conflictOnceClient) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { + c.gets++ + return c.Client.Get(ctx, key, obj, opts...) +} + +type conflictOnceStatusWriter struct { + client.SubResourceWriter + parent *conflictOnceClient +} + +func (w *conflictOnceStatusWriter) Update(ctx context.Context, obj client.Object, opts ...client.SubResourceUpdateOption) error { + if w.parent.conflicts == 0 { + w.parent.conflicts++ + return apierrors.NewConflict( + schema.GroupResource{Group: "example.io", Resource: "mockoperatorcrds"}, + obj.GetName(), + fmt.Errorf("the object has been modified; please apply your changes to the latest version and try again"), + ) + } + return w.SubResourceWriter.Update(ctx, obj, opts...) +} + +// failingClient is a client whose every method returns an error. Used to prove +// that applyStatusCondition never calls the API. +type failingClient struct { + client.Client +} + +func (failingClient) Status() client.SubResourceWriter { + panic("applyStatusCondition must not reach the client") +} +func (failingClient) Get(context.Context, client.ObjectKey, client.Object, ...client.GetOption) error { + panic("applyStatusCondition must not reach the client") +} + +// metricsThatPanic is a Recorder whose every method panics. Used to prove that +// applyStatusCondition never records metrics. +type metricsThatPanic struct{} + +func (metricsThatPanic) RecordConditionFor(string, ocm.ObjectLike, string, string, string, time.Time, ...string) { + panic("applyStatusCondition must not record metrics") +} From 06c3d6591975a7f0b5befe387132075ba64ade9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Wed, 13 May 2026 12:33:48 +0200 Subject: [PATCH 2/2] address copilot review comments Tighten FlushStatus GoDoc to state only rec.Client and rec.Owner are required (rec.Metrics is documented as optional below it). Fix the failingClient test helper comment which incorrectly described the methods as returning errors when they actually panic. Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/component/component.go | 4 ++-- pkg/component/conditions_test.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/component/component.go b/pkg/component/component.go index 28cd24ae..33afb076 100644 --- a/pkg/component/component.go +++ b/pkg/component/component.go @@ -394,8 +394,8 @@ func fail(rec ReconcileContext, conditionType ConditionType, err error) error { // the owner are preserved because meta.SetStatusCondition merges by condition // type. // -// If rec.Metrics is nil, metric recording is skipped. All other fields of rec -// must be populated. +// rec.Client and rec.Owner must be populated. If rec.Metrics is nil, metric +// recording is skipped. func FlushStatus(ctx context.Context, rec ReconcileContext) error { desired := append([]metav1.Condition(nil), *rec.Owner.GetStatusConditions()...) diff --git a/pkg/component/conditions_test.go b/pkg/component/conditions_test.go index 4a72ef6e..f59b5c30 100644 --- a/pkg/component/conditions_test.go +++ b/pkg/component/conditions_test.go @@ -388,8 +388,8 @@ func (w *conflictOnceStatusWriter) Update(ctx context.Context, obj client.Object return w.SubResourceWriter.Update(ctx, obj, opts...) } -// failingClient is a client whose every method returns an error. Used to prove -// that applyStatusCondition never calls the API. +// failingClient is a client whose every method panics. Used to prove that +// applyStatusCondition never calls the API. type failingClient struct { client.Client }