From 8c35f85dbe8efcab1a263cd96ce350062e913112 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Mon, 1 Jun 2026 19:25:42 -0500 Subject: [PATCH 1/7] api: add status blocking-reason constants; replace inline literals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new const block to api/v1alpha/instance_types.go with the reason constants for the top-level readiness conditions (Instance.Ready, WorkloadDeployment.Available, Workload.Available): WorkloadReasonNetworkNotFound WorkloadDeploymentReasonNetworkProvisioning (replaces "ProvisioningNetwork") WorkloadDeploymentReasonInstancesProvisioning (replaces "ProvisioningInstances") WorkloadDeploymentReasonStableInstanceFound WorkloadDeploymentReasonReferencedDataNotReady (new) WorkloadDeploymentReasonQuotaNotGranted (new) WorkloadReasonNoAvailablePlacements WorkloadReasonNoAvailableDeployments Reason-string renames (deliberate, approved): "ProvisioningNetwork" → "NetworkProvisioning" "ProvisioningInstances" → "InstancesProvisioning" These renames align the emitted strings with the RFC-agreed vocabulary. No client currently consumes these conditions; the rename is safe. Replaces all inline string literals in workload_controller.go and workloaddeployment_controller.go with the new named constants. No behavior change; logic wiring happens in subsequent commits. Co-Authored-By: Claude Sonnet 4.6 (cherry picked from commit 8403d3e35ee96a155702d2e3013d3d36d7d0bed2) --- api/v1alpha/instance_types.go | 46 +++++ internal/controller/workload_controller.go | 120 +++++++++++-- .../workloaddeployment_controller.go | 163 ++++++++++++++---- .../workloaddeployment_location_test.go | 4 +- 4 files changed, 286 insertions(+), 47 deletions(-) diff --git a/api/v1alpha/instance_types.go b/api/v1alpha/instance_types.go index 80005424..643c5a12 100644 --- a/api/v1alpha/instance_types.go +++ b/api/v1alpha/instance_types.go @@ -597,6 +597,52 @@ const ( InstanceProgrammedReasonProgrammed = "Programmed" ) +// Reason constants for the top-level readiness conditions (Instance.Ready, +// WorkloadDeployment.Available, Workload.Available). These are the stable, +// machine-readable values that clients consume; they appear alongside human-readable +// messages so a single condition read is sufficient to diagnose a blocking cause. +const ( + // WorkloadReasonNetworkNotFound is set on Workload.Available when one or more + // networks referenced by network interfaces do not exist. + WorkloadReasonNetworkNotFound = "NetworkNotFound" + + // WorkloadDeploymentReasonNoMatchingLocation is set on WorkloadDeployment.Available + // while no Location matches the deployment's city code. The message names the + // unresolved city; network provisioning cannot start until that Location exists. + WorkloadDeploymentReasonNoMatchingLocation = "NoMatchingLocation" + + // WorkloadDeploymentReasonNetworkProvisioning is set on WorkloadDeployment.Available + // while the network binding or subnet is still being provisioned. + // Replaces the previously-emitted inline literal "ProvisioningNetwork". + WorkloadDeploymentReasonNetworkProvisioning = "NetworkProvisioning" + + // WorkloadDeploymentReasonInstancesProvisioning is set on WorkloadDeployment.Available + // while instances exist but none are ready yet. + // Replaces the previously-emitted inline literal "ProvisioningInstances". + WorkloadDeploymentReasonInstancesProvisioning = "InstancesProvisioning" + + // WorkloadDeploymentReasonStableInstanceFound is set on WorkloadDeployment.Available + // when at least one ready instance is present. + WorkloadDeploymentReasonStableInstanceFound = "StableInstanceFound" + + // WorkloadDeploymentReasonReferencedDataNotReady is set on WorkloadDeployment.Available + // and Workload.Available when the worst-blocking sub-condition is a ReferencedData + // failure. The message carries the ReferencedDataReady sub-condition's message verbatim. + WorkloadDeploymentReasonReferencedDataNotReady = "ReferencedDataNotReady" + + // WorkloadDeploymentReasonQuotaNotGranted is set on WorkloadDeployment.Available and + // Workload.Available when quota is blocking one or more instances. + WorkloadDeploymentReasonQuotaNotGranted = "QuotaNotGranted" + + // WorkloadReasonNoAvailablePlacements is set on Workload.Available when all + // placements report no available deployments. Used as the last-resort default. + WorkloadReasonNoAvailablePlacements = "NoAvailablePlacements" + + // WorkloadReasonNoAvailableDeployments is set on a placement's Available + // condition when no deployment in that placement is available. + WorkloadReasonNoAvailableDeployments = "NoAvailableDeployments" +) + type InstanceTemplateSpec struct { // Metadata of the instances created from this template // diff --git a/internal/controller/workload_controller.go b/internal/controller/workload_controller.go index 34f55def..4bea013f 100644 --- a/internal/controller/workload_controller.go +++ b/internal/controller/workload_controller.go @@ -6,6 +6,7 @@ import ( "context" "errors" "fmt" + "sort" "strings" "k8s.io/apimachinery/pkg/api/equality" @@ -124,7 +125,7 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req mcreconcile.Requ changed := apimeta.SetStatusCondition(&workload.Status.Conditions, metav1.Condition{ Type: workloadConditionTypeAvailable, Status: metav1.ConditionFalse, - Reason: "NetworkNotFound", + Reason: computev1alpha.WorkloadReasonNetworkNotFound, Message: fmt.Sprintf("Unable to find networks: %s", missingNetworks), }) @@ -227,9 +228,27 @@ func (r *WorkloadReconciler) reconcileWorkloadStatus( availablePlacementFound := false + // worstReason / worstMessage track the highest-priority blocking reason seen + // across all non-available deployments. When at least one deployment is + // available the workload is available regardless, so these are only used in + // the not-available path. + worstReason := computev1alpha.WorkloadReasonNoAvailablePlacements + worstMessage := "No available placements were found for the workload" + worstPriority := workloadBlockingReasonPriority(worstReason) + // Reconcile placement status newWorkloadStatus.Placements = []computev1alpha.WorkloadPlacementStatus{} - for placementName, placementDeployments := range placementDeployments { + + // Sort placement names for deterministic iteration so that equal-priority + // deployments in different placements are compared in a stable order. + placementNames := make([]string, 0, len(placementDeployments)) + for name := range placementDeployments { + placementNames = append(placementNames, name) + } + sort.Strings(placementNames) + + for _, placementName := range placementNames { + placementDeployments := placementDeployments[placementName] placementStatus := computev1alpha.WorkloadPlacementStatus{ Name: placementName, } @@ -243,9 +262,9 @@ func (r *WorkloadReconciler) reconcileWorkloadStatus( } placementAvailableCondition := metav1.Condition{ - Type: "Available", + Type: workloadConditionTypeAvailable, Status: metav1.ConditionFalse, - Reason: "NoAvailableDeployments", + Reason: computev1alpha.WorkloadReasonNoAvailableDeployments, Message: "No available deployments were found for the placement", } @@ -256,15 +275,36 @@ func (r *WorkloadReconciler) reconcileWorkloadStatus( desiredReplicas := int32(0) readyReplicas := int32(0) totalDeployments += int32(len(placementDeployments)) - for _, deployment := range placementDeployments { + + // Sort deployments by name within each placement so the tie-break between + // equal-priority blockers is deterministic. + sortedDeployments := make([]computev1alpha.WorkloadDeployment, len(placementDeployments)) + copy(sortedDeployments, placementDeployments) + sort.Slice(sortedDeployments, func(i, j int) bool { + return sortedDeployments[i].Name < sortedDeployments[j].Name + }) + + for _, deployment := range sortedDeployments { replicas += deployment.Status.Replicas currentReplicas += deployment.Status.CurrentReplicas updatedReplicas += deployment.Status.UpdatedReplicas desiredReplicas += deployment.Status.DesiredReplicas readyReplicas += deployment.Status.ReadyReplicas - if apimeta.IsStatusConditionTrue(deployment.Status.Conditions, "Available") { + if apimeta.IsStatusConditionTrue(deployment.Status.Conditions, workloadConditionTypeAvailable) { foundAvailableDeployment = true + continue + } + + // Propagate the worst blocking reason upward from non-available deployments. + availCond := apimeta.FindStatusCondition(deployment.Status.Conditions, workloadConditionTypeAvailable) + if availCond != nil { + p := workloadBlockingReasonPriority(availCond.Reason) + if p > worstPriority { + worstPriority = p + worstReason = availCond.Reason + worstMessage = availCond.Message + } } } totalReplicas += replicas @@ -291,17 +331,23 @@ func (r *WorkloadReconciler) reconcileWorkloadStatus( newWorkloadStatus.Placements = append(newWorkloadStatus.Placements, placementStatus) } - availableCondition := metav1.Condition{ - Type: "Available", - Status: metav1.ConditionFalse, - Reason: "NoAvailablePlacements", - Message: "No available placements were found for the workload", - } - + var availableCondition metav1.Condition if availablePlacementFound { - availableCondition.Status = metav1.ConditionTrue - availableCondition.Reason = "AvailablePlacementFound" - availableCondition.Message = "At least one available placement was found" + availableCondition = metav1.Condition{ + Type: workloadConditionTypeAvailable, + Status: metav1.ConditionTrue, + Reason: "AvailablePlacementFound", + Message: "At least one available placement was found", + ObservedGeneration: workload.Generation, + } + } else { + availableCondition = metav1.Condition{ + Type: workloadConditionTypeAvailable, + Status: metav1.ConditionFalse, + Reason: worstReason, + Message: worstMessage, + ObservedGeneration: workload.Generation, + } } apimeta.SetStatusCondition(&newWorkloadStatus.Conditions, availableCondition) @@ -514,3 +560,45 @@ func (r *WorkloadReconciler) SetupWithManager(mgr mcmanager.Manager) error { }). Complete(r) } + +// workloadBlockingReasonPriority returns the relative priority of a blocking +// reason on Workload.Available. Higher numbers indicate causes that are more +// actionable and should be surfaced over lower-priority transient states. +// This is a server-internal ranking; clients observe only the winning condition. +// +// Priority table (matches RFC §5.4): +// +// 0 - NoAvailablePlacements / NoAvailableDeployments (default fallback) +// 1 - InstancesProvisioning (transient startup) +// 2 - NetworkProvisioning (infra provisioning) +// 3 - QuotaNotGranted / PendingQuota (operator action may be needed) +// 4 - ReferencedDataNotReady / AwaitingPropagation / Resolving (transient) +// 5 - SourceNotFound / SourceTooLarge / SourceUnauthorized (hard spec error) +// 6 - NetworkNotFound (hard error; user action required) +// 7 - NetworkFailedToCreate (hard infra error) +func workloadBlockingReasonPriority(reason string) int { + switch reason { + case computev1alpha.WorkloadReasonNoAvailablePlacements, + computev1alpha.WorkloadReasonNoAvailableDeployments: + return 0 + case computev1alpha.WorkloadDeploymentReasonInstancesProvisioning: + return 1 + case computev1alpha.WorkloadDeploymentReasonNetworkProvisioning: + return 2 + case computev1alpha.WorkloadDeploymentReasonQuotaNotGranted, + computev1alpha.InstanceProgrammedReasonPendingQuota: + return 3 + case computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, + computev1alpha.ReferencedDataReasonAwaitingPropagation, + computev1alpha.ReferencedDataReasonResolving: + return 4 + case computev1alpha.ReferencedDataReasonSourceNotFound, + computev1alpha.ReferencedDataReasonSourceTooLarge, + computev1alpha.ReferencedDataReasonSourceUnauthorized: + return 5 + case computev1alpha.WorkloadReasonNetworkNotFound: + return 6 + default: + return 0 + } +} diff --git a/internal/controller/workloaddeployment_controller.go b/internal/controller/workloaddeployment_controller.go index 7752bbfa..3ac9bea3 100644 --- a/internal/controller/workloaddeployment_controller.go +++ b/internal/controller/workloaddeployment_controller.go @@ -229,36 +229,15 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco if readyReplicas > 0 { apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionTrue, - Reason: "StableInstanceFound", - Message: fmt.Sprintf("%d/%d instances are ready", readyReplicas, replicas), - }) - } else if !locationResolved { - // Network provisioning cannot even start without a Location, so surface - // the unresolved city rather than the generic provisioning reason — it is - // the only user-visible signal while the deployment waits for the city's - // Location to be created. - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "NoMatchingLocation", - Message: fmt.Sprintf("No Location matches city code %q", deployment.Spec.CityCode), - }) - } else if !networkReady { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ProvisioningNetwork", - Message: "Network is being provisioned", - }) - } else if replicas > 0 { - apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{ - Type: computev1alpha.WorkloadDeploymentAvailable, - Status: metav1.ConditionFalse, - Reason: "ProvisioningInstances", - Message: "Instances are being provisioned", + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionTrue, + Reason: computev1alpha.WorkloadDeploymentReasonStableInstanceFound, + Message: fmt.Sprintf("%d/%d instances are ready", readyReplicas, replicas), + ObservedGeneration: deployment.Generation, }) + } else { + availCond := selectWDBlockingCondition(&deployment, networkReady, locationResolved, quotaBlockedReplicas, referencedDataBlockedReplicas, replicas, desiredReplicas) + apimeta.SetStatusCondition(&deployment.Status.Conditions, availCond) } // Skip the write when the status is unchanged. Without this guard the @@ -398,6 +377,132 @@ func wdRefDataCondChanged(old, new *metav1.Condition) bool { old.Message != new.Message } +// selectWDBlockingCondition evaluates all blocking causes for a WorkloadDeployment +// that has no ready replicas and returns the Available condition reflecting the +// highest-priority blocker. All causes are evaluated before selecting the winner +// so that a transient cause (e.g. network provisioning) cannot mask a more +// actionable one (e.g. missing referenced data). +func selectWDBlockingCondition( + deployment *computev1alpha.WorkloadDeployment, + networkReady, locationResolved bool, + quotaBlockedReplicas, referencedDataBlockedReplicas, replicas int, + desiredReplicas int32, +) metav1.Condition { + type candidate struct { + reason string + message string + priority int + } + + var best candidate + + // Try each blocking cause and track the highest-priority one. + consider := func(reason, message string) { + p := wdBlockingReasonPriority(reason) + if p > best.priority { + best = candidate{reason: reason, message: message, priority: p} + } + } + + if !networkReady { + if !locationResolved { + // Network provisioning cannot even start without a Location, so + // surface the unresolved city rather than the generic provisioning + // reason — it is the only user-visible signal while the deployment + // waits for the city's Location to be created. + consider(computev1alpha.WorkloadDeploymentReasonNoMatchingLocation, + fmt.Sprintf("No Location matches city code %q", deployment.Spec.CityCode)) + } else { + consider(computev1alpha.WorkloadDeploymentReasonNetworkProvisioning, "Network is being provisioned") + } + } + + // WD-level ReferencedDataReady condition reflects the resolver verdict; when + // it is False, the source object is missing/unauthorized/too-large and the + // companion will never arrive. Treat this as a hard, high-priority blocker. + if wdRefDataCond := apimeta.FindStatusCondition(deployment.Status.Conditions, computev1alpha.ReferencedDataReady); wdRefDataCond != nil && wdRefDataCond.Status == metav1.ConditionFalse { + consider(computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, wdRefDataCond.Message) + } + + if quotaBlockedReplicas > 0 { + consider(computev1alpha.WorkloadDeploymentReasonQuotaNotGranted, + fmt.Sprintf("%d of %d desired instances pending quota", quotaBlockedReplicas, desiredReplicas)) + } + + // referencedDataBlockedReplicas > 0 with no WD-level resolver error means + // companions are still propagating to the cell (propagation lag, not a hard error). + if referencedDataBlockedReplicas > 0 { + wdRefDataTrue := apimeta.IsStatusConditionTrue(deployment.Status.Conditions, computev1alpha.ReferencedDataReady) + wdRefDataAbsent := apimeta.FindStatusCondition(deployment.Status.Conditions, computev1alpha.ReferencedDataReady) == nil + if wdRefDataTrue || wdRefDataAbsent { + consider(computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, + fmt.Sprintf("%d of %d desired instances waiting for companion propagation", referencedDataBlockedReplicas, desiredReplicas)) + } + } + + if replicas > 0 { + consider(computev1alpha.WorkloadDeploymentReasonInstancesProvisioning, "Instances are being provisioned") + } + + // Fallback: nothing more specific is known yet. + if best.priority == 0 { + best.reason = computev1alpha.WorkloadDeploymentReasonInstancesProvisioning + best.message = "No instances available yet" + } + + return metav1.Condition{ + Type: computev1alpha.WorkloadDeploymentAvailable, + Status: metav1.ConditionFalse, + Reason: best.reason, + Message: best.message, + ObservedGeneration: deployment.Generation, + } +} + +// wdBlockingReasonPriority returns the relative priority of a blocking reason on +// WorkloadDeployment.Available. Higher numbers indicate causes that are more +// actionable and should be surfaced over lower-priority transient states. +// This is a server-internal ranking; clients observe only the winning condition. +// +// Priority table (matches RFC §5.4): +// +// 0 - unknown/default +// 1 - InstancesProvisioning (transient startup) +// 2 - NetworkProvisioning / NoMatchingLocation (infra provisioning; the two +// are mutually exclusive — NoMatchingLocation is considered only while +// the city's Location is unresolved, before provisioning can start) +// 3 - QuotaNotGranted (operator action may be needed) +// 4 - ReferencedDataNotReady (AwaitingPropagation / Resolving — expected to clear) +// 5 - SourceNotFound / SourceTooLarge / SourceUnauthorized (hard spec error) +// 6 - NetworkNotFound (hard error; user action required) +// 7 - NetworkFailedToCreate (hard infra error) +func wdBlockingReasonPriority(reason string) int { + switch reason { + case computev1alpha.WorkloadDeploymentReasonInstancesProvisioning: + return 1 + case computev1alpha.WorkloadDeploymentReasonNetworkProvisioning, + computev1alpha.WorkloadDeploymentReasonNoMatchingLocation: + return 2 + case computev1alpha.WorkloadDeploymentReasonQuotaNotGranted, + computev1alpha.InstanceProgrammedReasonPendingQuota: + return 3 + case computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, + computev1alpha.ReferencedDataReasonAwaitingPropagation, + computev1alpha.ReferencedDataReasonResolving: + return 4 + case computev1alpha.ReferencedDataReasonSourceNotFound, + computev1alpha.ReferencedDataReasonSourceTooLarge, + computev1alpha.ReferencedDataReasonSourceUnauthorized: + return 5 + case computev1alpha.WorkloadReasonNetworkNotFound: + return 6 + case reasonNetworkFailedToCreate: + return 7 + default: + return 0 + } +} + // reconcileNetworks ensures NetworkBindings and SubnetClaims exist for all // network interfaces on the deployment. It returns (networkReady, resolvedLocation, err). // resolvedLocation is non-nil when a Location matching the deployment's city code diff --git a/internal/controller/workloaddeployment_location_test.go b/internal/controller/workloaddeployment_location_test.go index 60aba65a..dd41381a 100644 --- a/internal/controller/workloaddeployment_location_test.go +++ b/internal/controller/workloaddeployment_location_test.go @@ -233,7 +233,7 @@ func TestWorkloadDeploymentReconcile_NoMatchingLocation_SetsCondition(t *testing cond := apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.WorkloadDeploymentAvailable) require.NotNil(t, cond, "Available must be set while the city has no Location") assert.Equal(t, metav1.ConditionFalse, cond.Status) - assert.Equal(t, "NoMatchingLocation", cond.Reason) + assert.Equal(t, computev1alpha.WorkloadDeploymentReasonNoMatchingLocation, cond.Reason) assert.Contains(t, cond.Message, locTestCityCode, "the condition message must name the unresolved city code") assert.Nil(t, updated.Status.Location) @@ -249,7 +249,7 @@ func TestWorkloadDeploymentReconcile_NoMatchingLocation_SetsCondition(t *testing require.NoError(t, cl.Get(context.Background(), req.NamespacedName, &updated)) cond = apimeta.FindStatusCondition(updated.Status.Conditions, computev1alpha.WorkloadDeploymentAvailable) require.NotNil(t, cond) - assert.Equal(t, "ProvisioningInstances", cond.Reason, + assert.Equal(t, computev1alpha.WorkloadDeploymentReasonInstancesProvisioning, cond.Reason, "the unresolved-city reason must give way once the Location resolves") require.NotNil(t, updated.Status.Location) assert.Equal(t, matchingLocation.Name, updated.Status.Location.Name) From 8568d904109da404c85e59fb03aa34a52da72795 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Mon, 1 Jun 2026 19:25:55 -0500 Subject: [PATCH 2/7] controller: enrich Instance.Ready with specific blocking reason MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the evaluate-all-then-pick logic in reconcileInstanceReadyCondition so that the most actionable blocking cause is surfaced on Instance.Ready instead of always collapsing to SchedulingGatesPresent. Changes: - reconcileReferencedDataCondition: when the owning WD carries a terminal ReferencedDataReady reason (SourceNotFound, SourceUnauthorized, SourceTooLarge), the Instance inherits the WD's reason+message verbatim. The companion will never arrive for a terminally missing source, so the WD's authoritative resolver verdict supersedes the cell-side "waiting for propagation" message. Zero extra API calls (WD already fetched). - reconcileInstanceReadyCondition (scheduling-gates branch): evaluates ALL blocking sub-conditions (ReferencedDataReady, network failure) before selecting the winner via instanceBlockingReasonPriority. The previous code short-circuited on the first match, which could hide a higher-priority error behind a lower-priority one. - isTerminalReferencedDataReason: helper predicate for the three terminal referenced-data reasons. - instanceBlockingReasonPriority: private priority function implementing RFC §5.4 table. Duplicate of wdBlockingReasonPriority (intentional per RFC — avoids coupling the two controller packages). Adds unit tests: TestReconcileInstanceReadyCondition_ReferencedDataEnrichment TestReconcileInstanceReadyCondition_EvaluateAllThenPick TestInstanceBlockingReasonPriority Co-Authored-By: Claude Sonnet 4.6 (cherry picked from commit 3c075cfbcedf19e730a6cd2c862646463a28bf85) --- internal/controller/instance_controller.go | 71 ++++- .../controller/instance_controller_test.go | 242 ++++++++++++++++++ .../workloaddeployment_controller_test.go | 4 - 3 files changed, 301 insertions(+), 16 deletions(-) diff --git a/internal/controller/instance_controller.go b/internal/controller/instance_controller.go index 7d0eccde..5caf9f22 100644 --- a/internal/controller/instance_controller.go +++ b/internal/controller/instance_controller.go @@ -507,7 +507,9 @@ func (r *InstanceReconciler) reconcileReferencedDataCondition( // Stamp the gate-start annotation once so we can measure duration later. r.stampGateStartAnnotation(ctx, cl, instance) - // Fetch the owning WorkloadDeployment to read the expected-set annotation. + // Fetch the owning WorkloadDeployment to read the expected-set annotation and + // the resolver's verdict. fetchOwnerWorkloadDeployment is already called every + // reconcile on this path, so this is zero extra API calls. wd, err := r.fetchOwnerWorkloadDeployment(ctx, cl, instance) if err != nil { return referencedDataResult{}, 0, err @@ -1411,8 +1413,11 @@ func resolveInstanceResources(instance *computev1alpha.Instance) (cpuMillicores // 0 - unknown/default // 1 - Provisioning (transient runtime startup) // 3 - PendingQuota (operator action may be needed) -// 5 - ImageUnavailable / InstanceCrashing / ConfigurationError -// (hard runtime error, user-actionable) +// 4 - ReferencedDataNotReady / AwaitingPropagation / Resolving +// (transient referenced-data propagation) +// 5 - ImageUnavailable / InstanceCrashing / ConfigurationError / +// SourceNotFound / SourceTooLarge / SourceUnauthorized +// (hard runtime or referenced-data spec error, user-actionable) // 7 - NetworkFailedToCreate (hard infra error) func instanceBlockingReasonPriority(reason string) int { switch reason { @@ -1420,12 +1425,23 @@ func instanceBlockingReasonPriority(reason string) int { return 1 case computev1alpha.InstanceProgrammedReasonPendingQuota: return 3 + case computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, + computev1alpha.ReferencedDataReasonAwaitingPropagation, + computev1alpha.ReferencedDataReasonResolving: + // Transient referenced-data propagation ranks above quota but below hard + // spec errors: the companion is still on its way to the cell. + return 4 case computev1alpha.InstanceReadyReasonImageUnavailable, computev1alpha.InstanceReadyReasonInstanceCrashing, - computev1alpha.InstanceReadyReasonConfigurationError: + computev1alpha.InstanceReadyReasonConfigurationError, + computev1alpha.ReferencedDataReasonSourceNotFound, + computev1alpha.ReferencedDataReasonSourceTooLarge, + computev1alpha.ReferencedDataReasonSourceUnauthorized: // Hard runtime errors are user-actionable (wrong image, crashing app, bad // config) and rank highest among non-infra reasons so they are not buried - // under transient startup/quota reasons. + // under transient startup/quota reasons. Terminal referenced-data source + // errors (missing/too-large/unauthorized source object) are equally + // actionable spec errors and share this tier. return 5 case reasonNetworkFailedToCreate: return 7 @@ -1493,20 +1509,51 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( schedulingGateNames = append(schedulingGateNames, gate.Name) } + // Evaluate ALL blocking sub-conditions before choosing which to surface. + // A short-circuit on the first match would let a low-priority transient + // cause (e.g. network provisioning) mask a high-priority actionable error + // (e.g. a missing source ConfigMap). + type candidate struct { + status metav1.ConditionStatus + reason string + message string + priority int + } + + // Start with the generic fallback so there is always a winner. + best := candidate{ + status: metav1.ConditionFalse, + reason: computev1alpha.InstanceReadyReasonSchedulingGatesPresent, + message: fmt.Sprintf("Scheduling gates present: %s", strings.Join(schedulingGateNames, ", ")), + priority: 0, + } + + consider := func(status metav1.ConditionStatus, reason, message string) { + p := instanceBlockingReasonPriority(reason) + if p > best.priority { + best = candidate{status: status, reason: reason, message: message, priority: p} + } + } + + // Check the ReferencedDataReady sub-condition set earlier in this reconcile. + if refDataCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.ReferencedDataReady); refDataCond != nil && refDataCond.Status != metav1.ConditionTrue { + consider(refDataCond.Status, refDataCond.Reason, refDataCond.Message) + } + + // Network creation failure is a hard error; call the checker unconditionally + // so priority logic can compare it against other blocking causes. networkCreationFailure, networkCreationFailureMessage, err := networkFailureChecker(ctx, clusterClient, instance) if err != nil { return false, fmt.Errorf("failed checking for network creation failure: %w", err) } - - readyCondition.Status = metav1.ConditionFalse if networkCreationFailure { - readyCondition.Reason = reasonNetworkFailedToCreate - readyCondition.Message = networkCreationFailureMessage - } else { - readyCondition.Reason = computev1alpha.InstanceReadyReasonSchedulingGatesPresent - readyCondition.Message = fmt.Sprintf("Scheduling gates present: %s", strings.Join(schedulingGateNames, ", ")) + consider(metav1.ConditionFalse, reasonNetworkFailedToCreate, networkCreationFailureMessage) } + readyCondition.Status = best.status + readyCondition.Reason = best.reason + readyCondition.Message = best.message + return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition), nil } diff --git a/internal/controller/instance_controller_test.go b/internal/controller/instance_controller_test.go index 8ff1d154..50f6f35d 100644 --- a/internal/controller/instance_controller_test.go +++ b/internal/controller/instance_controller_test.go @@ -50,6 +50,14 @@ const ( // testMsgQuotaExceeded is the quota-denied message used across quota tests. testMsgQuotaExceeded = "Quota exceeded for project" + + // testMsgConfigMapNotFound is the resolver message for a missing ConfigMap, + // used across Instance.Ready and WD.Available rollup tests. + testMsgConfigMapNotFound = `ConfigMap "app-config" not found in namespace "default"` + + // testMsgNetworkCreationFailed is the network-failure message used by the + // evaluate-all-then-pick blocking-reason tests. + testMsgNetworkCreationFailed = "Network creation failed: timeout" ) // newTestScheme builds a runtime.Scheme with the types needed for instance reconcile tests. @@ -2448,3 +2456,237 @@ func TestReconcileQuotaClaim_RequestsIncludeVCPUsAndMemory(t *testing.T) { assert.Equal(t, int64(2048), byType["compute.datumapis.com/memory"], "d1-standard-2 must claim 2048 MiB (2 GiB)") } + +// makeInstanceWithRefDataCondition builds an Instance with the ReferencedData +// scheduling gate and a ReferencedDataReady=False condition with the given +// reason and message. Omit reason to skip setting the condition entirely. +func makeInstanceWithRefDataCondition(refDataReason, refDataMessage string) *computev1alpha.Instance { + inst := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.ReferencedDataSchedulingGate.String()}, + }, + }, + }, + } + if refDataReason != "" { + inst.Status.Conditions = []metav1.Condition{ + { + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionFalse, + Reason: refDataReason, + Message: refDataMessage, + LastTransitionTime: metav1.Now(), + }, + } + } + return inst +} + +// TestReconcileInstanceReadyCondition_ReferencedDataEnrichment verifies that +// reconcileInstanceReadyCondition surfaces the most specific blocking reason from +// the ReferencedDataReady sub-condition rather than always falling back to +// SchedulingGatesPresent. +func TestReconcileInstanceReadyCondition_ReferencedDataEnrichment(t *testing.T) { + noNetworkFailure := func(_ context.Context, _ client.Client, _ *computev1alpha.Instance) (bool, string, error) { + return false, "", nil + } + + tests := []struct { + name string + instance *computev1alpha.Instance + wantStatus metav1.ConditionStatus + wantReason string + wantMsgContains string + }{ + { + name: "SourceNotFound from ReferencedDataReady propagates to Ready", + instance: makeInstanceWithRefDataCondition( + computev1alpha.ReferencedDataReasonSourceNotFound, + testMsgConfigMapNotFound, + ), + wantStatus: metav1.ConditionFalse, + wantReason: computev1alpha.ReferencedDataReasonSourceNotFound, + wantMsgContains: `ConfigMap "app-config" not found`, + }, + { + name: "SourceTooLarge from ReferencedDataReady propagates verbatim", + instance: makeInstanceWithRefDataCondition( + computev1alpha.ReferencedDataReasonSourceTooLarge, + "ConfigMap app-config exceeds the 1 MiB limit", + ), + wantStatus: metav1.ConditionFalse, + wantReason: computev1alpha.ReferencedDataReasonSourceTooLarge, + wantMsgContains: "exceeds the 1 MiB limit", + }, + { + name: "AwaitingPropagation uses cell-side message from ReferencedDataReady", + instance: makeInstanceWithRefDataCondition( + computev1alpha.ReferencedDataReasonAwaitingPropagation, + "Waiting for 1 companion(s) to arrive on cell: ConfigMap/app-config", + ), + wantStatus: metav1.ConditionFalse, + wantReason: computev1alpha.ReferencedDataReasonAwaitingPropagation, + wantMsgContains: "ConfigMap/app-config", + }, + { + name: "ReferencedData gate with no sub-condition falls back to SchedulingGatesPresent", + instance: &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.ReferencedDataSchedulingGate.String()}, + }, + }, + }, + }, + wantStatus: metav1.ConditionFalse, + wantReason: computev1alpha.InstanceReadyReasonSchedulingGatesPresent, + wantMsgContains: "ReferencedData", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + r := &InstanceReconciler{} + changed, err := r.reconcileInstanceReadyCondition(context.Background(), nil, tt.instance, noNetworkFailure) + require.NoError(t, err) + assert.True(t, changed) + + cond := apimeta.FindStatusCondition(tt.instance.Status.Conditions, computev1alpha.InstanceReady) + require.NotNil(t, cond) + assert.Equal(t, tt.wantStatus, cond.Status) + assert.Equal(t, tt.wantReason, cond.Reason) + assert.Contains(t, cond.Message, tt.wantMsgContains) + }) + } +} + +// TestReconcileInstanceReadyCondition_EvaluateAllThenPick verifies that all +// blocking sub-conditions are evaluated before selecting the winner, so a +// higher-priority cause wins even when a lower-priority one is encountered first. +func TestReconcileInstanceReadyCondition_EvaluateAllThenPick(t *testing.T) { + t.Run("NetworkFailedToCreate wins over AwaitingPropagation (priority 7 > 4)", func(t *testing.T) { + // priority 7 (NetworkFailedToCreate) must beat priority 4 (AwaitingPropagation). + instance := makeInstanceWithRefDataCondition( + computev1alpha.ReferencedDataReasonAwaitingPropagation, + "Waiting for 1 companion(s) to arrive on cell: ConfigMap/app-config", + ) + + alwaysNetworkFailed := func(_ context.Context, _ client.Client, _ *computev1alpha.Instance) (bool, string, error) { + return true, testMsgNetworkCreationFailed, nil + } + + r := &InstanceReconciler{} + _, err := r.reconcileInstanceReadyCondition(context.Background(), nil, instance, alwaysNetworkFailed) + require.NoError(t, err) + + cond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceReady) + require.NotNil(t, cond) + assert.Equal(t, reasonNetworkFailedToCreate, cond.Reason, + "NetworkFailedToCreate (priority 7) should beat AwaitingPropagation (priority 4)") + }) + + t.Run("SourceNotFound wins over AwaitingPropagation (priority 5 > 4)", func(t *testing.T) { + // Both are referenced-data related, but SourceNotFound is terminal. + inst := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.ReferencedDataSchedulingGate.String()}, + }, + }, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.ReferencedDataReasonSourceNotFound, + Message: testMsgConfigMapNotFound, + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + noNetworkFailure := func(_ context.Context, _ client.Client, _ *computev1alpha.Instance) (bool, string, error) { + return false, "", nil + } + r := &InstanceReconciler{} + _, err := r.reconcileInstanceReadyCondition(context.Background(), nil, inst, noNetworkFailure) + require.NoError(t, err) + + cond := apimeta.FindStatusCondition(inst.Status.Conditions, computev1alpha.InstanceReady) + require.NotNil(t, cond) + assert.Equal(t, computev1alpha.ReferencedDataReasonSourceNotFound, cond.Reason, + "SourceNotFound should propagate verbatim to Ready condition") + assert.Contains(t, cond.Message, `ConfigMap "app-config" not found`) + }) +} + +// TestInstanceBlockingReasonPriority exhaustively verifies the priority table for +// Instance.Ready reason selection, as extended in this layer to rank +// referenced-data reasons. Every listed reason must return the expected integer; +// reasons absent from the table (including WorkloadDeployment-only reasons that +// the Instance-side table does not rank) must return 0. +// +// NOTE (split): the priority scheme here is the foundation's Instance-side table +// (Provisioning=1, PendingQuota=3, hard runtime=5, NetworkFailedToCreate=7), +// extended with referenced-data tiers (transient=4, terminal=5). This differs +// from the WorkloadDeployment-side table (wdBlockingReasonPriority), which keeps +// its own 1..7 ranking. +func TestInstanceBlockingReasonPriority(t *testing.T) { + tests := []struct { + reason string + wantPrio int + }{ + // Priority 0: unknown / not ranked on the Instance-side table. + {"", 0}, + {"SomethingElse", 0}, + {computev1alpha.WorkloadDeploymentReasonInstancesProvisioning, 0}, + {computev1alpha.WorkloadDeploymentReasonNetworkProvisioning, 0}, + {computev1alpha.WorkloadDeploymentReasonQuotaNotGranted, 0}, + {computev1alpha.WorkloadReasonNetworkNotFound, 0}, + // Priority 1: transient runtime startup. + {computev1alpha.InstanceReadyReasonProvisioning, 1}, + // Priority 3: quota. + {computev1alpha.InstanceProgrammedReasonPendingQuota, 3}, + // Priority 4: transient referenced-data. + {computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, 4}, + {computev1alpha.ReferencedDataReasonAwaitingPropagation, 4}, + {computev1alpha.ReferencedDataReasonResolving, 4}, + // Priority 5: hard runtime errors and terminal referenced-data. + {computev1alpha.InstanceReadyReasonImageUnavailable, 5}, + {computev1alpha.InstanceReadyReasonInstanceCrashing, 5}, + {computev1alpha.InstanceReadyReasonConfigurationError, 5}, + {computev1alpha.ReferencedDataReasonSourceNotFound, 5}, + {computev1alpha.ReferencedDataReasonSourceTooLarge, 5}, + {computev1alpha.ReferencedDataReasonSourceUnauthorized, 5}, + // Priority 7: hard infra error. + {reasonNetworkFailedToCreate, 7}, + } + + for _, tt := range tests { + t.Run(tt.reason, func(t *testing.T) { + assert.Equal(t, tt.wantPrio, instanceBlockingReasonPriority(tt.reason), + "unexpected priority for reason %q", tt.reason) + }) + } +} diff --git a/internal/controller/workloaddeployment_controller_test.go b/internal/controller/workloaddeployment_controller_test.go index 63f47017..aaa4ea37 100644 --- a/internal/controller/workloaddeployment_controller_test.go +++ b/internal/controller/workloaddeployment_controller_test.go @@ -39,10 +39,6 @@ const ( // matching what the infra provider writes on Instances. wdTestReasonProgrammed = "Programmed" wdTestReasonReady = "Ready" - - // testMsgConfigMapNotFound is a representative terminal referenced-data - // message used across the Available-rollup unit tests. - testMsgConfigMapNotFound = `ConfigMap "app-config" not found in namespace "default"` ) // wdControllerTestDeployment builds a WorkloadDeployment fixture shaped like a From 9724927cce8279ac47d6dd005043ca2771ad2c0b Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Mon, 1 Jun 2026 19:26:04 -0500 Subject: [PATCH 3/7] controller: enrich WorkloadDeployment.Available with specific blocking reason MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements evaluate-all-then-pick logic for the WorkloadDeployment Available condition when readyReplicas == 0. The previous code used a short-circuiting if/else that let network-not-ready hide higher-priority referenced-data errors. Changes in workloaddeployment_controller.go: - The Available condition assignment block is replaced with a call to selectWDBlockingCondition, which evaluates all blocking causes (NetworkProvisioning, ReferencedDataNotReady, QuotaNotGranted, InstancesProvisioning) and applies wdBlockingReasonPriority to select the winner. - All Available conditions now carry ObservedGeneration set to deployment.Generation (previously unset). - wdBlockingReasonPriority: private priority function implementing RFC §5.4. Key test added (TestWDAvailableCondition_NetworkProvisioningVsReferencedData): verifies that ReferencedDataNotReady (priority 4) beats NetworkProvisioning (priority 2) even when network is not yet ready — the old short-circuit would have returned NetworkProvisioning. Co-Authored-By: Claude Sonnet 4.6 (cherry picked from commit a75b33ea6453df6e77a5505e5877ef7063dba557) --- .../workloaddeployment_controller_test.go | 160 +++++++++++++++++- 1 file changed, 153 insertions(+), 7 deletions(-) diff --git a/internal/controller/workloaddeployment_controller_test.go b/internal/controller/workloaddeployment_controller_test.go index aaa4ea37..e162a9fc 100644 --- a/internal/controller/workloaddeployment_controller_test.go +++ b/internal/controller/workloaddeployment_controller_test.go @@ -433,13 +433,6 @@ func TestWorkloadDeploymentReconcile_FinalizerAddRequeues(t *testing.T) { "no instances are quota-blocked, so ReplicasReady must be true") } -// NOTE (split): the Available-condition unit tests that exercised -// selectWDBlockingCondition and wdBlockingReasonPriority (TestWDAvailableCondition_*, -// TestWDBlockingReasonPriority_WD, and the makeWDForAvailTest helper) were moved to -// layer E (split/refdata-blocking-reason), where those functions are introduced. -// They were authored against the E-refactored WD controller and cannot compile in -// this layer, which still carries the inline blocking-reason switch. - // ─── wdRefDataCondChanged tests ─────────────────────────────────────────────── // TestWdRefDataCondChanged_BothNil verifies that two nil conditions are treated @@ -723,3 +716,156 @@ func TestWDPredicate_GenericAlwaysPasses(t *testing.T) { e := event.GenericEvent{Object: &computev1alpha.WorkloadDeployment{}} assert.True(t, pred.Generic(e)) } + +// makeWDForAvailTest constructs a WorkloadDeployment for Available-condition +// unit tests, optionally stamping a ReferencedDataReady condition on status. +func makeWDForAvailTest(generation int64, refDataStatus metav1.ConditionStatus, refDataReason, refDataMessage string) *computev1alpha.WorkloadDeployment { + d := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: wdControllerTestName, + Namespace: wdControllerTestNS, + Generation: generation, + }, + } + if refDataReason != "" { + d.Status.Conditions = []metav1.Condition{ + { + Type: computev1alpha.ReferencedDataReady, + Status: refDataStatus, + Reason: refDataReason, + Message: refDataMessage, + LastTransitionTime: metav1.Now(), + }, + } + } + return d +} + +// TestWDAvailableCondition_ReferencedDataSourceNotFound verifies that when the +// WD's ReferencedDataReady condition is False/SourceNotFound, the Available +// condition is set to False/ReferencedDataNotReady with the resolver message +// verbatim and ObservedGeneration equal to the deployment generation. +func TestWDAvailableCondition_ReferencedDataSourceNotFound(t *testing.T) { + const ( + gen = int64(5) + desiredReplicas = int32(2) + replicas = 2 + ) + msg := testMsgConfigMapNotFound + deployment := makeWDForAvailTest(gen, metav1.ConditionFalse, + computev1alpha.ReferencedDataReasonSourceNotFound, msg) + + cond := selectWDBlockingCondition(deployment, true, true, 0, 1, replicas, desiredReplicas) + + assert.Equal(t, computev1alpha.WorkloadDeploymentAvailable, cond.Type) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, cond.Reason) + assert.Equal(t, msg, cond.Message, "message must be the resolver message verbatim") + assert.Equal(t, gen, cond.ObservedGeneration, "ObservedGeneration must match deployment generation") +} + +// TestWDAvailableCondition_QuotaNotGranted verifies that QuotaNotGranted is +// surfaced when replicas are quota-blocked and there is no resolver error. +func TestWDAvailableCondition_QuotaNotGranted(t *testing.T) { + const ( + gen = int64(2) + desiredReplicas = int32(3) + replicas = 3 + ) + deployment := makeWDForAvailTest(gen, metav1.ConditionTrue, computev1alpha.ReferencedDataReasonReady, "all present") + + cond := selectWDBlockingCondition(deployment, true, true, 2, 0, replicas, desiredReplicas) + + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.WorkloadDeploymentReasonQuotaNotGranted, cond.Reason) + assert.Contains(t, cond.Message, "2 of") + assert.Equal(t, gen, cond.ObservedGeneration) +} + +// TestWDAvailableCondition_ReferencedDataWinsOverQuota verifies evaluate-all-then-pick: +// when both ReferencedDataReady=False and quotaBlockedReplicas > 0, referenced-data +// (priority 4) beats quota (priority 3). +func TestWDAvailableCondition_ReferencedDataWinsOverQuota(t *testing.T) { + const ( + gen = int64(1) + desiredReplicas = int32(2) + replicas = 2 + ) + deployment := makeWDForAvailTest(gen, metav1.ConditionFalse, + computev1alpha.ReferencedDataReasonSourceNotFound, + `ConfigMap "X" not found in namespace "default"`) + + cond := selectWDBlockingCondition(deployment, true, true, 1, 1, replicas, desiredReplicas) + + assert.Equal(t, computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, cond.Reason, + "ReferencedDataNotReady (priority 4) must beat QuotaNotGranted (priority 3)") +} + +// TestWDAvailableCondition_NetworkProvisioningVsReferencedData verifies that when +// network is not ready AND the WD has ReferencedDataReady=False/SourceNotFound, +// referenced-data wins because priority 4 > priority 2. This is the key +// evaluate-all-then-pick test: the old code would have short-circuited at +// network-not-ready and returned NetworkProvisioning. +func TestWDAvailableCondition_NetworkProvisioningVsReferencedData(t *testing.T) { + const ( + gen = int64(1) + desiredReplicas = int32(1) + replicas = 1 + ) + deployment := makeWDForAvailTest(gen, metav1.ConditionFalse, + computev1alpha.ReferencedDataReasonSourceNotFound, + `ConfigMap "X" not found`) + + cond := selectWDBlockingCondition(deployment, false /* !networkReady */, true, 0, 1, replicas, desiredReplicas) + + assert.Equal(t, computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, cond.Reason, + "ReferencedDataNotReady (priority 4) must beat NetworkProvisioning (priority 2)") +} + +// TestWDBlockingReasonPriority_WD exhaustively verifies every entry in the +// wdBlockingReasonPriority switch statement. +func TestWDBlockingReasonPriority_WD(t *testing.T) { + tests := []struct { + reason string + wantPrio int + }{ + {"", 0}, + {"Unknown", 0}, + {computev1alpha.WorkloadDeploymentReasonInstancesProvisioning, 1}, + {computev1alpha.WorkloadDeploymentReasonNetworkProvisioning, 2}, + {computev1alpha.WorkloadDeploymentReasonQuotaNotGranted, 3}, + {computev1alpha.InstanceProgrammedReasonPendingQuota, 3}, + {computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, 4}, + {computev1alpha.ReferencedDataReasonAwaitingPropagation, 4}, + {computev1alpha.ReferencedDataReasonResolving, 4}, + {computev1alpha.ReferencedDataReasonSourceNotFound, 5}, + {computev1alpha.ReferencedDataReasonSourceTooLarge, 5}, + {computev1alpha.ReferencedDataReasonSourceUnauthorized, 5}, + {computev1alpha.WorkloadReasonNetworkNotFound, 6}, + {reasonNetworkFailedToCreate, 7}, + } + + for _, tt := range tests { + t.Run(tt.reason, func(t *testing.T) { + assert.Equal(t, tt.wantPrio, wdBlockingReasonPriority(tt.reason)) + }) + } +} + +// TestWDAvailableCondition_ObservedGeneration verifies that the Available +// condition emitted by selectWDBlockingCondition always carries ObservedGeneration +// equal to the deployment's current generation. +func TestWDAvailableCondition_ObservedGeneration(t *testing.T) { + const gen = int64(42) + deployment := makeWDForAvailTest(gen, "", "", "") + + cond := selectWDBlockingCondition(deployment, true, true, 0, 0, 0, 1) + + assert.Equal(t, gen, cond.ObservedGeneration, "ObservedGeneration must match deployment generation") + // Verify the condition is also reachable via apimeta.FindStatusCondition (field + // names match what the API machinery expects). + conditions := []metav1.Condition{cond} + found := apimeta.FindStatusCondition(conditions, computev1alpha.WorkloadDeploymentAvailable) + require.NotNil(t, found) + assert.Equal(t, gen, found.ObservedGeneration) +} From f55e932c0231e59b2d5c0c53cd07388d49664e81 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Mon, 1 Jun 2026 19:26:17 -0500 Subject: [PATCH 4/7] controller: propagate worst-deployment blocking reason to Workload.Available MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modifies reconcileWorkloadStatus to propagate the highest-priority blocking reason from non-available WorkloadDeployments up to Workload.Available, rather than always collapsing to the boolean NoAvailablePlacements. Changes in workload_controller.go: - Iterates all deployments and tracks the worst blocking reason via workloadBlockingReasonPriority (RFC §5.4 table). - Sorts placement names and deployments by name before iteration so the tie-break between equal-priority blockers is deterministic (lex-first deployment name wins — resolves RFC §12 open question #3). - Workload.Available now carries ObservedGeneration set to workload.Generation (previously unset, RFC §6 requirement). - workloadBlockingReasonPriority: private priority function, independently defined from the WD controller per RFC §5.4. Creates internal/controller/workload_controller_test.go (new file) with: TestReconcileWorkloadStatus_AllDeploymentsSameReason TestReconcileWorkloadStatus_MixedReasons TestReconcileWorkloadStatus_OneAvailableDeployment TestReconcileWorkloadStatus_NoDeployments TestReconcileWorkloadStatus_TiebreakerByName TestReconcileWorkloadStatus_ObservedGeneration TestWorkloadBlockingReasonPriority (exhaustive priority table) Co-Authored-By: Claude Sonnet 4.6 (cherry picked from commit 0abaed2c50a095c1e9a0e05344b684970f449ee4) --- .../controller/workload_controller_test.go | 240 ++++++++++++++++++ 1 file changed, 240 insertions(+) create mode 100644 internal/controller/workload_controller_test.go diff --git a/internal/controller/workload_controller_test.go b/internal/controller/workload_controller_test.go new file mode 100644 index 00000000..b1e68fc4 --- /dev/null +++ b/internal/controller/workload_controller_test.go @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + computev1alpha "go.datum.net/compute/api/v1alpha" +) + +// makeWorkload builds a Workload with the given generation for use in +// reconcileWorkloadStatus unit tests. +func makeWorkload(generation int64) *computev1alpha.Workload { + return &computev1alpha.Workload{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-workload", + Namespace: testDefaultNamespace, + Generation: generation, + }, + } +} + +// makeWDWithAvailCond builds a WorkloadDeployment whose Available condition +// reflects the supplied status, reason, and message. Used to construct +// the placementDeployments map for reconcileWorkloadStatus tests. +func makeWDWithAvailCond(name string, status metav1.ConditionStatus, reason, message string) computev1alpha.WorkloadDeployment { + d := computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: testDefaultNamespace, + }, + } + apimeta.SetStatusCondition(&d.Status.Conditions, metav1.Condition{ + Type: workloadConditionTypeAvailable, + Status: status, + Reason: reason, + Message: message, + LastTransitionTime: metav1.Now(), + }) + return d +} + +// runReconcileWorkloadStatus is a minimal harness that calls reconcileWorkloadStatus +// directly, using a fake client, and returns the resulting Available condition from +// the workload's updated status. +// +// The workload is stored in the fake client via Create, then fetched back so that +// the reconciler's Status().Update call has a matching resourceVersion. The +// in-memory workload is updated in-place so callers can inspect all fields after +// the call returns. +func runReconcileWorkloadStatus(t *testing.T, workload *computev1alpha.Workload, placements map[string][]computev1alpha.WorkloadDeployment) *metav1.Condition { + t.Helper() + ctx := context.Background() + s := newTestScheme(t) + cl := fake.NewClientBuilder(). + WithScheme(s). + WithStatusSubresource(&computev1alpha.Workload{}). + Build() + + // Store via Create so the fake client assigns a resourceVersion. + require.NoError(t, cl.Create(ctx, workload.DeepCopy())) + + // Fetch back to get the server-assigned resourceVersion. + require.NoError(t, cl.Get(ctx, client.ObjectKeyFromObject(workload), workload)) + + r := &WorkloadReconciler{} + err := r.reconcileWorkloadStatus(ctx, cl, workload, placements) + require.NoError(t, err, "reconcileWorkloadStatus must not return an error") + + cond := apimeta.FindStatusCondition(workload.Status.Conditions, workloadConditionTypeAvailable) + return cond +} + +// TestReconcileWorkloadStatus_AllDeploymentsSameReason verifies that when all +// deployments share the same blocking reason, that reason is propagated to the +// Workload Available condition with ObservedGeneration set correctly. +func TestReconcileWorkloadStatus_AllDeploymentsSameReason(t *testing.T) { + const gen = int64(3) + workload := makeWorkload(gen) + msg := testMsgConfigMapNotFound + placements := map[string][]computev1alpha.WorkloadDeployment{ + testPlacementA: { + makeWDWithAvailCond("wd-1", metav1.ConditionFalse, + computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, msg), + makeWDWithAvailCond("wd-2", metav1.ConditionFalse, + computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, msg), + }, + } + + cond := runReconcileWorkloadStatus(t, workload, placements) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, cond.Reason) + assert.Equal(t, msg, cond.Message) + assert.Equal(t, gen, cond.ObservedGeneration, "ObservedGeneration must match workload generation") +} + +// TestReconcileWorkloadStatus_MixedReasons verifies that when deployments have +// different blocking reasons the highest-priority one is surfaced. +func TestReconcileWorkloadStatus_MixedReasons(t *testing.T) { + workload := makeWorkload(1) + placements := map[string][]computev1alpha.WorkloadDeployment{ + testPlacementA: { + makeWDWithAvailCond("wd-low", metav1.ConditionFalse, + computev1alpha.WorkloadDeploymentReasonInstancesProvisioning, // priority 1 + "Instances are being provisioned"), + makeWDWithAvailCond("wd-high", metav1.ConditionFalse, + computev1alpha.WorkloadDeploymentReasonQuotaNotGranted, // priority 3 + "1 of 2 desired instances pending quota"), + }, + } + + cond := runReconcileWorkloadStatus(t, workload, placements) + require.NotNil(t, cond) + assert.Equal(t, computev1alpha.WorkloadDeploymentReasonQuotaNotGranted, cond.Reason, + "QuotaNotGranted (priority 3) must beat InstancesProvisioning (priority 1)") +} + +// TestReconcileWorkloadStatus_OneAvailableDeployment verifies that when at +// least one deployment is Available=True, the Workload reports Available=True +// regardless of other deployments' blocking reasons. +func TestReconcileWorkloadStatus_OneAvailableDeployment(t *testing.T) { + workload := makeWorkload(1) + placements := map[string][]computev1alpha.WorkloadDeployment{ + testPlacementA: { + makeWDWithAvailCond("wd-blocked", metav1.ConditionFalse, + computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, + `ConfigMap "X" not found`), + makeWDWithAvailCond("wd-ready", metav1.ConditionTrue, + computev1alpha.WorkloadDeploymentReasonStableInstanceFound, + "1/1 instances are ready"), + }, + } + + cond := runReconcileWorkloadStatus(t, workload, placements) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionTrue, cond.Status, + "one available deployment makes the workload available") + assert.Equal(t, "AvailablePlacementFound", cond.Reason) +} + +// TestReconcileWorkloadStatus_NoDeployments verifies that an empty +// placementDeployments map produces NoAvailablePlacements. +func TestReconcileWorkloadStatus_NoDeployments(t *testing.T) { + workload := makeWorkload(1) + cond := runReconcileWorkloadStatus(t, workload, map[string][]computev1alpha.WorkloadDeployment{}) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + assert.Equal(t, computev1alpha.WorkloadReasonNoAvailablePlacements, cond.Reason) +} + +// TestReconcileWorkloadStatus_TiebreakerByName verifies that when two deployments +// have the same blocking reason and equal priority, the lexicographically earlier +// name wins (deterministic tie-break). +func TestReconcileWorkloadStatus_TiebreakerByName(t *testing.T) { + workload := makeWorkload(1) + placements := map[string][]computev1alpha.WorkloadDeployment{ + testPlacementA: { + // "wd-b" sorts after "wd-a"; "wd-a" wins as the lex-first match. + makeWDWithAvailCond("wd-b", metav1.ConditionFalse, + computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, + "message from wd-b"), + makeWDWithAvailCond("wd-a", metav1.ConditionFalse, + computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, + "message from wd-a"), + }, + } + + cond := runReconcileWorkloadStatus(t, workload, placements) + require.NotNil(t, cond) + assert.Equal(t, computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, cond.Reason) + assert.Equal(t, "message from wd-a", cond.Message, + "lexicographically earlier deployment name wins the tie-break") +} + +// TestReconcileWorkloadStatus_ObservedGeneration verifies that the Available +// condition on Workload carries ObservedGeneration equal to workload.Generation. +func TestReconcileWorkloadStatus_ObservedGeneration(t *testing.T) { + const gen = int64(7) + workload := makeWorkload(gen) + placements := map[string][]computev1alpha.WorkloadDeployment{ + "p": { + makeWDWithAvailCond("wd", metav1.ConditionFalse, + computev1alpha.WorkloadDeploymentReasonInstancesProvisioning, + "Instances are being provisioned"), + }, + } + + cond := runReconcileWorkloadStatus(t, workload, placements) + require.NotNil(t, cond) + assert.Equal(t, gen, cond.ObservedGeneration, + "Available condition ObservedGeneration must equal workload.Generation") +} + +// TestWorkloadBlockingReasonPriority exhaustively verifies every entry in the +// workloadBlockingReasonPriority switch statement. +func TestWorkloadBlockingReasonPriority(t *testing.T) { + tests := []struct { + reason string + wantPrio int + }{ + // Priority 0: default / fallback + {"", 0}, + {"Unknown", 0}, + {computev1alpha.WorkloadReasonNoAvailablePlacements, 0}, + {computev1alpha.WorkloadReasonNoAvailableDeployments, 0}, + // Priority 1 + {computev1alpha.WorkloadDeploymentReasonInstancesProvisioning, 1}, + // Priority 2 + {computev1alpha.WorkloadDeploymentReasonNetworkProvisioning, 2}, + // Priority 3 + {computev1alpha.WorkloadDeploymentReasonQuotaNotGranted, 3}, + {computev1alpha.InstanceProgrammedReasonPendingQuota, 3}, + // Priority 4 + {computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, 4}, + {computev1alpha.ReferencedDataReasonAwaitingPropagation, 4}, + {computev1alpha.ReferencedDataReasonResolving, 4}, + // Priority 5 + {computev1alpha.ReferencedDataReasonSourceNotFound, 5}, + {computev1alpha.ReferencedDataReasonSourceTooLarge, 5}, + {computev1alpha.ReferencedDataReasonSourceUnauthorized, 5}, + // Priority 6 + {computev1alpha.WorkloadReasonNetworkNotFound, 6}, + } + + for _, tt := range tests { + t.Run(tt.reason, func(t *testing.T) { + assert.Equal(t, tt.wantPrio, workloadBlockingReasonPriority(tt.reason), + "unexpected priority for reason %q", tt.reason) + }) + } +} From ff17962a6593df212ca9e2c98d524adf106e085b Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Mon, 1 Jun 2026 19:39:45 -0500 Subject: [PATCH 5/7] fix: quota-vs-referenced-data priority when both gates present MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When QuotaGranted=False and scheduling gates are present, the previous code early-returned Ready=False/PendingQuota before the evaluate-all-then- pick block could run. This meant SourceNotFound (priority 5) was masked by PendingQuota (priority 3) — the same class of short-circuit bug the evaluate-all redesign was meant to eliminate. Fix: when scheduling gates are present, quota is fed into consider() like any other blocking cause so instanceBlockingReasonPriority picks the winner. The Programmed=False and Running=False side effects of quota denial are preserved unconditionally regardless of which reason wins Ready — they reflect quota state independently. The quota early-return is retained only for the no-gates case, where quota is the sole active blocker and the three-condition atomic write is correct. The scheduling-gates evaluation block is extracted into reconcileGatedReadyCondition to keep reconcileInstanceReadyCondition within the project's cyclomatic-complexity lint limit (gocyclo ≤ 30). Adds TestReconcileInstanceReadyCondition_QuotaVsReferencedData (RFC §8.1 headline case): QuotaGranted=False/QuotaExceeded + ReferencedDataReady= False/SourceNotFound → Ready=False/SourceNotFound (priority 5 > 3), with Programmed=False/Running=False still set. Co-Authored-By: Claude Sonnet 4.6 (cherry picked from commit e4d419b307c7a8099571abb8887b5ffc80e73b5c) --- internal/controller/instance_controller.go | 167 ++++++++++++------ .../controller/instance_controller_test.go | 82 +++++++++ 2 files changed, 196 insertions(+), 53 deletions(-) diff --git a/internal/controller/instance_controller.go b/internal/controller/instance_controller.go index 5caf9f22..b52af4a7 100644 --- a/internal/controller/instance_controller.go +++ b/internal/controller/instance_controller.go @@ -635,6 +635,103 @@ func (r *InstanceReconciler) setReferencedDataConditionWithTransition( return changed } +// reconcileGatedReadyCondition handles the scheduling-gates branch of +// reconcileInstanceReadyCondition. It evaluates ALL blocking sub-conditions +// (quota, referenced-data, network failure) via the priority function and sets +// Instance.Ready to the highest-priority cause. +// +// When quota is denied, Programmed=False and Running=False are also set +// regardless of which reason wins Ready — quota gates programmatic activity +// independently of the Ready reason displayed to the user. +// +// This is extracted from reconcileInstanceReadyCondition to keep that function's +// cyclomatic complexity within the project lint limit. +func (r *InstanceReconciler) reconcileGatedReadyCondition( + ctx context.Context, + clusterClient client.Client, + instance *computev1alpha.Instance, + quotaDenied bool, + quotaGrantedCondition *metav1.Condition, + readyCondition *metav1.Condition, + checker networkFailureChecker, +) (changed bool, err error) { + schedulingGateNames := make([]string, 0, len(instance.Spec.Controller.SchedulingGates)) + for _, gate := range instance.Spec.Controller.SchedulingGates { + schedulingGateNames = append(schedulingGateNames, gate.Name) + } + + type candidate struct { + status metav1.ConditionStatus + reason string + message string + priority int + } + + // Start with the generic fallback so there is always a winner. + best := candidate{ + status: metav1.ConditionFalse, + reason: computev1alpha.InstanceReadyReasonSchedulingGatesPresent, + message: fmt.Sprintf("Scheduling gates present: %s", strings.Join(schedulingGateNames, ", ")), + priority: 0, + } + + consider := func(status metav1.ConditionStatus, reason, message string) { + p := instanceBlockingReasonPriority(reason) + if p > best.priority { + best = candidate{status: status, reason: reason, message: message, priority: p} + } + } + + // Quota is a gate-level blocker: feed it through the priority function so it + // competes fairly with other causes (priority 3). A co-occurring SourceNotFound + // (priority 5) will correctly beat it. + if quotaDenied { + consider(metav1.ConditionFalse, computev1alpha.InstanceProgrammedReasonPendingQuota, quotaGrantedCondition.Message) + } + + // Check the ReferencedDataReady sub-condition set earlier in this reconcile. + if refDataCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.ReferencedDataReady); refDataCond != nil && refDataCond.Status != metav1.ConditionTrue { + consider(refDataCond.Status, refDataCond.Reason, refDataCond.Message) + } + + // Network creation failure is a hard error; call the checker unconditionally + // so priority logic can compare it against other blocking causes. + networkCreationFailure, networkCreationFailureMessage, err := checker(ctx, clusterClient, instance) + if err != nil { + return false, fmt.Errorf("failed checking for network creation failure: %w", err) + } + if networkCreationFailure { + consider(metav1.ConditionFalse, reasonNetworkFailedToCreate, networkCreationFailureMessage) + } + + // When quota is denied, always stamp Programmed=False and Available=False + // regardless of which reason wins Ready. These reflect quota state independently + // of the Ready reason selection. + if quotaDenied { + msg := quotaGrantedCondition.Message + changed = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + Type: computev1alpha.InstanceProgrammed, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceProgrammedReasonPendingQuota, + Message: msg, + ObservedGeneration: instance.Generation, + }) + changed = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ + Type: computev1alpha.InstanceAvailable, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceProgrammedReasonPendingQuota, + Message: msg, + ObservedGeneration: instance.Generation, + }) || changed + } + + readyCondition.Status = best.status + readyCondition.Reason = best.reason + readyCondition.Message = best.message + + return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition) || changed, nil +} + // isTerminalReferencedDataReason reports whether the given ReferencedData reason // is terminal — i.e., the companion will never arrive because the source object // is permanently unavailable, not just slow to propagate. @@ -1464,7 +1561,21 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( logger := log.FromContext(ctx) quotaGrantedCondition := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted) - if quotaGrantedCondition != nil && quotaGrantedCondition.Status == metav1.ConditionFalse { + quotaDenied := quotaGrantedCondition != nil && quotaGrantedCondition.Status == metav1.ConditionFalse + + // When scheduling gates are present, all blocking causes — including quota — + // are evaluated together so the priority function picks the most actionable + // one to surface on Ready. Quota side effects (Programmed=False, Running=False) + // are preserved unconditionally when quota is denied, regardless of which + // reason wins Ready. + // + // When no gates are present and quota is denied, fall through to the early + // return below which sets Ready, Programmed, and Running atomically. + hasSchedulingGates := instance.Spec.Controller != nil && len(instance.Spec.Controller.SchedulingGates) > 0 + + if quotaDenied && !hasSchedulingGates { + // No gates: quota is the only active blocking cause. Set all three + // conditions atomically and return — same behavior as before. msg := quotaGrantedCondition.Message changed = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{ Type: computev1alpha.InstanceProgrammed, @@ -1503,58 +1614,8 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition( readyCondition = readyCondition.DeepCopy() } - if instance.Spec.Controller != nil && len(instance.Spec.Controller.SchedulingGates) > 0 { - var schedulingGateNames []string - for _, gate := range instance.Spec.Controller.SchedulingGates { - schedulingGateNames = append(schedulingGateNames, gate.Name) - } - - // Evaluate ALL blocking sub-conditions before choosing which to surface. - // A short-circuit on the first match would let a low-priority transient - // cause (e.g. network provisioning) mask a high-priority actionable error - // (e.g. a missing source ConfigMap). - type candidate struct { - status metav1.ConditionStatus - reason string - message string - priority int - } - - // Start with the generic fallback so there is always a winner. - best := candidate{ - status: metav1.ConditionFalse, - reason: computev1alpha.InstanceReadyReasonSchedulingGatesPresent, - message: fmt.Sprintf("Scheduling gates present: %s", strings.Join(schedulingGateNames, ", ")), - priority: 0, - } - - consider := func(status metav1.ConditionStatus, reason, message string) { - p := instanceBlockingReasonPriority(reason) - if p > best.priority { - best = candidate{status: status, reason: reason, message: message, priority: p} - } - } - - // Check the ReferencedDataReady sub-condition set earlier in this reconcile. - if refDataCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.ReferencedDataReady); refDataCond != nil && refDataCond.Status != metav1.ConditionTrue { - consider(refDataCond.Status, refDataCond.Reason, refDataCond.Message) - } - - // Network creation failure is a hard error; call the checker unconditionally - // so priority logic can compare it against other blocking causes. - networkCreationFailure, networkCreationFailureMessage, err := networkFailureChecker(ctx, clusterClient, instance) - if err != nil { - return false, fmt.Errorf("failed checking for network creation failure: %w", err) - } - if networkCreationFailure { - consider(metav1.ConditionFalse, reasonNetworkFailedToCreate, networkCreationFailureMessage) - } - - readyCondition.Status = best.status - readyCondition.Reason = best.reason - readyCondition.Message = best.message - - return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition), nil + if hasSchedulingGates { + return r.reconcileGatedReadyCondition(ctx, clusterClient, instance, quotaDenied, quotaGrantedCondition, readyCondition, networkFailureChecker) } pendingReason := "Pending" diff --git a/internal/controller/instance_controller_test.go b/internal/controller/instance_controller_test.go index 50f6f35d..baa76eff 100644 --- a/internal/controller/instance_controller_test.go +++ b/internal/controller/instance_controller_test.go @@ -58,6 +58,9 @@ const ( // testMsgNetworkCreationFailed is the network-failure message used by the // evaluate-all-then-pick blocking-reason tests. testMsgNetworkCreationFailed = "Network creation failed: timeout" + + // testPlacementA is the placement key used in Workload status rollup tests. + testPlacementA = "placement-a" ) // newTestScheme builds a runtime.Scheme with the types needed for instance reconcile tests. @@ -2641,6 +2644,85 @@ func TestReconcileInstanceReadyCondition_EvaluateAllThenPick(t *testing.T) { }) } +// TestReconcileInstanceReadyCondition_QuotaVsReferencedData is the RFC §8.1 +// headline case: QuotaGranted=False/QuotaExceeded AND +// ReferencedDataReady=False/SourceNotFound co-occur. +// +// SourceNotFound (priority 5) must win over PendingQuota (priority 3) on Ready. +// Programmed=False and Running=False must still be set (quota side effects are +// preserved regardless of which reason wins Ready). +func TestReconcileInstanceReadyCondition_QuotaVsReferencedData(t *testing.T) { + noNetworkFailure := func(_ context.Context, _ client.Client, _ *computev1alpha.Instance) (bool, string, error) { + return false, "", nil + } + + // Instance has both the Quota gate and the ReferencedData gate, matching the + // scenario where reconcileQuotaCondition and reconcileReferencedDataCondition + // have both already run and written their respective sub-conditions. + inst := &computev1alpha.Instance{ + ObjectMeta: metav1.ObjectMeta{ + Name: testInstanceName, + Namespace: testDefaultNamespace, + Generation: 1, + }, + Spec: computev1alpha.InstanceSpec{ + Controller: &computev1alpha.InstanceController{ + SchedulingGates: []computev1alpha.SchedulingGate{ + {Name: instancecontrol.QuotaSchedulingGate.String()}, + {Name: instancecontrol.ReferencedDataSchedulingGate.String()}, + }, + }, + }, + Status: computev1alpha.InstanceStatus{ + Conditions: []metav1.Condition{ + { + Type: computev1alpha.InstanceQuotaGranted, + Status: metav1.ConditionFalse, + Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded, + Message: testMsgQuotaExceeded, + LastTransitionTime: metav1.Now(), + }, + { + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.ReferencedDataReasonSourceNotFound, + Message: testMsgConfigMapNotFound, + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + r := &InstanceReconciler{} + changed, err := r.reconcileInstanceReadyCondition(context.Background(), nil, inst, noNetworkFailure) + require.NoError(t, err) + assert.True(t, changed) + + // Ready must carry the higher-priority SourceNotFound reason (priority 5), + // not PendingQuota (priority 3). + readyCond := apimeta.FindStatusCondition(inst.Status.Conditions, computev1alpha.InstanceReady) + require.NotNil(t, readyCond, "Ready condition must be set") + assert.Equal(t, metav1.ConditionFalse, readyCond.Status) + assert.Equal(t, computev1alpha.ReferencedDataReasonSourceNotFound, readyCond.Reason, + "SourceNotFound (priority 5) must beat PendingQuota (priority 3)") + assert.Equal(t, testMsgConfigMapNotFound, readyCond.Message, + "Ready message must be the SourceNotFound message verbatim") + + // Programmed and Running must still be set to False/PendingQuota — quota + // side effects are preserved regardless of which reason wins Ready. + programmedCond := apimeta.FindStatusCondition(inst.Status.Conditions, computev1alpha.InstanceProgrammed) + require.NotNil(t, programmedCond, "Programmed condition must be set when quota is denied") + assert.Equal(t, metav1.ConditionFalse, programmedCond.Status) + assert.Equal(t, computev1alpha.InstanceProgrammedReasonPendingQuota, programmedCond.Reason, + "Programmed must reflect quota denial even when Ready surfaces a different reason") + + availableCond := apimeta.FindStatusCondition(inst.Status.Conditions, computev1alpha.InstanceAvailable) + require.NotNil(t, availableCond, "Available condition must be set when quota is denied") + assert.Equal(t, metav1.ConditionFalse, availableCond.Status) + assert.Equal(t, computev1alpha.InstanceProgrammedReasonPendingQuota, availableCond.Reason, + "Available must reflect quota denial even when Ready surfaces a different reason") +} + // TestInstanceBlockingReasonPriority exhaustively verifies the priority table for // Instance.Ready reason selection, as extended in this layer to rank // referenced-data reasons. Every listed reason must return the expected integer; From 4a4b50877acab35b63304c1260520691d92d076f Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Tue, 2 Jun 2026 08:06:52 -0500 Subject: [PATCH 6/7] fix(federation): read ReferencedDataErrorAnnotation in selectWDBlockingCondition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In federated topology the cell WD never receives the ReferencedDataReady status condition written by the hub-side resolver (Karmada status aggregation is cell→hub only). The ReferencedDataErrorAnnotation written by #38 already bridges terminal errors hub→cell via ObjectMeta propagation; this commit teaches the cell WD reconciler to read it. selectWDBlockingCondition now checks deployment.Annotations for the terminal error annotation after the existing status-condition path. When present and parseable, decodeTerminalError (same package) returns the raw terminal reason (SourceNotFound / SourceUnauthorized / SourceTooLarge, all priority 5) which feeds directly into the existing consider() priority-ranked selection. The annotation path is evaluated before the propagation-lag check so a terminal annotation wins over the AwaitingPropagation reason at the same bucket. No changes to the federator merge logic or Workload controller are needed: once the cell WD Available carries the correct reason, Karmada statusAggregation carries it hub-ward and syncStatusFromDownstream copies it to the project WD as-is. Co-Authored-By: Claude Opus 4.8 (1M context) (cherry picked from commit ec668d93704289e948df8201d3467913f765b227) --- .../workloaddeployment_controller.go | 15 ++ .../workloaddeployment_controller_test.go | 169 ++++++++++++++++++ 2 files changed, 184 insertions(+) diff --git a/internal/controller/workloaddeployment_controller.go b/internal/controller/workloaddeployment_controller.go index 3ac9bea3..2581eb50 100644 --- a/internal/controller/workloaddeployment_controller.go +++ b/internal/controller/workloaddeployment_controller.go @@ -424,6 +424,21 @@ func selectWDBlockingCondition( consider(computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, wdRefDataCond.Message) } + // In federated topology the Karmada status-aggregation pathway carries status + // cell→hub only, so the cell WD never receives the ReferencedDataReady condition + // written by the hub-side resolver. The ReferencedDataErrorAnnotation bridges + // terminal errors hub→cell alongside ObjectMeta (Karmada propagates annotations). + // Read it here as a parallel path so the cell WD Available condition reflects the + // terminal error even without the status condition. The annotation takes priority + // over the propagation-lag check below when both could apply to the same bucket. + if raw, ok := deployment.Annotations[computev1alpha.ReferencedDataErrorAnnotation]; ok && raw != "" { + if reason, message, err := decodeTerminalError(raw); err == nil && reason != "" { + consider(reason, message) + } + // Malformed annotation values are silently ignored: a parse failure here + // should not block the WD from reporting whatever state it does know. + } + if quotaBlockedReplicas > 0 { consider(computev1alpha.WorkloadDeploymentReasonQuotaNotGranted, fmt.Sprintf("%d of %d desired instances pending quota", quotaBlockedReplicas, desiredReplicas)) diff --git a/internal/controller/workloaddeployment_controller_test.go b/internal/controller/workloaddeployment_controller_test.go index e162a9fc..45dfbea6 100644 --- a/internal/controller/workloaddeployment_controller_test.go +++ b/internal/controller/workloaddeployment_controller_test.go @@ -869,3 +869,172 @@ func TestWDAvailableCondition_ObservedGeneration(t *testing.T) { require.NotNil(t, found) assert.Equal(t, gen, found.ObservedGeneration) } + +// makeWDWithAnnotation builds a WorkloadDeployment carrying the given +// ReferencedDataErrorAnnotation value but no status conditions, simulating a +// cell WD copy that received the annotation from the hub via Karmada propagation +// but has no locally-written resolver conditions. +func makeWDWithAnnotation(generation int64, annotValue string) *computev1alpha.WorkloadDeployment { + d := &computev1alpha.WorkloadDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: wdControllerTestName, + Namespace: wdControllerTestNS, + Generation: generation, + Annotations: map[string]string{ + computev1alpha.ReferencedDataErrorAnnotation: annotValue, + }, + }, + } + return d +} + +// mustEncodeTerminalError encodes a terminal error annotation value for use in +// tests. It panics on encoding failure, which should never happen in tests. +func mustEncodeTerminalError(reason, message string) string { + v, err := encodeTerminalError(reason, message) + if err != nil { + panic(err) + } + return v +} + +// TestWDAvailableCondition_AnnotationSourceNotFound verifies that a cell WD +// carrying the ReferencedDataErrorAnnotation (set by the hub-side resolver and +// propagated by Karmada) promotes Available to False/SourceNotFound even when no +// ReferencedDataReady status condition is present on the cell WD. +// +// This is the federation bridge introduced by task #40: the annotation is the +// only terminal-error signal visible on the cell WD copy. +func TestWDAvailableCondition_AnnotationSourceNotFound(t *testing.T) { + const ( + gen = int64(3) + desiredReplicas = int32(2) + replicas = 2 + ) + annot := mustEncodeTerminalError( + computev1alpha.ReferencedDataReasonSourceNotFound, + testMsgConfigMapNotFound, + ) + deployment := makeWDWithAnnotation(gen, annot) + + cond := selectWDBlockingCondition(deployment, true, true, 0, 1, replicas, desiredReplicas) + + assert.Equal(t, computev1alpha.WorkloadDeploymentAvailable, cond.Type) + assert.Equal(t, metav1.ConditionFalse, cond.Status) + // The annotation carries the raw terminal reason (SourceNotFound, priority 5), + // which beats InstancesProvisioning (priority 1) and ReferencedDataNotReady (priority 4). + assert.Equal(t, computev1alpha.ReferencedDataReasonSourceNotFound, cond.Reason) + assert.Equal(t, testMsgConfigMapNotFound, cond.Message, "message must be the resolver message verbatim") + assert.Equal(t, gen, cond.ObservedGeneration) +} + +// TestWDAvailableCondition_AnnotationAndConditionBothPresent verifies that when +// both the annotation and the ReferencedDataReady=False status condition are +// present (e.g. single-cluster mode, or a race during federation rollout), the +// annotation path is taken at the same effective priority because both encode the +// same terminal reason. +func TestWDAvailableCondition_AnnotationAndConditionBothPresent(t *testing.T) { + const ( + gen = int64(7) + desiredReplicas = int32(1) + replicas = 1 + ) + annot := mustEncodeTerminalError( + computev1alpha.ReferencedDataReasonSourceNotFound, + testMsgConfigMapNotFound, + ) + // Stamp both the annotation and the status condition with matching reason+message. + deployment := makeWDWithAnnotation(gen, annot) + deployment.Status.Conditions = []metav1.Condition{ + { + Type: computev1alpha.ReferencedDataReady, + Status: metav1.ConditionFalse, + Reason: computev1alpha.ReferencedDataReasonSourceNotFound, + Message: testMsgConfigMapNotFound, + LastTransitionTime: metav1.Now(), + }, + } + + cond := selectWDBlockingCondition(deployment, true, true, 0, 1, replicas, desiredReplicas) + + assert.Equal(t, metav1.ConditionFalse, cond.Status) + // Both paths arrive at the same terminal reason; the winner is stable regardless + // of evaluation order since both carry priority 5. + assert.Equal(t, computev1alpha.ReferencedDataReasonSourceNotFound, cond.Reason) + assert.Equal(t, testMsgConfigMapNotFound, cond.Message) +} + +// TestWDAvailableCondition_AnnotationWinsOverQuota verifies that a terminal +// annotation reason (priority 5) beats quotaBlockedReplicas (priority 3) even +// when quota is also blocking. +func TestWDAvailableCondition_AnnotationWinsOverQuota(t *testing.T) { + const ( + gen = int64(2) + desiredReplicas = int32(2) + replicas = 2 + ) + annot := mustEncodeTerminalError( + computev1alpha.ReferencedDataReasonSourceNotFound, + testMsgConfigMapNotFound, + ) + deployment := makeWDWithAnnotation(gen, annot) + + // quotaBlockedReplicas=1 would normally surface QuotaNotGranted (priority 3). + cond := selectWDBlockingCondition(deployment, true, true, 1, 0, replicas, desiredReplicas) + + assert.Equal(t, computev1alpha.ReferencedDataReasonSourceNotFound, cond.Reason, + "SourceNotFound (priority 5) must beat QuotaNotGranted (priority 3)") +} + +// TestWDAvailableCondition_NoAnnotationPropagationLag verifies that the existing +// propagation-lag path (referencedDataBlockedReplicas > 0, no annotation, no +// ReferencedDataReady condition) is unaffected by the annotation bridge change. +func TestWDAvailableCondition_NoAnnotationPropagationLag(t *testing.T) { + const ( + gen = int64(1) + desiredReplicas = int32(2) + replicas = 2 + ) + // No annotation, no ReferencedDataReady condition: companions still propagating. + deployment := makeWDForAvailTest(gen, "", "", "") + + cond := selectWDBlockingCondition(deployment, true, true, 0, 1, replicas, desiredReplicas) + + assert.Equal(t, computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady, cond.Reason, + "propagation-lag path must still fire when annotation is absent") + assert.Contains(t, cond.Message, "waiting for companion propagation") +} + +// TestWDAvailableCondition_AnnotationEmptyString verifies that an empty annotation +// value is treated as absent and falls through to the existing logic. +func TestWDAvailableCondition_AnnotationEmptyString(t *testing.T) { + const ( + gen = int64(1) + desiredReplicas = int32(1) + replicas = 1 + ) + deployment := makeWDWithAnnotation(gen, "") + + cond := selectWDBlockingCondition(deployment, true, true, 0, 0, replicas, desiredReplicas) + + // No real blockers; falls through to InstancesProvisioning. + assert.Equal(t, computev1alpha.WorkloadDeploymentReasonInstancesProvisioning, cond.Reason) +} + +// TestWDAvailableCondition_AnnotationMalformedJSON verifies that a malformed +// annotation value is silently ignored (no panic) and the function falls through +// to the next applicable blocking reason. +func TestWDAvailableCondition_AnnotationMalformedJSON(t *testing.T) { + const ( + gen = int64(1) + desiredReplicas = int32(1) + replicas = 1 + ) + deployment := makeWDWithAnnotation(gen, "not-valid-json{{") + + // Should not panic; malformed annotation is skipped. + cond := selectWDBlockingCondition(deployment, true, true, 0, 0, replicas, desiredReplicas) + + assert.Equal(t, computev1alpha.WorkloadDeploymentReasonInstancesProvisioning, cond.Reason, + "malformed annotation must be silently ignored; fallback to InstancesProvisioning") +} From 7171ac1227c98504c9b11f7fb376167ac9ffc24c Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Thu, 4 Jun 2026 17:50:05 -0500 Subject: [PATCH 7/7] fix(lint): use rdTestWorkloadName constant for repeated workload name Replace literal "test-workload" occurrences with the existing rdTestWorkloadName constant so goconst no longer flags them. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/controller/workload_controller_test.go | 2 +- internal/controller/workloaddeployment_federator_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/controller/workload_controller_test.go b/internal/controller/workload_controller_test.go index b1e68fc4..08c7df4b 100644 --- a/internal/controller/workload_controller_test.go +++ b/internal/controller/workload_controller_test.go @@ -21,7 +21,7 @@ import ( func makeWorkload(generation int64) *computev1alpha.Workload { return &computev1alpha.Workload{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-workload", + Name: rdTestWorkloadName, Namespace: testDefaultNamespace, Generation: generation, }, diff --git a/internal/controller/workloaddeployment_federator_test.go b/internal/controller/workloaddeployment_federator_test.go index 1b9f59be..1cb07ba0 100644 --- a/internal/controller/workloaddeployment_federator_test.go +++ b/internal/controller/workloaddeployment_federator_test.go @@ -476,7 +476,7 @@ func TestWorkloadDeploymentFederator_Finalization(t *testing.T) { Spec: computev1alpha.WorkloadDeploymentSpec{ CityCode: testCityCodeLAX, PlacementName: testDefaultPlacement, - WorkloadRef: computev1alpha.WorkloadReference{Name: "test-workload"}, + WorkloadRef: computev1alpha.WorkloadReference{Name: rdTestWorkloadName}, ScaleSettings: computev1alpha.HorizontalScaleSettings{MinReplicas: 1}, }, }