Skip to content
46 changes: 46 additions & 0 deletions api/v1alpha/instance_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,52 @@ const (
InstanceProgrammedReasonProgrammed = "Programmed"
)

// Reason constants for the top-level readiness conditions (Instance.Ready,
// WorkloadDeployment.Available, Workload.Available). These are the stable,
// machine-readable values that clients consume; they appear alongside human-readable
// messages so a single condition read is sufficient to diagnose a blocking cause.
const (
// WorkloadReasonNetworkNotFound is set on Workload.Available when one or more
// networks referenced by network interfaces do not exist.
WorkloadReasonNetworkNotFound = "NetworkNotFound"

// WorkloadDeploymentReasonNoMatchingLocation is set on WorkloadDeployment.Available
// while no Location matches the deployment's city code. The message names the
// unresolved city; network provisioning cannot start until that Location exists.
WorkloadDeploymentReasonNoMatchingLocation = "NoMatchingLocation"

// WorkloadDeploymentReasonNetworkProvisioning is set on WorkloadDeployment.Available
// while the network binding or subnet is still being provisioned.
// Replaces the previously-emitted inline literal "ProvisioningNetwork".
WorkloadDeploymentReasonNetworkProvisioning = "NetworkProvisioning"

// WorkloadDeploymentReasonInstancesProvisioning is set on WorkloadDeployment.Available
// while instances exist but none are ready yet.
// Replaces the previously-emitted inline literal "ProvisioningInstances".
WorkloadDeploymentReasonInstancesProvisioning = "InstancesProvisioning"

// WorkloadDeploymentReasonStableInstanceFound is set on WorkloadDeployment.Available
// when at least one ready instance is present.
WorkloadDeploymentReasonStableInstanceFound = "StableInstanceFound"

// WorkloadDeploymentReasonReferencedDataNotReady is set on WorkloadDeployment.Available
// and Workload.Available when the worst-blocking sub-condition is a ReferencedData
// failure. The message carries the ReferencedDataReady sub-condition's message verbatim.
WorkloadDeploymentReasonReferencedDataNotReady = "ReferencedDataNotReady"

// WorkloadDeploymentReasonQuotaNotGranted is set on WorkloadDeployment.Available and
// Workload.Available when quota is blocking one or more instances.
WorkloadDeploymentReasonQuotaNotGranted = "QuotaNotGranted"

// WorkloadReasonNoAvailablePlacements is set on Workload.Available when all
// placements report no available deployments. Used as the last-resort default.
WorkloadReasonNoAvailablePlacements = "NoAvailablePlacements"

// WorkloadReasonNoAvailableDeployments is set on a placement's Available
// condition when no deployment in that placement is available.
WorkloadReasonNoAvailableDeployments = "NoAvailableDeployments"
)

type InstanceTemplateSpec struct {
// Metadata of the instances created from this template
//
Expand Down
162 changes: 135 additions & 27 deletions internal/controller/instance_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,9 @@ func (r *InstanceReconciler) reconcileReferencedDataCondition(
// Stamp the gate-start annotation once so we can measure duration later.
r.stampGateStartAnnotation(ctx, cl, instance)

// Fetch the owning WorkloadDeployment to read the expected-set annotation.
// Fetch the owning WorkloadDeployment to read the expected-set annotation and
// the resolver's verdict. fetchOwnerWorkloadDeployment is already called every
// reconcile on this path, so this is zero extra API calls.
wd, err := r.fetchOwnerWorkloadDeployment(ctx, cl, instance)
if err != nil {
return referencedDataResult{}, 0, err
Expand Down Expand Up @@ -633,6 +635,103 @@ func (r *InstanceReconciler) setReferencedDataConditionWithTransition(
return changed
}

// reconcileGatedReadyCondition handles the scheduling-gates branch of
// reconcileInstanceReadyCondition. It evaluates ALL blocking sub-conditions
// (quota, referenced-data, network failure) via the priority function and sets
// Instance.Ready to the highest-priority cause.
//
// When quota is denied, Programmed=False and Running=False are also set
// regardless of which reason wins Ready — quota gates programmatic activity
// independently of the Ready reason displayed to the user.
//
// This is extracted from reconcileInstanceReadyCondition to keep that function's
// cyclomatic complexity within the project lint limit.
func (r *InstanceReconciler) reconcileGatedReadyCondition(
ctx context.Context,
clusterClient client.Client,
instance *computev1alpha.Instance,
quotaDenied bool,
quotaGrantedCondition *metav1.Condition,
readyCondition *metav1.Condition,
checker networkFailureChecker,
) (changed bool, err error) {
schedulingGateNames := make([]string, 0, len(instance.Spec.Controller.SchedulingGates))
for _, gate := range instance.Spec.Controller.SchedulingGates {
schedulingGateNames = append(schedulingGateNames, gate.Name)
}

type candidate struct {
status metav1.ConditionStatus
reason string
message string
priority int
}

// Start with the generic fallback so there is always a winner.
best := candidate{
status: metav1.ConditionFalse,
reason: computev1alpha.InstanceReadyReasonSchedulingGatesPresent,
message: fmt.Sprintf("Scheduling gates present: %s", strings.Join(schedulingGateNames, ", ")),
priority: 0,
}

consider := func(status metav1.ConditionStatus, reason, message string) {
p := instanceBlockingReasonPriority(reason)
if p > best.priority {
best = candidate{status: status, reason: reason, message: message, priority: p}
}
}

// Quota is a gate-level blocker: feed it through the priority function so it
// competes fairly with other causes (priority 3). A co-occurring SourceNotFound
// (priority 5) will correctly beat it.
if quotaDenied {
consider(metav1.ConditionFalse, computev1alpha.InstanceProgrammedReasonPendingQuota, quotaGrantedCondition.Message)
}

// Check the ReferencedDataReady sub-condition set earlier in this reconcile.
if refDataCond := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.ReferencedDataReady); refDataCond != nil && refDataCond.Status != metav1.ConditionTrue {
consider(refDataCond.Status, refDataCond.Reason, refDataCond.Message)
}

// Network creation failure is a hard error; call the checker unconditionally
// so priority logic can compare it against other blocking causes.
networkCreationFailure, networkCreationFailureMessage, err := checker(ctx, clusterClient, instance)
if err != nil {
return false, fmt.Errorf("failed checking for network creation failure: %w", err)
}
if networkCreationFailure {
consider(metav1.ConditionFalse, reasonNetworkFailedToCreate, networkCreationFailureMessage)
}

// When quota is denied, always stamp Programmed=False and Available=False
// regardless of which reason wins Ready. These reflect quota state independently
// of the Ready reason selection.
if quotaDenied {
msg := quotaGrantedCondition.Message
changed = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
Type: computev1alpha.InstanceProgrammed,
Status: metav1.ConditionFalse,
Reason: computev1alpha.InstanceProgrammedReasonPendingQuota,
Message: msg,
ObservedGeneration: instance.Generation,
})
changed = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
Type: computev1alpha.InstanceAvailable,
Status: metav1.ConditionFalse,
Reason: computev1alpha.InstanceProgrammedReasonPendingQuota,
Message: msg,
ObservedGeneration: instance.Generation,
}) || changed
}

readyCondition.Status = best.status
readyCondition.Reason = best.reason
readyCondition.Message = best.message

return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition) || changed, nil
}

// isTerminalReferencedDataReason reports whether the given ReferencedData reason
// is terminal — i.e., the companion will never arrive because the source object
// is permanently unavailable, not just slow to propagate.
Expand Down Expand Up @@ -1411,21 +1510,35 @@ func resolveInstanceResources(instance *computev1alpha.Instance) (cpuMillicores
// 0 - unknown/default
// 1 - Provisioning (transient runtime startup)
// 3 - PendingQuota (operator action may be needed)
// 5 - ImageUnavailable / InstanceCrashing / ConfigurationError
// (hard runtime error, user-actionable)
// 4 - ReferencedDataNotReady / AwaitingPropagation / Resolving
// (transient referenced-data propagation)
// 5 - ImageUnavailable / InstanceCrashing / ConfigurationError /
// SourceNotFound / SourceTooLarge / SourceUnauthorized
// (hard runtime or referenced-data spec error, user-actionable)
// 7 - NetworkFailedToCreate (hard infra error)
func instanceBlockingReasonPriority(reason string) int {
switch reason {
case computev1alpha.InstanceReadyReasonProvisioning:
return 1
case computev1alpha.InstanceProgrammedReasonPendingQuota:
return 3
case computev1alpha.WorkloadDeploymentReasonReferencedDataNotReady,
computev1alpha.ReferencedDataReasonAwaitingPropagation,
computev1alpha.ReferencedDataReasonResolving:
// Transient referenced-data propagation ranks above quota but below hard
// spec errors: the companion is still on its way to the cell.
return 4
case computev1alpha.InstanceReadyReasonImageUnavailable,
computev1alpha.InstanceReadyReasonInstanceCrashing,
computev1alpha.InstanceReadyReasonConfigurationError:
computev1alpha.InstanceReadyReasonConfigurationError,
computev1alpha.ReferencedDataReasonSourceNotFound,
computev1alpha.ReferencedDataReasonSourceTooLarge,
computev1alpha.ReferencedDataReasonSourceUnauthorized:
// Hard runtime errors are user-actionable (wrong image, crashing app, bad
// config) and rank highest among non-infra reasons so they are not buried
// under transient startup/quota reasons.
// under transient startup/quota reasons. Terminal referenced-data source
// errors (missing/too-large/unauthorized source object) are equally
// actionable spec errors and share this tier.
return 5
case reasonNetworkFailedToCreate:
return 7
Expand All @@ -1448,7 +1561,21 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition(
logger := log.FromContext(ctx)

quotaGrantedCondition := apimeta.FindStatusCondition(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted)
if quotaGrantedCondition != nil && quotaGrantedCondition.Status == metav1.ConditionFalse {
quotaDenied := quotaGrantedCondition != nil && quotaGrantedCondition.Status == metav1.ConditionFalse

// When scheduling gates are present, all blocking causes — including quota —
// are evaluated together so the priority function picks the most actionable
// one to surface on Ready. Quota side effects (Programmed=False, Running=False)
// are preserved unconditionally when quota is denied, regardless of which
// reason wins Ready.
//
// When no gates are present and quota is denied, fall through to the early
// return below which sets Ready, Programmed, and Running atomically.
hasSchedulingGates := instance.Spec.Controller != nil && len(instance.Spec.Controller.SchedulingGates) > 0

if quotaDenied && !hasSchedulingGates {
// No gates: quota is the only active blocking cause. Set all three
// conditions atomically and return — same behavior as before.
msg := quotaGrantedCondition.Message
changed = apimeta.SetStatusCondition(&instance.Status.Conditions, metav1.Condition{
Type: computev1alpha.InstanceProgrammed,
Expand Down Expand Up @@ -1487,27 +1614,8 @@ func (r *InstanceReconciler) reconcileInstanceReadyCondition(
readyCondition = readyCondition.DeepCopy()
}

if instance.Spec.Controller != nil && len(instance.Spec.Controller.SchedulingGates) > 0 {
var schedulingGateNames []string
for _, gate := range instance.Spec.Controller.SchedulingGates {
schedulingGateNames = append(schedulingGateNames, gate.Name)
}

networkCreationFailure, networkCreationFailureMessage, err := networkFailureChecker(ctx, clusterClient, instance)
if err != nil {
return false, fmt.Errorf("failed checking for network creation failure: %w", err)
}

readyCondition.Status = metav1.ConditionFalse
if networkCreationFailure {
readyCondition.Reason = reasonNetworkFailedToCreate
readyCondition.Message = networkCreationFailureMessage
} else {
readyCondition.Reason = computev1alpha.InstanceReadyReasonSchedulingGatesPresent
readyCondition.Message = fmt.Sprintf("Scheduling gates present: %s", strings.Join(schedulingGateNames, ", "))
}

return apimeta.SetStatusCondition(&instance.Status.Conditions, *readyCondition), nil
if hasSchedulingGates {
return r.reconcileGatedReadyCondition(ctx, clusterClient, instance, quotaDenied, quotaGrantedCondition, readyCondition, networkFailureChecker)
}

pendingReason := "Pending"
Expand Down
Loading
Loading