Skip to content

Commit 82ab142

Browse files
controller: always determine MCP machine counts based on node properties
1 parent ecf42ae commit 82ab142

1 file changed

Lines changed: 12 additions & 118 deletions

File tree

pkg/controller/node/status.go

Lines changed: 12 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ func (ctrl *Controller) syncStatusOnly(pool *mcfgv1.MachineConfigPool) error {
7575
}
7676

7777
//nolint:gocyclo,gosec
78-
func (ctrl *Controller) calculateStatus(mcs []*mcfgv1.MachineConfigNode, cconfig *mcfgv1.ControllerConfig, pool *mcfgv1.MachineConfigPool, nodes []*corev1.Node, mosc *mcfgv1.MachineOSConfig, mosb *mcfgv1.MachineOSBuild) mcfgv1.MachineConfigPoolStatus {
78+
func (ctrl *Controller) calculateStatus(mcns []*mcfgv1.MachineConfigNode, cconfig *mcfgv1.ControllerConfig, pool *mcfgv1.MachineConfigPool, nodes []*corev1.Node, mosc *mcfgv1.MachineOSConfig, mosb *mcfgv1.MachineOSBuild) mcfgv1.MachineConfigPoolStatus {
7979
certExpirys := []mcfgv1.CertExpiry{}
8080
if cconfig != nil {
8181
for _, cert := range cconfig.Status.ControllerCertificates {
@@ -94,134 +94,28 @@ func (ctrl *Controller) calculateStatus(mcs []*mcfgv1.MachineConfigNode, cconfig
9494

9595
isLayeredPool := ctrl.isLayeredPool(mosc, mosb)
9696

97-
var degradedMachines, readyMachines, updatedMachines, unavailableMachines, updatingMachines []*corev1.Node
9897
degradedReasons := []string{}
9998
pisIsEnabled := ctrl.fgHandler.Enabled(features.FeatureGatePinnedImages)
10099
pinnedImageSetsDegraded := false
101100

102-
// if we represent updating properly here, we will also represent updating properly in the CO
103-
// so this solves the cordoning RFE and the upgradeable RFE
104-
// updating == updatePrepared, updateExecuted, updatedComplete, postAction, cordoning, draining
105-
// updated == nodeResumed, updated
106-
// ready == nodeResumed, updated
107-
// unavailable == draining, cordoned
108-
// degraded == if the condition.Reason == error
109-
// this ensures that a MCP only enters Upgradeable==False if the node actually needs to upgrade to the new MC
110-
for _, state := range mcs {
111-
var ourNode *corev1.Node
112-
for _, n := range nodes {
113-
if state.Name == n.Name {
114-
ourNode = n
115-
break
116-
}
117-
}
118-
if ourNode == nil {
119-
klog.Errorf("Could not find specified node %s", state.Name)
120-
}
121-
if len(state.Status.Conditions) == 0 {
122-
// not ready yet
123-
break
124-
}
101+
// Set the PIS pool synchronizer to updated based on the MCN
102+
for _, mcn := range mcns {
125103
if pisIsEnabled {
126-
if isPinnedImageSetsUpdated(state) {
104+
if isPinnedImageSetsUpdated(mcn) {
127105
poolSynchronizer.SetUpdated(mcfgv1.PinnedImageSets)
128106
}
129107
}
130-
for _, cond := range state.Status.Conditions {
131-
// populate the degradedReasons from the MachineConfigNodeNodeDegraded condition
132-
if mcfgv1.StateProgress(cond.Type) == mcfgv1.MachineConfigNodeNodeDegraded && cond.Status == metav1.ConditionTrue {
133-
degradedMachines = append(degradedMachines, ourNode)
134-
// Degraded nodes are also unavailable since they are not in a "Done" state
135-
// and cannot be used for further updates (see IsUnavailableForUpdate)
136-
unavailableMachines = append(unavailableMachines, ourNode)
137-
degradedReasons = append(degradedReasons, fmt.Sprintf("Node %s is reporting: %q", ourNode.Name, cond.Message))
138-
break
139-
}
140-
/*
141-
// TODO (ijanssen): This section of code should be implemented as part of OCPBUGS-57177 after OCPBUGS-32745 is addressed.
142-
// In the current state of the code, the `MachineConfigNodePinnedImageSetsDegraded` condition is wrongly being set to True`
143-
// even when no unintended functionality is occurring, such as many images taking more than 2 minutes to fetch. Thus, it is
144-
// not wise to degrade an MCP on a PIS degrade while the PIS degrade is not acting as intended. OCPBUGS-57177 has
145-
// been marked as blocked by OCPBUGS-32745 in Jira, but once the degrade condition is stablilized, this code block should
146-
// cover the fix for OCPBUGS-57177.
147-
// populate the degradedReasons from the MachineConfigNodePinnedImageSetsDegraded condition
148-
if pisIsEnabled && mcfgv1.StateProgress(cond.Type) == mcfgv1.MachineConfigNodePinnedImageSetsDegraded && cond.Status == metav1.ConditionTrue {
149-
degradedMachines = append(degradedMachines, ourNode)
150-
degradedReasons = append(degradedReasons, fmt.Sprintf("Node %s references an invalid PinnedImageSet. See the node's MachineConfigNode resource for details.", ourNode.Name))
151-
if mcfgv1.StateProgress(cond.Type) == mcfgv1.MachineConfigNodePinnedImageSetsDegraded {
152-
pinnedImageSetsDegraded = true
153-
}
154-
break
155-
}
156-
*/
157-
/*
158-
// TODO: (djoshy) Rework this block to use MCN conditions correctly. See: https://issues.redhat.com/browse/MCO-1228
159-
160-
// Specifically, the main concerns for the following block are:
161-
// (i) Why are only unknown conditions being evaluated? Shouldn't True/False conditions be used?
162-
// (ii) Multiple conditions can be unknown at the same time, resulting in certain machines being double counted
163-
164-
// Some background:
165-
// The MCN conditions are used to feed MCP statuses if the machine counts add up correctly to the total node count.
166-
// If this check fails, node conditions are used to determine machine counts. The MCN counts calculated in this block
167-
// seem incorrect most of the time, so the controller almost always defaults to using the node condition based counts.
168-
// On occasion, the MCN counts cause a false positive in aformentioned check, resulting in invalid values for the MCP
169-
// statuses. Commenting out this block will force the controller to always use node condition based counts to feed the
170-
// MCP status.
171-
172-
173-
if cond.Status == metav1.ConditionUnknown {
174-
// This switch case will cause a node to be double counted, maybe use a hash for node count
175-
switch mcfgv1.StateProgress(cond.Type) {
176-
case mcfgv1.MachineConfigNodeUpdatePrepared:
177-
updatingMachines = append(updatedMachines, ourNode) //nolint:gocritic
178-
case mcfgv1.MachineConfigNodeUpdateExecuted:
179-
updatingMachines = append(updatingMachines, ourNode)
180-
case mcfgv1.MachineConfigNodeUpdatePostActionComplete:
181-
updatingMachines = append(updatingMachines, ourNode)
182-
case mcfgv1.MachineConfigNodeUpdateComplete:
183-
updatingMachines = append(updatingMachines, ourNode)
184-
case mcfgv1.MachineConfigNodeResumed:
185-
updatingMachines = append(updatedMachines, ourNode) //nolint:gocritic
186-
readyMachines = append(readyMachines, ourNode)
187-
// Note (ijanssen): `MachineConfigNodeUpdateCompatible` was removed with MCO-1543. This case will need to be replaced/removed when working on MCO-1228.
188-
case mcfgv1.MachineConfigNodeUpdateCompatible:
189-
updatingMachines = append(updatedMachines, ourNode) //nolint:gocritic
190-
case mcfgv1.MachineConfigNodeUpdateDrained:
191-
unavailableMachines = append(unavailableMachines, ourNode)
192-
updatingMachines = append(updatingMachines, ourNode)
193-
case mcfgv1.MachineConfigNodeUpdateCordoned:
194-
unavailableMachines = append(unavailableMachines, ourNode)
195-
updatingMachines = append(updatingMachines, ourNode)
196-
case mcfgv1.MachineConfigNodeUpdated:
197-
updatedMachines = append(updatedMachines, ourNode)
198-
readyMachines = append(readyMachines, ourNode)
199-
}
200-
}
201-
*/
202-
}
203108
}
204-
degradedMachineCount := int32(len(degradedMachines))
109+
110+
// Get the updated, degraded, ready, and unavailable machine counts from the nodes' properties
111+
updatedMachines := getUpdatedMachines(pool, nodes, mosc, mosb, isLayeredPool)
205112
updatedMachineCount := int32(len(updatedMachines))
206-
unavailableMachineCount := int32(len(unavailableMachines))
207-
updatingMachineCount := int32(len(updatingMachines))
113+
readyMachines := getReadyMachines(pool, nodes, mosc, mosb, isLayeredPool)
208114
readyMachineCount := int32(len(readyMachines))
209-
210-
// this is # 1 priority, get the upgrade states actually reporting
211-
if degradedMachineCount+readyMachineCount+unavailableMachineCount+updatingMachineCount != int32(len(nodes)) {
212-
213-
updatedMachines = getUpdatedMachines(pool, nodes, mosc, mosb, isLayeredPool)
214-
updatedMachineCount = int32(len(updatedMachines))
215-
216-
readyMachines = getReadyMachines(pool, nodes, mosc, mosb, isLayeredPool)
217-
readyMachineCount = int32(len(readyMachines))
218-
219-
unavailableMachines = getUnavailableMachines(nodes, pool)
220-
unavailableMachineCount = int32(len(unavailableMachines))
221-
222-
degradedMachines = getDegradedMachines(nodes)
223-
degradedMachineCount = int32(len(degradedMachines))
224-
}
115+
unavailableMachines := getUnavailableMachines(nodes, pool)
116+
unavailableMachineCount := int32(len(unavailableMachines))
117+
degradedMachines := getDegradedMachines(nodes)
118+
degradedMachineCount := int32(len(degradedMachines))
225119

226120
for _, n := range degradedMachines {
227121
reason, ok := n.Annotations[daemonconsts.MachineConfigDaemonReasonAnnotationKey]

0 commit comments

Comments
 (0)