Skip to content

Commit b07c31d

Browse files
committed
GardenerNodeLifecycleController: add timeout for HA-disabled wait
When a node is terminating and offboarded, the controller waits for HaEnabled=False before allowing pod eviction. Previously it would wait indefinitely; now it times out after HaDisabledTimeout (default 15m), measured from the Offboarded condition's LastTransitionTime, so a stalled HA controller cannot block node termination indefinitely.
1 parent 9f13080 commit b07c31d

3 files changed

Lines changed: 106 additions & 14 deletions

File tree

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ require (
104104
k8s.io/component-base v0.35.0 // indirect
105105
k8s.io/klog/v2 v2.130.1 // indirect
106106
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect
107-
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect
107+
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4
108108
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.33.0 // indirect
109109
sigs.k8s.io/gateway-api v1.4.0 // indirect
110110
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect

internal/controller/gardener_node_lifecycle_controller.go

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"context"
2222
"fmt"
2323
"maps"
24+
"time"
2425

2526
appsv1 "k8s.io/api/apps/v1"
2627
corev1 "k8s.io/api/core/v1"
@@ -32,6 +33,7 @@ import (
3233
corev1ac "k8s.io/client-go/applyconfigurations/core/v1"
3334
v1 "k8s.io/client-go/applyconfigurations/meta/v1"
3435
policyv1ac "k8s.io/client-go/applyconfigurations/policy/v1"
36+
"k8s.io/utils/clock"
3537
ctrl "sigs.k8s.io/controller-runtime"
3638
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
3739
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
@@ -40,10 +42,19 @@ import (
4042
kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
4143
)
4244

45+
const defaultHaDisabledTimeout = 15 * time.Minute
46+
4347
type GardenerNodeLifecycleController struct {
4448
k8sclient.Client
4549
Scheme *runtime.Scheme
4650
namespace string
51+
// HaDisabledTimeout is the maximum time to wait for ConditionTypeHaEnabled to
52+
// become False after the node is offboarded, measured from the lastTransitionTime
53+
// of the ConditionTypeOffboarded condition. After the deadline, Unknown and unset
54+
// are also accepted. Defaults to defaultHaDisabledTimeout when zero.
55+
HaDisabledTimeout time.Duration
56+
// Clock is used to determine the current time. Defaults to clock.RealClock{}.
57+
Clock clock.Clock
4758
}
4859

4960
const (
@@ -84,18 +95,34 @@ func (r *GardenerNodeLifecycleController) Reconcile(ctx context.Context, req ctr
8495
// We do not care about the particular value, as long as it isn't an error
8596
var minAvailable int32 = 1
8697

87-
// Onboarding is not in progress anymore, i.e. the host is onboarded
98+
// Onboarding is not in progress, i.e. the host is onboarded
8899
onboardingCompleted := meta.IsStatusConditionFalse(hv.Status.Conditions, kvmv1.ConditionTypeOnboarding)
89-
// Evicting is not in progress anymore, i.e. the host is empty
100+
// Evicting is not in progress, i.e. the host is empty
90101
offboarded := meta.IsStatusConditionTrue(hv.Status.Conditions, kvmv1.ConditionTypeOffboarded)
91102

92103
if offboarded {
93104
minAvailable = 0
94105

95106
if onboardingCompleted && isTerminating(node) {
96-
// Wait for HypervisorInstanceHa controller to disable HA
107+
// Wait for HypervisorInstanceHa controller to disable HA.
108+
// After the deadline (measured from the lastTransitionTime of the
109+
// Offboarded condition) we also accept Unknown and unset, so that a
110+
// stalled HA controller cannot block node termination indefinitely.
97111
if !meta.IsStatusConditionFalse(hv.Status.Conditions, kvmv1.ConditionTypeHaEnabled) {
98-
return ctrl.Result{}, nil // Will be reconciled again when condition changes
112+
offboardedCondition := meta.FindStatusCondition(hv.Status.Conditions, kvmv1.ConditionTypeOffboarded)
113+
timeout := r.HaDisabledTimeout
114+
if timeout == 0 {
115+
timeout = defaultHaDisabledTimeout
116+
}
117+
now := r.Clock.Now()
118+
offboardedAt := offboardedCondition.LastTransitionTime.Time
119+
deadline := offboardedAt.Add(timeout)
120+
if now.Before(deadline) {
121+
return ctrl.Result{RequeueAfter: deadline.Sub(now)}, nil
122+
}
123+
log.Info("HA disabled timeout exceeded, proceeding without waiting for HaEnabled=False",
124+
"timeout", timeout,
125+
"offboardedAt", offboardedAt)
99126
}
100127
}
101128
}
@@ -221,7 +248,9 @@ func (r *GardenerNodeLifecycleController) SetupWithManager(mgr ctrl.Manager, nam
221248
ctx := context.Background()
222249
_ = logger.FromContext(ctx)
223250
r.namespace = namespace
224-
251+
if r.Clock == nil {
252+
r.Clock = clock.RealClock{}
253+
}
225254
return ctrl.NewControllerManagedBy(mgr).
226255
Named(MaintenanceControllerName).
227256
For(&corev1.Node{}).

internal/controller/gardener_node_lifecycle_controller_test.go

Lines changed: 71 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package controller
1919

2020
import (
2121
"fmt"
22+
"time"
2223

2324
. "github.com/onsi/ginkgo/v2"
2425
. "github.com/onsi/gomega"
@@ -28,6 +29,7 @@ import (
2829
"k8s.io/apimachinery/pkg/api/meta"
2930
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3031
"k8s.io/apimachinery/pkg/types"
32+
"k8s.io/utils/clock"
3133
ctrl "sigs.k8s.io/controller-runtime"
3234
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
3335

@@ -47,6 +49,7 @@ var _ = Describe("Gardener Maintenance Controller", func() {
4749
controller = &GardenerNodeLifecycleController{
4850
Client: k8sClient,
4951
Scheme: k8sClient.Scheme(),
52+
Clock: clock.RealClock{},
5053
}
5154

5255
By("creating the core resource for the Kind Node")
@@ -166,18 +169,21 @@ var _ = Describe("Gardener Maintenance Controller", func() {
166169

167170
Context("When node is terminating and offboarded", func() {
168171
BeforeEach(func(ctx SpecContext) {
169-
// Set node as terminating and add required labels for disableInstanceHA
172+
// Set node labels (spec/metadata update)
170173
node := &corev1.Node{}
171174
Expect(k8sClient.Get(ctx, name, node)).To(Succeed())
172175
node.Labels = map[string]string{
173176
corev1.LabelHostname: nodeName,
174177
"topology.kubernetes.io/zone": "test-zone",
175178
}
179+
Expect(k8sClient.Update(ctx, node)).To(Succeed())
180+
// Set node Terminating condition separately via the status subresource,
181+
// using a fresh Get to avoid the spec Update overwriting the status.
182+
Expect(k8sClient.Get(ctx, name, node)).To(Succeed())
176183
node.Status.Conditions = append(node.Status.Conditions, corev1.NodeCondition{
177184
Type: "Terminating",
178185
Status: corev1.ConditionTrue,
179186
})
180-
Expect(k8sClient.Update(ctx, node)).To(Succeed())
181187
Expect(k8sClient.Status().Update(ctx, node)).To(Succeed())
182188

183189
// Set hypervisor as onboarded and offboarded
@@ -198,13 +204,70 @@ var _ = Describe("Gardener Maintenance Controller", func() {
198204
Expect(k8sClient.Status().Update(ctx, hypervisor)).To(Succeed())
199205
})
200206

201-
It("should allow pod eviction by setting the PDB to minAvailable 0", func(ctx SpecContext) {
202-
_, err := controller.Reconcile(ctx, reconcileReq)
203-
Expect(err).NotTo(HaveOccurred())
207+
When("HaEnabled is explicitly False", func() {
208+
BeforeEach(func(ctx SpecContext) {
209+
hypervisor := &kvmv1.Hypervisor{}
210+
Expect(k8sClient.Get(ctx, name, hypervisor)).To(Succeed())
211+
meta.SetStatusCondition(&hypervisor.Status.Conditions, metav1.Condition{
212+
Type: kvmv1.ConditionTypeHaEnabled,
213+
Status: metav1.ConditionFalse,
214+
Reason: "Evicted",
215+
Message: "HA disabled due to eviction",
216+
})
217+
Expect(k8sClient.Status().Update(ctx, hypervisor)).To(Succeed())
218+
})
204219

205-
pdb := &policyv1.PodDisruptionBudget{}
206-
Expect(k8sClient.Get(ctx, maintenanceName, pdb)).To(Succeed())
207-
Expect(pdb.Spec.MinAvailable).To(HaveField("IntVal", BeNumerically("==", int32(0))))
220+
It("should allow pod eviction immediately by setting the PDB to minAvailable 0", func(ctx SpecContext) {
221+
result, err := controller.Reconcile(ctx, reconcileReq)
222+
Expect(err).NotTo(HaveOccurred())
223+
Expect(result.RequeueAfter).To(BeZero())
224+
225+
pdb := &policyv1.PodDisruptionBudget{}
226+
Expect(k8sClient.Get(ctx, maintenanceName, pdb)).To(Succeed())
227+
Expect(pdb.Spec.MinAvailable).To(HaveField("IntVal", BeNumerically("==", int32(0))))
228+
})
229+
})
230+
231+
When("HaEnabled is not yet False and the timeout has not elapsed", func() {
232+
BeforeEach(func() {
233+
// LastTransitionTime ≈ now (set by meta.SetStatusCondition above),
234+
// so deadline = now + 1h is in the future.
235+
controller.HaDisabledTimeout = time.Hour
236+
})
237+
238+
It("should requeue and not proceed", func(ctx SpecContext) {
239+
result, err := controller.Reconcile(ctx, reconcileReq)
240+
Expect(err).NotTo(HaveOccurred())
241+
// Should requeue before the deadline rather than returning immediately.
242+
Expect(result.RequeueAfter).To(BeNumerically(">", 0))
243+
})
244+
})
245+
246+
When("HaEnabled is not False but the timeout has elapsed", func() {
247+
BeforeEach(func(ctx SpecContext) {
248+
// Push LastTransitionTime 2h into the past so that
249+
// deadline = (now - 2h) + 1h = now - 1h, which has already elapsed.
250+
hypervisor := &kvmv1.Hypervisor{}
251+
Expect(k8sClient.Get(ctx, name, hypervisor)).To(Succeed())
252+
for i := range hypervisor.Status.Conditions {
253+
if hypervisor.Status.Conditions[i].Type == kvmv1.ConditionTypeOffboarded {
254+
hypervisor.Status.Conditions[i].LastTransitionTime = metav1.NewTime(time.Now().Add(-2 * time.Hour))
255+
break
256+
}
257+
}
258+
Expect(k8sClient.Status().Update(ctx, hypervisor)).To(Succeed())
259+
controller.HaDisabledTimeout = time.Hour
260+
})
261+
262+
It("should allow pod eviction by setting the PDB to minAvailable 0", func(ctx SpecContext) {
263+
result, err := controller.Reconcile(ctx, reconcileReq)
264+
Expect(err).NotTo(HaveOccurred())
265+
Expect(result.RequeueAfter).To(BeZero())
266+
267+
pdb := &policyv1.PodDisruptionBudget{}
268+
Expect(k8sClient.Get(ctx, maintenanceName, pdb)).To(Succeed())
269+
Expect(pdb.Spec.MinAvailable).To(HaveField("IntVal", BeNumerically("==", int32(0))))
270+
})
208271
})
209272
})
210273
})

0 commit comments

Comments
 (0)