Skip to content

Commit 21a6831

Browse files
committed
Offboarding via Hypervisor CRO
- Block gardener with pdb until offboarded - Communicate offboarded via condition on hypervisor - Do only decomissioning via hypervisor cro Use offboarded instead of decomissioning as it is clearer the opposite from onboarding.
1 parent 3586af8 commit 21a6831

5 files changed

Lines changed: 70 additions & 119 deletions

File tree

api/v1/hypervisor_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ const (
3232
// ConditionTypeOnboarding is the type of condition for onboarding status
3333
ConditionTypeOnboarding = "Onboarding"
3434

35+
// ConditionTypeOffboarded is the type of condition for the completed offboarding
36+
ConditionTypeOffboarded = "Offboarded"
37+
3538
// ConditionTypeReady is the type of condition for ready status of a hypervisor
3639
ConditionTypeReady = "Ready"
3740

internal/controller/decomission_controller.go

Lines changed: 31 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -28,23 +28,19 @@ import (
2828
"github.com/gophercloud/gophercloud/v2/openstack/compute/v2/aggregates"
2929
"github.com/gophercloud/gophercloud/v2/openstack/compute/v2/services"
3030
"github.com/gophercloud/gophercloud/v2/openstack/placement/v1/resourceproviders"
31-
corev1 "k8s.io/api/core/v1"
3231
"k8s.io/apimachinery/pkg/api/meta"
3332
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3433
"k8s.io/apimachinery/pkg/runtime"
35-
"k8s.io/client-go/util/retry"
3634
ctrl "sigs.k8s.io/controller-runtime"
3735
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
38-
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
3936
logger "sigs.k8s.io/controller-runtime/pkg/log"
4037

4138
kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
4239
"github.com/cobaltcore-dev/openstack-hypervisor-operator/internal/openstack"
4340
)
4441

4542
const (
46-
decommissionFinalizerName = "cobaltcore.cloud.sap/decommission-hypervisor"
47-
DecommissionControllerName = "nodeDecommission"
43+
DecommissionControllerName = "offboarding"
4844
)
4945

5046
type NodeDecommissionReconciler struct {
@@ -57,66 +53,40 @@ type NodeDecommissionReconciler struct {
5753
// The counter-side in gardener is here:
5854
// https://github.com/gardener/machine-controller-manager/blob/rel-v0.56/pkg/util/provider/machinecontroller/machine.go#L646
5955

60-
// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;patch;update
61-
// +kubebuilder:rbac:groups="",resources=nodes/finalizers,verbs=update
6256
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors,verbs=get;list;watch
6357
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;list;watch;update;patch
6458
func (r *NodeDecommissionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
6559
hostname := req.Name
6660
log := logger.FromContext(ctx).WithName(req.Name).WithValues("hostname", hostname)
6761
ctx = logger.IntoContext(ctx, log)
6862

69-
node := &corev1.Node{}
70-
if err := r.Get(ctx, req.NamespacedName, node); err != nil {
71-
return ctrl.Result{}, k8sclient.IgnoreNotFound(err)
72-
}
73-
74-
// Fetch HV to check if lifecycle management is enabled
7563
hv := &kvmv1.Hypervisor{}
7664
if err := r.Get(ctx, k8sclient.ObjectKey{Name: hostname}, hv); err != nil {
7765
// ignore not found errors, could be deleted
7866
return ctrl.Result{}, k8sclient.IgnoreNotFound(err)
7967
}
80-
if !hv.Spec.LifecycleEnabled {
81-
// Get out of the way
82-
return r.removeFinalizer(ctx, node)
83-
}
84-
85-
if !controllerutil.ContainsFinalizer(node, decommissionFinalizerName) {
86-
return ctrl.Result{}, retry.RetryOnConflict(retry.DefaultRetry, func() error {
87-
patch := k8sclient.MergeFrom(node.DeepCopy())
88-
controllerutil.AddFinalizer(node, decommissionFinalizerName)
89-
if err := r.Patch(ctx, node, patch); err != nil {
90-
return fmt.Errorf("failed to add finalizer due to %w", err)
91-
}
92-
log.Info("Added finalizer")
93-
return nil
94-
})
95-
}
9668

97-
// Not yet deleting hv, nothing more to do
98-
if node.DeletionTimestamp.IsZero() {
69+
if !hv.Spec.LifecycleEnabled || hv.Spec.Maintenance != kvmv1.MaintenanceTermination {
9970
return ctrl.Result{}, nil
10071
}
10172

102-
// Someone is just deleting the hv, without going through termination
103-
// See: https://github.com/gardener/machine-controller-manager/blob/rel-v0.56/pkg/util/provider/machinecontroller/machine.go#L658-L659
104-
if !IsNodeConditionTrue(node.Status.Conditions, "Terminating") {
105-
log.Info("removing finalizer since not terminating")
106-
// So we just get out of the way for now
107-
return r.removeFinalizer(ctx, node)
108-
}
109-
11073
if meta.IsStatusConditionTrue(hv.Status.Conditions, kvmv1.ConditionTypeReady) {
11174
return r.setDecommissioningCondition(ctx, hv, "Node is being decommissioned, removing host from nova")
11275
}
11376

77+
if meta.IsStatusConditionTrue(hv.Status.Conditions, kvmv1.ConditionTypeOffboarded) {
78+
return ctrl.Result{}, nil
79+
}
80+
11481
log.Info("removing host from nova")
11582

11683
hypervisor, err := openstack.GetHypervisorByName(ctx, r.computeClient, hostname, true)
117-
if errors.Is(err, openstack.ErrNoHypervisor) {
118-
// We are (hopefully) done
119-
return r.removeFinalizer(ctx, node)
84+
if err != nil {
85+
if errors.Is(err, openstack.ErrNoHypervisor) {
86+
// We are (hopefully) done
87+
return ctrl.Result{}, r.markOffboarded(ctx, hv)
88+
}
89+
return r.setDecommissioningCondition(ctx, hv, fmt.Sprintf("cannot get hypervisor by name %s due to %s", hostname, err))
12090
}
12191

12292
// TODO: remove since RunningVMs is only available until micro-version 2.87, and also is updated asynchronously
@@ -141,7 +111,7 @@ func (r *NodeDecommissionReconciler) Reconcile(ctx context.Context, req ctrl.Req
141111
return r.setDecommissioningCondition(ctx, hv, fmt.Sprintf("cannot list aggregates due to %v", err))
142112
}
143113

144-
host := node.Name
114+
host := hv.Name
145115
for name, aggregate := range aggs {
146116
if slices.Contains(aggregate.Hosts, host) {
147117
opts := aggregates.RemoveHostOpts{Host: host}
@@ -168,19 +138,7 @@ func (r *NodeDecommissionReconciler) Reconcile(ctx context.Context, req ctrl.Req
168138
return r.setDecommissioningCondition(ctx, hv, fmt.Sprintf("cannot clean up resource provider: %v", err))
169139
}
170140

171-
return r.removeFinalizer(ctx, node)
172-
}
173-
174-
func (r *NodeDecommissionReconciler) removeFinalizer(ctx context.Context, node *corev1.Node) (ctrl.Result, error) {
175-
if !controllerutil.ContainsFinalizer(node, decommissionFinalizerName) {
176-
return ctrl.Result{}, nil
177-
}
178-
179-
nodeBase := node.DeepCopy()
180-
controllerutil.RemoveFinalizer(node, decommissionFinalizerName)
181-
err := r.Patch(ctx, node, k8sclient.MergeFromWithOptions(nodeBase,
182-
k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(DecommissionControllerName))
183-
return ctrl.Result{}, err
141+
return ctrl.Result{}, r.markOffboarded(ctx, hv)
184142
}
185143

186144
func (r *NodeDecommissionReconciler) setDecommissioningCondition(ctx context.Context, hv *kvmv1.Hypervisor, message string) (ctrl.Result, error) {
@@ -195,7 +153,22 @@ func (r *NodeDecommissionReconciler) setDecommissioningCondition(ctx context.Con
195153
k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(DecommissionControllerName)); err != nil {
196154
return ctrl.Result{}, fmt.Errorf("cannot update hypervisor status due to %w", err)
197155
}
198-
return ctrl.Result{RequeueAfter: shortRetryTime}, nil
156+
return ctrl.Result{}, nil
157+
}
158+
159+
func (r *NodeDecommissionReconciler) markOffboarded(ctx context.Context, hv *kvmv1.Hypervisor) error {
160+
base := hv.DeepCopy()
161+
meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{
162+
Type: kvmv1.ConditionTypeOffboarded,
163+
Status: metav1.ConditionTrue,
164+
Reason: "Offboarded",
165+
Message: "Offboarding successful",
166+
})
167+
if err := r.Status().Patch(ctx, hv, k8sclient.MergeFromWithOptions(base,
168+
k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(DecommissionControllerName)); err != nil {
169+
return fmt.Errorf("cannot update hypervisor status due to %w", err)
170+
}
171+
return nil
199172
}
200173

201174
// SetupWithManager sets up the controller with the Manager.
@@ -217,6 +190,6 @@ func (r *NodeDecommissionReconciler) SetupWithManager(mgr ctrl.Manager) error {
217190

218191
return ctrl.NewControllerManagedBy(mgr).
219192
Named(DecommissionControllerName).
220-
For(&corev1.Node{}).
193+
For(&kvmv1.Hypervisor{}).
221194
Complete(r)
222195
}

internal/controller/decomission_controller_test.go

Lines changed: 25 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,9 @@ const (
6868
var _ = Describe("Decommission Controller", func() {
6969
var (
7070
r *NodeDecommissionReconciler
71-
nodeName = types.NamespacedName{Name: hypervisorName}
71+
resourceName = types.NamespacedName{Name: hypervisorName}
7272
reconcileReq = ctrl.Request{
73-
NamespacedName: nodeName,
73+
NamespacedName: resourceName,
7474
}
7575
fakeServer testhelper.FakeServer
7676
)
@@ -99,30 +99,10 @@ var _ = Describe("Decommission Controller", func() {
9999
Expect(k8sClient.Delete(ctx, ns)).To(Succeed())
100100
})
101101

102-
By("creating the core resource for the Kind Node")
103-
node := &corev1.Node{
104-
ObjectMeta: metav1.ObjectMeta{
105-
Name: nodeName.Name,
106-
Labels: map[string]string{labelEvictionRequired: "true"},
107-
},
108-
}
109-
Expect(k8sClient.Create(ctx, node)).To(Succeed())
110-
DeferCleanup(func(ctx SpecContext) {
111-
node := &corev1.Node{}
112-
Expect(k8sclient.IgnoreNotFound(k8sClient.Get(ctx, nodeName, node))).To(Succeed())
113-
if len(node.Finalizers) > 0 {
114-
node.Finalizers = make([]string, 0)
115-
Expect(k8sClient.Update(ctx, node)).To(Succeed())
116-
}
117-
if node.Name != "" {
118-
Expect(k8sclient.IgnoreNotFound(k8sClient.Delete(ctx, node))).To(Succeed())
119-
}
120-
})
121-
122102
By("Create the hypervisor resource with lifecycle enabled")
123103
hypervisor := &kvmv1.Hypervisor{
124104
ObjectMeta: metav1.ObjectMeta{
125-
Name: nodeName.Name,
105+
Name: resourceName.Name,
126106
},
127107
Spec: kvmv1.HypervisorSpec{
128108
LifecycleEnabled: true,
@@ -134,47 +114,32 @@ var _ = Describe("Decommission Controller", func() {
134114
})
135115
})
136116

137-
Context("When reconciling a node", func() {
138-
It("should set the finalizer", func(ctx SpecContext) {
139-
By("reconciling the created resource")
140-
_, err := r.Reconcile(ctx, reconcileReq)
141-
Expect(err).NotTo(HaveOccurred())
142-
node := &corev1.Node{}
143-
144-
Expect(k8sClient.Get(ctx, nodeName, node)).To(Succeed())
145-
Expect(node.Finalizers).To(ContainElement(decommissionFinalizerName))
146-
})
147-
})
148-
149-
Context("When terminating a node", func() {
117+
Context("When marking the hypervisor terminating", func() {
150118
JustBeforeEach(func(ctx SpecContext) {
151119
By("reconciling first reconciling the to add the finalizer")
152120
_, err := r.Reconcile(ctx, reconcileReq)
153121
Expect(err).NotTo(HaveOccurred())
154-
node := &corev1.Node{}
155122

156-
Expect(k8sClient.Get(ctx, nodeName, node)).To(Succeed())
157-
Expect(node.Finalizers).To(ContainElement(decommissionFinalizerName))
123+
hypervisor := &kvmv1.Hypervisor{}
124+
Expect(k8sClient.Get(ctx, resourceName, hypervisor)).To(Succeed())
158125

159-
By("and then terminating then node")
160-
node.Status.Conditions = append(node.Status.Conditions, corev1.NodeCondition{
161-
Type: "Terminating",
162-
Status: corev1.ConditionTrue,
126+
By("and then marking the hypervisor terminating")
127+
hypervisor.Spec.Maintenance = kvmv1.MaintenanceTermination
128+
Expect(k8sClient.Update(ctx, hypervisor)).To(Succeed())
129+
meta.SetStatusCondition(&hypervisor.Status.Conditions, metav1.Condition{
130+
Type: kvmv1.ConditionTypeTerminating,
131+
Status: metav1.ConditionTrue,
163132
Reason: "dontcare",
164133
Message: "dontcare",
165134
})
166-
Expect(k8sClient.Status().Update(ctx, node)).To(Succeed())
167-
Expect(k8sClient.Delete(ctx, node)).To(Succeed())
168-
nodelist := &corev1.NodeList{}
169-
Expect(k8sClient.List(ctx, nodelist)).To(Succeed())
170-
Expect(nodelist.Items).NotTo(BeEmpty())
135+
Expect(k8sClient.Status().Update(ctx, hypervisor)).To(Succeed())
171136
})
172137

173138
When("the hypervisor was set to ready", func() {
174139
getHypervisorsCalled := 0
175140
BeforeEach(func(ctx SpecContext) {
176141
hv := &kvmv1.Hypervisor{}
177-
Expect(k8sClient.Get(ctx, nodeName, hv)).To(Succeed())
142+
Expect(k8sClient.Get(ctx, resourceName, hv)).To(Succeed())
178143
meta.SetStatusCondition(&hv.Status.Conditions,
179144
metav1.Condition{
180145
Type: kvmv1.ConditionTypeReady,
@@ -239,12 +204,12 @@ var _ = Describe("Decommission Controller", func() {
239204
})
240205
})
241206

242-
It("should set the hypervisor condition", func(ctx SpecContext) {
207+
It("should set the hypervisor ready condition", func(ctx SpecContext) {
243208
By("reconciling the created resource")
244209
_, err := r.Reconcile(ctx, reconcileReq)
245210
Expect(err).NotTo(HaveOccurred())
246211
hypervisor := &kvmv1.Hypervisor{}
247-
Expect(k8sClient.Get(ctx, nodeName, hypervisor)).To(Succeed())
212+
Expect(k8sClient.Get(ctx, resourceName, hypervisor)).To(Succeed())
248213
Expect(hypervisor.Status.Conditions).To(ContainElement(
249214
SatisfyAll(
250215
HaveField("Type", kvmv1.ConditionTypeReady),
@@ -254,18 +219,22 @@ var _ = Describe("Decommission Controller", func() {
254219
))
255220
})
256221

257-
It("should remove the finalizer", func(ctx SpecContext) {
222+
It("should set the hypervisor offboarded condition", func(ctx SpecContext) {
258223
By("reconciling the created resource")
259224
for range 3 {
260225
_, err := r.Reconcile(ctx, reconcileReq)
261226
Expect(err).NotTo(HaveOccurred())
262227
}
263228
Expect(getHypervisorsCalled).To(BeNumerically(">", 0))
264229

265-
node := &corev1.Node{}
266-
err := k8sClient.Get(ctx, nodeName, node)
267-
Expect(err).To(HaveOccurred())
268-
Expect(k8sclient.IgnoreNotFound(err)).To(Succeed())
230+
hypervisor := &kvmv1.Hypervisor{}
231+
Expect(k8sClient.Get(ctx, resourceName, hypervisor)).To(Succeed())
232+
Expect(hypervisor.Status.Conditions).To(ContainElement(
233+
SatisfyAll(
234+
HaveField("Type", kvmv1.ConditionTypeOffboarded),
235+
HaveField("Status", metav1.ConditionTrue),
236+
),
237+
))
269238
})
270239
})
271240

internal/controller/gardener_node_lifecycle_controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,9 +87,9 @@ func (r *GardenerNodeLifecycleController) Reconcile(ctx context.Context, req ctr
8787
// Onboarding is not in progress anymore, i.e. the host is onboarded
8888
onboardingCompleted := meta.IsStatusConditionFalse(hv.Status.Conditions, kvmv1.ConditionTypeOnboarding)
8989
// Evicting is not in progress anymore, i.e. the host is empty
90-
evictionComplete := meta.IsStatusConditionFalse(hv.Status.Conditions, kvmv1.ConditionTypeEvicting)
90+
offboarded := meta.IsStatusConditionTrue(hv.Status.Conditions, kvmv1.ConditionTypeOffboarded)
9191

92-
if evictionComplete {
92+
if offboarded {
9393
minAvailable = 0
9494

9595
if onboardingCompleted && isTerminating(node) {

internal/controller/gardener_node_lifecycle_controller_test.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,18 +114,24 @@ var _ = Describe("Gardener Maintenance Controller", func() {
114114
})
115115
})
116116

117-
When("the node has been evicted", func() {
117+
When("the node has been offboarded", func() {
118118
BeforeEach(func(ctx SpecContext) {
119119
hypervisor := &kvmv1.Hypervisor{}
120120
Expect(k8sClient.Get(ctx, name, hypervisor)).To(Succeed())
121121
meta.SetStatusCondition(&hypervisor.Status.Conditions, metav1.Condition{
122-
Type: kvmv1.ConditionTypeEvicting,
123-
Status: metav1.ConditionFalse,
122+
Type: kvmv1.ConditionTypeOffboarded,
123+
Status: metav1.ConditionTrue,
124124
Reason: "dontcare",
125125
Message: "dontcare",
126126
})
127127
Expect(k8sClient.Status().Update(ctx, hypervisor)).To(Succeed())
128128
})
129+
130+
It("should update the poddisruptionbudget to minAvailable 0", func(ctx SpecContext) {
131+
pdb := &policyv1.PodDisruptionBudget{}
132+
Expect(k8sClient.Get(ctx, maintenanceName, pdb)).To(Succeed())
133+
Expect(pdb.Spec.MinAvailable).To(HaveField("IntVal", BeNumerically("==", int32(0))))
134+
})
129135
})
130136

131137
})

0 commit comments

Comments
 (0)