Skip to content

Commit c79bb87

Browse files
committed
Onboarding: Timeout on active instance eventually
The code currently assumes, that an instance would report the expected string eventually, which assumes that the metadata service lookup works. The active state only reflects that the hypervisor could boot the VM. As the instance usually boots within seconds, a five minute timeout seems save enough to catch this eventuality.
1 parent 9813dcb commit c79bb87

File tree

4 files changed

+91
-6
lines changed

4 files changed

+91
-6
lines changed

charts/openstack-hypervisor-operator/crds/hypervisor-crd.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
---
21
apiVersion: apiextensions.k8s.io/v1
32
kind: CustomResourceDefinition
43
metadata:
@@ -619,3 +618,4 @@ spec:
619618
storage: true
620619
subresources:
621620
status: {}
621+

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ require (
104104
k8s.io/component-base v0.35.0 // indirect
105105
k8s.io/klog/v2 v2.130.1 // indirect
106106
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect
107-
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect
107+
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4
108108
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.33.0 // indirect
109109
sigs.k8s.io/gateway-api v1.4.0 // indirect
110110
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect

internal/controller/onboarding_controller.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3232
"k8s.io/apimachinery/pkg/runtime"
3333
"k8s.io/apimachinery/pkg/types"
34+
"k8s.io/utils/clock"
3435
ctrl "sigs.k8s.io/controller-runtime"
3536
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
3637
logger "sigs.k8s.io/controller-runtime/pkg/log"
@@ -52,6 +53,7 @@ var errRequeue = errors.New("requeue requested")
5253

5354
const (
5455
defaultWaitTime = 1 * time.Minute
56+
smokeTestTimeout = 5 * time.Minute
5557
testProjectName = "test"
5658
testDomainName = "cc3test"
5759
testImageName = "cirros-kvm"
@@ -63,6 +65,7 @@ const (
6365
type OnboardingController struct {
6466
k8sclient.Client
6567
Scheme *runtime.Scheme
68+
Clock clock.Clock
6669
computeClient *gophercloud.ServiceClient
6770
testComputeClient *gophercloud.ServiceClient
6871
testImageClient *gophercloud.ServiceClient
@@ -286,6 +289,21 @@ func (r *OnboardingController) smokeTest(ctx context.Context, hv *kvmv1.Hypervis
286289
}
287290

288291
if !strings.Contains(consoleOutput, server.Name) {
292+
if !server.LaunchedAt.IsZero() && r.Clock.Now().After(server.LaunchedAt.Add(smokeTestTimeout)) {
293+
base := hv.DeepCopy()
294+
meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{
295+
Type: kvmv1.ConditionTypeOnboarding,
296+
Status: metav1.ConditionTrue,
297+
Reason: kvmv1.ConditionReasonTesting,
298+
Message: fmt.Sprintf("timeout waiting for console output since %v", server.LaunchedAt),
299+
})
300+
if err := r.patchStatus(ctx, hv, base); err != nil {
301+
return ctrl.Result{}, err
302+
}
303+
if err = servers.Delete(ctx, r.testComputeClient, server.ID).ExtractErr(); err != nil {
304+
return ctrl.Result{}, fmt.Errorf("failed to delete timed out test instance %v: %w", server.ID, err)
305+
}
306+
}
289307
return ctrl.Result{RequeueAfter: defaultWaitTime}, nil
290308
}
291309

@@ -634,6 +652,8 @@ func (r *OnboardingController) SetupWithManager(mgr ctrl.Manager) error {
634652
}
635653
r.testNetworkClient.ResourceBase = fmt.Sprintf("%vv2.0/", r.testNetworkClient.Endpoint)
636654

655+
r.Clock = clock.RealClock{}
656+
637657
return ctrl.NewControllerManagedBy(mgr).
638658
Named(OnboardingControllerName).
639659
For(&kvmv1.Hypervisor{}).

internal/controller/onboarding_controller_test.go

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"fmt"
2222
"net/http"
2323
"os"
24+
"time"
2425

2526
"github.com/gophercloud/gophercloud/v2/testhelper"
2627
"github.com/gophercloud/gophercloud/v2/testhelper/client"
@@ -30,6 +31,8 @@ import (
3031
"k8s.io/apimachinery/pkg/api/meta"
3132
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3233
"k8s.io/apimachinery/pkg/types"
34+
"k8s.io/utils/clock"
35+
clocktesting "k8s.io/utils/clock/testing"
3336
ctrl "sigs.k8s.io/controller-runtime"
3437

3538
kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
@@ -162,7 +165,8 @@ var _ = Describe("Onboarding Controller", func() {
162165
createServerBody = `{
163166
"server": {
164167
"id": "server-id",
165-
"status": "ACTIVE"
168+
"status": "ACTIVE",
169+
"OS-SRV-USG:launched_at": "2025-01-01T12:00:00.000000"
166170
}
167171
}`
168172
)
@@ -220,6 +224,7 @@ var _ = Describe("Onboarding Controller", func() {
220224
onboardingReconciler = &OnboardingController{
221225
Client: k8sClient,
222226
Scheme: k8sClient.Scheme(),
227+
Clock: clock.RealClock{},
223228
computeClient: client.ServiceClient(fakeServer),
224229
testComputeClient: client.ServiceClient(fakeServer),
225230
testImageClient: client.ServiceClient(fakeServer),
@@ -339,6 +344,11 @@ var _ = Describe("Onboarding Controller", func() {
339344
})
340345

341346
Context("running tests after initial setup", func() {
347+
var (
348+
serverActionHandler func(http.ResponseWriter, *http.Request)
349+
serverDeleteHandler func(http.ResponseWriter, *http.Request)
350+
)
351+
342352
BeforeEach(func(ctx SpecContext) {
343353
hv := &kvmv1.Hypervisor{}
344354
Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed())
@@ -413,15 +423,21 @@ var _ = Describe("Onboarding Controller", func() {
413423
Expect(err).NotTo(HaveOccurred())
414424
})
415425

416-
fakeServer.Mux.HandleFunc("POST /servers/server-id/action", func(w http.ResponseWriter, r *http.Request) {
426+
serverActionHandler = func(w http.ResponseWriter, _ *http.Request) {
417427
w.Header().Add("Content-Type", "application/json")
418428
w.WriteHeader(http.StatusOK)
419429
_, err := fmt.Fprintf(w, `{"output": "FAKE CONSOLE OUTPUT\nANOTHER\nLAST LINE\nohooc--%v-%v\n"}`, hv.Name, hv.UID)
420430
Expect(err).NotTo(HaveOccurred())
421-
431+
}
432+
fakeServer.Mux.HandleFunc("POST /servers/server-id/action", func(w http.ResponseWriter, r *http.Request) {
433+
serverActionHandler(w, r)
422434
})
423-
fakeServer.Mux.HandleFunc("DELETE /servers/server-id", func(w http.ResponseWriter, r *http.Request) {
435+
436+
serverDeleteHandler = func(w http.ResponseWriter, _ *http.Request) {
424437
w.WriteHeader(http.StatusAccepted)
438+
}
439+
fakeServer.Mux.HandleFunc("DELETE /servers/server-id", func(w http.ResponseWriter, r *http.Request) {
440+
serverDeleteHandler(w, r)
425441
})
426442
})
427443

@@ -504,6 +520,7 @@ var _ = Describe("Onboarding Controller", func() {
504520
))
505521
})
506522
})
523+
507524
When("SkipTests is set to false", func() {
508525
BeforeEach(func(ctx SpecContext) {
509526
hv := &kvmv1.Hypervisor{}
@@ -578,5 +595,53 @@ var _ = Describe("Onboarding Controller", func() {
578595
})
579596
})
580597

598+
When("smoke test times out waiting for console output", func() {
599+
var serverDeletedCalled bool
600+
601+
BeforeEach(func(ctx SpecContext) {
602+
By("Overriding HV status to Testing state")
603+
hv := &kvmv1.Hypervisor{}
604+
Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed())
605+
meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{
606+
Type: kvmv1.ConditionTypeOnboarding,
607+
Status: metav1.ConditionTrue,
608+
Reason: kvmv1.ConditionReasonTesting,
609+
})
610+
Expect(k8sClient.Status().Update(ctx, hv)).To(Succeed())
611+
612+
By("Setting fake clock past the smoke test timeout")
613+
// createServerBody has LaunchedAt "2025-01-01T12:00:00", so 6 minutes later is past the 5-minute deadline
614+
onboardingReconciler.Clock = clocktesting.NewFakeClock(time.Date(2025, 1, 1, 12, 6, 0, 0, time.UTC))
615+
serverDeletedCalled = false
616+
617+
serverActionHandler = func(w http.ResponseWriter, _ *http.Request) {
618+
w.Header().Add("Content-Type", "application/json")
619+
w.WriteHeader(http.StatusOK)
620+
_, err := fmt.Fprint(w, `{"output": "some unrelated console output\n"}`)
621+
Expect(err).NotTo(HaveOccurred())
622+
}
623+
serverDeleteHandler = func(w http.ResponseWriter, _ *http.Request) {
624+
serverDeletedCalled = true
625+
w.WriteHeader(http.StatusAccepted)
626+
}
627+
})
628+
629+
It("should delete the stalled server and record a timeout in the status", func(ctx SpecContext) {
630+
By("Reconciling smoke test past the timeout deadline")
631+
_, err := onboardingReconciler.Reconcile(ctx, reconcileReq)
632+
Expect(err).NotTo(HaveOccurred())
633+
634+
By("Verifying the timed-out server was deleted")
635+
Expect(serverDeletedCalled).To(BeTrue())
636+
637+
By("Verifying the onboarding condition message indicates a timeout")
638+
hv := &kvmv1.Hypervisor{}
639+
Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed())
640+
onboardingCond := meta.FindStatusCondition(hv.Status.Conditions, kvmv1.ConditionTypeOnboarding)
641+
Expect(onboardingCond).NotTo(BeNil())
642+
Expect(onboardingCond.Reason).To(Equal(kvmv1.ConditionReasonTesting))
643+
Expect(onboardingCond.Message).To(ContainSubstring("timeout"))
644+
})
645+
})
581646
})
582647
})

0 commit comments

Comments
 (0)