Skip to content

Commit 0c09786

Browse files
committed
Onboarding: Timeout on active instance eventually
The code currently assumes, that an instance would report the expected string eventually, which assumes that the metadata service lookup works. The active state only reflects that the hypervisor could boot the VM. As the instance usually boots within seconds, a five minute timeout seems save enough to catch this eventuality.
1 parent 40ef4bb commit 0c09786

3 files changed

Lines changed: 124 additions & 6 deletions

File tree

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ require (
104104
k8s.io/component-base v0.35.0 // indirect
105105
k8s.io/klog/v2 v2.130.1 // indirect
106106
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect
107-
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect
107+
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4
108108
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.33.0 // indirect
109109
sigs.k8s.io/gateway-api v1.4.0 // indirect
110110
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect

internal/controller/onboarding_controller.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3232
"k8s.io/apimachinery/pkg/runtime"
3333
"k8s.io/apimachinery/pkg/types"
34+
"k8s.io/utils/clock"
3435
ctrl "sigs.k8s.io/controller-runtime"
3536
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
3637
logger "sigs.k8s.io/controller-runtime/pkg/log"
@@ -51,6 +52,7 @@ var errRequeue = errors.New("requeue requested")
5152

5253
const (
5354
defaultWaitTime = 1 * time.Minute
55+
smokeTestTimeout = 5 * time.Minute
5456
testProjectName = "test"
5557
testDomainName = "cc3test"
5658
testImageName = "cirros-kvm"
@@ -63,6 +65,7 @@ type OnboardingController struct {
6365
k8sclient.Client
6466
Scheme *runtime.Scheme
6567
TestFlavorID string
68+
Clock clock.Clock
6669
computeClient *gophercloud.ServiceClient
6770
testComputeClient *gophercloud.ServiceClient
6871
testImageClient *gophercloud.ServiceClient
@@ -286,6 +289,23 @@ func (r *OnboardingController) smokeTest(ctx context.Context, hv *kvmv1.Hypervis
286289
}
287290

288291
if !strings.Contains(consoleOutput, server.Name) {
292+
if !server.LaunchedAt.IsZero() && r.Clock.Now().After(server.LaunchedAt.Add(smokeTestTimeout)) {
293+
base := hv.DeepCopy()
294+
meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{
295+
Type: kvmv1.ConditionTypeOnboarding,
296+
Status: metav1.ConditionTrue,
297+
Reason: kvmv1.ConditionReasonTesting,
298+
Message: fmt.Sprintf("timeout waiting for console output since %v", server.LaunchedAt),
299+
})
300+
if err := r.patchStatus(ctx, hv, base); err != nil {
301+
return ctrl.Result{}, err
302+
}
303+
if err = servers.Delete(ctx, r.testComputeClient, server.ID).ExtractErr(); err != nil {
304+
if !gophercloud.ResponseCodeIs(err, http.StatusNotFound) {
305+
return ctrl.Result{}, fmt.Errorf("failed to delete timed out test instance %v: %w", server.ID, err)
306+
}
307+
}
308+
}
289309
return ctrl.Result{RequeueAfter: defaultWaitTime}, nil
290310
}
291311

@@ -613,6 +633,10 @@ func (r *OnboardingController) SetupWithManager(mgr ctrl.Manager) error {
613633
}
614634
r.testNetworkClient.ResourceBase = fmt.Sprintf("%vv2.0/", r.testNetworkClient.Endpoint)
615635

636+
if r.Clock == nil {
637+
r.Clock = clock.RealClock{}
638+
}
639+
616640
return ctrl.NewControllerManagedBy(mgr).
617641
Named(OnboardingControllerName).
618642
For(&kvmv1.Hypervisor{}).

internal/controller/onboarding_controller_test.go

Lines changed: 99 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"fmt"
2222
"net/http"
2323
"os"
24+
"time"
2425

2526
"github.com/gophercloud/gophercloud/v2/testhelper"
2627
"github.com/gophercloud/gophercloud/v2/testhelper/client"
@@ -30,6 +31,8 @@ import (
3031
"k8s.io/apimachinery/pkg/api/meta"
3132
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3233
"k8s.io/apimachinery/pkg/types"
34+
"k8s.io/utils/clock"
35+
clocktesting "k8s.io/utils/clock/testing"
3336
ctrl "sigs.k8s.io/controller-runtime"
3437

3538
kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
@@ -221,6 +224,7 @@ var _ = Describe("Onboarding Controller", func() {
221224
Client: k8sClient,
222225
Scheme: k8sClient.Scheme(),
223226
TestFlavorID: "1",
227+
Clock: clock.RealClock{},
224228
computeClient: client.ServiceClient(fakeServer),
225229
testComputeClient: client.ServiceClient(fakeServer),
226230
testImageClient: client.ServiceClient(fakeServer),
@@ -340,6 +344,12 @@ var _ = Describe("Onboarding Controller", func() {
340344
})
341345

342346
Context("running tests after initial setup", func() {
347+
var (
348+
serverActionHandler func(http.ResponseWriter, *http.Request)
349+
serverDeleteHandler func(http.ResponseWriter, *http.Request)
350+
serverDetailHandler func(http.ResponseWriter, *http.Request)
351+
)
352+
343353
BeforeEach(func(ctx SpecContext) {
344354
hv := &kvmv1.Hypervisor{}
345355
Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed())
@@ -371,11 +381,14 @@ var _ = Describe("Onboarding Controller", func() {
371381
Expect(err).NotTo(HaveOccurred())
372382
})
373383

384+
serverDetailHandler = func(w http.ResponseWriter, _ *http.Request) {
385+
_, err := fmt.Fprint(w, emptyServersBody)
386+
Expect(err).NotTo(HaveOccurred())
387+
}
374388
fakeServer.Mux.HandleFunc("GET /servers/detail", func(w http.ResponseWriter, r *http.Request) {
375389
w.Header().Add("Content-Type", "application/json")
376390
w.WriteHeader(http.StatusOK)
377-
_, err := fmt.Fprint(w, emptyServersBody)
378-
Expect(err).NotTo(HaveOccurred())
391+
serverDetailHandler(w, r)
379392
})
380393

381394
fakeServer.Mux.HandleFunc("POST /instance-ha", func(w http.ResponseWriter, r *http.Request) {
@@ -406,15 +419,21 @@ var _ = Describe("Onboarding Controller", func() {
406419
Expect(err).NotTo(HaveOccurred())
407420
})
408421

409-
fakeServer.Mux.HandleFunc("POST /servers/server-id/action", func(w http.ResponseWriter, r *http.Request) {
422+
serverActionHandler = func(w http.ResponseWriter, _ *http.Request) {
410423
w.Header().Add("Content-Type", "application/json")
411424
w.WriteHeader(http.StatusOK)
412425
_, err := fmt.Fprintf(w, `{"output": "FAKE CONSOLE OUTPUT\nANOTHER\nLAST LINE\nohooc--%v-%v\n"}`, hv.Name, hv.UID)
413426
Expect(err).NotTo(HaveOccurred())
414-
427+
}
428+
fakeServer.Mux.HandleFunc("POST /servers/server-id/action", func(w http.ResponseWriter, r *http.Request) {
429+
serverActionHandler(w, r)
415430
})
416-
fakeServer.Mux.HandleFunc("DELETE /servers/server-id", func(w http.ResponseWriter, r *http.Request) {
431+
432+
serverDeleteHandler = func(w http.ResponseWriter, _ *http.Request) {
417433
w.WriteHeader(http.StatusAccepted)
434+
}
435+
fakeServer.Mux.HandleFunc("DELETE /servers/server-id", func(w http.ResponseWriter, r *http.Request) {
436+
serverDeleteHandler(w, r)
418437
})
419438
})
420439

@@ -571,5 +590,80 @@ var _ = Describe("Onboarding Controller", func() {
571590
})
572591
})
573592

593+
When("smoke test times out waiting for console output", func() {
594+
var serverDeletedCalled bool
595+
596+
BeforeEach(func(ctx SpecContext) {
597+
By("Overriding HV status to Testing state")
598+
hv := &kvmv1.Hypervisor{}
599+
Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed())
600+
meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{
601+
Type: kvmv1.ConditionTypeOnboarding,
602+
Status: metav1.ConditionTrue,
603+
Reason: kvmv1.ConditionReasonTesting,
604+
})
605+
Expect(k8sClient.Status().Update(ctx, hv)).To(Succeed())
606+
607+
// Construct the server name the controller will look for.
608+
serverName := fmt.Sprintf("%s-%s-%s", testPrefixName, hypervisorName, string(hv.UID))
609+
detailCallCount := 0
610+
611+
// On the first reconcile GET /servers/detail returns empty so the controller
612+
// creates the server via POST (no launched_at → timeout cannot fire yet).
613+
// On the second reconcile GET /servers/detail returns the ACTIVE server with a
614+
// stale launched_at so createOrGetTestServer takes the "already-running" path and
615+
// smokeTest fires the timeout.
616+
serverDetailHandler = func(w http.ResponseWriter, _ *http.Request) {
617+
if detailCallCount == 0 {
618+
detailCallCount++
619+
_, err := fmt.Fprint(w, emptyServersBody)
620+
Expect(err).NotTo(HaveOccurred())
621+
} else {
622+
_, err := fmt.Fprintf(w,
623+
`{"servers": [{"id": "server-id", "name": %q, "status": "ACTIVE", "OS-SRV-USG:launched_at": "2025-01-01T12:00:00.000000"}], "servers_links": []}`,
624+
serverName)
625+
Expect(err).NotTo(HaveOccurred())
626+
}
627+
}
628+
629+
// Set the clock to 6 minutes after the launched_at above (past the 5-minute deadline).
630+
onboardingReconciler.Clock = clocktesting.NewFakeClock(time.Date(2025, 1, 1, 12, 6, 0, 0, time.UTC))
631+
serverDeletedCalled = false
632+
633+
// Console output that does NOT contain the server name, so the timeout path is exercised.
634+
serverActionHandler = func(w http.ResponseWriter, _ *http.Request) {
635+
w.Header().Add("Content-Type", "application/json")
636+
w.WriteHeader(http.StatusOK)
637+
_, err := fmt.Fprint(w, `{"output": "some unrelated console output\n"}`)
638+
Expect(err).NotTo(HaveOccurred())
639+
}
640+
serverDeleteHandler = func(w http.ResponseWriter, _ *http.Request) {
641+
serverDeletedCalled = true
642+
w.WriteHeader(http.StatusAccepted)
643+
}
644+
})
645+
646+
It("should delete the stalled server and record a timeout in the status", func(ctx SpecContext) {
647+
By("First reconcile: controller creates the ACTIVE server; launched_at is absent so timeout does not fire yet")
648+
_, err := onboardingReconciler.Reconcile(ctx, reconcileReq)
649+
Expect(err).NotTo(HaveOccurred())
650+
651+
By("Second reconcile: GET /servers/detail returns the stale server; timeout fires and the server is deleted")
652+
_, err = onboardingReconciler.Reconcile(ctx, reconcileReq)
653+
Expect(err).NotTo(HaveOccurred())
654+
655+
By("Verifying the timed-out server was deleted")
656+
Expect(serverDeletedCalled).To(BeTrue())
657+
658+
By("Verifying the onboarding condition message indicates a timeout")
659+
hv := &kvmv1.Hypervisor{}
660+
Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed())
661+
onboardingCond := meta.FindStatusCondition(hv.Status.Conditions, kvmv1.ConditionTypeOnboarding)
662+
Expect(onboardingCond).NotTo(BeNil())
663+
Expect(onboardingCond.Reason).To(Equal(kvmv1.ConditionReasonTesting))
664+
Expect(onboardingCond.Message).To(ContainSubstring("timeout"))
665+
})
666+
})
667+
574668
})
575669
})

0 commit comments

Comments
 (0)