diff --git a/coreapi/lifecycle.go b/coreapi/lifecycle.go index 90f61b1..f5af3ec 100644 --- a/coreapi/lifecycle.go +++ b/coreapi/lifecycle.go @@ -8,6 +8,7 @@ import ( "log/slog" "sort" "sync" + "time" ) // Service is the lifecycle contract every L11 plugin implements. @@ -145,9 +146,15 @@ func stopWithPanicRecovery(ctx context.Context, s Service) (err error) { // shutdown cannot crash the daemon; the panic is converted to an error // and all remaining services still get their Stop call. // -// Errors from individual Stop calls (including recovered panics) are -// collected; the first one is returned but every service still gets -// its Stop call invoked. +// To prevent one hung plugin from blocking the entire shutdown sequence, +// each per-plugin Stop call is given its own 5-second timeout via a +// per-plugin context derived from the parent. If a plugin exceeds its +// deadline, context.DeadlineExceeded is surfaced as a Stop error and +// remaining plugins continue shutting down. +// +// Errors from individual Stop calls (including recovered panics and +// deadline expirations) are collected; the first one is returned but +// every service still gets its Stop call invoked. func (sr *ServiceRegistry) StopAll(ctx context.Context) error { sr.mu.Lock() queue := append([]Service(nil), sr.started...) @@ -156,7 +163,10 @@ func (sr *ServiceRegistry) StopAll(ctx context.Context) error { var firstErr error for i := len(queue) - 1; i >= 0; i-- { - if err := stopWithPanicRecovery(ctx, queue[i]); err != nil && firstErr == nil { + pluginCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + err := stopWithPanicRecovery(pluginCtx, queue[i]) + cancel() + if err != nil && firstErr == nil { firstErr = err } } diff --git a/coreapi/zz_lifecycle_edge_test.go b/coreapi/zz_lifecycle_edge_test.go index 9612073..6cb8a45 100644 --- a/coreapi/zz_lifecycle_edge_test.go +++ b/coreapi/zz_lifecycle_edge_test.go @@ -59,6 +59,40 @@ func TestServiceRegistry_StopAllStopsAllEvenAfterError(t *testing.T) { } } +func TestServiceRegistry_StopAllTimingOutHangingPlugin(t *testing.T) { + t.Parallel() + sr := &coreapi.ServiceRegistry{} + bStopped := false + a := &hangingService{name: "a", order: 1} + bb := &recordingStopWithErr{name: "b", order: 2, stopped: &bStopped} + _ = sr.Register(a) + _ = sr.Register(bb) + _ = sr.StartAll(context.Background(), coreapi.Deps{}) + // StopAll should not block forever; the hanging plugin should time out + err := sr.StopAll(context.Background()) + if !errors.Is(err, context.DeadlineExceeded) { + t.Errorf("StopAll = %v, want DeadlineExceeded from hung plugin a", err) + } + if !bStopped { + t.Error("service b was not stopped despite a hanging") + } +} + +// hangingService never returns from Stop — simulates a plugin that +// blocks indefinitely, used to verify per-plugin timeout in StopAll. +type hangingService struct { + name string + order int +} + +func (h *hangingService) Name() string { return h.name } +func (h *hangingService) Order() int { return h.order } +func (h *hangingService) Start(ctx context.Context, deps coreapi.Deps) error { return nil } +func (h *hangingService) Stop(ctx context.Context) error { + <-ctx.Done() + return ctx.Err() +} + type recordingStopWithErr struct { name string order int