From f66897e8f04d65baa4cee176635100e88984f2f7 Mon Sep 17 00:00:00 2001 From: matthew-pilot Date: Mon, 1 Jun 2026 07:51:29 +0000 Subject: [PATCH] fix(daemon): wrap bgWG.Wait() with 5s timeout in doStop() (PILOT-318) The doStop() shutdown path called bgWG.Wait() with no deadline. If any background goroutine was blocked on registry I/O during an outage, doStop() never returned, and the process only exited when the supervisor SIGKILL'd it. This wraps the Wait with a 5-second timeout via a goroutine + select. On timeout, slog.Warn records the leak event so operators can distinguish graceful shutdown from forced-exit-by-leak. Hung goroutines are the lesser evil compared to an unkillable daemon. Verified: build + vet clean, daemon tests pass (55.7s). Closes PILOT-318 --- pkg/daemon/daemon.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index c1219a04..214ad122 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -1325,7 +1325,19 @@ func (d *Daemon) Stop() error { func (d *Daemon) doStop() { // Wait for all daemon-scoped background goroutines to notice // stopCh and exit before tearing down shared infrastructure. - d.bgWG.Wait() + // Use a 5-second timeout to prevent a hung goroutine (e.g. + // blocked on registry I/O during an outage) from blocking + // shutdown forever. Leaked goroutines are the lesser evil. + done := make(chan struct{}) + go func() { + d.bgWG.Wait() + close(done) + }() + select { + case <-done: + case <-time.After(5 * time.Second): + slog.Warn("timed out waiting for background goroutines to exit", "leaked", true) + } // v1.9.1: emit a shutdown signal BEFORE any teardown so operators // can distinguish planned drain from crash. Auto-scalers and