From 58f4a2f805b40fd35f68044fd1f11fb1ee228b4c Mon Sep 17 00:00:00 2001 From: Amber Xue Date: Fri, 22 May 2026 10:51:41 -0700 Subject: [PATCH 1/4] fix: context deadline exceeded shouldn't throw an error Signed-off-by: Amber Xue --- internal/exporter/exporter.go | 10 ++++++++++ internal/exporter/exporter_test.go | 27 +++++++++++++++++++++++++++ internal/server/server.go | 14 ++++++++++++-- internal/server/server_test.go | 24 ++++++++++++++++++++++++ 4 files changed, 73 insertions(+), 2 deletions(-) diff --git a/internal/exporter/exporter.go b/internal/exporter/exporter.go index b9f6ec98..52e67d2d 100644 --- a/internal/exporter/exporter.go +++ b/internal/exporter/exporter.go @@ -116,6 +116,16 @@ func (e *healthExporter) Start() error { log.Logger.Infow("Starting health exporter") + // In offline mode, emit one export immediately so short runs don't exit + // before the first ticker cycle and leave an empty output directory. + if e.options.config.OfflineMode { + if err := e.export(); err != nil { + log.Logger.Errorw("Initial offline export failed", "error", err) + } else { + e.lastExport = time.Now().UTC() + } + } + // Start the health export ticker go func() { ticker := time.NewTicker(e.options.config.Interval.Duration) diff --git a/internal/exporter/exporter_test.go b/internal/exporter/exporter_test.go index 8739eae1..93564071 100644 --- a/internal/exporter/exporter_test.go +++ b/internal/exporter/exporter_test.go @@ -284,6 +284,33 @@ func TestStart(t *testing.T) { require.NoError(t, err) }) + t.Run("offline mode performs initial export before first tick", func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.HealthExporterConfig{ + Interval: metav1.Duration{Duration: 1 * time.Hour}, + Timeout: metav1.Duration{Duration: 30 * time.Second}, + OfflineMode: true, + OutputPath: tmpDir, + OutputFormat: "json", + } + + exporter, err := New(ctx, WithConfig(cfg), WithMachineID("test-machine-id")) + require.NoError(t, err) + require.NotNil(t, exporter) + + err = exporter.Start() + require.NoError(t, err) + + // Initial export should be written immediately, without waiting for ticker. + time.Sleep(100 * time.Millisecond) + entries, err := os.ReadDir(tmpDir) + require.NoError(t, err) + assert.Greater(t, len(entries), 0, "Expected offline files from initial export") + + err = exporter.Stop() + require.NoError(t, err) + }) + } // TestStop tests the Stop function diff --git a/internal/server/server.go b/internal/server/server.go index 791ace43..7b6e1699 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -210,6 +210,16 @@ func getAttestationTimeout(cfg *config.Config) time.Duration { return config.DefaultAttestationTimeout } +func shouldLogLoopExitError(err error) bool { + if err == nil { + return false + } + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return false + } + return true +} + // shouldEnableComponent determines if a component should be enabled based on configuration func shouldEnableComponent(name string, enabledByDefault bool, config *config.Config) bool { shouldEnable := enabledByDefault @@ -463,7 +473,7 @@ func (s *Server) startInventoryLoop( }) go func() { - if err := manager.Run(ctx); err != nil && !errors.Is(err, context.Canceled) { + if err := manager.Run(ctx); shouldLogLoopExitError(err) { log.Logger.Errorw("inventory loop manager exited", "error", err) } }() @@ -498,7 +508,7 @@ func (s *Server) startAttestationLoop(ctx context.Context, cfg *config.Config) { ) go func() { - if err := manager.Run(ctx); err != nil && !errors.Is(err, context.Canceled) { + if err := manager.Run(ctx); shouldLogLoopExitError(err) { log.Logger.Errorw("attestation loop exited", "error", err) } }() diff --git a/internal/server/server_test.go b/internal/server/server_test.go index f00d1504..dee57c2f 100644 --- a/internal/server/server_test.go +++ b/internal/server/server_test.go @@ -17,6 +17,7 @@ package server import ( "context" + "errors" "net" "os" "path/filepath" @@ -158,6 +159,29 @@ func TestGetInventorySyncTimeout(t *testing.T) { } } +func TestShouldLogLoopExitError(t *testing.T) { + t.Run("nil error", func(t *testing.T) { + assert.False(t, shouldLogLoopExitError(nil)) + }) + + t.Run("context canceled", func(t *testing.T) { + assert.False(t, shouldLogLoopExitError(context.Canceled)) + }) + + t.Run("context deadline exceeded", func(t *testing.T) { + assert.False(t, shouldLogLoopExitError(context.DeadlineExceeded)) + }) + + t.Run("wrapped deadline exceeded", func(t *testing.T) { + err := errors.Join(errors.New("loop stopped"), context.DeadlineExceeded) + assert.False(t, shouldLogLoopExitError(err)) + }) + + t.Run("real error", func(t *testing.T) { + assert.True(t, shouldLogLoopExitError(errors.New("unexpected failure"))) + }) +} + func TestGetAttestationSettings(t *testing.T) { tests := []struct { name string From 6b39c2bd0305032f9cc1aefbd612d0ca056bfd19 Mon Sep 17 00:00:00 2001 From: Amber Xue Date: Fri, 22 May 2026 11:44:33 -0700 Subject: [PATCH 2/4] add waitgroup Signed-off-by: Amber Xue --- .../helm/fleet-intelligence-agent/README.md | 3 + .../templates/daemonset.yaml | 6 ++ .../helm/fleet-intelligence-agent/values.yaml | 4 ++ docs/install-helm.md | 17 +++++ internal/exporter/exporter_test.go | 52 ++++++++++++++ internal/server/server.go | 52 ++++++++++++-- internal/server/server_test.go | 68 +++++++++++++++++-- 7 files changed, 189 insertions(+), 13 deletions(-) diff --git a/deployments/helm/fleet-intelligence-agent/README.md b/deployments/helm/fleet-intelligence-agent/README.md index 1c28a645..c41c0a35 100644 --- a/deployments/helm/fleet-intelligence-agent/README.md +++ b/deployments/helm/fleet-intelligence-agent/README.md @@ -51,6 +51,8 @@ Common values (defaults from `values.yaml`): | `enroll.tokenSecretName` | `""` | Secret name for enrollment token. | | `enroll.tokenSecretKey` | `token` | Secret key for enrollment token. | | `enroll.tokenValue` | `""` | Inline token value (optional). | +| `enroll.nodeGroup` | `null` | Optional `--node-group` enroll flag. `null` omits flag, `""` clears stored value. | +| `enroll.computeZone` | `null` | Optional `--compute-zone` enroll flag. `null` omits flag, `""` clears stored value. | | `enroll.securityContext.runAsUser` | `0` | Run enrollment init as root. | | `ports.http` | `15133` | HTTP port. | | `resources.requests.cpu` | `100m` | CPU request. | @@ -70,6 +72,7 @@ Common values (defaults from `values.yaml`): `enroll.enabled` and `enroll.unenroll` are mutually exclusive; do not set both to `true`. Set `enroll.force=true` to append `--force` to `fleetint enroll`. +Set `enroll.nodeGroup` / `enroll.computeZone` to pass optional enrollment metadata via the init container command. See `docs/install-helm.md` for the enrollment flow and secret creation steps. diff --git a/deployments/helm/fleet-intelligence-agent/templates/daemonset.yaml b/deployments/helm/fleet-intelligence-agent/templates/daemonset.yaml index 2900555e..6520714a 100644 --- a/deployments/helm/fleet-intelligence-agent/templates/daemonset.yaml +++ b/deployments/helm/fleet-intelligence-agent/templates/daemonset.yaml @@ -74,6 +74,12 @@ spec: {{- if .Values.enroll.force }} --force {{- end }} + {{- if ne .Values.enroll.nodeGroup nil }} + --node-group {{ .Values.enroll.nodeGroup | quote }} + {{- end }} + {{- if ne .Values.enroll.computeZone nil }} + --compute-zone {{ .Values.enroll.computeZone | quote }} + {{- end }} securityContext: {{- toYaml .Values.securityContext | nindent 12 }} env: diff --git a/deployments/helm/fleet-intelligence-agent/values.yaml b/deployments/helm/fleet-intelligence-agent/values.yaml index 71834352..1238f41e 100644 --- a/deployments/helm/fleet-intelligence-agent/values.yaml +++ b/deployments/helm/fleet-intelligence-agent/values.yaml @@ -61,6 +61,10 @@ enroll: tokenSecretName: "" tokenSecretKey: "token" tokenValue: "" + # Optional enrollment metadata. Keep null to omit flag (preserve existing value). + # Set to a string to overwrite. Set to empty string "" to clear. + nodeGroup: "" + computeZone: "" ports: http: 15133 diff --git a/docs/install-helm.md b/docs/install-helm.md index 3165dbca..36b72531 100644 --- a/docs/install-helm.md +++ b/docs/install-helm.md @@ -75,6 +75,23 @@ helm upgrade fleet-intelligence-agent oci://ghcr.io/nvidia/charts/fleet-intellig --set enroll.tokenSecretName="$ENROLL_TOKEN_SECRET_NAME" ``` +Optional: include node metadata during automatic enrollment: + +```bash +helm upgrade fleet-intelligence-agent oci://ghcr.io/nvidia/charts/fleet-intelligence-agent \ + --version "$CHART_VERSION" \ + --namespace "$NS" \ + --set enroll.enabled=true \ + --set enroll.endpoint="$ENROLL_ENDPOINT" \ + --set enroll.tokenSecretName="$ENROLL_TOKEN_SECRET_NAME" \ + --set-string enroll.nodeGroup="prod-a" \ + --set-string enroll.computeZone="us-east-1c" +``` + +Notes: +- Keep `enroll.nodeGroup` / `enroll.computeZone` unset (`null`) to omit the flag and preserve existing stored values. +- Set either value to an empty string to clear it (for example: `--set-string enroll.nodeGroup=""`). + Upgrade (no enrollment): ```bash diff --git a/internal/exporter/exporter_test.go b/internal/exporter/exporter_test.go index 93564071..8bc470ab 100644 --- a/internal/exporter/exporter_test.go +++ b/internal/exporter/exporter_test.go @@ -311,6 +311,58 @@ func TestStart(t *testing.T) { require.NoError(t, err) }) + t.Run("offline initial export includes collected data", func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.HealthExporterConfig{ + Interval: metav1.Duration{Duration: 1 * time.Hour}, + Timeout: metav1.Duration{Duration: 30 * time.Second}, + OfflineMode: true, + OutputPath: tmpDir, + OutputFormat: "json", + } + + exporter, err := New(ctx, WithConfig(cfg), WithMachineID("test-machine-id")) + require.NoError(t, err) + require.NotNil(t, exporter) + + he := exporter.(*healthExporter) + mockCollector := &MockCollector{} + mockCollector.On("Collect", mock.Anything).Return(&collector.HealthData{ + MachineID: "test-machine", + Timestamp: time.Now(), + Metrics: []pkgmetrics.Metric{ + { + Name: "test_metric", + Value: 42.0, + UnixMilliseconds: time.Now().UnixMilli(), + }, + }, + }, nil).Once() + he.collector = mockCollector + + err = exporter.Start() + require.NoError(t, err) + + entries, err := os.ReadDir(tmpDir) + require.NoError(t, err) + require.Greater(t, len(entries), 0, "Expected offline files from initial export") + + allContent := "" + for _, entry := range entries { + fullPath := filepath.Join(tmpDir, entry.Name()) + content, readErr := os.ReadFile(fullPath) + require.NoError(t, readErr) + allContent += string(content) + } + assert.Contains(t, allContent, "test_metric") + assert.Contains(t, allContent, "machine.id") + + mockCollector.AssertExpectations(t) + + err = exporter.Stop() + require.NoError(t, err) + }) + } // TestStop tests the Stop function diff --git a/internal/server/server.go b/internal/server/server.go index 7b6e1699..a737c4e2 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -89,6 +89,12 @@ type Server struct { // signal handler in run.go both call Stop(), so without this guard // components, databases, and the health exporter would be closed twice. stopOnce sync.Once + loopWG sync.WaitGroup + + // loopCtx/loopCancel control background inventory and attestation goroutines. + // Stop() cancels this context and waits on loopWG for graceful shutdown. + loopCtx context.Context + loopCancel context.CancelFunc machineID string } @@ -210,16 +216,34 @@ func getAttestationTimeout(cfg *config.Config) time.Duration { return config.DefaultAttestationTimeout } -func shouldLogLoopExitError(err error) bool { +func shouldLogLoopExitError(ctx context.Context, err error) bool { if err == nil { return false } - if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + if ctx != nil && ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded)) { return false } return true } +func waitForWaitGroup(wg *sync.WaitGroup, timeout time.Duration) bool { + done := make(chan struct{}) + go func() { + wg.Wait() + close(done) + }() + if timeout <= 0 { + <-done + return true + } + select { + case <-done: + return true + case <-time.After(timeout): + return false + } +} + // shouldEnableComponent determines if a component should be enabled based on configuration func shouldEnableComponent(name string, enabledByDefault bool, config *config.Config) bool { shouldEnable := enabledByDefault @@ -245,11 +269,14 @@ func New(ctx context.Context, auditLogger log.AuditLogger, config *config.Config return nil, err } + loopCtx, loopCancel := context.WithCancel(ctx) s := &Server{ auditLogger: auditLogger, dbRW: dbRW, dbRO: dbRO, config: config, + loopCtx: loopCtx, + loopCancel: loopCancel, } defer func() { if retErr != nil { @@ -393,8 +420,8 @@ func New(ctx context.Context, auditLogger log.AuditLogger, config *config.Config } } - s.startInventoryLoop(ctx, config, nvmlInstance, dcgmGPUIndexes) - s.startAttestationLoop(ctx, config) + s.startInventoryLoop(loopCtx, config, nvmlInstance, dcgmGPUIndexes) + s.startAttestationLoop(loopCtx, config) // Create and start health exporter with all dependencies if enabled if config.HealthExporter != nil { @@ -472,8 +499,10 @@ func (s *Server) startInventoryLoop( StartupJitter: inventory.DefaultStartupJitter, }) + s.loopWG.Add(1) go func() { - if err := manager.Run(ctx); shouldLogLoopExitError(err) { + defer s.loopWG.Done() + if err := manager.Run(ctx); shouldLogLoopExitError(ctx, err) { log.Logger.Errorw("inventory loop manager exited", "error", err) } }() @@ -507,8 +536,10 @@ func (s *Server) startAttestationLoop(ctx context.Context, cfg *config.Config) { }, ) + s.loopWG.Add(1) go func() { - if err := manager.Run(ctx); shouldLogLoopExitError(err) { + defer s.loopWG.Done() + if err := manager.Run(ctx); shouldLogLoopExitError(ctx, err) { log.Logger.Errorw("attestation loop exited", "error", err) } }() @@ -526,6 +557,15 @@ func (s *Server) GetHealthExporter() exporter.Exporter { // signal handler in run.go both invoke it. func (s *Server) Stop() { s.stopOnce.Do(func() { + // Signal inventory/attestation loops to stop and wait for graceful exit + // before we close dependencies they may still be using. + if s.loopCancel != nil { + s.loopCancel() + } + if !waitForWaitGroup(&s.loopWG, 10*time.Second) { + log.Logger.Warnw("timed out waiting for background loops to stop") + } + // Gracefully shut down the HTTP server so in-flight requests complete // before we close databases and components underneath them. if s.srv != nil { diff --git a/internal/server/server_test.go b/internal/server/server_test.go index dee57c2f..54dc0710 100644 --- a/internal/server/server_test.go +++ b/internal/server/server_test.go @@ -21,6 +21,7 @@ import ( "net" "os" "path/filepath" + "sync" "testing" "time" @@ -160,25 +161,60 @@ func TestGetInventorySyncTimeout(t *testing.T) { } func TestShouldLogLoopExitError(t *testing.T) { + canceledCtx, cancel := context.WithCancel(context.Background()) + cancel() + + deadlineCtx, deadlineCancel := context.WithTimeout(context.Background(), 1*time.Millisecond) + defer deadlineCancel() + <-deadlineCtx.Done() + t.Run("nil error", func(t *testing.T) { - assert.False(t, shouldLogLoopExitError(nil)) + assert.False(t, shouldLogLoopExitError(context.Background(), nil)) }) - t.Run("context canceled", func(t *testing.T) { - assert.False(t, shouldLogLoopExitError(context.Canceled)) + t.Run("context canceled during shutdown", func(t *testing.T) { + assert.False(t, shouldLogLoopExitError(canceledCtx, context.Canceled)) }) - t.Run("context deadline exceeded", func(t *testing.T) { - assert.False(t, shouldLogLoopExitError(context.DeadlineExceeded)) + t.Run("context deadline exceeded during shutdown", func(t *testing.T) { + assert.False(t, shouldLogLoopExitError(deadlineCtx, context.DeadlineExceeded)) }) t.Run("wrapped deadline exceeded", func(t *testing.T) { err := errors.Join(errors.New("loop stopped"), context.DeadlineExceeded) - assert.False(t, shouldLogLoopExitError(err)) + assert.False(t, shouldLogLoopExitError(deadlineCtx, err)) + }) + + t.Run("deadline exceeded while parent context still active", func(t *testing.T) { + assert.True(t, shouldLogLoopExitError(context.Background(), context.DeadlineExceeded)) + }) + + t.Run("canceled while parent context still active", func(t *testing.T) { + assert.True(t, shouldLogLoopExitError(context.Background(), context.Canceled)) }) t.Run("real error", func(t *testing.T) { - assert.True(t, shouldLogLoopExitError(errors.New("unexpected failure"))) + assert.True(t, shouldLogLoopExitError(context.Background(), errors.New("unexpected failure"))) + }) +} + +func TestWaitForWaitGroup(t *testing.T) { + t.Run("returns true when waitgroup completes", func(t *testing.T) { + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + time.Sleep(10 * time.Millisecond) + }() + assert.True(t, waitForWaitGroup(&wg, 200*time.Millisecond)) + }) + + t.Run("returns false on timeout", func(t *testing.T) { + var wg sync.WaitGroup + wg.Add(1) + assert.False(t, waitForWaitGroup(&wg, 10*time.Millisecond)) + wg.Done() + assert.True(t, waitForWaitGroup(&wg, 100*time.Millisecond)) }) } @@ -463,6 +499,24 @@ func TestServerStop(t *testing.T) { } } +func TestServerStopCancelsBackgroundLoops(t *testing.T) { + loopCtx, loopCancel := context.WithCancel(context.Background()) + s := &Server{ + loopCtx: loopCtx, + loopCancel: loopCancel, + } + s.loopWG.Add(1) + go func() { + defer s.loopWG.Done() + <-loopCtx.Done() + }() + + s.Stop() + + assert.ErrorIs(t, loopCtx.Err(), context.Canceled) + assert.True(t, waitForWaitGroup(&s.loopWG, 100*time.Millisecond)) +} + // TestServerStopWithDatabases tests Stop with actual database connections. func TestServerStopWithDatabases(t *testing.T) { ctx := context.Background() From 5707e8359c021e05c175840c509b5979142eebbf Mon Sep 17 00:00:00 2001 From: Amber Xue Date: Fri, 22 May 2026 12:55:08 -0700 Subject: [PATCH 3/4] add guard in values Signed-off-by: Amber Xue --- deployments/helm/fleet-intelligence-agent/README.md | 4 ++-- .../fleet-intelligence-agent/templates/daemonset.yaml | 4 ++-- deployments/helm/fleet-intelligence-agent/values.yaml | 11 +++++++---- docs/install-helm.md | 2 +- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/deployments/helm/fleet-intelligence-agent/README.md b/deployments/helm/fleet-intelligence-agent/README.md index c41c0a35..2a5a32c6 100644 --- a/deployments/helm/fleet-intelligence-agent/README.md +++ b/deployments/helm/fleet-intelligence-agent/README.md @@ -51,8 +51,8 @@ Common values (defaults from `values.yaml`): | `enroll.tokenSecretName` | `""` | Secret name for enrollment token. | | `enroll.tokenSecretKey` | `token` | Secret key for enrollment token. | | `enroll.tokenValue` | `""` | Inline token value (optional). | -| `enroll.nodeGroup` | `null` | Optional `--node-group` enroll flag. `null` omits flag, `""` clears stored value. | -| `enroll.computeZone` | `null` | Optional `--compute-zone` enroll flag. `null` omits flag, `""` clears stored value. | +| `enroll.nodeGroup` | `(not set)` | Optional `--node-group` enroll flag. Omit key to omit flag, set `""` to clear stored value. | +| `enroll.computeZone` | `(not set)` | Optional `--compute-zone` enroll flag. Omit key to omit flag, set `""` to clear stored value. | | `enroll.securityContext.runAsUser` | `0` | Run enrollment init as root. | | `ports.http` | `15133` | HTTP port. | | `resources.requests.cpu` | `100m` | CPU request. | diff --git a/deployments/helm/fleet-intelligence-agent/templates/daemonset.yaml b/deployments/helm/fleet-intelligence-agent/templates/daemonset.yaml index 6520714a..8c059f83 100644 --- a/deployments/helm/fleet-intelligence-agent/templates/daemonset.yaml +++ b/deployments/helm/fleet-intelligence-agent/templates/daemonset.yaml @@ -74,10 +74,10 @@ spec: {{- if .Values.enroll.force }} --force {{- end }} - {{- if ne .Values.enroll.nodeGroup nil }} + {{- if hasKey .Values.enroll "nodeGroup" }} --node-group {{ .Values.enroll.nodeGroup | quote }} {{- end }} - {{- if ne .Values.enroll.computeZone nil }} + {{- if hasKey .Values.enroll "computeZone" }} --compute-zone {{ .Values.enroll.computeZone | quote }} {{- end }} securityContext: diff --git a/deployments/helm/fleet-intelligence-agent/values.yaml b/deployments/helm/fleet-intelligence-agent/values.yaml index 1238f41e..747bc608 100644 --- a/deployments/helm/fleet-intelligence-agent/values.yaml +++ b/deployments/helm/fleet-intelligence-agent/values.yaml @@ -61,10 +61,13 @@ enroll: tokenSecretName: "" tokenSecretKey: "token" tokenValue: "" - # Optional enrollment metadata. Keep null to omit flag (preserve existing value). - # Set to a string to overwrite. Set to empty string "" to clear. - nodeGroup: "" - computeZone: "" + # Optional enrollment metadata: + # - omit key to preserve existing stored value + # - set non-empty string to overwrite + # - set empty string ("") to clear + # + # nodeGroup: "prod-a" + # computeZone: "us-east-1c" ports: http: 15133 diff --git a/docs/install-helm.md b/docs/install-helm.md index 36b72531..c7aac70b 100644 --- a/docs/install-helm.md +++ b/docs/install-helm.md @@ -89,7 +89,7 @@ helm upgrade fleet-intelligence-agent oci://ghcr.io/nvidia/charts/fleet-intellig ``` Notes: -- Keep `enroll.nodeGroup` / `enroll.computeZone` unset (`null`) to omit the flag and preserve existing stored values. +- Omit `enroll.nodeGroup` / `enroll.computeZone` keys to omit the flags and preserve existing stored values. - Set either value to an empty string to clear it (for example: `--set-string enroll.nodeGroup=""`). Upgrade (no enrollment): From 74c6009368f4f49572d1682a9a6551db15597a58 Mon Sep 17 00:00:00 2001 From: Amber Xue Date: Fri, 22 May 2026 13:02:01 -0700 Subject: [PATCH 4/4] revert shouldLogLoopExitError Signed-off-by: Amber Xue --- internal/server/server.go | 14 ++---------- internal/server/server_test.go | 39 ---------------------------------- 2 files changed, 2 insertions(+), 51 deletions(-) diff --git a/internal/server/server.go b/internal/server/server.go index a737c4e2..b3d69174 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -216,16 +216,6 @@ func getAttestationTimeout(cfg *config.Config) time.Duration { return config.DefaultAttestationTimeout } -func shouldLogLoopExitError(ctx context.Context, err error) bool { - if err == nil { - return false - } - if ctx != nil && ctx.Err() != nil && (errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded)) { - return false - } - return true -} - func waitForWaitGroup(wg *sync.WaitGroup, timeout time.Duration) bool { done := make(chan struct{}) go func() { @@ -502,7 +492,7 @@ func (s *Server) startInventoryLoop( s.loopWG.Add(1) go func() { defer s.loopWG.Done() - if err := manager.Run(ctx); shouldLogLoopExitError(ctx, err) { + if err := manager.Run(ctx); err != nil && !errors.Is(err, context.Canceled) { log.Logger.Errorw("inventory loop manager exited", "error", err) } }() @@ -539,7 +529,7 @@ func (s *Server) startAttestationLoop(ctx context.Context, cfg *config.Config) { s.loopWG.Add(1) go func() { defer s.loopWG.Done() - if err := manager.Run(ctx); shouldLogLoopExitError(ctx, err) { + if err := manager.Run(ctx); err != nil && !errors.Is(err, context.Canceled) { log.Logger.Errorw("attestation loop exited", "error", err) } }() diff --git a/internal/server/server_test.go b/internal/server/server_test.go index 54dc0710..010e4ff9 100644 --- a/internal/server/server_test.go +++ b/internal/server/server_test.go @@ -17,7 +17,6 @@ package server import ( "context" - "errors" "net" "os" "path/filepath" @@ -160,44 +159,6 @@ func TestGetInventorySyncTimeout(t *testing.T) { } } -func TestShouldLogLoopExitError(t *testing.T) { - canceledCtx, cancel := context.WithCancel(context.Background()) - cancel() - - deadlineCtx, deadlineCancel := context.WithTimeout(context.Background(), 1*time.Millisecond) - defer deadlineCancel() - <-deadlineCtx.Done() - - t.Run("nil error", func(t *testing.T) { - assert.False(t, shouldLogLoopExitError(context.Background(), nil)) - }) - - t.Run("context canceled during shutdown", func(t *testing.T) { - assert.False(t, shouldLogLoopExitError(canceledCtx, context.Canceled)) - }) - - t.Run("context deadline exceeded during shutdown", func(t *testing.T) { - assert.False(t, shouldLogLoopExitError(deadlineCtx, context.DeadlineExceeded)) - }) - - t.Run("wrapped deadline exceeded", func(t *testing.T) { - err := errors.Join(errors.New("loop stopped"), context.DeadlineExceeded) - assert.False(t, shouldLogLoopExitError(deadlineCtx, err)) - }) - - t.Run("deadline exceeded while parent context still active", func(t *testing.T) { - assert.True(t, shouldLogLoopExitError(context.Background(), context.DeadlineExceeded)) - }) - - t.Run("canceled while parent context still active", func(t *testing.T) { - assert.True(t, shouldLogLoopExitError(context.Background(), context.Canceled)) - }) - - t.Run("real error", func(t *testing.T) { - assert.True(t, shouldLogLoopExitError(context.Background(), errors.New("unexpected failure"))) - }) -} - func TestWaitForWaitGroup(t *testing.T) { t.Run("returns true when waitgroup completes", func(t *testing.T) { var wg sync.WaitGroup