-
Notifications
You must be signed in to change notification settings - Fork 143
Try to start the queue maintainer multiple times with backoff #1184
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,7 @@ import ( | |
| "github.com/riverqueue/river/rivershared/testsignal" | ||
| "github.com/riverqueue/river/rivershared/util/dbutil" | ||
| "github.com/riverqueue/river/rivershared/util/maputil" | ||
| "github.com/riverqueue/river/rivershared/util/serviceutil" | ||
| "github.com/riverqueue/river/rivershared/util/sliceutil" | ||
| "github.com/riverqueue/river/rivershared/util/testutil" | ||
| "github.com/riverqueue/river/rivershared/util/valutil" | ||
|
|
@@ -619,7 +620,9 @@ type Client[TTx any] struct { | |
|
|
||
| // Test-only signals. | ||
| type clientTestSignals struct { | ||
| electedLeader testsignal.TestSignal[struct{}] // notifies when elected leader | ||
| electedLeader testsignal.TestSignal[struct{}] // notifies when elected leader | ||
| queueMaintainerStartError testsignal.TestSignal[error] // notifies on each failed queue maintainer start attempt | ||
| queueMaintainerStartRetriesExhausted testsignal.TestSignal[struct{}] // notifies when leader resignation is requested after all queue maintainer start retries have been exhausted | ||
|
|
||
| jobCleaner *maintenance.JobCleanerTestSignals | ||
| jobRescuer *maintenance.JobRescuerTestSignals | ||
|
|
@@ -631,6 +634,8 @@ type clientTestSignals struct { | |
|
|
||
| func (ts *clientTestSignals) Init(tb testutil.TestingTB) { | ||
| ts.electedLeader.Init(tb) | ||
| ts.queueMaintainerStartError.Init(tb) | ||
| ts.queueMaintainerStartRetriesExhausted.Init(tb) | ||
|
|
||
| if ts.jobCleaner != nil { | ||
| ts.jobCleaner.Init(tb) | ||
|
|
@@ -1279,26 +1284,6 @@ func (c *Client[TTx]) logStatsLoop(ctx context.Context, shouldStart bool, starte | |
| } | ||
|
|
||
| func (c *Client[TTx]) handleLeadershipChangeLoop(ctx context.Context, shouldStart bool, started, stopped func()) error { | ||
| handleLeadershipChange := func(ctx context.Context, notification *leadership.Notification) { | ||
| c.baseService.Logger.DebugContext(ctx, c.baseService.Name+": Election change received", | ||
| slog.String("client_id", c.config.ID), slog.Bool("is_leader", notification.IsLeader)) | ||
|
|
||
| switch { | ||
| case notification.IsLeader: | ||
| // Starting the queue maintainer can take a little time so send to | ||
| // this test signal _first_ so tests waiting on it can finish, | ||
| // cancel the queue maintainer start, and overall run much faster. | ||
| c.testSignals.electedLeader.Signal(struct{}{}) | ||
|
|
||
| if err := c.queueMaintainer.Start(ctx); err != nil { | ||
| c.baseService.Logger.ErrorContext(ctx, "Error starting queue maintainer", slog.String("err", err.Error())) | ||
| } | ||
|
|
||
| default: | ||
| c.queueMaintainer.Stop() | ||
| } | ||
| } | ||
|
|
||
| if !shouldStart { | ||
| return nil | ||
| } | ||
|
|
@@ -1310,20 +1295,116 @@ func (c *Client[TTx]) handleLeadershipChangeLoop(ctx context.Context, shouldStar | |
| sub := c.elector.Listen() | ||
| defer sub.Unlisten() | ||
|
|
||
| // Cancel function for an in-progress tryStartQueueMaintainer. If | ||
| // leadership is lost while the start process is still retrying, this | ||
| // is used to abort it promptly instead of waiting for retries to | ||
| // finish. | ||
| var cancelQueueMaintainerStart context.CancelCauseFunc = func(_ error) {} | ||
|
|
||
| for { | ||
| select { | ||
| case <-ctx.Done(): | ||
| cancelQueueMaintainerStart(context.Cause(ctx)) | ||
| return | ||
|
|
||
| case notification := <-sub.C(): | ||
| handleLeadershipChange(ctx, notification) | ||
| c.baseService.Logger.DebugContext(ctx, c.baseService.Name+": Election change received", | ||
| slog.String("client_id", c.config.ID), slog.Bool("is_leader", notification.IsLeader)) | ||
|
|
||
| switch { | ||
| case notification.IsLeader: | ||
| // Starting the queue maintainer can take a little time so | ||
| // send to this test signal first so tests waiting on it | ||
| // can finish, cancel the queue maintainer start, and | ||
| // overall run much faster. | ||
| c.testSignals.electedLeader.Signal(struct{}{}) | ||
|
|
||
| // Start the queue maintainer with a few retries and | ||
| // exponential backoff in a separate goroutine so the | ||
| // leadership change loop remains responsive to new | ||
| // notifications. startCtx is used for cancellation in case | ||
| // leadership is lost while retries are in progress. | ||
| var startCtx context.Context | ||
| startCtx, cancelQueueMaintainerStart = context.WithCancelCause(ctx) | ||
| go c.tryStartQueueMaintainer(startCtx) | ||
|
|
||
| default: | ||
| // Cancel any in-progress start attempts before stopping. We | ||
| // sent a startstop.ErrStop to make sure services like | ||
| // Reindexer run any specific cleanup code for stops. | ||
| cancelQueueMaintainerStart(startstop.ErrStop) | ||
| cancelQueueMaintainerStart = func(_ error) {} | ||
|
|
||
| c.queueMaintainer.Stop() | ||
| } | ||
| } | ||
| } | ||
| }() | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| // Tries to start the queue maintainer after gaining leadership. We allow some | ||
| // retries with exponential backoff in case of failure, and in case the queue | ||
| // maintainer can't be started, we request resignation to allow another client | ||
| // to try and take over. | ||
| func (c *Client[TTx]) tryStartQueueMaintainer(ctx context.Context) { | ||
| const maxStartAttempts = 3 | ||
|
|
||
| ctxCancelled := func() bool { | ||
| if ctx.Err() != nil { | ||
| c.baseService.Logger.InfoContext(ctx, c.baseService.Name+": Queue maintainer start cancelled") | ||
| return true | ||
| } | ||
| return false | ||
| } | ||
|
|
||
| var lastErr error | ||
| for attempt := 1; attempt <= maxStartAttempts; attempt++ { | ||
| if ctxCancelled() { | ||
| return | ||
| } | ||
|
|
||
| if lastErr = c.queueMaintainer.Start(ctx); lastErr == nil { | ||
| return | ||
| } | ||
|
|
||
| c.baseService.Logger.ErrorContext(ctx, c.baseService.Name+": Error starting queue maintainer", | ||
| slog.String("err", lastErr.Error()), slog.Int("attempt", attempt)) | ||
|
|
||
| c.testSignals.queueMaintainerStartError.Signal(lastErr) | ||
|
|
||
| // If Start blocked long enough for this context to be cancelled | ||
| // (e.g. leadership was lost), bail out immediately. A newer | ||
| // leadership term may already have started the maintainer, and | ||
| // calling Stop here would tear it down. | ||
| if ctxCancelled() { | ||
| return | ||
| } | ||
|
|
||
| // Stop the queue maintainer to fully reset its state (and any | ||
| // sub-services) before retrying. | ||
| c.queueMaintainer.Stop() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This Useful? React with 👍 / 👎. |
||
|
|
||
| if attempt < maxStartAttempts { | ||
| serviceutil.CancellableSleep(ctx, serviceutil.ExponentialBackoff(attempt, serviceutil.MaxAttemptsBeforeResetDefault)) | ||
| } | ||
| } | ||
|
|
||
| if ctxCancelled() { | ||
| return | ||
| } | ||
|
|
||
| c.baseService.Logger.ErrorContext(ctx, c.baseService.Name+": Queue maintainer failed to start after all attempts, requesting leader resignation", | ||
| slog.String("err", lastErr.Error())) | ||
|
|
||
| c.testSignals.queueMaintainerStartRetriesExhausted.Signal(struct{}{}) | ||
|
|
||
| if err := c.clientNotifyBundle.RequestResign(ctx); err != nil { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dang that is a subtle one @brandur 🤔
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To use Codex here, create a Codex account and connect to github.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @codex re-review |
||
| c.baseService.Logger.ErrorContext(ctx, c.baseService.Name+": Error requesting leader resignation", slog.String("err", err.Error())) | ||
| } | ||
| } | ||
|
|
||
| // Driver exposes the underlying driver used by the client. | ||
| // | ||
| // API is not stable. DO NOT USE. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -318,31 +318,42 @@ func (s *PeriodicJobEnqueuer) Start(ctx context.Context) error { | |
|
|
||
| s.StaggerStart(ctx) | ||
|
|
||
| initialPeriodicJobs, err := s.Config.Pilot.PeriodicJobGetAll(ctx, s.exec, &riverpilot.PeriodicJobGetAllParams{ | ||
| Schema: s.Config.Schema, | ||
| }) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| var ( | ||
| initialPeriodicJobs []*riverpilot.PeriodicJob | ||
| subServices []startstop.Service | ||
| ) | ||
| if err := func() error { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ended up indenting all of this in so that there's a single |
||
| var err error | ||
| initialPeriodicJobs, err = s.Config.Pilot.PeriodicJobGetAll(ctx, s.exec, &riverpilot.PeriodicJobGetAllParams{ | ||
| Schema: s.Config.Schema, | ||
| }) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| for _, hook := range s.Config.HookLookupGlobal.ByHookKind(hooklookup.HookKindPeriodicJobsStart) { | ||
| if err := hook.(rivertype.HookPeriodicJobsStart).Start(ctx, &rivertype.HookPeriodicJobsStartParams{ //nolint:forcetypeassert | ||
| DurableJobs: sliceutil.Map(initialPeriodicJobs, func(job *riverpilot.PeriodicJob) *rivertype.DurablePeriodicJob { | ||
| return (*rivertype.DurablePeriodicJob)(job) | ||
| }), | ||
| }); err != nil { | ||
| for _, hook := range s.Config.HookLookupGlobal.ByHookKind(hooklookup.HookKindPeriodicJobsStart) { | ||
| if err := hook.(rivertype.HookPeriodicJobsStart).Start(ctx, &rivertype.HookPeriodicJobsStartParams{ //nolint:forcetypeassert | ||
| DurableJobs: sliceutil.Map(initialPeriodicJobs, func(job *riverpilot.PeriodicJob) *rivertype.DurablePeriodicJob { | ||
| return (*rivertype.DurablePeriodicJob)(job) | ||
| }), | ||
| }); err != nil { | ||
| return err | ||
| } | ||
| } | ||
|
|
||
| subServices = []startstop.Service{ | ||
| startstop.StartStopFunc(s.periodicJobKeepAliveAndReapPeriodically), | ||
| } | ||
| stopServicesOnError := func() { | ||
| startstop.StopAllParallel(subServices...) | ||
| } | ||
| if err := startstop.StartAll(ctx, subServices...); err != nil { | ||
| stopServicesOnError() | ||
| return err | ||
| } | ||
| } | ||
|
|
||
| subServices := []startstop.Service{ | ||
| startstop.StartStopFunc(s.periodicJobKeepAliveAndReapPeriodically), | ||
| } | ||
| stopServicesOnError := func() { | ||
| startstop.StopAllParallel(subServices...) | ||
| } | ||
| if err := startstop.StartAll(ctx, subServices...); err != nil { | ||
| stopServicesOnError() | ||
| return nil | ||
| }(); err != nil { | ||
| stopped() | ||
| return err | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -50,6 +50,8 @@ func (m *QueueMaintainer) Start(ctx context.Context) error { | |
|
|
||
| for _, service := range m.servicesByName { | ||
| if err := service.Start(ctx); err != nil { | ||
| startstop.StopAllParallel(maputil.Values(m.servicesByName)...) | ||
| stopped() | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similarly, make sure |
||
| return err | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
tryStartQueueMaintainerstops the shared maintainer on every start error, even when this retry context has already been canceled due to a leadership loss. If leadership flips (lose then regain) while an earlierStartcall is still blocked (for example in a hook that does not return promptly on cancellation), that older goroutine can return later and executeStopagainst the newer leader term’s successfully started maintainer, taking maintenance back offline unexpectedly. Add a cancellation/epoch check immediately afterStartreturns and before callingStopso stale retries cannot affect a newer leadership term.Useful? React with 👍 / 👎.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@codex re-review