Skip to content

Commit 63f1cda

Browse files
committed
Startstop: Start successfully again even in the event of a partial start
This one's presented as an alternative to #1136. Basically, a current problem with the start/stop infrastructure is that in the event of a partial start where a service returns from its start function, but without `Stop` having been called on, we can get into a situation where the start/stop's `isRunning` flag is still set to true, and when the start/stop is started again, it'll fall through thinking it's already running. Here, we check for this condition on subsequent starts. If the `stopped` channel is non-nil but already closed, we reset all internal state including `isRunning` so the service can start again. To prove this works, I pull in the test case added in #1136 verbatim, and also add one more specific test in `start_stop_test.go` for a more precise version.
1 parent bcbcb6d commit 63f1cda

3 files changed

Lines changed: 69 additions & 1 deletion

File tree

client_test.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7133,6 +7133,31 @@ func Test_Client_Start_Error(t *testing.T) {
71337133
require.ErrorAs(t, err, &pgErr)
71347134
require.Equal(t, pgerrcode.InvalidCatalogName, pgErr.Code)
71357135
})
7136+
7137+
t.Run("CanRestartAfterFailure", func(t *testing.T) {
7138+
t.Parallel()
7139+
7140+
// Use a non-existent database to trigger a startup failure
7141+
dbConfig := riversharedtest.DBPool(ctx, t).Config().Copy()
7142+
dbConfig.ConnConfig.Database = "does-not-exist-and-dont-create-it"
7143+
7144+
dbPool, err := pgxpool.NewWithConfig(ctx, dbConfig)
7145+
require.NoError(t, err)
7146+
7147+
config := newTestConfig(t, "")
7148+
7149+
client := newTestClient(t, dbPool, config)
7150+
7151+
// First Start() should fail with a database error
7152+
err = client.Start(ctx)
7153+
require.Error(t, err, "first Start() should fail with database error")
7154+
7155+
// Second Start() should also fail with an error, NOT return nil.
7156+
// This verifies that the client's internal state was properly reset
7157+
// after the first failure, allowing it to attempt startup again.
7158+
err = client.Start(ctx)
7159+
require.Error(t, err, "second Start() should return an error, not nil; client state should be reset after failed start")
7160+
})
71367161
}
71377162

71387163
func Test_NewClient_BaseServiceName(t *testing.T) {

rivershared/startstop/start_stop.go

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,24 @@ func (s *BaseStartStop) StartInit(ctx context.Context) (context.Context, bool, f
109109
defer s.mu.Unlock()
110110

111111
if s.isRunning {
112-
return ctx, false, nil, nil
112+
// If stopped has already been closed (e.g. a previous Start failed and
113+
// called stopped()), reset state so the service can start again.
114+
//
115+
// Notably, for this branch to be taken, Stop will not have been called.
116+
// If it was, isRunning will have been set to false via finalizeStop.
117+
if s.stopped != nil {
118+
select {
119+
case <-s.stopped:
120+
s.isRunning = false
121+
s.started = nil
122+
s.stopped = nil
123+
default:
124+
}
125+
}
126+
127+
if s.isRunning {
128+
return ctx, false, nil, nil
129+
}
113130
}
114131

115132
s.isRunning = true

rivershared/startstop/start_stop_test.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,32 @@ func TestSampleService(t *testing.T) {
253253
riversharedtest.WaitOrTimeout(t, service.Started()) // start channel also closed on erroneous start
254254
riversharedtest.WaitOrTimeout(t, service.Stopped())
255255
})
256+
257+
t.Run("StartErrorThenSuccessfulRestart", func(t *testing.T) {
258+
t.Parallel()
259+
260+
service, _ := setup(t)
261+
service.startErr = errors.New("error on start")
262+
263+
// First start fails with our simulated error.
264+
require.ErrorIs(t, service.Start(ctx), service.startErr)
265+
266+
riversharedtest.WaitOrTimeout(t, service.Started())
267+
riversharedtest.WaitOrTimeout(t, service.Stopped())
268+
269+
// Clear error so the next start succeeds.
270+
service.startErr = nil
271+
272+
// Second start should succeed despite the prior failure. Without the
273+
// reset-on-failed-start logic in StartInit, isRunning would still be
274+
// true and StartInit would return shouldStart=false, causing Start to
275+
// return nil without actually starting the service.
276+
require.NoError(t, service.Start(ctx))
277+
t.Cleanup(service.Stop)
278+
279+
riversharedtest.WaitOrTimeout(t, service.Started())
280+
require.True(t, service.state)
281+
})
256282
}
257283

258284
// A service with the more unusual case.

0 commit comments

Comments
 (0)