diff --git a/network/network.go b/network/network.go index a5b6fa51b9..ac4df70a0b 100644 --- a/network/network.go +++ b/network/network.go @@ -378,12 +378,31 @@ func (n *Network) Start() error { // Resume all notifiers. Notifiers may access other components of the network stack. // To prevent nil derefs run the notifiers last. https://github.com/nuts-foundation/nuts-node/issues/3155 - for _, notifier := range n.state.Notifiers() { - if err = notifier.Run(); err != nil { - return fmt.Errorf("failed to start notifiers: %w", err) + // Run in a goroutine: connectToKnownNodes() above may already be loading transactions from peers, + // holding the BBolt write lock, causing notifier.Run() (which needs a read lock) to time out. + // See https://github.com/nuts-foundation/nuts-node/issues/4162 + go n.startNotifiers(30 * time.Second) + return nil +} + +// startNotifiers starts all notifiers, retrying failed ones every retryDelay until all have started. +// It is called in a goroutine from Start() to avoid blocking on BBolt lock contention during initial peer sync. +// See https://github.com/nuts-foundation/nuts-node/issues/4162 +func (n *Network) startNotifiers(retryDelay time.Duration) { + pending := n.state.Notifiers() + for len(pending) > 0 { + var failed []dag.Notifier + for _, notifier := range pending { + if err := notifier.Run(); err != nil { + log.Logger().WithError(err).Errorf("Failed to start notifier '%s', retrying in %s", notifier.Name(), retryDelay) + failed = append(failed, notifier) + } + } + pending = failed + if len(pending) > 0 { + time.Sleep(retryDelay) } } - return nil } func (n *Network) connectToKnownNodes(nodeDID did.DID) error { diff --git a/network/network_test.go b/network/network_test.go index ea6a3f70f3..b8f4452423 100644 --- a/network/network_test.go +++ b/network/network_test.go @@ -567,6 +567,32 @@ func TestNetwork_Start(t *testing.T) { }) } +func TestNetwork_startNotifiers(t *testing.T) { + t.Run("ok - succeeds on first try", func(t *testing.T) { + ctrl := gomock.NewController(t) + cxt := createNetwork(t, ctrl) + notifier := dag.NewMockNotifier(ctrl) + notifier.EXPECT().Run().Return(nil) + notifier.EXPECT().Name().AnyTimes().Return("test") + cxt.state.EXPECT().Notifiers().Return([]dag.Notifier{notifier}) + + cxt.network.startNotifiers(time.Millisecond) + }) + t.Run("ok - retries on first failure", func(t *testing.T) { + ctrl := gomock.NewController(t) + cxt := createNetwork(t, ctrl) + notifier := dag.NewMockNotifier(ctrl) + gomock.InOrder( + notifier.EXPECT().Run().Return(errors.New("lock timeout")), + notifier.EXPECT().Run().Return(nil), + ) + notifier.EXPECT().Name().AnyTimes().Return("test") + cxt.state.EXPECT().Notifiers().Return([]dag.Notifier{notifier}) + + cxt.network.startNotifiers(time.Millisecond) + }) +} + func TestNetwork_selfTestNutsCommAddress(t *testing.T) { t.Run("TLS", func(t *testing.T) { certificate := testPKI.Certificate()