From 7beb03969faa199bbef3466f8e5825b0ca31f1b7 Mon Sep 17 00:00:00 2001 From: Rein Krul Date: Thu, 9 Apr 2026 12:54:28 +0200 Subject: [PATCH] Fix #4162: start notifiers in goroutine with retry to avoid BBolt lock contention connectToKnownNodes() may already be syncing transactions from peers when notifier.Run() is called, holding the BBolt write lock and causing read lock timeouts. Run notifiers in a background goroutine that retries failed notifiers every 30 seconds until all have started successfully. Co-Authored-By: Claude Sonnet 4.6 --- network/network.go | 27 +++++++++++++++++++++++---- network/network_test.go | 26 ++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/network/network.go b/network/network.go index a5b6fa51b9..ac4df70a0b 100644 --- a/network/network.go +++ b/network/network.go @@ -378,12 +378,31 @@ func (n *Network) Start() error { // Resume all notifiers. Notifiers may access other components of the network stack. // To prevent nil derefs run the notifiers last. https://github.com/nuts-foundation/nuts-node/issues/3155 - for _, notifier := range n.state.Notifiers() { - if err = notifier.Run(); err != nil { - return fmt.Errorf("failed to start notifiers: %w", err) + // Run in a goroutine: connectToKnownNodes() above may already be loading transactions from peers, + // holding the BBolt write lock, causing notifier.Run() (which needs a read lock) to time out. + // See https://github.com/nuts-foundation/nuts-node/issues/4162 + go n.startNotifiers(30 * time.Second) + return nil +} + +// startNotifiers starts all notifiers, retrying failed ones every retryDelay until all have started. +// It is called in a goroutine from Start() to avoid blocking on BBolt lock contention during initial peer sync. +// See https://github.com/nuts-foundation/nuts-node/issues/4162 +func (n *Network) startNotifiers(retryDelay time.Duration) { + pending := n.state.Notifiers() + for len(pending) > 0 { + var failed []dag.Notifier + for _, notifier := range pending { + if err := notifier.Run(); err != nil { + log.Logger().WithError(err).Errorf("Failed to start notifier '%s', retrying in %s", notifier.Name(), retryDelay) + failed = append(failed, notifier) + } + } + pending = failed + if len(pending) > 0 { + time.Sleep(retryDelay) } } - return nil } func (n *Network) connectToKnownNodes(nodeDID did.DID) error { diff --git a/network/network_test.go b/network/network_test.go index ea6a3f70f3..b8f4452423 100644 --- a/network/network_test.go +++ b/network/network_test.go @@ -567,6 +567,32 @@ func TestNetwork_Start(t *testing.T) { }) } +func TestNetwork_startNotifiers(t *testing.T) { + t.Run("ok - succeeds on first try", func(t *testing.T) { + ctrl := gomock.NewController(t) + cxt := createNetwork(t, ctrl) + notifier := dag.NewMockNotifier(ctrl) + notifier.EXPECT().Run().Return(nil) + notifier.EXPECT().Name().AnyTimes().Return("test") + cxt.state.EXPECT().Notifiers().Return([]dag.Notifier{notifier}) + + cxt.network.startNotifiers(time.Millisecond) + }) + t.Run("ok - retries on first failure", func(t *testing.T) { + ctrl := gomock.NewController(t) + cxt := createNetwork(t, ctrl) + notifier := dag.NewMockNotifier(ctrl) + gomock.InOrder( + notifier.EXPECT().Run().Return(errors.New("lock timeout")), + notifier.EXPECT().Run().Return(nil), + ) + notifier.EXPECT().Name().AnyTimes().Return("test") + cxt.state.EXPECT().Notifiers().Return([]dag.Notifier{notifier}) + + cxt.network.startNotifiers(time.Millisecond) + }) +} + func TestNetwork_selfTestNutsCommAddress(t *testing.T) { t.Run("TLS", func(t *testing.T) { certificate := testPKI.Certificate()