Skip to content

Commit 1a18568

Browse files
committed
Merge remote-tracking branch 'origin/fix-race-local-registry-race' into codex/dx-3578-local-cre-startup-speedup
2 parents 709649b + 274c28b commit 1a18568

2 files changed

Lines changed: 42 additions & 3 deletions

File tree

core/capabilities/launcher.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,14 @@ func (w *launcher) donPairsToUpdate(myID ragetypes.PeerID, localRegistry *regist
300300
}
301301

302302
func (w *launcher) OnNewRegistry(ctx context.Context, localRegistry *registrysyncer.LocalRegistry) error {
303-
w.lggr.Debug("CapabilitiesLauncher triggered...")
304-
w.registry.SetLocalRegistry(localRegistry)
303+
// Do not set an empty local registry: capability init (e.g. EVM) calls LocalNode() and fails with
304+
// "empty local registry. no DONs registered". Only set once we have at least one DON so that
305+
// capabilities that depend on the registry see valid data (or keep waiting until syncer pushes non-empty).
306+
if len(localRegistry.IDsToDONs) > 0 {
307+
w.registry.SetLocalRegistry(localRegistry)
308+
} else {
309+
w.lggr.Debugw("CapabilitiesLauncher skipping SetLocalRegistry (empty registry, waiting for first sync with DONs)")
310+
}
305311

306312
allDONIDs := w.allDONs(localRegistry)
307313
w.lggr.Debugw("All DONs in the local registry", "allDONIDs", allDONIDs)

core/services/standardcapabilities/standard_capabilities.go

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"errors"
66
"fmt"
7+
"strings"
78
"sync"
89
"time"
910

@@ -118,7 +119,9 @@ func (s *StandardCapabilities) Start(ctx context.Context) error {
118119
CRESettings: s.creSettings,
119120
TriggerEventStore: s.triggerEventStore,
120121
}
121-
if err = s.capabilitiesLoop.Service.Initialise(cctx, dependencies); err != nil {
122+
123+
s.log.Infow("StandardCapabilities calling Initialise on capability service", "command", s.command)
124+
if err = s.retryInitialiseUntilReady(cctx, dependencies); err != nil {
122125
s.log.Errorf("error initialising standard capabilities service: %v", err)
123126
return
124127
}
@@ -136,6 +139,36 @@ func (s *StandardCapabilities) Start(ctx context.Context) error {
136139
})
137140
}
138141

142+
// retryInitialiseUntilReady calls Initialise and retries on "empty local registry" or
143+
// "metadataRegistry information not available" so that capability init runs after the
144+
// registry syncer has pushed at least one non-empty local registry (startup race fix).
145+
const initRetryTimeout = 90 * time.Second
146+
const initRetryInterval = 3 * time.Second
147+
148+
func (s *StandardCapabilities) retryInitialiseUntilReady(ctx context.Context, dependencies core.StandardCapabilitiesDependencies) error {
149+
deadline := time.Now().Add(initRetryTimeout)
150+
var lastErr error
151+
for attempt := 0; time.Now().Before(deadline); attempt++ {
152+
lastErr = s.capabilitiesLoop.Service.Initialise(ctx, dependencies)
153+
if lastErr == nil {
154+
return nil
155+
}
156+
msg := lastErr.Error()
157+
if !strings.Contains(msg, "empty local registry") && !strings.Contains(msg, "metadataRegistry information not available") {
158+
return lastErr
159+
}
160+
if attempt > 0 {
161+
s.log.Infow("StandardCapabilities Initialise retry (waiting for registry sync)", "command", s.command, "attempt", attempt+1, "err", lastErr)
162+
}
163+
select {
164+
case <-ctx.Done():
165+
return ctx.Err()
166+
case <-time.After(initRetryInterval):
167+
}
168+
}
169+
return fmt.Errorf("initialise still failing after %v (registry never became ready): %w", initRetryTimeout, lastErr)
170+
}
171+
139172
// Ready is a non-blocking check for the service's ready state. Errors if not
140173
// ready when called.
141174
func (s *StandardCapabilities) Ready() error {

0 commit comments

Comments
 (0)