Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion core/capabilities/launcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,14 @@ func (w *launcher) donPairsToUpdate(myID ragetypes.PeerID, localRegistry *regist

func (w *launcher) OnNewRegistry(ctx context.Context, localRegistry *registrysyncer.LocalRegistry) error {
w.lggr.Debug("CapabilitiesLauncher triggered...")
w.registry.SetLocalRegistry(localRegistry)
// Do not set an empty local registry: capability init (e.g. EVM) calls LocalNode() and fails with
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm -- what is calling this to be called with an empty registry? Are we not guaranteed to call it with a non-empty one?

// "empty local registry. no DONs registered". Only set once we have at least one DON so that
// capabilities that depend on the registry see valid data (or keep waiting until syncer pushes non-empty).
if len(localRegistry.IDsToDONs) > 0 {
w.registry.SetLocalRegistry(localRegistry)
} else {
w.lggr.Debugw("CapabilitiesLauncher skipping SetLocalRegistry (empty registry, waiting for first sync with DONs)")
}

allDONIDs := w.allDONs(localRegistry)
w.lggr.Debugw("All DONs in the local registry", "allDONIDs", allDONIDs)
Expand Down
35 changes: 34 additions & 1 deletion core/services/standardcapabilities/standard_capabilities.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"errors"
"fmt"
"strings"
"sync"
"time"

Expand Down Expand Up @@ -118,7 +119,9 @@ func (s *StandardCapabilities) Start(ctx context.Context) error {
CRESettings: s.creSettings,
TriggerEventStore: s.triggerEventStore,
}
if err = s.capabilitiesLoop.Service.Initialise(cctx, dependencies); err != nil {

s.log.Infow("StandardCapabilities calling Initialise on capability service", "command", s.command)
if err = s.retryInitialiseUntilReady(cctx, dependencies); err != nil {
s.log.Errorf("error initialising standard capabilities service: %v", err)
return
}
Expand All @@ -136,6 +139,36 @@ func (s *StandardCapabilities) Start(ctx context.Context) error {
})
}

// retryInitialiseUntilReady calls Initialise and retries on "empty local registry" or
// "metadataRegistry information not available" so that capability init runs after the
// registry syncer has pushed at least one non-empty local registry (startup race fix).
const initRetryTimeout = 90 * time.Second
const initRetryInterval = 3 * time.Second

func (s *StandardCapabilities) retryInitialiseUntilReady(ctx context.Context, dependencies core.StandardCapabilitiesDependencies) error {
deadline := time.Now().Add(initRetryTimeout)
var lastErr error
for attempt := 0; time.Now().Before(deadline); attempt++ {
lastErr = s.capabilitiesLoop.Service.Initialise(ctx, dependencies)
if lastErr == nil {
return nil
}
msg := lastErr.Error()
if !strings.Contains(msg, "empty local registry") && !strings.Contains(msg, "metadataRegistry information not available") {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we go the retry-way can we use typed errors so that a change in error message wording won't break it?

return lastErr
}
if attempt > 0 {
s.log.Infow("StandardCapabilities Initialise retry (waiting for registry sync)", "command", s.command, "attempt", attempt+1, "err", lastErr)
}
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(initRetryInterval):
}
}
return fmt.Errorf("initialise still failing after %v (registry never became ready): %w", initRetryTimeout, lastErr)
}

// Ready is a non-blocking check for the service's ready state. Errors if not
// ready when called.
func (s *StandardCapabilities) Ready() error {
Expand Down
Loading