Skip to content

Commit 6fa1ee4

Browse files
committed
agent: implement hash-based config polling with configurable cache timeout
Replace aggressive 5-second full config polling with hash-based change detection. The agent now: - Checks config hash every 5 seconds (64 bytes) - Only fetches and applies full config when hash changes - Forces full config check after timeout (default 60s) as safety net This dramatically reduces: - Network bandwidth (99%+ when config unchanged) - EOS device load (no config application when unchanged) - Agent CPU (hash computed only when fetching new config) Add --config-cache-timeout-in-seconds flag to control the forced full config check interval. Refactor main loop: - Split pollControllerAndConfigureDevice into focused functions - Add computeChecksum() helper for SHA256 hashing - Add fetchConfigFromController() to get config and compute hash - Add applyConfig() to apply config to EOS device - Rename variables: cachedConfigHash, configCacheTime, configCacheTimeout Add GetConfigHashFromServer() client function to call new gRPC endpoint.
1 parent 20cf0ff commit 6fa1ee4

4 files changed

Lines changed: 93 additions & 28 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,8 @@ All notable changes to this project will be documented in this file.
253253
- feat(smartcontract): add use_onchain_deallocation flag to MulticastGroup ([#2748](https://github.com/malbeclabs/doublezero/pull/2748))
254254
- CLI
255255
- Remove restriction for a single tunnel per user; now a user can have a unicast and multicast tunnel concurrently (but can only be a publisher _or_ a subscriber) ([2728](https://github.com/malbeclabs/doublezero/pull/2728))
256+
- Device agents
257+
- Reduce config agent network and CPU usage by checking config checksums every 5 seconds, and reducing full config check frquency to 1m
256258

257259
## [v0.8.3](https://github.com/malbeclabs/doublezero/compare/client/v0.8.2...client/v0.8.3) – 2026-01-22
258260

controlplane/agent/cmd/agent/main.go

Lines changed: 68 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ package main
22

33
import (
44
"context"
5+
"crypto/sha256"
6+
"encoding/hex"
57
"flag"
68
"fmt"
79
"log"
@@ -20,52 +22,51 @@ import (
2022
)
2123

2224
var (
23-
localDevicePubkey = flag.String("pubkey", "frtyt4WKYudUpqTsvJzwN6Bd4btYxrkaYNhBNAaUVGWn", "This device's public key on the doublezero network")
24-
controllerAddress = flag.String("controller", "18.116.166.35:7000", "The DoubleZero controller IP address and port to connect to")
25-
device = flag.String("device", "127.0.0.1:9543", "IP Address and port of the Arist EOS API. Should always be the local switch at 127.0.0.1:9543.")
26-
sleepIntervalInSeconds = flag.Float64("sleep-interval-in-seconds", 5, "How long to sleep in between polls")
27-
controllerTimeoutInSeconds = flag.Float64("controller-timeout-in-seconds", 30, "How long to wait for a response from the controller before giving up")
28-
maxLockAge = flag.Int("max-lock-age-in-seconds", 3600, "If agent detects a config lock that older than the specified age, it will force unlock.")
29-
verbose = flag.Bool("verbose", false, "Enable verbose logging")
30-
showVersion = flag.Bool("version", false, "Print the version of the doublezero-agent and exit")
31-
metricsEnable = flag.Bool("metrics-enable", false, "Enable prometheus metrics")
32-
metricsAddr = flag.String("metrics-addr", ":8080", "Address to listen on for prometheus metrics")
25+
localDevicePubkey = flag.String("pubkey", "frtyt4WKYudUpqTsvJzwN6Bd4btYxrkaYNhBNAaUVGWn", "This device's public key on the doublezero network")
26+
controllerAddress = flag.String("controller", "18.116.166.35:7000", "The DoubleZero controller IP address and port to connect to")
27+
device = flag.String("device", "127.0.0.1:9543", "IP Address and port of the Arist EOS API. Should always be the local switch at 127.0.0.1:9543.")
28+
sleepIntervalInSeconds = flag.Float64("sleep-interval-in-seconds", 5, "How long to sleep in between polls")
29+
controllerTimeoutInSeconds = flag.Float64("controller-timeout-in-seconds", 30, "How long to wait for a response from the controller before giving up")
30+
configCacheTimeoutInSeconds = flag.Int("config-cache-timeout-in-seconds", 60, "Force full config fetch after this many seconds, even if hash unchanged")
31+
maxLockAge = flag.Int("max-lock-age-in-seconds", 3600, "If agent detects a config lock that older than the specified age, it will force unlock.")
32+
verbose = flag.Bool("verbose", false, "Enable verbose logging")
33+
showVersion = flag.Bool("version", false, "Print the version of the doublezero-agent and exit")
34+
metricsEnable = flag.Bool("metrics-enable", false, "Enable prometheus metrics")
35+
metricsAddr = flag.String("metrics-addr", ":8080", "Address to listen on for prometheus metrics")
3336

3437
// set by LDFLAGS
3538
version = "dev"
3639
commit = "none"
3740
date = "unknown"
3841
)
3942

40-
func pollControllerAndConfigureDevice(ctx context.Context, dzclient pb.ControllerClient, eapiClient *arista.EAPIClient, pubkey string, verbose *bool, maxLockAge int, agentVersion string, agentCommit string, agentDate string) error {
41-
var err error
42-
43-
// The dz controller needs to know what BGP sessions we have configured locally
44-
var neighborIpMap map[string][]string
45-
neighborIpMap, err = eapiClient.GetBgpNeighbors(ctx)
46-
if err != nil {
47-
log.Println("pollControllerAndConfigureDevice: eapiClient.GetBgpNeighbors returned error:", err)
48-
agent.ErrorsBgpNeighbors.Inc()
49-
}
43+
func computeChecksum(data string) string {
44+
hash := sha256.Sum256([]byte(data))
45+
return hex.EncodeToString(hash[:])
46+
}
5047

51-
var configText string
48+
func fetchConfigFromController(ctx context.Context, dzclient pb.ControllerClient, pubkey string, neighborIpMap map[string][]string, verbose *bool, agentVersion string, agentCommit string, agentDate string) (configText string, configHash string, err error) {
5249
configText, err = agent.GetConfigFromServer(ctx, dzclient, pubkey, neighborIpMap, controllerTimeoutInSeconds, agentVersion, agentCommit, agentDate)
5350
if err != nil {
54-
log.Printf("pollControllerAndConfigureDevice failed to call agent.GetConfigFromServer: %q", err)
51+
log.Printf("fetchConfigFromController failed to call agent.GetConfigFromServer: %q", err)
5552
agent.ErrorsGetConfig.Inc()
56-
return err
53+
return "", "", err
5754
}
5855

5956
if *verbose {
6057
log.Printf("controller returned the following config: '%s'", configText)
6158
}
6259

60+
configHash = computeChecksum(configText)
61+
return configText, configHash, nil
62+
}
63+
64+
func applyConfig(ctx context.Context, eapiClient *arista.EAPIClient, configText string, maxLockAge int) error {
6365
if configText == "" {
64-
// Controller returned empty config
6566
return nil
6667
}
6768

68-
_, err = eapiClient.AddConfigToDevice(ctx, configText, nil, maxLockAge) // 3rd arg (diffCmd) is only used for testing
69+
_, err := eapiClient.AddConfigToDevice(ctx, configText, nil, maxLockAge)
6970
if err != nil {
7071
agent.ErrorsApplyConfig.Inc()
7172
return err
@@ -121,15 +122,55 @@ func main() {
121122
client := aristapb.NewEapiMgrServiceClient(clientConn)
122123
eapiClient = arista.NewEAPIClient(slog.Default(), client)
123124

125+
var cachedConfigHash string
126+
var configCacheTime time.Time
127+
configCacheTimeout := time.Duration(*configCacheTimeoutInSeconds) * time.Second
128+
124129
for {
125130
select {
126131
case <-ctx.Done():
127132
return
128133
case <-ticker.C:
129-
err := pollControllerAndConfigureDevice(ctx, dzclient, eapiClient, *localDevicePubkey, verbose, *maxLockAge, version, commit, date)
134+
neighborIpMap, err := eapiClient.GetBgpNeighbors(ctx)
135+
if err != nil {
136+
log.Println("ERROR: eapiClient.GetBgpNeighbors returned", err)
137+
agent.ErrorsBgpNeighbors.Inc()
138+
}
139+
140+
shouldFetchAndApply := false
141+
142+
if cachedConfigHash == "" {
143+
shouldFetchAndApply = true
144+
} else if time.Since(configCacheTime) >= configCacheTimeout {
145+
shouldFetchAndApply = true
146+
} else {
147+
hash, err := agent.GetConfigHashFromServer(ctx, dzclient, *localDevicePubkey, neighborIpMap, controllerTimeoutInSeconds, version, commit, date)
148+
if err != nil {
149+
log.Println("ERROR: GetConfigHashFromServer returned", err)
150+
continue
151+
}
152+
if hash != cachedConfigHash {
153+
shouldFetchAndApply = true
154+
}
155+
}
156+
157+
if !shouldFetchAndApply {
158+
continue
159+
}
160+
161+
configText, configHash, err := fetchConfigFromController(ctx, dzclient, *localDevicePubkey, neighborIpMap, verbose, version, commit, date)
162+
if err != nil {
163+
log.Println("ERROR: fetchConfigFromController returned", err)
164+
continue
165+
}
166+
167+
err = applyConfig(ctx, eapiClient, configText, *maxLockAge)
130168
if err != nil {
131-
log.Println("ERROR: pollAndConfigureDevice returned", err)
169+
log.Println("ERROR: applyConfig returned", err)
170+
continue
132171
}
172+
cachedConfigHash = configHash
173+
configCacheTime = time.Now()
133174
}
134175
}
135176
}

controlplane/agent/internal/agent/dzclient.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,28 @@ func GetConfigFromServer(ctx context.Context, client pb.ControllerClient, localD
3535
return config, nil
3636
}
3737

38+
func GetConfigHashFromServer(ctx context.Context, client pb.ControllerClient, localDevicePubkey string, neighborIpMap map[string][]string, controllerTimeoutInSeconds *float64, agentVersion string, agentCommit string, agentDate string) (hash string, err error) {
39+
ctx, cancel := context.WithTimeout(ctx, time.Duration(*controllerTimeoutInSeconds*float64(time.Second)))
40+
defer cancel()
41+
42+
var bgpPeers []string
43+
bgpPeersByVrf := make(map[string]*pb.BgpPeers)
44+
for vrf, peers := range neighborIpMap {
45+
bgpPeersByVrf[vrf] = &pb.BgpPeers{Peers: peers}
46+
bgpPeers = append(bgpPeers, peers...)
47+
}
48+
slices.Sort(bgpPeers)
49+
50+
req := &pb.ConfigRequest{Pubkey: localDevicePubkey, BgpPeers: bgpPeers, BgpPeersByVrf: bgpPeersByVrf, AgentVersion: &agentVersion, AgentCommit: &agentCommit, AgentDate: &agentDate}
51+
resp, err := client.GetConfigHash(ctx, req)
52+
if err != nil {
53+
log.Printf("Error calling GetConfigHash: %v\n", err)
54+
return "", err
55+
}
56+
57+
return resp.GetHash(), nil
58+
}
59+
3860
func GetDzClient(controllerAddressAndPort string) (pb.ControllerClient, error) {
3961
conn, err := grpc.NewClient(controllerAddressAndPort, grpc.WithTransportCredentials(insecure.NewCredentials()))
4062
log.Printf("controllerAddressAndPort %s\n", controllerAddressAndPort)

controlplane/controller/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ The design includes two optimizations:
1212
1. Applying configuration to an Arista EOS device causes the EOS ConfigAgent process CPU to spike, so the agent only applies the config when the config generated by the controller is different than the last polling cycle
1313
2. To make success more likely on lossy networks,
1414

15-
Here's how the agent uses the endpoints:
15+
fHere's how the agent uses the endpoints:
1616

1717
```
1818
┌─────────┐ ┌────────────┐ ┌────────────┐ ┌─────────┐

0 commit comments

Comments
 (0)