Skip to content

Commit 69b9539

Browse files
committed
agent: reduce config agent CPU by caching and comparing config locally (#3026)
The agent now fetches the full config every 5 seconds but only applies it to the EOS device when the content has changed (using local SHA256 hash computation) or after a 60-second timeout. This reduces CPU usage on Arista EOS devices by avoiding unnecessary config applications while maintaining responsiveness to config changes.
1 parent 657e65a commit 69b9539

4 files changed

Lines changed: 404 additions & 27 deletions

File tree

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ All notable changes to this project will be documented in this file.
88

99
### Changes
1010

11+
- Device agents
12+
- Reduce agent CPU usage by continuing to fetch the full config every 5 seconds but only applying when it has changed or after 60s timeout
13+
1114
## [v0.14.0](https://github.com/malbeclabs/doublezero/compare/client/v0.13.0...client/v0.14.0) - 2026-03-24
1215

1316
### Breaking

controlplane/agent/cmd/agent/main.go

Lines changed: 65 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ package main
22

33
import (
44
"context"
5+
"crypto/sha256"
6+
"encoding/hex"
57
"flag"
68
"fmt"
79
"log"
@@ -20,52 +22,51 @@ import (
2022
)
2123

2224
var (
23-
localDevicePubkey = flag.String("pubkey", "frtyt4WKYudUpqTsvJzwN6Bd4btYxrkaYNhBNAaUVGWn", "This device's public key on the doublezero network")
24-
controllerAddress = flag.String("controller", "18.116.166.35:7000", "The DoubleZero controller IP address and port to connect to")
25-
device = flag.String("device", "127.0.0.1:9543", "IP Address and port of the Arist EOS API. Should always be the local switch at 127.0.0.1:9543.")
26-
sleepIntervalInSeconds = flag.Float64("sleep-interval-in-seconds", 5, "How long to sleep in between polls")
27-
controllerTimeoutInSeconds = flag.Float64("controller-timeout-in-seconds", 30, "How long to wait for a response from the controller before giving up")
28-
maxLockAge = flag.Int("max-lock-age-in-seconds", 3600, "If agent detects a config lock that older than the specified age, it will force unlock.")
29-
verbose = flag.Bool("verbose", false, "Enable verbose logging")
30-
showVersion = flag.Bool("version", false, "Print the version of the doublezero-agent and exit")
31-
metricsEnable = flag.Bool("metrics-enable", false, "Enable prometheus metrics")
32-
metricsAddr = flag.String("metrics-addr", ":8080", "Address to listen on for prometheus metrics")
25+
localDevicePubkey = flag.String("pubkey", "frtyt4WKYudUpqTsvJzwN6Bd4btYxrkaYNhBNAaUVGWn", "This device's public key on the doublezero network")
26+
controllerAddress = flag.String("controller", "18.116.166.35:7000", "The DoubleZero controller IP address and port to connect to")
27+
device = flag.String("device", "127.0.0.1:9543", "IP Address and port of the Arist EOS API. Should always be the local switch at 127.0.0.1:9543.")
28+
sleepIntervalInSeconds = flag.Float64("sleep-interval-in-seconds", 5, "How long to sleep in between polls")
29+
controllerTimeoutInSeconds = flag.Float64("controller-timeout-in-seconds", 30, "How long to wait for a response from the controller before giving up")
30+
configCacheTimeoutInSeconds = flag.Int("config-cache-timeout-in-seconds", 60, "Force full config fetch after this many seconds, even if hash unchanged")
31+
maxLockAge = flag.Int("max-lock-age-in-seconds", 3600, "If agent detects a config lock that older than the specified age, it will force unlock.")
32+
verbose = flag.Bool("verbose", false, "Enable verbose logging")
33+
showVersion = flag.Bool("version", false, "Print the version of the doublezero-agent and exit")
34+
metricsEnable = flag.Bool("metrics-enable", false, "Enable prometheus metrics")
35+
metricsAddr = flag.String("metrics-addr", ":8080", "Address to listen on for prometheus metrics")
3336

3437
// set by LDFLAGS
3538
version = "dev"
3639
commit = "none"
3740
date = "unknown"
3841
)
3942

40-
func pollControllerAndConfigureDevice(ctx context.Context, dzclient pb.ControllerClient, eapiClient *arista.EAPIClient, pubkey string, verbose *bool, maxLockAge int, agentVersion string, agentCommit string, agentDate string) error {
41-
var err error
42-
43-
// The dz controller needs to know what BGP sessions we have configured locally
44-
var neighborIpMap map[string][]string
45-
neighborIpMap, err = eapiClient.GetBgpNeighbors(ctx)
46-
if err != nil {
47-
log.Println("pollControllerAndConfigureDevice: eapiClient.GetBgpNeighbors returned error:", err)
48-
agent.ErrorsBgpNeighbors.Inc()
49-
}
43+
func computeChecksum(data string) string {
44+
hash := sha256.Sum256([]byte(data))
45+
return hex.EncodeToString(hash[:])
46+
}
5047

51-
var configText string
48+
func fetchConfigFromController(ctx context.Context, dzclient pb.ControllerClient, pubkey string, neighborIpMap map[string][]string, verbose *bool, agentVersion string, agentCommit string, agentDate string) (configText string, configHash string, err error) {
5249
configText, err = agent.GetConfigFromServer(ctx, dzclient, pubkey, neighborIpMap, controllerTimeoutInSeconds, agentVersion, agentCommit, agentDate)
5350
if err != nil {
54-
log.Printf("pollControllerAndConfigureDevice failed to call agent.GetConfigFromServer: %q", err)
51+
log.Printf("fetchConfigFromController failed to call agent.GetConfigFromServer: %q", err)
5552
agent.ErrorsGetConfig.Inc()
56-
return err
53+
return "", "", err
5754
}
5855

5956
if *verbose {
6057
log.Printf("controller returned the following config: '%s'", configText)
6158
}
6259

60+
configHash = computeChecksum(configText)
61+
return configText, configHash, nil
62+
}
63+
64+
func applyConfig(ctx context.Context, eapiClient *arista.EAPIClient, configText string, maxLockAge int) error {
6365
if configText == "" {
64-
// Controller returned empty config
6566
return nil
6667
}
6768

68-
_, err = eapiClient.AddConfigToDevice(ctx, configText, nil, maxLockAge) // 3rd arg (diffCmd) is only used for testing
69+
_, err := eapiClient.AddConfigToDevice(ctx, configText, nil, maxLockAge)
6970
if err != nil {
7071
agent.ErrorsApplyConfig.Inc()
7172
return err
@@ -121,15 +122,52 @@ func main() {
121122
client := aristapb.NewEapiMgrServiceClient(clientConn)
122123
eapiClient = arista.NewEAPIClient(slog.Default(), client)
123124

125+
var cachedConfigHash string
126+
var configCacheTime time.Time
127+
configCacheTimeout := time.Duration(*configCacheTimeoutInSeconds) * time.Second
128+
124129
for {
125130
select {
126131
case <-ctx.Done():
127132
return
128133
case <-ticker.C:
129-
err := pollControllerAndConfigureDevice(ctx, dzclient, eapiClient, *localDevicePubkey, verbose, *maxLockAge, version, commit, date)
134+
neighborIpMap, err := eapiClient.GetBgpNeighbors(ctx)
135+
if err != nil {
136+
log.Println("ERROR: eapiClient.GetBgpNeighbors returned", err)
137+
agent.ErrorsBgpNeighbors.Inc()
138+
}
139+
140+
// Fetch config every 5 seconds
141+
configText, configHash, err := fetchConfigFromController(ctx, dzclient, *localDevicePubkey, neighborIpMap, verbose, version, commit, date)
142+
if err != nil {
143+
log.Println("ERROR: fetchConfigFromController returned", err)
144+
continue
145+
}
146+
147+
// Only apply if config changed or timeout elapsed
148+
shouldApply := false
149+
if cachedConfigHash == "" {
150+
// First run
151+
shouldApply = true
152+
} else if configHash != cachedConfigHash {
153+
// Config changed
154+
shouldApply = true
155+
} else if time.Since(configCacheTime) >= configCacheTimeout {
156+
// Force apply after timeout
157+
shouldApply = true
158+
}
159+
160+
if !shouldApply {
161+
continue
162+
}
163+
164+
err = applyConfig(ctx, eapiClient, configText, *maxLockAge)
130165
if err != nil {
131-
log.Println("ERROR: pollAndConfigureDevice returned", err)
166+
log.Println("ERROR: applyConfig returned", err)
167+
continue
132168
}
169+
cachedConfigHash = configHash
170+
configCacheTime = time.Now()
133171
}
134172
}
135173
}

0 commit comments

Comments
 (0)