Skip to content

Commit 89aa601

Browse files
committed
agent: simplify config polling by removing GetConfigHash RPC
Remove the separate GetConfigHash RPC endpoint and simplify the agent's config polling logic based on PR feedback. The agent now: - Fetches the full config every 5 seconds (as before) - Computes SHA256 hash locally to detect changes - Only applies config when changed or after 60s timeout This achieves the same CPU/load reduction goals with simpler architecture: - No duplicate logic between GetConfig and GetConfigHash - Same performance (config only applied when changed) - ~400 lines of code removed The optimization benefits remain: EOS devices aren't hammered with unchanged configs, reducing device CPU usage while maintaining the 5-second polling interval for responsiveness.
1 parent e7858cc commit 89aa601

10 files changed

Lines changed: 90 additions & 493 deletions

File tree

CHANGELOG.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ All notable changes to this project will be documented in this file.
88

99
### Changes
1010

11+
- Device agents
12+
- Reduce agent CPU usage by continuing to fetch the full config every 5 seconds but only applying when it has changed or after 60s timeout
13+
1114
## [v0.12.0](https://github.com/malbeclabs/doublezero/compare/client/v0.11.0...client/v0.12.0) - 2026-03-16
1215

1316
### Breaking
@@ -33,8 +36,6 @@ All notable changes to this project will be documented in this file.
3336
- Add onchain parent DZD discovery to geoprobe-agent: periodically queries the Geolocation program for this probe's parent devices and resolves their metrics publisher keys from Serviceability, replacing the need for static `--parent-dzd` CLI flags. Static parents from CLI are merged with onchain parents, with onchain taking precedence for duplicate keys.
3437
- Optimize inbound probe-measured RTT accuracy: pre-sign both TWAMP probes before network I/O so probe 1 fires immediately after reply 0 with no signing delay, measure Tx-to-Rx interval (reply 0 Tx → probe 1 Rx) instead of Rx-to-Rx to exclude processing overhead on both sides, use kernel `SO_TIMESTAMPNS` receive timestamps on the reflector, and add a 15ms busy-poll window on the sender to avoid scheduler wakeup latency
3538
- Optimize outbound probe RTT accuracy: send a staggered warmup probe on a separate socket 2ms before the measurement probe to wake the reflector's thread, then take the min RTT of both
36-
- Device agents
37-
- Reduce config agent network and CPU usage by checking config checksums every 5 seconds, and reducing full config check frequency to 1m
3839

3940
## [v0.11.0](https://github.com/malbeclabs/doublezero/compare/client/v0.10.0...client/v0.11.0) - 2026-03-12
4041

controlplane/agent/cmd/agent/main.go

Lines changed: 16 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -137,30 +137,27 @@ func main() {
137137
agent.ErrorsBgpNeighbors.Inc()
138138
}
139139

140-
shouldFetchAndApply := false
140+
// Fetch config every 5 seconds
141+
configText, configHash, err := fetchConfigFromController(ctx, dzclient, *localDevicePubkey, neighborIpMap, verbose, version, commit, date)
142+
if err != nil {
143+
log.Println("ERROR: fetchConfigFromController returned", err)
144+
continue
145+
}
141146

147+
// Only apply if config changed or timeout elapsed
148+
shouldApply := false
142149
if cachedConfigHash == "" {
143-
shouldFetchAndApply = true
150+
// First run
151+
shouldApply = true
152+
} else if configHash != cachedConfigHash {
153+
// Config changed
154+
shouldApply = true
144155
} else if time.Since(configCacheTime) >= configCacheTimeout {
145-
shouldFetchAndApply = true
146-
} else {
147-
hash, err := agent.GetConfigHashFromServer(ctx, dzclient, *localDevicePubkey, neighborIpMap, controllerTimeoutInSeconds, version, commit, date)
148-
if err != nil {
149-
log.Println("ERROR: GetConfigHashFromServer returned", err)
150-
continue
151-
}
152-
if hash != cachedConfigHash {
153-
shouldFetchAndApply = true
154-
}
156+
// Force apply after timeout
157+
shouldApply = true
155158
}
156159

157-
if !shouldFetchAndApply {
158-
continue
159-
}
160-
161-
configText, configHash, err := fetchConfigFromController(ctx, dzclient, *localDevicePubkey, neighborIpMap, verbose, version, commit, date)
162-
if err != nil {
163-
log.Println("ERROR: fetchConfigFromController returned", err)
160+
if !shouldApply {
164161
continue
165162
}
166163

controlplane/agent/internal/agent/dzclient.go

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -35,28 +35,6 @@ func GetConfigFromServer(ctx context.Context, client pb.ControllerClient, localD
3535
return config, nil
3636
}
3737

38-
func GetConfigHashFromServer(ctx context.Context, client pb.ControllerClient, localDevicePubkey string, neighborIpMap map[string][]string, controllerTimeoutInSeconds *float64, agentVersion string, agentCommit string, agentDate string) (hash string, err error) {
39-
ctx, cancel := context.WithTimeout(ctx, time.Duration(*controllerTimeoutInSeconds*float64(time.Second)))
40-
defer cancel()
41-
42-
var bgpPeers []string
43-
bgpPeersByVrf := make(map[string]*pb.BgpPeers)
44-
for vrf, peers := range neighborIpMap {
45-
bgpPeersByVrf[vrf] = &pb.BgpPeers{Peers: peers}
46-
bgpPeers = append(bgpPeers, peers...)
47-
}
48-
slices.Sort(bgpPeers)
49-
50-
req := &pb.ConfigRequest{Pubkey: localDevicePubkey, BgpPeers: bgpPeers, BgpPeersByVrf: bgpPeersByVrf, AgentVersion: &agentVersion, AgentCommit: &agentCommit, AgentDate: &agentDate}
51-
resp, err := client.GetConfigHash(ctx, req)
52-
if err != nil {
53-
log.Printf("Error calling GetConfigHash: %v\n", err)
54-
return "", err
55-
}
56-
57-
return resp.GetHash(), nil
58-
}
59-
6038
func GetDzClient(controllerAddressAndPort string) (pb.ControllerClient, error) {
6139
conn, err := grpc.NewClient(controllerAddressAndPort, grpc.WithTransportCredentials(insecure.NewCredentials()))
6240
log.Printf("controllerAddressAndPort %s\n", controllerAddressAndPort)

controlplane/controller/README.md

Lines changed: 43 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -6,93 +6,57 @@ The controller generates device configurations from Solana smart contract state
66

77
### Agent-Controller Communication Flow
88

9-
The controller provides two gRPC endpoints, GetConfig and GetConfigHash, that the config agent (in ../agent/) uses to detect and apply configuration changes. The agent polls the controller every 5 seconds by default.
9+
The controller provides a gRPC endpoint (GetConfig) that returns both the configuration and its hash. The agent polls the controller every 5 seconds, but only applies the configuration to the EOS device when it has changed (based on hash comparison) or after a 60-second timeout.
1010

11-
The design includes two optimizations:
12-
1. Applying configuration to an Arista EOS device causes the EOS ConfigAgent process CPU to spike, so the agent only applies the config when the config generated by the controller is different than the last polling cycle
13-
2. To make success more likely on lossy networks, GetConfigHash returns only the hash (64 bytes) instead of the full config (~50KB+)
11+
The design includes an optimization to reduce EOS device CPU usage:
12+
- Applying configuration to an Arista EOS device causes the EOS ConfigAgent process CPU to spike
13+
- The agent computes a SHA256 hash of the received config and only applies it when:
14+
1. The hash differs from the last applied configuration, OR
15+
2. 60 seconds have elapsed since the last application (as a safety measure)
1416

15-
Here's how the agent uses the endpoints:
17+
Here's how the agent uses the endpoint:
1618

1719
```
18-
┌─────────┐ ┌────────────┐ ┌────────────┐ ┌─────────┐
19-
│ Agent │ │ Controller │ │ Controller │ │ EOS │
20-
│ main() │ │GetConfigHash │ Config │ │ Device │
21-
│ │ │ GetConfig()│ │ Generator │ │ │
22-
└────┬────┘ └─────┬──────┘ └─────┬──────┘ └────┬────┘
23-
│ │ │ │
24-
│ Every 5s: │ │ │
25-
│ │ │ │
26-
│ GetBgpNeighbors() │ │ │
27-
├─────────────────────────────────────────────────────────────────────────────────────────►│
28-
│◄─────────────────────────────────────────────────────────────────────────────────────────┤
29-
│ [peer IPs] │ │ │
30-
│ │ │ │
31-
│ Decision: should fetch? │ │ │
32-
│ • First run (no hash)? │ │ │
33-
│ • 1m since last apply? │ │ │
34-
│ • Hash changed? │ │ │
35-
│ │ │ │
36-
│ GetConfigHashFromServer() │ │ │
37-
├───────────────────────────►│ │ │
38-
│ │ processConfigRequest() │ │
39-
│ ├─────────────────────────────►│ │
40-
│ │ │ generateConfig() │
41-
│ │ │ • deduplicateTunnels() │
42-
│ │ │ • renderConfig() │
43-
│ │ │ SHA256(config) │
44-
│ │◄─────────────────────────────┤ │
45-
│ │ [hash only] │ │
46-
│◄───────────────────────────┤ │ │
47-
│ ConfigHashResponse │ │ │
48-
│ {hash: "abc123..."} │ │ │
49-
│ (64 bytes) │ │ │
50-
│ │ │ │
51-
│ Compare: hash != lastHash? │ │ │
52-
│ │ │ │
53-
├─── if YES (or first run or 1m timeout): │
54-
│ │ │ │
55-
│ fetchConfigFromController() │ │
56-
│ ├─► GetConfigFromServer() │ │
57-
│ │ ──────────────────► │ │ │
58-
│ │ │ processConfigRequest() │ │
59-
│ │ ├─────────────────────────────►│ │
60-
│ │ │ │ generateConfig() │
61-
│ │ │ │ • deduplicateTunnels() │
62-
│ │ │ │ • renderConfig() │
63-
│ │ │ │ (entire config text) │
64-
│ │ │◄─────────────────────────────┤ │
65-
│ │ ◄──────────────────│ [config string] │ │
66-
│ │ ConfigResponse │ │ │
67-
│ │ {config: "..."} │ │ │
68-
│ │ │ │ │
69-
│ ├─► computeChecksum(config) │ │
70-
│ │ [local SHA256] │ │ │
71-
│ │ │ │ │
72-
│ └─► return config+hash │ │ │
73-
│ │ │ │
74-
│ applyConfig() │ │ │
75-
│ └─► AddConfigToDevice(config) │ │
76-
│ ─────────────────────────────────────────────────────────────────────────────────►│
77-
│ ◄─────────────────────────────────────────────────────────────────────────────────┤
78-
│ [config applied] │ │ │
79-
│ │ │ │
80-
│ lastChecksum = hash │ │ │
81-
│ lastApplyTime = now │ │ │
82-
│ │ │ │
83-
├─── else: skip this cycle (hash unchanged, no work needed) | │
84-
│ │ │ │
85-
│ sleep(5s) │ │ │
86-
│ goto top │ │ │
87-
│ │ │ │
20+
┌─────────┐ ┌────────────┐ ┌────────────┐ ┌─────────┐
21+
│ Agent │ │ Controller │ │ Controller │ │ EOS │
22+
│ main() │ │ GetConfig()│ │ Config │ │ Device │
23+
│ │ │ (gRPC) │ │ Generator │ │ │
24+
└────┬────┘ └─────┬──────┘ └─────┬──────┘ └────┬────┘
25+
│ │ │ │
26+
│ Every 5s: │ │ │
27+
│ │ │ │
28+
│ GetBgpNeighbors() │ │ │
29+
├──────────────────────────────────────────────────────────────────────────────────────────►│
30+
│◄──────────────────────────────────────────────────────────────────────────────────────────┤
31+
│ [peer IPs] │ │ │
32+
│ │ │ │
33+
│ GetConfigFromServer() │ │ │
34+
├────────────────────────────►│ │ │
35+
│ │ processConfigRequest() │ │
36+
│ ├─────────────────────────────►│ │
37+
│ │ │ generateConfig() │
38+
│ │ │ • deduplicateTunnels() │
39+
│ │ │ • renderConfig() │
40+
│ │ │ (~50KB config text) │
41+
│ │ │ • compute SHA256 hash │
42+
│ │◄─────────────────────────────┤ │
43+
│ │ [config string + hash] │ │
44+
│◄────────────────────────────┤ │ │
45+
│ ConfigResponse │ │ │
46+
│ {config: "...", hash: "..."}│ │ │
47+
│ │ │ │
48+
│ Compare hash with cached │ │ │
49+
│ If changed OR 60s elapsed: │ │ │
50+
│ AddConfigToDevice(config) │ │ │
51+
├──────────────────────────────────────────────────────────────────────────────────────────►│
8852
```
8953

9054
**Key Benefits:**
91-
- **Network**: 64 bytes vs ~50KB on most cycles (99%+ reduction when config unchanged)
92-
- **CPU**: Config generation still happens on controller (for hash), but EOS device skips apply
93-
- **Safety**: Full config check every 60s as fallback
55+
- **CPU**: EOS device only processes config when it actually changes (or every 60s as safety)
9456
- **Responsiveness**: Still checks for changes every 5 seconds
95-
- **Decision points**: First run, 60s timeout, or hash mismatch triggers full fetch
57+
- **Simplicity**: Single endpoint, agent handles caching logic
58+
- **Safety**: Full config application every 60s ensures eventual consistency
59+
- **Backward Compatibility**: Hash field maintained in response for older agents
9660

9761
## Configuration
9862

0 commit comments

Comments
 (0)