Skip to content

Commit 6c8d006

Browse files
committed
Respect latency thresholds in node switching
Add configurable latency thresholds to monitoring and use them to avoid insignificant latency-based node switches. Changes include: - config.yml: add latency_threshold and latency_threshold_percent. - config/mod.rs: add optional latency fields and is_latency_improvement_significant() helper. - monitoring/sync.rs: refactor selection logic (find_best_candidate, evaluate_switch), make select_best_node accept monitoring config and chain, and honor latency thresholds and in_sync rules when deciding switches; add many tests covering threshold behavior. - monitoring/request_health.rs: expose test helpers via testkit::sync::url import. - monitoring/mod.rs: make request_health module crate-visible. - monitoring/service.rs & monitoring/worker.rs: expose/adopt RequestAdaptiveMonitor (add adaptive_monitor accessor in NodeService, wire adaptive_monitor through NodeMonitor, mark adaptive monitor on successful switches via try_switch). - monitoring/telemetry.rs: normalize in_sync field serialization to "false" string and minor formatting. - monitoring/switch_reason.rs: derive PartialEq for test assertions. - testkit: add sync helpers (url, healthy_observation, not_in_sync_observation) and include them in mod; update test monitoring config defaults. Overall goal: reduce noisy or insignificant latency-driven switches and integrate adaptive monitoring updates when switches occur. Tests added/updated to validate behavior.
1 parent e1045b0 commit 6c8d006

13 files changed

Lines changed: 265 additions & 98 deletions

File tree

apps/dynode/config.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ monitoring:
1313
poll_interval: 10m
1414
max_sync_delay: 24s
1515
max_sync_blocks: 20
16+
latency_threshold: 250ms
17+
latency_threshold_percent: 20
1618
adaptive:
1719
enabled: true
1820
window: 30s

apps/dynode/src/config/mod.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ pub struct NodeMonitoringConfig {
2828
#[serde(deserialize_with = "duration::deserialize")]
2929
pub max_sync_delay: Duration,
3030
pub max_sync_blocks: u64,
31+
#[serde(default, deserialize_with = "duration::deserialize_option")]
32+
pub latency_threshold: Option<Duration>,
33+
#[serde(default)]
34+
pub latency_threshold_percent: Option<f64>,
3135
pub adaptive: AdaptiveMonitoringConfig,
3236
}
3337

@@ -44,6 +48,24 @@ impl NodeMonitoringConfig {
4448
let computed = self.max_sync_delay.as_millis() as u64 / block_time_ms;
4549
computed.clamp(1, self.max_sync_blocks)
4650
}
51+
52+
pub fn is_latency_improvement_significant(&self, old: Duration, new: Duration) -> bool {
53+
if new >= old {
54+
return false;
55+
}
56+
let diff = old - new;
57+
if let Some(threshold) = self.latency_threshold
58+
&& diff < threshold
59+
{
60+
return false;
61+
}
62+
if let Some(percent) = self.latency_threshold_percent
63+
&& (diff.as_millis() as f64 / old.as_millis() as f64) * 100.0 < percent
64+
{
65+
return false;
66+
}
67+
true
68+
}
4769
}
4870

4971
#[derive(Debug, Deserialize, Clone)]

apps/dynode/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
154154
Arc::clone(&node_service.nodes),
155155
Arc::clone(&node_service.metrics),
156156
node_service.monitoring_config.clone(),
157+
node_service.adaptive_monitor(),
157158
);
158159

159160
rocket::tokio::spawn(async move {

apps/dynode/src/monitoring/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
mod chain_client;
2-
mod request_health;
2+
pub(crate) mod request_health;
33
mod service;
44
mod switch_reason;
55
mod sync;

apps/dynode/src/monitoring/request_health.rs

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,7 @@ impl RequestWindow {
263263
mod tests {
264264
use super::*;
265265
use crate::testkit::config as testkit;
266+
use crate::testkit::sync::url;
266267
use std::time::Duration;
267268

268269
fn config() -> AdaptiveMonitoringConfig {
@@ -274,13 +275,6 @@ mod tests {
274275
}
275276
}
276277

277-
fn url(value: &str) -> Url {
278-
Url {
279-
url: value.to_string(),
280-
headers: None,
281-
}
282-
}
283-
284278
#[tokio::test]
285279
async fn opens_host_when_error_threshold_is_reached() {
286280
let monitor = RequestAdaptiveMonitor::new(config());

apps/dynode/src/monitoring/service.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ impl NodeService {
6363
}
6464
}
6565

66+
pub fn adaptive_monitor(&self) -> Arc<RequestAdaptiveMonitor> {
67+
Arc::clone(&self.request_adaptive_monitor)
68+
}
69+
6670
pub async fn get_node_domain(nodes: &Arc<RwLock<HashMap<Chain, NodeDomain>>>, chain: Chain) -> Option<NodeDomain> {
6771
nodes.read().await.get(&chain).cloned()
6872
}

apps/dynode/src/monitoring/switch_reason.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use std::fmt;
22

3-
#[derive(Debug, Clone)]
3+
#[derive(Debug, Clone, PartialEq)]
44
pub enum NodeSwitchReason {
55
BlockHeight { old_block: u64, new_block: u64 },
66
Latency { old_latency_ms: u64, new_latency_ms: u64 },
@@ -17,7 +17,6 @@ impl NodeSwitchReason {
1717
Self::AdaptiveError { .. } => "adaptive_error",
1818
}
1919
}
20-
2120
}
2221

2322
impl fmt::Display for NodeSwitchReason {

0 commit comments

Comments
 (0)