From 8728919440f9f993278f08de300594ea35f991ed Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Thu, 30 Apr 2026 14:53:13 -0700 Subject: [PATCH 01/33] feat(tonic-xds): add OutlierDetector sweep engine (gRFC A50) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the core gRFC A50 outlier-detection algorithm: per-endpoint success/failure counters, the success-rate and failure-percentage ejection algorithms, the ejection-multiplier state machine, and a periodic sweep task that emits ejection/un-ejection decisions on a channel. `run_sweep` is pure (returns a Vec); the sweep loop spawned by `OutlierDetector::spawn` owns the channel sender and forwards decisions, so dropping the returned `AbortOnDrop` ends the loop and closes the receiver. Tests drive `run_sweep` directly without the channel or tokio time mechanics. Algorithm coverage matches the gRFC: - Success-rate ejection with configurable `stdev_factor`, `enforcing_success_rate`, `minimum_hosts`, `request_volume`. - Failure-percentage ejection with `threshold`, `enforcing_failure_ percentage`, `minimum_hosts`, `request_volume`. - Ejection multiplier increments on each ejection, decays on healthy intervals; ejection duration is `base * multiplier` capped at `max(base, max_ejection_time)`. - `max_ejection_percent` caps total concurrent ejections. Probability rolls go through an injectable `Rng` trait (defaulting to `fastrand`) so tests can pin enforcement decisions. Standalone in this PR — no integration with the load balancer yet. That lands in a follow-up alongside the per-endpoint outcome interception layer. Refs: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md --- tonic-xds/src/client/loadbalance/mod.rs | 1 + .../client/loadbalance/outlier_detection.rs | 856 ++++++++++++++++++ 2 files changed, 857 insertions(+) create mode 100644 tonic-xds/src/client/loadbalance/outlier_detection.rs diff --git a/tonic-xds/src/client/loadbalance/mod.rs b/tonic-xds/src/client/loadbalance/mod.rs index 66ccb1772..1c4ffa395 100644 --- a/tonic-xds/src/client/loadbalance/mod.rs +++ b/tonic-xds/src/client/loadbalance/mod.rs @@ -3,4 +3,5 @@ pub(crate) mod channel_state; pub(crate) mod errors; pub(crate) mod keyed_futures; pub(crate) mod loadbalancer; +pub(crate) mod outlier_detection; pub(crate) mod pickers; diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs new file mode 100644 index 000000000..93e63ed46 --- /dev/null +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -0,0 +1,856 @@ +//! gRFC A50 outlier-detection sweep engine. +//! +//! Owns per-endpoint counters and an ejection state machine. Periodically +//! reads the counters, runs the success-rate and failure-percentage +//! ejection algorithms, and emits [`EjectionDecision`]s. Knows nothing +//! about the data path: callers feed it RPC outcomes via the lock-free +//! [`EndpointCounters`] handle returned by [`OutlierDetector::add_endpoint`], +//! and consume decisions from a channel returned by [`OutlierDetector::spawn`]. +//! +//! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md + +use std::collections::HashMap; +use std::sync::Arc; +use std::sync::Mutex; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::Instant; + +use tokio::sync::mpsc; + +use crate::client::endpoint::EndpointAddress; +use crate::common::async_util::AbortOnDrop; +use crate::xds::resource::outlier_detection::{ + FailurePercentageConfig, OutlierDetectionConfig, SuccessRateConfig, +}; + +/// Lock-free per-endpoint success/failure counter handle. +/// +/// Cloned freely. Callers (typically a request-outcome interceptor) +/// invoke [`record_success`] / [`record_failure`] from the data path. +/// The detector reads and resets the counters during each sweep. +/// +/// [`record_success`]: EndpointCounters::record_success +/// [`record_failure`]: EndpointCounters::record_failure +#[derive(Debug, Default)] +pub(crate) struct EndpointCounters { + success: AtomicU64, + failure: AtomicU64, +} + +impl EndpointCounters { + pub(crate) fn record_success(&self) { + self.success.fetch_add(1, Ordering::Relaxed); + } + + pub(crate) fn record_failure(&self) { + self.failure.fetch_add(1, Ordering::Relaxed); + } + + /// Atomically read and zero both counters. Returns `(success, failure)`. + fn snapshot_and_reset(&self) -> (u64, u64) { + let s = self.success.swap(0, Ordering::Relaxed); + let f = self.failure.swap(0, Ordering::Relaxed); + (s, f) + } +} + +/// A decision emitted by an [`OutlierDetector`] sweep. +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) enum EjectionDecision { + /// Eject this endpoint from the load-balancing pool. The caller + /// should keep its underlying connection alive (A50 requires + /// preserving connections across ejection). + Eject(EndpointAddress), + /// Restore a previously-ejected endpoint to the pool. + Uneject(EndpointAddress), +} + +/// Probability source for `enforcing_*` rolls. Abstracted so tests can +/// inject deterministic outcomes. +pub(crate) trait Rng: Send + Sync + 'static { + /// Return a uniform random `u32` in `0..100`. + fn pct_roll(&self) -> u32; +} + +/// Default RNG backed by `fastrand` (already a workspace dep). +struct FastRandRng; + +impl Rng for FastRandRng { + fn pct_roll(&self) -> u32 { + fastrand::u32(0..100) + } +} + +/// Per-endpoint state held inside the detector. +struct EndpointState { + counters: Arc, + /// Number of times this endpoint has been ejected. Grows on each + /// re-ejection and decays on each healthy interval. + ejection_multiplier: u32, + /// `Some(at)` when currently ejected; `None` otherwise. + ejected_at: Option, +} + +impl EndpointState { + fn new() -> Self { + Self { + counters: Arc::new(EndpointCounters::default()), + ejection_multiplier: 0, + ejected_at: None, + } + } +} + +/// gRFC A50 outlier detector. +/// +/// `run_sweep` is pure — it returns a list of [`EjectionDecision`]s +/// rather than sending them. The sweep loop spawned by [`spawn`] owns +/// the channel sender and forwards decisions to the receiver, so +/// dropping the [`AbortOnDrop`] handle ends the loop and closes the +/// receiver. `OutlierDetector` itself holds no I/O resources, which +/// makes algorithm-level tests trivial to write. +/// +/// [`spawn`]: OutlierDetector::spawn +pub(crate) struct OutlierDetector { + config: OutlierDetectionConfig, + state: Mutex>, + rng: Box, +} + +impl OutlierDetector { + /// Build the detector and spawn its sweep task on the current Tokio + /// runtime. The sweep runs every `config.interval` until the returned + /// [`AbortOnDrop`] is dropped. + pub(crate) fn spawn( + config: OutlierDetectionConfig, + ) -> ( + Arc, + mpsc::UnboundedReceiver, + AbortOnDrop, + ) { + Self::spawn_with_rng(config, Box::new(FastRandRng)) + } + + /// Variant of [`spawn`] that accepts an injected [`Rng`]. + pub(crate) fn spawn_with_rng( + config: OutlierDetectionConfig, + rng: Box, + ) -> ( + Arc, + mpsc::UnboundedReceiver, + AbortOnDrop, + ) { + let (tx, rx) = mpsc::unbounded_channel(); + let detector = Arc::new(Self { + config, + state: Mutex::new(HashMap::new()), + rng, + }); + let task = tokio::spawn(sweep_loop(detector.clone(), tx)); + (detector, rx, AbortOnDrop(task)) + } + + /// Register an endpoint and return its lock-free counter handle. + /// The caller wires this handle into the data-path RPC interceptor so + /// that completed calls increment success/failure atomics. + /// + /// Adding an already-registered address is a no-op and returns the + /// existing handle (so callers can re-add idempotently). + pub(crate) fn add_endpoint(&self, addr: EndpointAddress) -> Arc { + let mut state = self.state.lock().expect("outlier_detector mutex poisoned"); + state + .entry(addr) + .or_insert_with(EndpointState::new) + .counters + .clone() + } + + /// Forget a previously-registered endpoint. Drops its counters and + /// any ejection state. If the endpoint was ejected, no `Uneject` + /// decision is emitted — the caller is expected to handle the removal + /// directly (e.g., by dropping its slot in the load balancer). + pub(crate) fn remove_endpoint(&self, addr: &EndpointAddress) { + let mut state = self.state.lock().expect("outlier_detector mutex poisoned"); + state.remove(addr); + } + + /// Run a single sweep at logical time `now` and return the resulting + /// ejection/un-ejection decisions. Pure — does no I/O. The sweep loop + /// invokes this on each interval tick and forwards the decisions on + /// the channel; tests call it directly. + pub(crate) fn run_sweep(&self, now: Instant) -> Vec { + let mut state = self.state.lock().expect("outlier_detector mutex poisoned"); + + // Snapshot per-endpoint stats and update ejection-time multiplier + // bookkeeping. A50: for each endpoint that received traffic and is + // not currently ejected, decrement the multiplier toward zero. + let mut snapshots: Vec<(EndpointAddress, u64, u64)> = Vec::with_capacity(state.len()); + for (addr, ep) in state.iter_mut() { + let (success, failure) = ep.counters.snapshot_and_reset(); + let total = success + failure; + if ep.ejected_at.is_none() && total > 0 { + ep.ejection_multiplier = ep.ejection_multiplier.saturating_sub(1); + } + snapshots.push((addr.clone(), success, failure)); + } + + // Un-eject endpoints whose backoff has elapsed. A50: + // actual_duration = min(base * multiplier, max(base, max_ejection_time)) + let cap = self + .config + .base_ejection_time + .max(self.config.max_ejection_time); + let mut to_uneject: Vec = Vec::new(); + for (addr, ep) in state.iter_mut() { + if let Some(at) = ep.ejected_at + && let Some(scaled) = self + .config + .base_ejection_time + .checked_mul(ep.ejection_multiplier) + && now.duration_since(at) >= scaled.min(cap) + { + ep.ejected_at = None; + to_uneject.push(addr.clone()); + } + } + + // Build candidate list (non-ejected endpoints) once for both + // algorithms. A50 wants both algorithms to share the snapshot. + // Note: we only build the rate slice; per-algorithm filters + // (request_volume, minimum_hosts) are applied below. + let candidates: Vec = snapshots + .iter() + .filter_map(|(addr, success, failure)| { + let total = success + failure; + let ep = state.get(addr)?; + if ep.ejected_at.is_some() { + return None; + } + Some(Candidate { + addr: addr.clone(), + success: *success, + failure: *failure, + total, + }) + }) + .collect(); + + // Compute the cap on currently-ejected endpoints. A50: + // if ejected_count >= max_ejection_percent of total, stop ejecting. + // We compute the cap once and decrement the available budget as + // each algorithm ejects. + let total_endpoints = state.len(); + let max_ejections = (total_endpoints as u64 + * u64::from(self.config.max_ejection_percent.get()) + / 100) as usize; + let already_ejected = state.values().filter(|ep| ep.ejected_at.is_some()).count(); + let mut budget = max_ejections.saturating_sub(already_ejected); + + let mut to_eject: Vec = Vec::new(); + + if let Some(sr) = self.config.success_rate.as_ref() { + self.run_success_rate(sr, &candidates, &mut budget, &mut to_eject); + } + if let Some(fp) = self.config.failure_percentage.as_ref() { + self.run_failure_percentage(fp, &candidates, &mut budget, &mut to_eject); + } + + for addr in &to_eject { + if let Some(ep) = state.get_mut(addr) { + ep.ejected_at = Some(now); + ep.ejection_multiplier = ep.ejection_multiplier.saturating_add(1); + } + } + + drop(state); + + let mut decisions = Vec::with_capacity(to_uneject.len() + to_eject.len()); + for addr in to_uneject { + decisions.push(EjectionDecision::Uneject(addr)); + } + for addr in to_eject { + decisions.push(EjectionDecision::Eject(addr)); + } + decisions + } + + /// A50 success-rate algorithm. + fn run_success_rate( + &self, + cfg: &SuccessRateConfig, + all: &[Candidate], + budget: &mut usize, + out: &mut Vec, + ) { + // Filter to candidates with enough traffic. + let qualifying: Vec<&Candidate> = all + .iter() + .filter(|c| c.total >= u64::from(cfg.request_volume)) + .collect(); + if qualifying.len() < cfg.minimum_hosts as usize { + return; + } + + // success_rate = success / total (in [0.0, 1.0]). + let rates: Vec = qualifying + .iter() + .map(|c| c.success as f64 / c.total as f64) + .collect(); + let n = rates.len() as f64; + let mean = rates.iter().sum::() / n; + let variance = rates.iter().map(|r| (r - mean).powi(2)).sum::() / n; + let stdev = variance.sqrt(); + + // threshold = mean - stdev * (stdev_factor / 1000) + let factor = f64::from(cfg.stdev_factor) / 1000.0; + let threshold = mean - stdev * factor; + + for (c, rate) in qualifying.iter().zip(rates.iter()) { + if *budget == 0 { + break; + } + if *rate < threshold && self.roll(cfg.enforcing_success_rate.get()) { + out.push(c.addr.clone()); + *budget -= 1; + } + } + } + + /// A50 failure-percentage algorithm. + fn run_failure_percentage( + &self, + cfg: &FailurePercentageConfig, + all: &[Candidate], + budget: &mut usize, + out: &mut Vec, + ) { + let qualifying: Vec<&Candidate> = all + .iter() + .filter(|c| c.total >= u64::from(cfg.request_volume)) + .filter(|c| !out.contains(&c.addr)) // skip endpoints already ejected this sweep + .collect(); + if qualifying.len() < cfg.minimum_hosts as usize { + return; + } + + let threshold = u64::from(cfg.threshold.get()); + for c in qualifying { + if *budget == 0 { + break; + } + // failure_pct = 100 * failure / total + let failure_pct = 100 * c.failure / c.total; + if failure_pct >= threshold && self.roll(cfg.enforcing_failure_percentage.get()) { + out.push(c.addr.clone()); + *budget -= 1; + } + } + } + + /// Return true with probability `pct / 100` (clamped at 100 ⇒ always). + fn roll(&self, pct: u8) -> bool { + if pct >= 100 { + return true; + } + if pct == 0 { + return false; + } + self.rng.pct_roll() < u32::from(pct) + } +} + +/// Cached per-endpoint snapshot used during a sweep. +struct Candidate { + addr: EndpointAddress, + success: u64, + failure: u64, + total: u64, +} + +/// Background task: runs `detector.run_sweep` on each interval tick and +/// forwards each decision on the channel. The task ends (and `tx` is +/// dropped, closing the receiver) when [`AbortOnDrop`] is dropped or +/// when the receiver itself is dropped. +async fn sweep_loop(detector: Arc, tx: mpsc::UnboundedSender) { + let mut ticker = tokio::time::interval(detector.config.interval); + // Skip missed ticks rather than burst-catching up — the goal is + // periodic observation, not making up for paused time. + ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + // The first tick fires immediately; consume it so the first real + // sweep is `interval` after spawn (matches A50 semantics). + ticker.tick().await; + + loop { + ticker.tick().await; + for decision in detector.run_sweep(Instant::now()) { + if tx.send(decision).is_err() { + // Receiver gone — nobody is listening. + return; + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::xds::resource::outlier_detection::Percentage; + use std::sync::atomic::AtomicU32; + use std::time::Duration; + + // ----- Fixtures ----- + + fn addr(port: u16) -> EndpointAddress { + EndpointAddress::new("10.0.0.1", port) + } + + fn pct(v: u32) -> Percentage { + Percentage::new(v).unwrap() + } + + /// Base config with both algorithms disabled; tests opt in. + fn base_config() -> OutlierDetectionConfig { + OutlierDetectionConfig { + interval: Duration::from_secs(1), + base_ejection_time: Duration::from_secs(30), + max_ejection_time: Duration::from_secs(300), + max_ejection_percent: pct(100), + success_rate: None, + failure_percentage: None, + } + } + + /// Deterministic RNG: `pct_roll()` returns a fixed value, configurable. + struct FixedRng(AtomicU32); + + impl FixedRng { + fn new(value: u32) -> Self { + Self(AtomicU32::new(value)) + } + fn boxed(value: u32) -> Box { + Box::new(Self::new(value)) + } + } + + impl Rng for FixedRng { + fn pct_roll(&self) -> u32 { + self.0.load(Ordering::Relaxed) + } + } + + /// Build a detector with no sweep loop running. Tests drive + /// `run_sweep` directly and inspect the returned decisions. + fn detector_no_loop(config: OutlierDetectionConfig, rng: Box) -> Arc { + Arc::new(OutlierDetector { + config, + state: Mutex::new(HashMap::new()), + rng, + }) + } + + /// Sort a decision list deterministically so equality checks can rely + /// on a canonical order without coupling to `HashMap` iteration order. + fn sort(mut ds: Vec) -> Vec { + ds.sort_by(|a, b| format!("{a:?}").cmp(&format!("{b:?}"))); + ds + } + + // ----- EndpointCounters ----- + + #[test] + fn counters_record_and_reset() { + let c = EndpointCounters::default(); + c.record_success(); + c.record_success(); + c.record_failure(); + assert_eq!(c.snapshot_and_reset(), (2, 1)); + assert_eq!(c.snapshot_and_reset(), (0, 0)); + } + + // ----- add_endpoint / remove_endpoint ----- + + #[test] + fn add_endpoint_returns_shared_counter() { + let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); + let h1 = detector.add_endpoint(addr(8080)); + let h2 = detector.add_endpoint(addr(8080)); + assert!( + Arc::ptr_eq(&h1, &h2), + "second add should return same handle" + ); + h1.record_success(); + assert_eq!(h2.snapshot_and_reset(), (1, 0)); + } + + #[test] + fn remove_endpoint_drops_state() { + let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); + detector.add_endpoint(addr(8080)); + detector.remove_endpoint(&addr(8080)); + assert!(detector.state.lock().unwrap().is_empty()); + } + + // ----- Failure-percentage algorithm ----- + + fn fp_config( + threshold: u32, + request_volume: u32, + minimum_hosts: u32, + ) -> OutlierDetectionConfig { + let mut c = base_config(); + c.failure_percentage = Some(FailurePercentageConfig { + threshold: pct(threshold), + enforcing_failure_percentage: pct(100), + minimum_hosts, + request_volume, + }); + c + } + + #[test] + fn failure_percentage_ejects_above_threshold() { + let detector = detector_no_loop(fp_config(50, 10, 3), FixedRng::boxed(99)); + // 4 healthy endpoints + 1 bad one. + for port in 8080..=8083 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_success(); + } + } + let bad = detector.add_endpoint(addr(8084)); + for _ in 0..90 { + bad.record_failure(); + } + for _ in 0..10 { + bad.record_success(); + } + + let decisions = detector.run_sweep(Instant::now()); + assert_eq!(decisions, vec![EjectionDecision::Eject(addr(8084))]); + } + + #[test] + fn failure_percentage_skips_below_threshold() { + let detector = detector_no_loop(fp_config(50, 10, 3), FixedRng::boxed(99)); + for port in 8080..=8084 { + let h = detector.add_endpoint(addr(port)); + // 30% failure → below threshold of 50%. + for _ in 0..70 { + h.record_success(); + } + for _ in 0..30 { + h.record_failure(); + } + } + assert!(detector.run_sweep(Instant::now()).is_empty()); + } + + #[test] + fn minimum_hosts_gates_failure_percentage() { + let detector = detector_no_loop(fp_config(50, 10, 5), FixedRng::boxed(99)); + // Only 2 hosts have request_volume ≥ 10; minimum_hosts is 5 ⇒ skip. + for port in 8080..=8081 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_failure(); + } + } + assert!(detector.run_sweep(Instant::now()).is_empty()); + } + + #[test] + fn request_volume_filters_low_traffic_endpoints() { + let detector = detector_no_loop(fp_config(50, 100, 3), FixedRng::boxed(99)); + // Bad endpoint, but only 5 requests — below request_volume=100. + let bad = detector.add_endpoint(addr(8080)); + for _ in 0..5 { + bad.record_failure(); + } + for port in 8081..=8084 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..200 { + h.record_success(); + } + } + assert!(detector.run_sweep(Instant::now()).is_empty()); + } + + #[test] + fn enforcement_zero_percent_never_ejects() { + let mut config = fp_config(50, 10, 3); + config + .failure_percentage + .as_mut() + .unwrap() + .enforcing_failure_percentage = pct(0); + // Roll = 0 wouldn't trigger anyway since `roll(0)` short-circuits; + // pin the RNG to 0 just to be explicit. + let detector = detector_no_loop(config, FixedRng::boxed(0)); + for port in 8080..=8084 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_failure(); + } + } + assert!(detector.run_sweep(Instant::now()).is_empty()); + } + + // ----- Success-rate algorithm ----- + + fn sr_config( + stdev_factor: u32, + request_volume: u32, + minimum_hosts: u32, + ) -> OutlierDetectionConfig { + let mut c = base_config(); + c.success_rate = Some(SuccessRateConfig { + stdev_factor, + enforcing_success_rate: pct(100), + minimum_hosts, + request_volume, + }); + c + } + + #[test] + fn success_rate_ejects_outlier_below_threshold() { + let detector = detector_no_loop(sr_config(1900, 10, 5), FixedRng::boxed(99)); + // 4 endpoints at 99% success, 1 at 50% — outlier. + for port in 8080..=8083 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..99 { + h.record_success(); + } + h.record_failure(); + } + let bad = detector.add_endpoint(addr(8084)); + for _ in 0..50 { + bad.record_success(); + } + for _ in 0..50 { + bad.record_failure(); + } + assert_eq!( + detector.run_sweep(Instant::now()), + vec![EjectionDecision::Eject(addr(8084))], + ); + } + + #[test] + fn success_rate_no_ejection_when_all_uniform() { + let detector = detector_no_loop(sr_config(1900, 10, 5), FixedRng::boxed(99)); + for port in 8080..=8084 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..95 { + h.record_success(); + } + for _ in 0..5 { + h.record_failure(); + } + } + assert!(detector.run_sweep(Instant::now()).is_empty()); + } + + // ----- Ejection multiplier / un-ejection ----- + + #[test] + fn unejects_after_base_time() { + let mut config = fp_config(50, 10, 3); + config.base_ejection_time = Duration::from_secs(10); + config.max_ejection_time = Duration::from_secs(60); + let detector = detector_no_loop(config, FixedRng::boxed(99)); + + for port in 8080..=8084 { + let h = detector.add_endpoint(addr(port)); + if port == 8084 { + for _ in 0..100 { + h.record_failure(); + } + } else { + for _ in 0..100 { + h.record_success(); + } + } + } + + let t0 = Instant::now(); + assert_eq!( + detector.run_sweep(t0), + vec![EjectionDecision::Eject(addr(8084))], + ); + + // Still ejected just before base_ejection_time elapses. + assert!(detector.run_sweep(t0 + Duration::from_secs(9)).is_empty()); + + // Un-eject after `base * multiplier(=1)` = 10s. + assert_eq!( + detector.run_sweep(t0 + Duration::from_secs(10)), + vec![EjectionDecision::Uneject(addr(8084))], + ); + } + + #[test] + fn re_ejection_doubles_duration() { + // The multiplier doubles only when un-ejection and re-ejection + // happen in the *same* sweep — at that point the multiplier- + // decrement step has skipped the (still-ejected-at-start) + // endpoint, so re-ejection increments it from 1 to 2. + let mut config = fp_config(50, 10, 3); + config.base_ejection_time = Duration::from_secs(10); + config.max_ejection_time = Duration::from_secs(60); + let detector = detector_no_loop(config, FixedRng::boxed(99)); + + let bad = addr(8084); + let bad_h = detector.add_endpoint(bad.clone()); + for port in 8080..=8083 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_success(); + } + } + for _ in 0..100 { + bad_h.record_failure(); + } + + // Sweep 1: eject. Multiplier 0 → 1. + let t0 = Instant::now(); + assert_eq!( + detector.run_sweep(t0), + vec![EjectionDecision::Eject(bad.clone())], + ); + + // Re-record stats so sweep 2's snapshot has volume to evaluate. + for port in 8080..=8083 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_success(); + } + } + for _ in 0..100 { + bad_h.record_failure(); + } + + // Sweep 2 at t0+10: same-sweep un-eject + re-eject. + // Multiplier stays 1 through un-eject, then 1 → 2 on re-eject. + assert_eq!( + detector.run_sweep(t0 + Duration::from_secs(10)), + vec![ + EjectionDecision::Uneject(bad.clone()), + EjectionDecision::Eject(bad.clone()), + ], + ); + + // Re-ejection started at t0+10 with multiplier=2 → duration 20s. + // Still ejected 19s later (29s after t0). + assert!(detector.run_sweep(t0 + Duration::from_secs(29)).is_empty()); + + // Un-ejects at the 20s mark (30s after t0). + assert_eq!( + detector.run_sweep(t0 + Duration::from_secs(30)), + vec![EjectionDecision::Uneject(bad)], + ); + } + + #[test] + fn ejection_capped_by_max_ejection_time() { + // base=10s, max=15s, multiplier=10 → cap at 15s rather than 100s. + let mut config = fp_config(50, 10, 3); + config.base_ejection_time = Duration::from_secs(10); + config.max_ejection_time = Duration::from_secs(15); + let detector = detector_no_loop(config, FixedRng::boxed(99)); + + for port in 8080..=8084 { + detector.add_endpoint(addr(port)); + } + let t0 = Instant::now(); + // Force multiplier=10 directly. + { + let mut state = detector.state.lock().unwrap(); + let ep = state.get_mut(&addr(8084)).unwrap(); + ep.ejection_multiplier = 10; + ep.ejected_at = Some(t0); + } + // After base*multiplier (= 100s) the cap (= 15s) has long passed, + // so a sweep at 16s should un-eject. + let decisions = detector.run_sweep(t0 + Duration::from_secs(16)); + assert_eq!(decisions, vec![EjectionDecision::Uneject(addr(8084))]); + } + + #[test] + fn max_ejection_percent_caps_concurrent_ejections() { + // 5 hosts, all bad, but max_ejection_percent=20 ⇒ at most 1 ejected. + let mut config = fp_config(50, 10, 3); + config.max_ejection_percent = pct(20); + let detector = detector_no_loop(config, FixedRng::boxed(99)); + + for port in 8080..=8084 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_failure(); + } + } + let decisions = sort(detector.run_sweep(Instant::now())); + let ejects = decisions + .iter() + .filter(|d| matches!(d, EjectionDecision::Eject(_))) + .count(); + assert_eq!(ejects, 1, "max_ejection_percent=20% of 5 hosts ⇒ 1"); + } + + #[test] + fn multiplier_decrements_on_healthy_interval() { + let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); + let h = detector.add_endpoint(addr(8080)); + // Force multiplier to 3 without ejecting. + { + let mut state = detector.state.lock().unwrap(); + state.get_mut(&addr(8080)).unwrap().ejection_multiplier = 3; + } + // Healthy interval (some traffic, no ejection). + h.record_success(); + detector.run_sweep(Instant::now()); + let state = detector.state.lock().unwrap(); + assert_eq!(state.get(&addr(8080)).unwrap().ejection_multiplier, 2); + } + + // ----- Sweep loop ----- + + #[tokio::test(start_paused = true)] + async fn sweep_loop_emits_decisions_on_tick() { + let mut config = fp_config(50, 10, 3); + config.interval = Duration::from_millis(100); + let (detector, mut rx, _abort) = + OutlierDetector::spawn_with_rng(config, FixedRng::boxed(99)); + + for port in 8080..=8083 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_success(); + } + } + let bad = detector.add_endpoint(addr(8084)); + for _ in 0..100 { + bad.record_failure(); + } + + // Advance just past the first sweep tick. + tokio::time::sleep(Duration::from_millis(150)).await; + + let decision = rx.recv().await.expect("sweep should emit a decision"); + assert_eq!(decision, EjectionDecision::Eject(addr(8084))); + } + + #[tokio::test(start_paused = true)] + async fn dropping_abort_stops_sweep_loop() { + let mut config = base_config(); + config.interval = Duration::from_millis(50); + let (_detector, mut rx, abort) = OutlierDetector::spawn(config); + + // Drop the AbortOnDrop; the loop must terminate. + drop(abort); + tokio::time::sleep(Duration::from_millis(200)).await; + + // Sender should be dropped along with the task; recv returns None. + assert!(rx.recv().await.is_none()); + } +} From b03fb6982d78ec16bfbfb07e634b6c6b510e90db Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Thu, 30 Apr 2026 14:58:19 -0700 Subject: [PATCH 02/33] docs(tonic-xds): clarify outlier-detection config docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address two follow-up review comments from #2604 (the merged config PR) by folding the doc updates into this PR: - Module docstring: describe the actual integration plan (an mpsc channel of EjectionDecisions polled by LoadBalancer, leveraging EjectedChannel) instead of the original "filter on the Discover stream" wording. Add intra-doc links to the relevant types. - enforcing_success_rate / enforcing_failure_percentage: clarify that each is the *enforcement probability* — distinct from the per-algorithm threshold (stdev_factor for success-rate, threshold for failure-percentage). Note that 0 disables enforcement while still computing statistics. Also fix an unresolved intra-doc link in the algorithm module. --- .../client/loadbalance/outlier_detection.rs | 2 +- .../src/xds/resource/outlier_detection.rs | 21 +++++++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 93e63ed46..5e53883c0 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -131,7 +131,7 @@ impl OutlierDetector { Self::spawn_with_rng(config, Box::new(FastRandRng)) } - /// Variant of [`spawn`] that accepts an injected [`Rng`]. + /// Variant of [`Self::spawn`] that accepts an injected [`Rng`]. pub(crate) fn spawn_with_rng( config: OutlierDetectionConfig, rng: Box, diff --git a/tonic-xds/src/xds/resource/outlier_detection.rs b/tonic-xds/src/xds/resource/outlier_detection.rs index a31fd6c60..159ff7735 100644 --- a/tonic-xds/src/xds/resource/outlier_detection.rs +++ b/tonic-xds/src/xds/resource/outlier_detection.rs @@ -4,12 +4,17 @@ //! algorithm. The two sub-configs gate which ejection algorithms run. //! //! Note: A50 specifies outlier detection as a load-balancing policy -//! wrapping a `child_policy`. `tonic-xds` currently runs P2C as its only -//! load balancer and integrates outlier detection as a filter on the -//! `Discover` stream feeding it, so there is no `child_policy` field -//! here yet. It will be added when more balancers are supported. +//! wrapping a `child_policy`. `tonic-xds` currently runs P2C as its +//! only load balancer, so there is no `child_policy` field here yet — +//! it will be added when more balancers are supported. Integration +//! with the data path is via an mpsc channel of ejection decisions +//! polled by the [`LoadBalancer`] tower service, which marks the +//! corresponding [`ReadyChannel`] as ejected via [`EjectedChannel`]. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md +//! [`LoadBalancer`]: crate::client::loadbalance::loadbalancer::LoadBalancer +//! [`ReadyChannel`]: crate::client::loadbalance::channel_state::ReadyChannel +//! [`EjectedChannel`]: crate::client::loadbalance::channel_state::EjectedChannel use std::time::Duration; @@ -68,7 +73,9 @@ pub(crate) struct SuccessRateConfig { /// An endpoint is a candidate for ejection when its success rate falls /// below `mean - stdev * (stdev_factor / 1000.0)`. pub stdev_factor: u32, - /// Probability that a candidate is actually ejected. + /// Probability that a flagged candidate is actually ejected — *not* + /// the success-rate threshold (which is derived from `stdev_factor`). + /// Set to 0 to disable enforcement while still computing statistics. pub enforcing_success_rate: Percentage, /// Minimum number of candidate endpoints required to run the algorithm. pub minimum_hosts: u32, @@ -83,7 +90,9 @@ pub(crate) struct FailurePercentageConfig { /// Failure rate at or above which an endpoint is a candidate for /// ejection. pub threshold: Percentage, - /// Probability that a candidate is actually ejected. + /// Probability that a flagged candidate is actually ejected — *not* + /// the failure-rate threshold (that is `threshold` above). Set to 0 + /// to disable enforcement while still computing statistics. pub enforcing_failure_percentage: Percentage, /// Minimum number of candidate endpoints required to run the algorithm. pub minimum_hosts: u32, From 1da4063e55c4ab397addd483299dccd4f0e97880 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Thu, 30 Apr 2026 15:11:19 -0700 Subject: [PATCH 03/33] fix(tonic-xds): align outlier-detection algorithm with gRFC A50 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three spec-compliance fixes to `run_sweep` and the failure-percentage algorithm: 1. Reorder the sweep to match A50 step order: snapshot counters → run success-rate algorithm → run failure-percentage algorithm → step-5 housekeeping (decrement non-ejected multipliers, un-eject elapsed ejections). The previous order (un-eject before algorithms) caused spurious `Uneject` decisions whenever the same sweep also re-ejected the address. Per spec, re-ejection refreshes `ejected_at` to `now` before the un-eject check runs, so no transient un-eject is emitted. 2. Drop the `total > 0` traffic gate from the multiplier-decrement step. A50 says a non-ejected address with multiplier > 0 has its multiplier decremented every sweep, regardless of whether it received traffic that interval. 3. Failure-percentage now uses strict `>` against the threshold (was `>=`). Per A50: "If the address's failure percentage is greater than `failure_percentage_ejection.threshold`..." — an address sitting exactly at the threshold is not ejected. Also: drop the explicit "skip ejected hosts from candidate list" pre- filter. Per spec the algorithms iterate every address; ejected hosts naturally fail the `request_volume` gate since they receive no traffic in production. Behavior on real workloads is unchanged. Test changes: - `re_ejection_doubles_duration` now asserts a single `Eject` decision (no transient `Uneject`) under the corrected sweep order. - New `failure_percentage_at_threshold_does_not_eject` covers the strict-`>` boundary. - New `multiplier_decrements_even_without_traffic` covers the no-traffic-gate fix. --- .../client/loadbalance/outlier_detection.rs | 167 +++++++++++------- 1 file changed, 101 insertions(+), 66 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 5e53883c0..04ba1f734 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -178,67 +178,34 @@ impl OutlierDetector { /// ejection/un-ejection decisions. Pure — does no I/O. The sweep loop /// invokes this on each interval tick and forwards the decisions on /// the channel; tests call it directly. + /// + /// The order of operations follows gRFC A50: + /// 1. Record the timestamp. + /// 2. Swap each address's call-counter buckets. + /// 3. Run the success-rate algorithm if configured. + /// 4. Run the failure-percentage algorithm if configured. + /// 5. For each address: decrement the multiplier of non-ejected + /// addresses with multiplier > 0, and un-eject ejected addresses + /// whose backoff has elapsed. pub(crate) fn run_sweep(&self, now: Instant) -> Vec { let mut state = self.state.lock().expect("outlier_detector mutex poisoned"); - // Snapshot per-endpoint stats and update ejection-time multiplier - // bookkeeping. A50: for each endpoint that received traffic and is - // not currently ejected, decrement the multiplier toward zero. - let mut snapshots: Vec<(EndpointAddress, u64, u64)> = Vec::with_capacity(state.len()); + // Step 2: snapshot every endpoint's counters. + let mut snapshots: Vec = Vec::with_capacity(state.len()); for (addr, ep) in state.iter_mut() { let (success, failure) = ep.counters.snapshot_and_reset(); - let total = success + failure; - if ep.ejected_at.is_none() && total > 0 { - ep.ejection_multiplier = ep.ejection_multiplier.saturating_sub(1); - } - snapshots.push((addr.clone(), success, failure)); - } - - // Un-eject endpoints whose backoff has elapsed. A50: - // actual_duration = min(base * multiplier, max(base, max_ejection_time)) - let cap = self - .config - .base_ejection_time - .max(self.config.max_ejection_time); - let mut to_uneject: Vec = Vec::new(); - for (addr, ep) in state.iter_mut() { - if let Some(at) = ep.ejected_at - && let Some(scaled) = self - .config - .base_ejection_time - .checked_mul(ep.ejection_multiplier) - && now.duration_since(at) >= scaled.min(cap) - { - ep.ejected_at = None; - to_uneject.push(addr.clone()); - } - } - - // Build candidate list (non-ejected endpoints) once for both - // algorithms. A50 wants both algorithms to share the snapshot. - // Note: we only build the rate slice; per-algorithm filters - // (request_volume, minimum_hosts) are applied below. - let candidates: Vec = snapshots - .iter() - .filter_map(|(addr, success, failure)| { - let total = success + failure; - let ep = state.get(addr)?; - if ep.ejected_at.is_some() { - return None; - } - Some(Candidate { - addr: addr.clone(), - success: *success, - failure: *failure, - total, - }) - }) - .collect(); - - // Compute the cap on currently-ejected endpoints. A50: - // if ejected_count >= max_ejection_percent of total, stop ejecting. - // We compute the cap once and decrement the available budget as - // each algorithm ejects. + snapshots.push(Candidate { + addr: addr.clone(), + success, + failure, + total: success + failure, + }); + } + + // Compute a cap on the number of new ejections this sweep so we + // don't exceed `max_ejection_percent` of the total. Per A50, the + // check is performed before each candidate ejection; we model that + // as a budget that algorithms decrement. let total_endpoints = state.len(); let max_ejections = (total_endpoints as u64 * u64::from(self.config.max_ejection_percent.get()) @@ -246,13 +213,18 @@ impl OutlierDetector { let already_ejected = state.values().filter(|ep| ep.ejected_at.is_some()).count(); let mut budget = max_ejections.saturating_sub(already_ejected); + // Steps 3 & 4: run the algorithms on the snapshot. Hosts that are + // currently ejected naturally fail the `request_volume` gate + // because they receive no traffic in production, so iterating + // every address (per spec) and ejected-only candidates produce + // the same outcome on real workloads. let mut to_eject: Vec = Vec::new(); if let Some(sr) = self.config.success_rate.as_ref() { - self.run_success_rate(sr, &candidates, &mut budget, &mut to_eject); + self.run_success_rate(sr, &snapshots, &mut budget, &mut to_eject); } if let Some(fp) = self.config.failure_percentage.as_ref() { - self.run_failure_percentage(fp, &candidates, &mut budget, &mut to_eject); + self.run_failure_percentage(fp, &snapshots, &mut budget, &mut to_eject); } for addr in &to_eject { @@ -262,6 +234,32 @@ impl OutlierDetector { } } + // Step 5: decrement multipliers for non-ejected addresses, and + // un-eject any ejected addresses whose backoff has elapsed. This + // runs *after* re-ejection, so a same-sweep re-ejection updates + // `ejected_at` to `now` and the un-eject check sees zero elapsed + // time — no spurious uneject decision is emitted. + let cap = self + .config + .base_ejection_time + .max(self.config.max_ejection_time); + let mut to_uneject: Vec = Vec::new(); + for (addr, ep) in state.iter_mut() { + if let Some(at) = ep.ejected_at { + if let Some(scaled) = self + .config + .base_ejection_time + .checked_mul(ep.ejection_multiplier) + && now.duration_since(at) >= scaled.min(cap) + { + ep.ejected_at = None; + to_uneject.push(addr.clone()); + } + } else if ep.ejection_multiplier > 0 { + ep.ejection_multiplier -= 1; + } + } + drop(state); let mut decisions = Vec::with_capacity(to_uneject.len() + to_eject.len()); @@ -338,9 +336,11 @@ impl OutlierDetector { if *budget == 0 { break; } - // failure_pct = 100 * failure / total + // failure_pct = 100 * failure / total. A50 specifies a strict + // "greater than" comparison: an address sitting exactly at + // the threshold is not ejected. let failure_pct = 100 * c.failure / c.total; - if failure_pct >= threshold && self.roll(cfg.enforcing_failure_percentage.get()) { + if failure_pct > threshold && self.roll(cfg.enforcing_failure_percentage.get()) { out.push(c.addr.clone()); *budget -= 1; } @@ -545,6 +545,24 @@ mod tests { assert!(detector.run_sweep(Instant::now()).is_empty()); } + #[test] + fn failure_percentage_at_threshold_does_not_eject() { + // A50 specifies a strict "greater than" comparison: an address + // sitting exactly at the threshold should *not* be ejected. + let detector = detector_no_loop(fp_config(50, 10, 3), FixedRng::boxed(0)); + for port in 8080..=8084 { + let h = detector.add_endpoint(addr(port)); + // Exactly 50% failure rate — equal to the threshold. + for _ in 0..50 { + h.record_success(); + } + for _ in 0..50 { + h.record_failure(); + } + } + assert!(detector.run_sweep(Instant::now()).is_empty()); + } + #[test] fn minimum_hosts_gates_failure_percentage() { let detector = detector_no_loop(fp_config(50, 10, 5), FixedRng::boxed(99)); @@ -730,14 +748,14 @@ mod tests { bad_h.record_failure(); } - // Sweep 2 at t0+10: same-sweep un-eject + re-eject. - // Multiplier stays 1 through un-eject, then 1 → 2 on re-eject. + // Sweep 2 at t0+10: re-ejection happens before the un-eject + // housekeeping step (per A50 ordering), so `ejected_at` is + // refreshed to `now` and the un-eject check sees zero elapsed + // time. Only an Eject decision is emitted; the multiplier moves + // 1 → 2. assert_eq!( detector.run_sweep(t0 + Duration::from_secs(10)), - vec![ - EjectionDecision::Uneject(bad.clone()), - EjectionDecision::Eject(bad.clone()), - ], + vec![EjectionDecision::Eject(bad.clone())], ); // Re-ejection started at t0+10 with multiplier=2 → duration 20s. @@ -813,6 +831,23 @@ mod tests { assert_eq!(state.get(&addr(8080)).unwrap().ejection_multiplier, 2); } + #[test] + fn multiplier_decrements_even_without_traffic() { + // A50: a non-ejected address with multiplier > 0 has its + // multiplier decremented every sweep, regardless of whether it + // received any RPCs that interval. + let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); + detector.add_endpoint(addr(8080)); + { + let mut state = detector.state.lock().unwrap(); + state.get_mut(&addr(8080)).unwrap().ejection_multiplier = 3; + } + // No traffic recorded. + detector.run_sweep(Instant::now()); + let state = detector.state.lock().unwrap(); + assert_eq!(state.get(&addr(8080)).unwrap().ejection_multiplier, 2); + } + // ----- Sweep loop ----- #[tokio::test(start_paused = true)] From 1663b1c85ff99211ddb31cb1fe2ec2bf1622f3f1 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Thu, 30 Apr 2026 15:15:25 -0700 Subject: [PATCH 04/33] refactor(tonic-xds): defer success-rate algorithm to a follow-up PR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the success-rate algorithm and its tests from this PR so the outlier-detection PR is minimal and stand-alone. The scaffolding (sweep loop, multiplier state, counters, max-ejection-percent budget) is unchanged and still exercised by the failure-percentage algorithm plus the multiplier / un-eject / cap tests. If `OutlierDetectionConfig.success_rate` is set on the cluster, it is currently ignored. Documented in the module docstring with a pointer to the follow-up PR. Removes: - `OutlierDetector::run_success_rate` (mean / variance / sqrt math). - `success_rate` dispatch in `run_sweep`. - `run_failure_percentage`'s `!out.contains` filter — dead now that only one algorithm runs per sweep. - `success_rate_ejects_outlier_below_threshold` test. - `success_rate_no_ejection_when_all_uniform` test. - The `sr_config` test helper. - Unused `SuccessRateConfig` import. --- .../client/loadbalance/outlier_detection.rs | 126 +++--------------- 1 file changed, 16 insertions(+), 110 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 04ba1f734..a30286c98 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -1,13 +1,20 @@ //! gRFC A50 outlier-detection sweep engine. //! //! Owns per-endpoint counters and an ejection state machine. Periodically -//! reads the counters, runs the success-rate and failure-percentage -//! ejection algorithms, and emits [`EjectionDecision`]s. Knows nothing -//! about the data path: callers feed it RPC outcomes via the lock-free -//! [`EndpointCounters`] handle returned by [`OutlierDetector::add_endpoint`], -//! and consume decisions from a channel returned by [`OutlierDetector::spawn`]. +//! reads the counters, runs the failure-percentage ejection algorithm, +//! and emits [`EjectionDecision`]s. Knows nothing about the data path: +//! callers feed it RPC outcomes via the lock-free [`EndpointCounters`] +//! handle returned by [`OutlierDetector::add_endpoint`], and consume +//! decisions from a channel returned by [`OutlierDetector::spawn`]. +//! +//! Only the **failure-percentage** algorithm is implemented in this +//! module. The success-rate algorithm — which adds float-math (mean +//! and standard deviation across the qualifying hosts) — lands in a +//! follow-up PR. If [`OutlierDetectionConfig::success_rate`] is set, +//! it is currently ignored. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md +//! [`OutlierDetectionConfig::success_rate`]: crate::xds::resource::outlier_detection::OutlierDetectionConfig::success_rate use std::collections::HashMap; use std::sync::Arc; @@ -19,9 +26,7 @@ use tokio::sync::mpsc; use crate::client::endpoint::EndpointAddress; use crate::common::async_util::AbortOnDrop; -use crate::xds::resource::outlier_detection::{ - FailurePercentageConfig, OutlierDetectionConfig, SuccessRateConfig, -}; +use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDetectionConfig}; /// Lock-free per-endpoint success/failure counter handle. /// @@ -218,11 +223,11 @@ impl OutlierDetector { // because they receive no traffic in production, so iterating // every address (per spec) and ejected-only candidates produce // the same outcome on real workloads. + // + // Step 3 (`success_rate_ejection`) is intentionally not yet + // dispatched in this PR; it lands in a follow-up. let mut to_eject: Vec = Vec::new(); - if let Some(sr) = self.config.success_rate.as_ref() { - self.run_success_rate(sr, &snapshots, &mut budget, &mut to_eject); - } if let Some(fp) = self.config.failure_percentage.as_ref() { self.run_failure_percentage(fp, &snapshots, &mut budget, &mut to_eject); } @@ -272,48 +277,6 @@ impl OutlierDetector { decisions } - /// A50 success-rate algorithm. - fn run_success_rate( - &self, - cfg: &SuccessRateConfig, - all: &[Candidate], - budget: &mut usize, - out: &mut Vec, - ) { - // Filter to candidates with enough traffic. - let qualifying: Vec<&Candidate> = all - .iter() - .filter(|c| c.total >= u64::from(cfg.request_volume)) - .collect(); - if qualifying.len() < cfg.minimum_hosts as usize { - return; - } - - // success_rate = success / total (in [0.0, 1.0]). - let rates: Vec = qualifying - .iter() - .map(|c| c.success as f64 / c.total as f64) - .collect(); - let n = rates.len() as f64; - let mean = rates.iter().sum::() / n; - let variance = rates.iter().map(|r| (r - mean).powi(2)).sum::() / n; - let stdev = variance.sqrt(); - - // threshold = mean - stdev * (stdev_factor / 1000) - let factor = f64::from(cfg.stdev_factor) / 1000.0; - let threshold = mean - stdev * factor; - - for (c, rate) in qualifying.iter().zip(rates.iter()) { - if *budget == 0 { - break; - } - if *rate < threshold && self.roll(cfg.enforcing_success_rate.get()) { - out.push(c.addr.clone()); - *budget -= 1; - } - } - } - /// A50 failure-percentage algorithm. fn run_failure_percentage( &self, @@ -325,7 +288,6 @@ impl OutlierDetector { let qualifying: Vec<&Candidate> = all .iter() .filter(|c| c.total >= u64::from(cfg.request_volume)) - .filter(|c| !out.contains(&c.addr)) // skip endpoints already ejected this sweep .collect(); if qualifying.len() < cfg.minimum_hosts as usize { return; @@ -613,62 +575,6 @@ mod tests { assert!(detector.run_sweep(Instant::now()).is_empty()); } - // ----- Success-rate algorithm ----- - - fn sr_config( - stdev_factor: u32, - request_volume: u32, - minimum_hosts: u32, - ) -> OutlierDetectionConfig { - let mut c = base_config(); - c.success_rate = Some(SuccessRateConfig { - stdev_factor, - enforcing_success_rate: pct(100), - minimum_hosts, - request_volume, - }); - c - } - - #[test] - fn success_rate_ejects_outlier_below_threshold() { - let detector = detector_no_loop(sr_config(1900, 10, 5), FixedRng::boxed(99)); - // 4 endpoints at 99% success, 1 at 50% — outlier. - for port in 8080..=8083 { - let h = detector.add_endpoint(addr(port)); - for _ in 0..99 { - h.record_success(); - } - h.record_failure(); - } - let bad = detector.add_endpoint(addr(8084)); - for _ in 0..50 { - bad.record_success(); - } - for _ in 0..50 { - bad.record_failure(); - } - assert_eq!( - detector.run_sweep(Instant::now()), - vec![EjectionDecision::Eject(addr(8084))], - ); - } - - #[test] - fn success_rate_no_ejection_when_all_uniform() { - let detector = detector_no_loop(sr_config(1900, 10, 5), FixedRng::boxed(99)); - for port in 8080..=8084 { - let h = detector.add_endpoint(addr(port)); - for _ in 0..95 { - h.record_success(); - } - for _ in 0..5 { - h.record_failure(); - } - } - assert!(detector.run_sweep(Instant::now()).is_empty()); - } - // ----- Ejection multiplier / un-ejection ----- #[test] From 83530f88706b00a976e029cb57c08684e7b262c7 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Thu, 30 Apr 2026 16:22:57 -0700 Subject: [PATCH 05/33] refactor(tonic-xds): use bounded mpsc for ejection decisions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch from `mpsc::unbounded_channel` to `mpsc::channel(256)` for the ejection-decision stream that the sweep loop emits. The decisions are edge-triggered (`Eject`/`Uneject` transitions, not state snapshots), so the consumer must process every event in order; we can't drop or coalesce. But we don't want unbounded memory growth either if the consumer stalls. A bounded channel gives us: - Same correctness as unbounded — no events dropped, ordered delivery. - Bounded memory. - Natural backpressure: when the buffer fills, `tx.send().await` parks the sweep task, which (combined with `MissedTickBehavior:: Skip`) throttles sweep cadence to whatever rate the consumer can drain. Computing more decisions than the consumer can apply just widens the desync. Capacity is 256 — at most `2 * num_endpoints` decisions per sweep, so this buffers several sweeps' worth of decisions for clusters of typical size. A docstring on `DECISIONS_CHANNEL_CAPACITY` captures the rationale for future readers. --- .../client/loadbalance/outlier_detection.rs | 38 ++++++++++++------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index a30286c98..43ae8e2fd 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -28,6 +28,22 @@ use crate::client::endpoint::EndpointAddress; use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDetectionConfig}; +/// Capacity of the bounded mpsc channel that carries ejection decisions +/// from the sweep loop to the consumer. +/// +/// Decisions are edge-triggered (`Eject`/`Uneject` transitions, not +/// state snapshots), so the consumer must process every event in order +/// to stay in sync with the detector. We therefore can't drop or +/// coalesce — but we don't want unbounded growth either if the consumer +/// stalls. With sweep cadence on the order of seconds and at most +/// `2 * num_endpoints` decisions per sweep, 256 buffers several sweeps' +/// worth of decisions for clusters of typical size. When the buffer +/// fills, `tx.send().await` parks the sweep task, which naturally +/// throttles sweep cadence to whatever rate the consumer can drain — +/// the right behavior, since computing more decisions than the consumer +/// can apply just widens the desync. +const DECISIONS_CHANNEL_CAPACITY: usize = 256; + /// Lock-free per-endpoint success/failure counter handle. /// /// Cloned freely. Callers (typically a request-outcome interceptor) @@ -128,11 +144,7 @@ impl OutlierDetector { /// [`AbortOnDrop`] is dropped. pub(crate) fn spawn( config: OutlierDetectionConfig, - ) -> ( - Arc, - mpsc::UnboundedReceiver, - AbortOnDrop, - ) { + ) -> (Arc, mpsc::Receiver, AbortOnDrop) { Self::spawn_with_rng(config, Box::new(FastRandRng)) } @@ -140,12 +152,8 @@ impl OutlierDetector { pub(crate) fn spawn_with_rng( config: OutlierDetectionConfig, rng: Box, - ) -> ( - Arc, - mpsc::UnboundedReceiver, - AbortOnDrop, - ) { - let (tx, rx) = mpsc::unbounded_channel(); + ) -> (Arc, mpsc::Receiver, AbortOnDrop) { + let (tx, rx) = mpsc::channel(DECISIONS_CHANNEL_CAPACITY); let detector = Arc::new(Self { config, state: Mutex::new(HashMap::new()), @@ -333,7 +341,11 @@ struct Candidate { /// forwards each decision on the channel. The task ends (and `tx` is /// dropped, closing the receiver) when [`AbortOnDrop`] is dropped or /// when the receiver itself is dropped. -async fn sweep_loop(detector: Arc, tx: mpsc::UnboundedSender) { +/// +/// `tx.send().await` is fallible (returns `Err` if the receiver was +/// dropped) and may park briefly when the channel is full — see +/// [`DECISIONS_CHANNEL_CAPACITY`]. +async fn sweep_loop(detector: Arc, tx: mpsc::Sender) { let mut ticker = tokio::time::interval(detector.config.interval); // Skip missed ticks rather than burst-catching up — the goal is // periodic observation, not making up for paused time. @@ -345,7 +357,7 @@ async fn sweep_loop(detector: Arc, tx: mpsc::UnboundedSender Date: Fri, 1 May 2026 13:55:45 -0700 Subject: [PATCH 06/33] refactor(tonic-xds): make OutlierDetector runtime options configurable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace `spawn_with_rng` with `spawn_with`, taking an `OutlierDetectorOptions` struct that bundles the RNG and the new configurable `decisions_channel_capacity`. Defaults are unchanged (`fastrand` RNG, capacity 256). The hard-coded constant becomes `DEFAULT_DECISIONS_CHANNEL_CAPACITY` and is no longer the only knob — production callers may want to bump the bound for clusters with very large endpoint sets (worst case `2 * num_endpoints` decisions per sweep) or unusually slow consumers. Using a struct instead of a long argument list means future runtime knobs (custom Tokio runtime, alternate backoff policies, observability hooks, …) can be added without breaking call sites — callers typically construct via `..Default::default()`. The xDS-derived `OutlierDetectionConfig` stays separate from these host-side runtime knobs, keeping a clean line between "what the xDS proto specifies" and "how this binary chooses to host it." --- .../client/loadbalance/outlier_detection.rs | 75 +++++++++++++++---- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 43ae8e2fd..a76594d80 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -28,8 +28,8 @@ use crate::client::endpoint::EndpointAddress; use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDetectionConfig}; -/// Capacity of the bounded mpsc channel that carries ejection decisions -/// from the sweep loop to the consumer. +/// Default capacity of the bounded mpsc channel that carries ejection +/// decisions from the sweep loop to the consumer. /// /// Decisions are edge-triggered (`Eject`/`Uneject` transitions, not /// state snapshots), so the consumer must process every event in order @@ -42,7 +42,11 @@ use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDe /// throttles sweep cadence to whatever rate the consumer can drain — /// the right behavior, since computing more decisions than the consumer /// can apply just widens the desync. -const DECISIONS_CHANNEL_CAPACITY: usize = 256; +/// +/// Override via [`OutlierDetectorOptions::decisions_channel_capacity`] +/// for clusters with very large endpoint sets or unusually slow +/// consumers. +pub(crate) const DEFAULT_DECISIONS_CHANNEL_CAPACITY: usize = 256; /// Lock-free per-endpoint success/failure counter handle. /// @@ -122,6 +126,44 @@ impl EndpointState { } } +/// Runtime knobs that don't come from the xDS config (`OutlierDetection` +/// proto) — the channel capacity, the RNG, etc. Kept separate from +/// [`OutlierDetectionConfig`] so xDS-derived state stays distinct from +/// host-side runtime tuning. +/// +/// New fields can be added without breaking call sites because callers +/// typically construct via `..Default::default()`. +pub(crate) struct OutlierDetectorOptions { + /// Capacity of the bounded mpsc channel that carries + /// [`EjectionDecision`]s from the sweep loop to the consumer. + /// See [`DEFAULT_DECISIONS_CHANNEL_CAPACITY`] for the rationale. + pub decisions_channel_capacity: usize, + /// Probability source for the `enforcing_*` rolls. Tests inject a + /// deterministic [`Rng`]; production uses `fastrand`. + pub rng: Box, +} + +impl Default for OutlierDetectorOptions { + fn default() -> Self { + Self { + decisions_channel_capacity: DEFAULT_DECISIONS_CHANNEL_CAPACITY, + rng: Box::new(FastRandRng), + } + } +} + +impl std::fmt::Debug for OutlierDetectorOptions { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("OutlierDetectorOptions") + .field( + "decisions_channel_capacity", + &self.decisions_channel_capacity, + ) + .field("rng", &"") + .finish() + } +} + /// gRFC A50 outlier detector. /// /// `run_sweep` is pure — it returns a list of [`EjectionDecision`]s @@ -139,25 +181,25 @@ pub(crate) struct OutlierDetector { } impl OutlierDetector { - /// Build the detector and spawn its sweep task on the current Tokio - /// runtime. The sweep runs every `config.interval` until the returned - /// [`AbortOnDrop`] is dropped. + /// Build the detector with default runtime options and spawn its + /// sweep task on the current Tokio runtime. The sweep runs every + /// `config.interval` until the returned [`AbortOnDrop`] is dropped. pub(crate) fn spawn( config: OutlierDetectionConfig, ) -> (Arc, mpsc::Receiver, AbortOnDrop) { - Self::spawn_with_rng(config, Box::new(FastRandRng)) + Self::spawn_with(config, OutlierDetectorOptions::default()) } - /// Variant of [`Self::spawn`] that accepts an injected [`Rng`]. - pub(crate) fn spawn_with_rng( + /// Variant of [`Self::spawn`] that accepts custom runtime options. + pub(crate) fn spawn_with( config: OutlierDetectionConfig, - rng: Box, + options: OutlierDetectorOptions, ) -> (Arc, mpsc::Receiver, AbortOnDrop) { - let (tx, rx) = mpsc::channel(DECISIONS_CHANNEL_CAPACITY); + let (tx, rx) = mpsc::channel(options.decisions_channel_capacity); let detector = Arc::new(Self { config, state: Mutex::new(HashMap::new()), - rng, + rng: options.rng, }); let task = tokio::spawn(sweep_loop(detector.clone(), tx)); (detector, rx, AbortOnDrop(task)) @@ -772,8 +814,13 @@ mod tests { async fn sweep_loop_emits_decisions_on_tick() { let mut config = fp_config(50, 10, 3); config.interval = Duration::from_millis(100); - let (detector, mut rx, _abort) = - OutlierDetector::spawn_with_rng(config, FixedRng::boxed(99)); + let (detector, mut rx, _abort) = OutlierDetector::spawn_with( + config, + OutlierDetectorOptions { + rng: FixedRng::boxed(99), + ..Default::default() + }, + ); for port in 8080..=8083 { let h = detector.add_endpoint(addr(port)); From 8eacb782a6c7e961916570d28f74182fad618c19 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 4 May 2026 09:44:03 -0700 Subject: [PATCH 07/33] test(tonic-xds): use tokio::time::advance instead of sleep in paused tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both `sweep_loop_emits_decisions_on_tick` and `dropping_abort_stops_sweep_loop` previously used `tokio::time::sleep` in `start_paused = true` mode. That works through the runtime's auto-advance heuristic for parked tasks, but the heuristic is sensitive to the order of pending wake-ups across multiple tasks and can be flaky in practice. - `sweep_loop_emits_decisions_on_tick`: switch to `tokio::time::advance(150ms)` which explicitly moves the clock and yields until pending wake-ups have been polled — deterministic. - `dropping_abort_stops_sweep_loop`: drop the artificial sleep altogether. Aborting the JoinHandle wakes the spawned task synchronously; the runtime polls it, the harness observes the abort, and the task ends — dropping its sender. `rx.recv().await` parks briefly while that happens and then returns `None`. No time advancement needed. Stress-tested both tests 50× back-to-back: all pass. --- .../client/loadbalance/outlier_detection.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index a76594d80..345a2ee85 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -833,8 +833,12 @@ mod tests { bad.record_failure(); } - // Advance just past the first sweep tick. - tokio::time::sleep(Duration::from_millis(150)).await; + // Explicitly advance virtual time past the first sweep tick. + // `advance` is preferred over `sleep` for paused-time tests — it + // moves the clock deterministically and yields until pending + // task wake-ups have been polled, instead of relying on the + // runtime's auto-advance heuristic for parked tasks. + tokio::time::advance(Duration::from_millis(150)).await; let decision = rx.recv().await.expect("sweep should emit a decision"); assert_eq!(decision, EjectionDecision::Eject(addr(8084))); @@ -846,11 +850,13 @@ mod tests { config.interval = Duration::from_millis(50); let (_detector, mut rx, abort) = OutlierDetector::spawn(config); - // Drop the AbortOnDrop; the loop must terminate. + // Aborting the JoinHandle wakes the spawned task synchronously; + // the runtime polls it, the task harness observes the abort, + // and the task ends — dropping its sender clone. No time + // advancement is needed: `rx.recv().await` parks briefly, the + // runtime drives the aborted task to completion, then `recv` + // returns `None` because the sender is gone. drop(abort); - tokio::time::sleep(Duration::from_millis(200)).await; - - // Sender should be dropped along with the task; recv returns None. assert!(rx.recv().await.is_none()); } } From 54255c38fcdc8d35113dbf7e45de101761971214 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 4 May 2026 10:29:00 -0700 Subject: [PATCH 08/33] docs(tonic-xds): tighten DEFAULT_DECISIONS_CHANNEL_CAPACITY doc Rewrite the doc comment to be reference documentation rather than a design narrative. Drops the editorializing ("the right behavior") and the first-person reasoning, keeps the three things a developer needs: what the constant controls, why this size, what happens at capacity (and why decisions can't be dropped or coalesced), and how to override. --- .../client/loadbalance/outlier_detection.rs | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 345a2ee85..15dddb5a5 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -28,24 +28,16 @@ use crate::client::endpoint::EndpointAddress; use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDetectionConfig}; -/// Default capacity of the bounded mpsc channel that carries ejection -/// decisions from the sweep loop to the consumer. +/// Default capacity for the channel that delivers [`EjectionDecision`]s +/// from the sweep task to its consumer. /// -/// Decisions are edge-triggered (`Eject`/`Uneject` transitions, not -/// state snapshots), so the consumer must process every event in order -/// to stay in sync with the detector. We therefore can't drop or -/// coalesce — but we don't want unbounded growth either if the consumer -/// stalls. With sweep cadence on the order of seconds and at most -/// `2 * num_endpoints` decisions per sweep, 256 buffers several sweeps' -/// worth of decisions for clusters of typical size. When the buffer -/// fills, `tx.send().await` parks the sweep task, which naturally -/// throttles sweep cadence to whatever rate the consumer can drain — -/// the right behavior, since computing more decisions than the consumer -/// can apply just widens the desync. +/// Sized for several sweeps' worth of decisions on typical clusters — +/// each sweep emits at most `2 * num_endpoints`. At capacity, the sweep +/// task waits on `send` rather than dropping or coalescing decisions: +/// the channel is edge-triggered, so missing or merging events would +/// desynchronize the consumer's view of which endpoints are ejected. /// -/// Override via [`OutlierDetectorOptions::decisions_channel_capacity`] -/// for clusters with very large endpoint sets or unusually slow -/// consumers. +/// Override via [`OutlierDetectorOptions::decisions_channel_capacity`]. pub(crate) const DEFAULT_DECISIONS_CHANNEL_CAPACITY: usize = 256; /// Lock-free per-endpoint success/failure counter handle. From ab5be120634638dcb8ca25b8fc074750ee60b5d4 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 4 May 2026 10:32:26 -0700 Subject: [PATCH 09/33] docs(tonic-xds): drop "workspace dep" parenthetical from FastRandRng --- tonic-xds/src/client/loadbalance/outlier_detection.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 15dddb5a5..84335c970 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -89,7 +89,7 @@ pub(crate) trait Rng: Send + Sync + 'static { fn pct_roll(&self) -> u32; } -/// Default RNG backed by `fastrand` (already a workspace dep). +/// Default RNG backed by `fastrand`. struct FastRandRng; impl Rng for FastRandRng { From 0cc008565bc8830bb2c0f1b0b0cdfc889e870763 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 4 May 2026 10:38:44 -0700 Subject: [PATCH 10/33] refactor(tonic-xds): pack EndpointCounters into a single AtomicU64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous design used two separate `AtomicU64`s and snapshotted via two independent `swap` calls — the doc comment claimed this was atomic across the pair, but it isn't: an RPC completing between the two swaps inflates the next snapshot by one event, biasing the failure-percentage computation slightly under contention. Pack both counters into one `AtomicU64` (high 32 bits: successes, low 32 bits: failures). `record_*` becomes a single `fetch_add` (same hot- path cost as before), `snapshot_and_reset` becomes a single `swap(0)`, and the snapshot is now genuinely atomic across the pair — matching the bucket-swap semantics the gRFC describes. Each counter is capped at `u32::MAX` per sweep interval. Exceeding it would carry into the other counter's bits, but the cap is unreachable for realistic workloads (> 4 × 10⁹ RPCs to one endpoint within one interval). Documented on the struct. --- .../client/loadbalance/outlier_detection.rs | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 84335c970..04a9cd055 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -46,28 +46,44 @@ pub(crate) const DEFAULT_DECISIONS_CHANNEL_CAPACITY: usize = 256; /// invoke [`record_success`] / [`record_failure`] from the data path. /// The detector reads and resets the counters during each sweep. /// +/// Both counters are packed into a single `AtomicU64` (high 32 bits: +/// successes, low 32 bits: failures) so each increment is a single +/// `fetch_add` and a sweep is a single `swap(0)` — the snapshot is +/// truly atomic across the pair. Each counter is capped at +/// `u32::MAX` per sweep interval; exceeding that carries into the +/// other counter's bits, but the cap is unreachable for realistic +/// workloads (> 4 × 10⁹ RPCs to one endpoint within a single +/// interval). +/// /// [`record_success`]: EndpointCounters::record_success /// [`record_failure`]: EndpointCounters::record_failure #[derive(Debug, Default)] pub(crate) struct EndpointCounters { - success: AtomicU64, - failure: AtomicU64, + /// High 32 bits: successes since last sweep. + /// Low 32 bits: failures since last sweep. + packed: AtomicU64, } +/// Increment to apply to [`EndpointCounters::packed`] for one success. +const SUCCESS_INC: u64 = 1 << 32; +/// Increment to apply to [`EndpointCounters::packed`] for one failure. +const FAILURE_INC: u64 = 1; +/// Mask for the failure half of the packed counter. +const FAILURE_MASK: u64 = 0xFFFF_FFFF; + impl EndpointCounters { pub(crate) fn record_success(&self) { - self.success.fetch_add(1, Ordering::Relaxed); + self.packed.fetch_add(SUCCESS_INC, Ordering::Relaxed); } pub(crate) fn record_failure(&self) { - self.failure.fetch_add(1, Ordering::Relaxed); + self.packed.fetch_add(FAILURE_INC, Ordering::Relaxed); } /// Atomically read and zero both counters. Returns `(success, failure)`. fn snapshot_and_reset(&self) -> (u64, u64) { - let s = self.success.swap(0, Ordering::Relaxed); - let f = self.failure.swap(0, Ordering::Relaxed); - (s, f) + let v = self.packed.swap(0, Ordering::Relaxed); + (v >> 32, v & FAILURE_MASK) } } From eb10e3f03abd41306ca545045c5cff6e775a1aeb Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 4 May 2026 10:45:27 -0700 Subject: [PATCH 11/33] docs(tonic-xds): consolidate EndpointCounters doc comment --- .../client/loadbalance/outlier_detection.rs | 22 ++++++------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 04a9cd055..7729de92f 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -40,23 +40,15 @@ use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDe /// Override via [`OutlierDetectorOptions::decisions_channel_capacity`]. pub(crate) const DEFAULT_DECISIONS_CHANNEL_CAPACITY: usize = 256; -/// Lock-free per-endpoint success/failure counter handle. +/// Lock-free success/failure counter for one endpoint. The data path +/// records RPC outcomes via `record_success` / `record_failure`; the +/// sweep snapshots and resets atomically. /// -/// Cloned freely. Callers (typically a request-outcome interceptor) -/// invoke [`record_success`] / [`record_failure`] from the data path. -/// The detector reads and resets the counters during each sweep. -/// -/// Both counters are packed into a single `AtomicU64` (high 32 bits: -/// successes, low 32 bits: failures) so each increment is a single -/// `fetch_add` and a sweep is a single `swap(0)` — the snapshot is -/// truly atomic across the pair. Each counter is capped at +/// Counts are packed into a single `AtomicU64` (high 32 bits: +/// successes, low 32 bits: failures), so each record is one `fetch_add` +/// and a snapshot is one `swap(0)`. Each counter is capped at /// `u32::MAX` per sweep interval; exceeding that carries into the -/// other counter's bits, but the cap is unreachable for realistic -/// workloads (> 4 × 10⁹ RPCs to one endpoint within a single -/// interval). -/// -/// [`record_success`]: EndpointCounters::record_success -/// [`record_failure`]: EndpointCounters::record_failure +/// other counter's bits but is unreachable for realistic workloads. #[derive(Debug, Default)] pub(crate) struct EndpointCounters { /// High 32 bits: successes since last sweep. From 0d2c2644a02798fb43173784c6ec8b26fe465b14 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 4 May 2026 15:23:40 -0700 Subject: [PATCH 12/33] fix(tonic-xds): skip zero-traffic candidates in failure-percentage algo Guard the `100 * failure / total` division against `total == 0`. gRFC A50 doesn't forbid `request_volume == 0`, in which case the qualifying filter `c.total >= request_volume` admits candidates with zero traffic; the spec is silent on `0/0`, so skip those endpoints rather than panic. --- tonic-xds/src/client/loadbalance/outlier_detection.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 7729de92f..61a064030 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -348,6 +348,12 @@ impl OutlierDetector { if *budget == 0 { break; } + // A50 doesn't forbid `request_volume == 0`, in which case a + // candidate may have `total == 0`. The spec is silent on + // `0/0`; skip these endpoints rather than divide by zero. + if c.total == 0 { + continue; + } // failure_pct = 100 * failure / total. A50 specifies a strict // "greater than" comparison: an address sitting exactly at // the threshold is not ejected. From 3c946845b199d1da6d16fd526e6b9a03f9e1b907 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 4 May 2026 15:27:20 -0700 Subject: [PATCH 13/33] docs(tonic-xds): fix stale DECISIONS_CHANNEL_CAPACITY doc link --- tonic-xds/src/client/loadbalance/outlier_detection.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 61a064030..44c11feec 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -392,7 +392,7 @@ struct Candidate { /// /// `tx.send().await` is fallible (returns `Err` if the receiver was /// dropped) and may park briefly when the channel is full — see -/// [`DECISIONS_CHANNEL_CAPACITY`]. +/// [`DEFAULT_DECISIONS_CHANNEL_CAPACITY`]. async fn sweep_loop(detector: Arc, tx: mpsc::Sender) { let mut ticker = tokio::time::interval(detector.config.interval); // Skip missed ticks rather than burst-catching up — the goal is From 5aca8c088255bbca43104e30a55a69e8b7369115 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 5 May 2026 11:00:15 -0700 Subject: [PATCH 14/33] refactor(tonic-xds): use derived Ord for EjectionDecision sorting in tests Drop the test-only `sort` helper that compared `EjectionDecision`s by their `Debug` string representation, which was fragile (any change to the `Debug` impl would silently change ordering). Derive `PartialOrd` and `Ord` on `EjectionDecision` (and on `EndpointAddress` / `EndpointHost`, since the address is the inner field) and call `Vec::sort` directly at the one test site. --- tonic-xds/src/client/endpoint.rs | 4 ++-- .../src/client/loadbalance/outlier_detection.rs | 12 +++--------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/tonic-xds/src/client/endpoint.rs b/tonic-xds/src/client/endpoint.rs index 81767414d..ec23012bb 100644 --- a/tonic-xds/src/client/endpoint.rs +++ b/tonic-xds/src/client/endpoint.rs @@ -5,7 +5,7 @@ use std::task::{Context, Poll}; use tower::{Service, load::Load}; /// Represents the host part of an endpoint address -#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] enum EndpointHost { Ipv4(std::net::Ipv4Addr), Ipv6(std::net::Ipv6Addr), @@ -25,7 +25,7 @@ impl From for EndpointHost { } /// Represents a validated endpoint address extracted from xDS -#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub(crate) struct EndpointAddress { /// The IP address or hostname host: EndpointHost, diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 44c11feec..f9b46f037 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -80,7 +80,7 @@ impl EndpointCounters { } /// A decision emitted by an [`OutlierDetector`] sweep. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub(crate) enum EjectionDecision { /// Eject this endpoint from the load-balancing pool. The caller /// should keep its underlying connection alive (A50 requires @@ -470,13 +470,6 @@ mod tests { }) } - /// Sort a decision list deterministically so equality checks can rely - /// on a canonical order without coupling to `HashMap` iteration order. - fn sort(mut ds: Vec) -> Vec { - ds.sort_by(|a, b| format!("{a:?}").cmp(&format!("{b:?}"))); - ds - } - // ----- EndpointCounters ----- #[test] @@ -773,7 +766,8 @@ mod tests { h.record_failure(); } } - let decisions = sort(detector.run_sweep(Instant::now())); + let mut decisions = detector.run_sweep(Instant::now()); + decisions.sort(); let ejects = decisions .iter() .filter(|d| matches!(d, EjectionDecision::Eject(_))) From bbf935cb1798197e04c9c3eef9c3eaef76224126 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 5 May 2026 11:24:49 -0700 Subject: [PATCH 15/33] fix(tonic-xds): exclude re-ejections from max_ejection_percent budget MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an already-ejected endpoint has in-flight RPCs that complete during its ejection backoff, those completions accumulate on its counter. At the next sweep the algorithm may "re-eject" the host (refreshing its `ejected_at` timestamp and bumping the multiplier). That action does not change the count of currently-ejected addresses, so per A50's `max_ejection_percent` check it must not consume a slot in the cap — but the previous code decremented the budget for it, under-counting how many *new* ejections the cap allows. Track the pre-sweep ejection state on each `Candidate` and only decrement the budget for new ejections in the failure-percentage algorithm. Add a regression test covering the specific scenario. --- .../client/loadbalance/outlier_detection.rs | 59 ++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index f9b46f037..761d843a5 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -254,6 +254,7 @@ impl OutlierDetector { success, failure, total: success + failure, + already_ejected: ep.ejected_at.is_some(), }); } @@ -360,7 +361,13 @@ impl OutlierDetector { let failure_pct = 100 * c.failure / c.total; if failure_pct > threshold && self.roll(cfg.enforcing_failure_percentage.get()) { out.push(c.addr.clone()); - *budget -= 1; + // Only NEW ejections consume a budget slot; re-ejecting + // an already-ejected address only refreshes its + // timestamp and multiplier, leaving the count of + // currently-ejected addresses unchanged. + if !c.already_ejected { + *budget -= 1; + } } } } @@ -383,6 +390,12 @@ struct Candidate { success: u64, failure: u64, total: u64, + /// Whether this address was already ejected at the start of the sweep. + /// "Re-ejecting" an already-ejected address only refreshes its + /// ejection timestamp and bumps the multiplier; it does not change + /// the count of currently-ejected addresses, so it must not consume + /// a `max_ejection_percent` budget slot. + already_ejected: bool, } /// Background task: runs `detector.run_sweep` on each interval tick and @@ -775,6 +788,50 @@ mod tests { assert_eq!(ejects, 1, "max_ejection_percent=20% of 5 hosts ⇒ 1"); } + #[test] + fn already_ejected_re_ejection_does_not_consume_budget() { + // 5 hosts: one already ejected (with stats from in-flight RPCs + // accumulated during its backoff), four newly bad. Cap permits + // 3 concurrently ejected hosts (60% of 5), with 1 already taken + // by the pre-ejected host — so 2 new ejections remain in budget. + // + // This test would fail before the fix that excludes re-ejections + // from budget accounting: the algorithm would "re-eject" the + // already-ejected host (consuming the second slot), leaving only + // 1 new ejection from the four bad hosts. + let mut config = fp_config(50, 10, 3); + config.max_ejection_percent = pct(60); + let detector = detector_no_loop(config, FixedRng::boxed(99)); + + // Pre-eject host 8080 directly and give it bad in-flight stats. + let already_bad = detector.add_endpoint(addr(8080)); + for _ in 0..100 { + already_bad.record_failure(); + } + { + let mut state = detector.state.lock().unwrap(); + let ep = state.get_mut(&addr(8080)).unwrap(); + ep.ejected_at = Some(Instant::now()); + ep.ejection_multiplier = 1; + } + + // Four more bad hosts. + for port in 8081..=8084 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_failure(); + } + } + + let mut decisions = detector.run_sweep(Instant::now()); + decisions.sort(); + let new_ejects = decisions + .iter() + .filter(|d| matches!(d, EjectionDecision::Eject(a) if *a != addr(8080))) + .count(); + assert_eq!(new_ejects, 2, "expected 2 new ejections under the cap"); + } + #[test] fn multiplier_decrements_on_healthy_interval() { let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); From b8ea266497208c6e61b3eb843bfa69bf0fea2796 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 5 May 2026 11:45:19 -0700 Subject: [PATCH 16/33] refactor(tonic-xds): drive sweeps on demand from poll_ready MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the spawned sweep loop + mpsc channel with an on-demand model: the detector exposes `maybe_run_sweep(&mut self, now: Instant) -> Vec ` and the consumer (the load balancer in a follow-up PR) calls it from its own event loop — typically `poll_ready` — gated by wallclock time. This eliminates a significant amount of machinery: - `tokio::spawn`, `sweep_loop`, `AbortOnDrop`, the mpsc channel. - The bounded-channel capacity option, its constant, and its docs (`OutlierDetectorOptions::decisions_channel_capacity`, `DEFAULT_DECISIONS_CHANNEL_CAPACITY`). - `OutlierDetectorOptions` itself — collapses to two constructors `new(config)` and `with_rng(config, rng)`. - The `Mutex` on `state` — the consumer's `&mut self` already serializes access. - Two `#[tokio::test(start_paused = true)]` tests that exercised the spawned task and its abort handle. Sweep timing now depends on RPC traffic: when no RPCs flow, no sweeps run. This matches A50's intent (sweeps happen approximately every `interval` while traffic is flowing) and is observably equivalent because ejection only matters during endpoint picking, which only happens during RPCs. Suggested by the PR review. Tests: - All algorithm-level tests rewritten to use owned `OutlierDetector` + `&mut self` calls, no `Mutex::lock()`, no Arc. - Three new `maybe_run_sweep_*` tests cover the interval gate: runs on first call, skips before interval elapsed, runs after. - Existing failure-percentage and multiplier/un-ejection tests unchanged in spirit; just adjusted to the new ownership model. --- .../client/loadbalance/outlier_detection.rs | 485 ++++++++---------- 1 file changed, 224 insertions(+), 261 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 761d843a5..f3614e403 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -1,11 +1,14 @@ //! gRFC A50 outlier-detection sweep engine. //! -//! Owns per-endpoint counters and an ejection state machine. Periodically -//! reads the counters, runs the failure-percentage ejection algorithm, -//! and emits [`EjectionDecision`]s. Knows nothing about the data path: +//! Owns per-endpoint counters and an ejection state machine. Runs the +//! failure-percentage ejection algorithm on demand and returns the +//! resulting [`EjectionDecision`]s. Knows nothing about the data path: //! callers feed it RPC outcomes via the lock-free [`EndpointCounters`] -//! handle returned by [`OutlierDetector::add_endpoint`], and consume -//! decisions from a channel returned by [`OutlierDetector::spawn`]. +//! handle returned by [`OutlierDetector::add_endpoint`], and pump the +//! sweep by calling [`OutlierDetector::maybe_run_sweep`] from their own +//! event loop (typically the load balancer's `poll_ready`). The wall +//! clock supplied to `maybe_run_sweep` decides when each sweep actually +//! runs — at most once per `config.interval`. //! //! Only the **failure-percentage** algorithm is implemented in this //! module. The success-rate algorithm — which adds float-math (mean @@ -18,28 +21,12 @@ use std::collections::HashMap; use std::sync::Arc; -use std::sync::Mutex; use std::sync::atomic::{AtomicU64, Ordering}; use std::time::Instant; -use tokio::sync::mpsc; - use crate::client::endpoint::EndpointAddress; -use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDetectionConfig}; -/// Default capacity for the channel that delivers [`EjectionDecision`]s -/// from the sweep task to its consumer. -/// -/// Sized for several sweeps' worth of decisions on typical clusters — -/// each sweep emits at most `2 * num_endpoints`. At capacity, the sweep -/// task waits on `send` rather than dropping or coalescing decisions: -/// the channel is edge-triggered, so missing or merging events would -/// desynchronize the consumer's view of which endpoints are ejected. -/// -/// Override via [`OutlierDetectorOptions::decisions_channel_capacity`]. -pub(crate) const DEFAULT_DECISIONS_CHANNEL_CAPACITY: usize = 256; - /// Lock-free success/failure counter for one endpoint. The data path /// records RPC outcomes via `record_success` / `record_failure`; the /// sweep snapshots and resets atomically. @@ -126,83 +113,38 @@ impl EndpointState { } } -/// Runtime knobs that don't come from the xDS config (`OutlierDetection` -/// proto) — the channel capacity, the RNG, etc. Kept separate from -/// [`OutlierDetectionConfig`] so xDS-derived state stays distinct from -/// host-side runtime tuning. -/// -/// New fields can be added without breaking call sites because callers -/// typically construct via `..Default::default()`. -pub(crate) struct OutlierDetectorOptions { - /// Capacity of the bounded mpsc channel that carries - /// [`EjectionDecision`]s from the sweep loop to the consumer. - /// See [`DEFAULT_DECISIONS_CHANNEL_CAPACITY`] for the rationale. - pub decisions_channel_capacity: usize, - /// Probability source for the `enforcing_*` rolls. Tests inject a - /// deterministic [`Rng`]; production uses `fastrand`. - pub rng: Box, -} - -impl Default for OutlierDetectorOptions { - fn default() -> Self { - Self { - decisions_channel_capacity: DEFAULT_DECISIONS_CHANNEL_CAPACITY, - rng: Box::new(FastRandRng), - } - } -} - -impl std::fmt::Debug for OutlierDetectorOptions { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("OutlierDetectorOptions") - .field( - "decisions_channel_capacity", - &self.decisions_channel_capacity, - ) - .field("rng", &"") - .finish() - } -} - /// gRFC A50 outlier detector. /// -/// `run_sweep` is pure — it returns a list of [`EjectionDecision`]s -/// rather than sending them. The sweep loop spawned by [`spawn`] owns -/// the channel sender and forwards decisions to the receiver, so -/// dropping the [`AbortOnDrop`] handle ends the loop and closes the -/// receiver. `OutlierDetector` itself holds no I/O resources, which -/// makes algorithm-level tests trivial to write. -/// -/// [`spawn`]: OutlierDetector::spawn +/// State is owned (no `Mutex`, no `Arc`): the consumer holds the +/// detector by `&mut` and calls [`Self::maybe_run_sweep`] from its own +/// event loop, typically the load balancer's `poll_ready`. The wall +/// clock argument decides when each sweep actually runs — at most once +/// per `config.interval`. pub(crate) struct OutlierDetector { config: OutlierDetectionConfig, - state: Mutex>, + state: HashMap, + /// Wall-clock time of the last sweep that actually ran. `None` + /// before the first sweep, so the first call to `maybe_run_sweep` + /// always runs. + last_sweep_at: Option, rng: Box, } impl OutlierDetector { - /// Build the detector with default runtime options and spawn its - /// sweep task on the current Tokio runtime. The sweep runs every - /// `config.interval` until the returned [`AbortOnDrop`] is dropped. - pub(crate) fn spawn( - config: OutlierDetectionConfig, - ) -> (Arc, mpsc::Receiver, AbortOnDrop) { - Self::spawn_with(config, OutlierDetectorOptions::default()) - } - - /// Variant of [`Self::spawn`] that accepts custom runtime options. - pub(crate) fn spawn_with( - config: OutlierDetectionConfig, - options: OutlierDetectorOptions, - ) -> (Arc, mpsc::Receiver, AbortOnDrop) { - let (tx, rx) = mpsc::channel(options.decisions_channel_capacity); - let detector = Arc::new(Self { + /// Build the detector with the default RNG (`fastrand`). + pub(crate) fn new(config: OutlierDetectionConfig) -> Self { + Self::with_rng(config, Box::new(FastRandRng)) + } + + /// Build the detector with an injected [`Rng`]. Tests use this to + /// pin the `enforcing_*` rolls. + pub(crate) fn with_rng(config: OutlierDetectionConfig, rng: Box) -> Self { + Self { config, - state: Mutex::new(HashMap::new()), - rng: options.rng, - }); - let task = tokio::spawn(sweep_loop(detector.clone(), tx)); - (detector, rx, AbortOnDrop(task)) + state: HashMap::new(), + last_sweep_at: None, + rng, + } } /// Register an endpoint and return its lock-free counter handle. @@ -211,9 +153,8 @@ impl OutlierDetector { /// /// Adding an already-registered address is a no-op and returns the /// existing handle (so callers can re-add idempotently). - pub(crate) fn add_endpoint(&self, addr: EndpointAddress) -> Arc { - let mut state = self.state.lock().expect("outlier_detector mutex poisoned"); - state + pub(crate) fn add_endpoint(&mut self, addr: EndpointAddress) -> Arc { + self.state .entry(addr) .or_insert_with(EndpointState::new) .counters @@ -224,15 +165,30 @@ impl OutlierDetector { /// any ejection state. If the endpoint was ejected, no `Uneject` /// decision is emitted — the caller is expected to handle the removal /// directly (e.g., by dropping its slot in the load balancer). - pub(crate) fn remove_endpoint(&self, addr: &EndpointAddress) { - let mut state = self.state.lock().expect("outlier_detector mutex poisoned"); - state.remove(addr); + pub(crate) fn remove_endpoint(&mut self, addr: &EndpointAddress) { + self.state.remove(addr); + } + + /// Run a sweep at logical time `now` if at least `config.interval` + /// has elapsed since the last sweep, returning the resulting + /// ejection / un-ejection decisions. Otherwise returns an empty + /// vector and leaves the detector state untouched. + /// + /// The first call after construction always runs a sweep + /// (`last_sweep_at` starts as `None`). + pub(crate) fn maybe_run_sweep(&mut self, now: Instant) -> Vec { + if let Some(last) = self.last_sweep_at + && now.duration_since(last) < self.config.interval + { + return Vec::new(); + } + self.last_sweep_at = Some(now); + self.run_sweep(now) } - /// Run a single sweep at logical time `now` and return the resulting - /// ejection/un-ejection decisions. Pure — does no I/O. The sweep loop - /// invokes this on each interval tick and forwards the decisions on - /// the channel; tests call it directly. + /// Unconditionally run one sweep at logical time `now` and return the + /// resulting decisions. Used by [`Self::maybe_run_sweep`] and by tests + /// that want to drive sweeps without modeling the interval gate. /// /// The order of operations follows gRFC A50: /// 1. Record the timestamp. @@ -242,12 +198,10 @@ impl OutlierDetector { /// 5. For each address: decrement the multiplier of non-ejected /// addresses with multiplier > 0, and un-eject ejected addresses /// whose backoff has elapsed. - pub(crate) fn run_sweep(&self, now: Instant) -> Vec { - let mut state = self.state.lock().expect("outlier_detector mutex poisoned"); - + pub(crate) fn run_sweep(&mut self, now: Instant) -> Vec { // Step 2: snapshot every endpoint's counters. - let mut snapshots: Vec = Vec::with_capacity(state.len()); - for (addr, ep) in state.iter_mut() { + let mut snapshots: Vec = Vec::with_capacity(self.state.len()); + for (addr, ep) in self.state.iter_mut() { let (success, failure) = ep.counters.snapshot_and_reset(); snapshots.push(Candidate { addr: addr.clone(), @@ -262,11 +216,15 @@ impl OutlierDetector { // don't exceed `max_ejection_percent` of the total. Per A50, the // check is performed before each candidate ejection; we model that // as a budget that algorithms decrement. - let total_endpoints = state.len(); + let total_endpoints = self.state.len(); let max_ejections = (total_endpoints as u64 * u64::from(self.config.max_ejection_percent.get()) / 100) as usize; - let already_ejected = state.values().filter(|ep| ep.ejected_at.is_some()).count(); + let already_ejected = self + .state + .values() + .filter(|ep| ep.ejected_at.is_some()) + .count(); let mut budget = max_ejections.saturating_sub(already_ejected); // Steps 3 & 4: run the algorithms on the snapshot. Hosts that are @@ -280,11 +238,11 @@ impl OutlierDetector { let mut to_eject: Vec = Vec::new(); if let Some(fp) = self.config.failure_percentage.as_ref() { - self.run_failure_percentage(fp, &snapshots, &mut budget, &mut to_eject); + run_failure_percentage(fp, &snapshots, &mut budget, &mut to_eject, &*self.rng); } for addr in &to_eject { - if let Some(ep) = state.get_mut(addr) { + if let Some(ep) = self.state.get_mut(addr) { ep.ejected_at = Some(now); ep.ejection_multiplier = ep.ejection_multiplier.saturating_add(1); } @@ -300,7 +258,7 @@ impl OutlierDetector { .base_ejection_time .max(self.config.max_ejection_time); let mut to_uneject: Vec = Vec::new(); - for (addr, ep) in state.iter_mut() { + for (addr, ep) in self.state.iter_mut() { if let Some(at) = ep.ejected_at { if let Some(scaled) = self .config @@ -316,8 +274,6 @@ impl OutlierDetector { } } - drop(state); - let mut decisions = Vec::with_capacity(to_uneject.len() + to_eject.len()); for addr in to_uneject { decisions.push(EjectionDecision::Uneject(addr)); @@ -327,61 +283,61 @@ impl OutlierDetector { } decisions } +} - /// A50 failure-percentage algorithm. - fn run_failure_percentage( - &self, - cfg: &FailurePercentageConfig, - all: &[Candidate], - budget: &mut usize, - out: &mut Vec, - ) { - let qualifying: Vec<&Candidate> = all - .iter() - .filter(|c| c.total >= u64::from(cfg.request_volume)) - .collect(); - if qualifying.len() < cfg.minimum_hosts as usize { - return; - } - - let threshold = u64::from(cfg.threshold.get()); - for c in qualifying { - if *budget == 0 { - break; - } - // A50 doesn't forbid `request_volume == 0`, in which case a - // candidate may have `total == 0`. The spec is silent on - // `0/0`; skip these endpoints rather than divide by zero. - if c.total == 0 { - continue; - } - // failure_pct = 100 * failure / total. A50 specifies a strict - // "greater than" comparison: an address sitting exactly at - // the threshold is not ejected. - let failure_pct = 100 * c.failure / c.total; - if failure_pct > threshold && self.roll(cfg.enforcing_failure_percentage.get()) { - out.push(c.addr.clone()); - // Only NEW ejections consume a budget slot; re-ejecting - // an already-ejected address only refreshes its - // timestamp and multiplier, leaving the count of - // currently-ejected addresses unchanged. - if !c.already_ejected { - *budget -= 1; - } +/// A50 failure-percentage algorithm. +fn run_failure_percentage( + cfg: &FailurePercentageConfig, + all: &[Candidate], + budget: &mut usize, + out: &mut Vec, + rng: &dyn Rng, +) { + let qualifying: Vec<&Candidate> = all + .iter() + .filter(|c| c.total >= u64::from(cfg.request_volume)) + .collect(); + if qualifying.len() < cfg.minimum_hosts as usize { + return; + } + + let threshold = u64::from(cfg.threshold.get()); + for c in qualifying { + if *budget == 0 { + break; + } + // A50 doesn't forbid `request_volume == 0`, in which case a + // candidate may have `total == 0`. The spec is silent on + // `0/0`; skip these endpoints rather than divide by zero. + if c.total == 0 { + continue; + } + // failure_pct = 100 * failure / total. A50 specifies a strict + // "greater than" comparison: an address sitting exactly at + // the threshold is not ejected. + let failure_pct = 100 * c.failure / c.total; + if failure_pct > threshold && roll(rng, cfg.enforcing_failure_percentage.get()) { + out.push(c.addr.clone()); + // Only NEW ejections consume a budget slot; re-ejecting + // an already-ejected address only refreshes its + // timestamp and multiplier, leaving the count of + // currently-ejected addresses unchanged. + if !c.already_ejected { + *budget -= 1; } } } +} - /// Return true with probability `pct / 100` (clamped at 100 ⇒ always). - fn roll(&self, pct: u8) -> bool { - if pct >= 100 { - return true; - } - if pct == 0 { - return false; - } - self.rng.pct_roll() < u32::from(pct) +/// Return true with probability `pct / 100` (clamped at 100 ⇒ always). +fn roll(rng: &dyn Rng, pct: u8) -> bool { + if pct >= 100 { + return true; + } + if pct == 0 { + return false; } + rng.pct_roll() < u32::from(pct) } /// Cached per-endpoint snapshot used during a sweep. @@ -398,34 +354,6 @@ struct Candidate { already_ejected: bool, } -/// Background task: runs `detector.run_sweep` on each interval tick and -/// forwards each decision on the channel. The task ends (and `tx` is -/// dropped, closing the receiver) when [`AbortOnDrop`] is dropped or -/// when the receiver itself is dropped. -/// -/// `tx.send().await` is fallible (returns `Err` if the receiver was -/// dropped) and may park briefly when the channel is full — see -/// [`DEFAULT_DECISIONS_CHANNEL_CAPACITY`]. -async fn sweep_loop(detector: Arc, tx: mpsc::Sender) { - let mut ticker = tokio::time::interval(detector.config.interval); - // Skip missed ticks rather than burst-catching up — the goal is - // periodic observation, not making up for paused time. - ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - // The first tick fires immediately; consume it so the first real - // sweep is `interval` after spawn (matches A50 semantics). - ticker.tick().await; - - loop { - ticker.tick().await; - for decision in detector.run_sweep(Instant::now()) { - if tx.send(decision).await.is_err() { - // Receiver gone — nobody is listening. - return; - } - } - } -} - #[cfg(test)] mod tests { use super::*; @@ -473,14 +401,8 @@ mod tests { } } - /// Build a detector with no sweep loop running. Tests drive - /// `run_sweep` directly and inspect the returned decisions. - fn detector_no_loop(config: OutlierDetectionConfig, rng: Box) -> Arc { - Arc::new(OutlierDetector { - config, - state: Mutex::new(HashMap::new()), - rng, - }) + fn detector_with_rng(config: OutlierDetectionConfig, rng: Box) -> OutlierDetector { + OutlierDetector::with_rng(config, rng) } // ----- EndpointCounters ----- @@ -499,7 +421,7 @@ mod tests { #[test] fn add_endpoint_returns_shared_counter() { - let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); + let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); let h1 = detector.add_endpoint(addr(8080)); let h2 = detector.add_endpoint(addr(8080)); assert!( @@ -512,10 +434,10 @@ mod tests { #[test] fn remove_endpoint_drops_state() { - let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); + let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); detector.add_endpoint(addr(8080)); detector.remove_endpoint(&addr(8080)); - assert!(detector.state.lock().unwrap().is_empty()); + assert!(detector.state.is_empty()); } // ----- Failure-percentage algorithm ----- @@ -537,7 +459,7 @@ mod tests { #[test] fn failure_percentage_ejects_above_threshold() { - let detector = detector_no_loop(fp_config(50, 10, 3), FixedRng::boxed(99)); + let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); // 4 healthy endpoints + 1 bad one. for port in 8080..=8083 { let h = detector.add_endpoint(addr(port)); @@ -559,7 +481,7 @@ mod tests { #[test] fn failure_percentage_skips_below_threshold() { - let detector = detector_no_loop(fp_config(50, 10, 3), FixedRng::boxed(99)); + let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); for port in 8080..=8084 { let h = detector.add_endpoint(addr(port)); // 30% failure → below threshold of 50%. @@ -577,7 +499,7 @@ mod tests { fn failure_percentage_at_threshold_does_not_eject() { // A50 specifies a strict "greater than" comparison: an address // sitting exactly at the threshold should *not* be ejected. - let detector = detector_no_loop(fp_config(50, 10, 3), FixedRng::boxed(0)); + let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(0)); for port in 8080..=8084 { let h = detector.add_endpoint(addr(port)); // Exactly 50% failure rate — equal to the threshold. @@ -593,7 +515,7 @@ mod tests { #[test] fn minimum_hosts_gates_failure_percentage() { - let detector = detector_no_loop(fp_config(50, 10, 5), FixedRng::boxed(99)); + let mut detector = detector_with_rng(fp_config(50, 10, 5), FixedRng::boxed(99)); // Only 2 hosts have request_volume ≥ 10; minimum_hosts is 5 ⇒ skip. for port in 8080..=8081 { let h = detector.add_endpoint(addr(port)); @@ -606,7 +528,7 @@ mod tests { #[test] fn request_volume_filters_low_traffic_endpoints() { - let detector = detector_no_loop(fp_config(50, 100, 3), FixedRng::boxed(99)); + let mut detector = detector_with_rng(fp_config(50, 100, 3), FixedRng::boxed(99)); // Bad endpoint, but only 5 requests — below request_volume=100. let bad = detector.add_endpoint(addr(8080)); for _ in 0..5 { @@ -631,7 +553,7 @@ mod tests { .enforcing_failure_percentage = pct(0); // Roll = 0 wouldn't trigger anyway since `roll(0)` short-circuits; // pin the RNG to 0 just to be explicit. - let detector = detector_no_loop(config, FixedRng::boxed(0)); + let mut detector = detector_with_rng(config, FixedRng::boxed(0)); for port in 8080..=8084 { let h = detector.add_endpoint(addr(port)); for _ in 0..100 { @@ -648,7 +570,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(60); - let detector = detector_no_loop(config, FixedRng::boxed(99)); + let mut detector = detector_with_rng(config, FixedRng::boxed(99)); for port in 8080..=8084 { let h = detector.add_endpoint(addr(port)); @@ -688,7 +610,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(60); - let detector = detector_no_loop(config, FixedRng::boxed(99)); + let mut detector = detector_with_rng(config, FixedRng::boxed(99)); let bad = addr(8084); let bad_h = detector.add_endpoint(bad.clone()); @@ -747,7 +669,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(15); - let detector = detector_no_loop(config, FixedRng::boxed(99)); + let mut detector = detector_with_rng(config, FixedRng::boxed(99)); for port in 8080..=8084 { detector.add_endpoint(addr(port)); @@ -755,8 +677,7 @@ mod tests { let t0 = Instant::now(); // Force multiplier=10 directly. { - let mut state = detector.state.lock().unwrap(); - let ep = state.get_mut(&addr(8084)).unwrap(); + let ep = detector.state.get_mut(&addr(8084)).unwrap(); ep.ejection_multiplier = 10; ep.ejected_at = Some(t0); } @@ -771,7 +692,7 @@ mod tests { // 5 hosts, all bad, but max_ejection_percent=20 ⇒ at most 1 ejected. let mut config = fp_config(50, 10, 3); config.max_ejection_percent = pct(20); - let detector = detector_no_loop(config, FixedRng::boxed(99)); + let mut detector = detector_with_rng(config, FixedRng::boxed(99)); for port in 8080..=8084 { let h = detector.add_endpoint(addr(port)); @@ -801,7 +722,7 @@ mod tests { // 1 new ejection from the four bad hosts. let mut config = fp_config(50, 10, 3); config.max_ejection_percent = pct(60); - let detector = detector_no_loop(config, FixedRng::boxed(99)); + let mut detector = detector_with_rng(config, FixedRng::boxed(99)); // Pre-eject host 8080 directly and give it bad in-flight stats. let already_bad = detector.add_endpoint(addr(8080)); @@ -809,8 +730,7 @@ mod tests { already_bad.record_failure(); } { - let mut state = detector.state.lock().unwrap(); - let ep = state.get_mut(&addr(8080)).unwrap(); + let ep = detector.state.get_mut(&addr(8080)).unwrap(); ep.ejected_at = Some(Instant::now()); ep.ejection_multiplier = 1; } @@ -834,18 +754,21 @@ mod tests { #[test] fn multiplier_decrements_on_healthy_interval() { - let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); + let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); let h = detector.add_endpoint(addr(8080)); // Force multiplier to 3 without ejecting. - { - let mut state = detector.state.lock().unwrap(); - state.get_mut(&addr(8080)).unwrap().ejection_multiplier = 3; - } + detector + .state + .get_mut(&addr(8080)) + .unwrap() + .ejection_multiplier = 3; // Healthy interval (some traffic, no ejection). h.record_success(); detector.run_sweep(Instant::now()); - let state = detector.state.lock().unwrap(); - assert_eq!(state.get(&addr(8080)).unwrap().ejection_multiplier, 2); + assert_eq!( + detector.state.get(&addr(8080)).unwrap().ejection_multiplier, + 2, + ); } #[test] @@ -853,32 +776,47 @@ mod tests { // A50: a non-ejected address with multiplier > 0 has its // multiplier decremented every sweep, regardless of whether it // received any RPCs that interval. - let detector = detector_no_loop(base_config(), FixedRng::boxed(99)); + let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); detector.add_endpoint(addr(8080)); - { - let mut state = detector.state.lock().unwrap(); - state.get_mut(&addr(8080)).unwrap().ejection_multiplier = 3; - } + detector + .state + .get_mut(&addr(8080)) + .unwrap() + .ejection_multiplier = 3; // No traffic recorded. detector.run_sweep(Instant::now()); - let state = detector.state.lock().unwrap(); - assert_eq!(state.get(&addr(8080)).unwrap().ejection_multiplier, 2); + assert_eq!( + detector.state.get(&addr(8080)).unwrap().ejection_multiplier, + 2, + ); } - // ----- Sweep loop ----- + // ----- maybe_run_sweep gating ----- - #[tokio::test(start_paused = true)] - async fn sweep_loop_emits_decisions_on_tick() { - let mut config = fp_config(50, 10, 3); - config.interval = Duration::from_millis(100); - let (detector, mut rx, _abort) = OutlierDetector::spawn_with( - config, - OutlierDetectorOptions { - rng: FixedRng::boxed(99), - ..Default::default() - }, - ); + #[test] + fn maybe_run_sweep_runs_on_first_call() { + // `last_sweep_at` starts as `None`, so the first call always + // sweeps regardless of the wall clock argument. + let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + for port in 8080..=8083 { + let h = detector.add_endpoint(addr(port)); + for _ in 0..100 { + h.record_success(); + } + } + let bad = detector.add_endpoint(addr(8084)); + for _ in 0..100 { + bad.record_failure(); + } + let decisions = detector.maybe_run_sweep(Instant::now()); + assert_eq!(decisions, vec![EjectionDecision::Eject(addr(8084))]); + } + #[test] + fn maybe_run_sweep_skips_when_interval_not_elapsed() { + let mut config = fp_config(50, 10, 3); + config.interval = Duration::from_secs(10); + let mut detector = detector_with_rng(config, FixedRng::boxed(99)); for port in 8080..=8083 { let h = detector.add_endpoint(addr(port)); for _ in 0..100 { @@ -890,30 +828,55 @@ mod tests { bad.record_failure(); } - // Explicitly advance virtual time past the first sweep tick. - // `advance` is preferred over `sleep` for paused-time tests — it - // moves the clock deterministically and yields until pending - // task wake-ups have been polled, instead of relying on the - // runtime's auto-advance heuristic for parked tasks. - tokio::time::advance(Duration::from_millis(150)).await; + // First call always runs. + let t0 = Instant::now(); + assert_eq!( + detector.maybe_run_sweep(t0), + vec![EjectionDecision::Eject(addr(8084))], + ); - let decision = rx.recv().await.expect("sweep should emit a decision"); - assert_eq!(decision, EjectionDecision::Eject(addr(8084))); + // Re-arm with bad stats; second call Date: Tue, 5 May 2026 13:32:48 -0700 Subject: [PATCH 17/33] docs(tonic-xds): scrub narrative from outlier_detection comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass through every doc comment and inline comment, removing rationale, timeline language, and explanations that don't help a future reader. Notable trims: - Module docstring drops "Knows nothing about the data path:" framing, the "lands in a follow-up PR" timeline (regression — flagged and removed earlier on a different doc), and the "(mean and standard deviation across the qualifying hosts)" parenthetical. - `Rng` trait drops the "Abstracted so tests can inject" rationale. - `OutlierDetector` struct drops "State is owned (no `Mutex`, no `Arc`):" framing. - `add_endpoint` / `remove_endpoint` / `with_rng` lose the trailing usage hints / explanatory parentheticals. - `maybe_run_sweep` / `run_sweep` tightened to facts-only. - Inline comments inside `run_sweep` drop "we model that" and "intentionally not yet dispatched in this PR" timeline. - Inline comment for the budget-decrement guard now points at `Candidate::already_ejected` instead of duplicating its doc. - Test `already_ejected_re_ejection_does_not_consume_budget` drops the "this would fail before the fix" git-history paragraph. --- .../client/loadbalance/outlier_detection.rs | 118 +++++++----------- 1 file changed, 44 insertions(+), 74 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index f3614e403..e28e81df1 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -1,20 +1,15 @@ //! gRFC A50 outlier-detection sweep engine. //! -//! Owns per-endpoint counters and an ejection state machine. Runs the -//! failure-percentage ejection algorithm on demand and returns the -//! resulting [`EjectionDecision`]s. Knows nothing about the data path: -//! callers feed it RPC outcomes via the lock-free [`EndpointCounters`] -//! handle returned by [`OutlierDetector::add_endpoint`], and pump the -//! sweep by calling [`OutlierDetector::maybe_run_sweep`] from their own -//! event loop (typically the load balancer's `poll_ready`). The wall -//! clock supplied to `maybe_run_sweep` decides when each sweep actually -//! runs — at most once per `config.interval`. +//! Tracks per-endpoint success/failure counters and an ejection state +//! machine. Callers feed RPC outcomes via the lock-free +//! [`EndpointCounters`] handle returned by +//! [`OutlierDetector::add_endpoint`], and drive sweeps by calling +//! [`OutlierDetector::maybe_run_sweep`] from their own event loop +//! (typically the load balancer's `poll_ready`); a sweep runs at most +//! once per `config.interval`. //! -//! Only the **failure-percentage** algorithm is implemented in this -//! module. The success-rate algorithm — which adds float-math (mean -//! and standard deviation across the qualifying hosts) — lands in a -//! follow-up PR. If [`OutlierDetectionConfig::success_rate`] is set, -//! it is currently ignored. +//! Only the failure-percentage algorithm is currently dispatched. If +//! [`OutlierDetectionConfig::success_rate`] is set, it is ignored. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md //! [`OutlierDetectionConfig::success_rate`]: crate::xds::resource::outlier_detection::OutlierDetectionConfig::success_rate @@ -77,8 +72,7 @@ pub(crate) enum EjectionDecision { Uneject(EndpointAddress), } -/// Probability source for `enforcing_*` rolls. Abstracted so tests can -/// inject deterministic outcomes. +/// Probability source for `enforcing_*` rolls. pub(crate) trait Rng: Send + Sync + 'static { /// Return a uniform random `u32` in `0..100`. fn pct_roll(&self) -> u32; @@ -115,11 +109,9 @@ impl EndpointState { /// gRFC A50 outlier detector. /// -/// State is owned (no `Mutex`, no `Arc`): the consumer holds the -/// detector by `&mut` and calls [`Self::maybe_run_sweep`] from its own -/// event loop, typically the load balancer's `poll_ready`. The wall -/// clock argument decides when each sweep actually runs — at most once -/// per `config.interval`. +/// Held by `&mut`; the consumer drives sweeps by calling +/// [`Self::maybe_run_sweep`] from its own event loop (typically the +/// load balancer's `poll_ready`). pub(crate) struct OutlierDetector { config: OutlierDetectionConfig, state: HashMap, @@ -136,8 +128,7 @@ impl OutlierDetector { Self::with_rng(config, Box::new(FastRandRng)) } - /// Build the detector with an injected [`Rng`]. Tests use this to - /// pin the `enforcing_*` rolls. + /// Build the detector with a custom [`Rng`]. pub(crate) fn with_rng(config: OutlierDetectionConfig, rng: Box) -> Self { Self { config, @@ -148,11 +139,9 @@ impl OutlierDetector { } /// Register an endpoint and return its lock-free counter handle. - /// The caller wires this handle into the data-path RPC interceptor so - /// that completed calls increment success/failure atomics. + /// The caller wires this handle into the data-path RPC interceptor. /// - /// Adding an already-registered address is a no-op and returns the - /// existing handle (so callers can re-add idempotently). + /// Adding an already-registered address returns the existing handle. pub(crate) fn add_endpoint(&mut self, addr: EndpointAddress) -> Arc { self.state .entry(addr) @@ -161,21 +150,17 @@ impl OutlierDetector { .clone() } - /// Forget a previously-registered endpoint. Drops its counters and - /// any ejection state. If the endpoint was ejected, no `Uneject` - /// decision is emitted — the caller is expected to handle the removal - /// directly (e.g., by dropping its slot in the load balancer). + /// Forget a previously-registered endpoint, dropping its counters + /// and ejection state. No `Uneject` decision is emitted if the + /// endpoint was ejected; the caller handles removal directly. pub(crate) fn remove_endpoint(&mut self, addr: &EndpointAddress) { self.state.remove(addr); } - /// Run a sweep at logical time `now` if at least `config.interval` - /// has elapsed since the last sweep, returning the resulting - /// ejection / un-ejection decisions. Otherwise returns an empty - /// vector and leaves the detector state untouched. - /// - /// The first call after construction always runs a sweep - /// (`last_sweep_at` starts as `None`). + /// Run a sweep at logical time `now`, returning the resulting + /// decisions. Sweeps are gated to at most one per `config.interval`; + /// calls inside the gate return an empty vector and leave state + /// untouched. The first call after construction always sweeps. pub(crate) fn maybe_run_sweep(&mut self, now: Instant) -> Vec { if let Some(last) = self.last_sweep_at && now.duration_since(last) < self.config.interval @@ -186,11 +171,8 @@ impl OutlierDetector { self.run_sweep(now) } - /// Unconditionally run one sweep at logical time `now` and return the - /// resulting decisions. Used by [`Self::maybe_run_sweep`] and by tests - /// that want to drive sweeps without modeling the interval gate. - /// - /// The order of operations follows gRFC A50: + /// Run one sweep at logical time `now` unconditionally and return + /// the resulting decisions, in gRFC A50 step order: /// 1. Record the timestamp. /// 2. Swap each address's call-counter buckets. /// 3. Run the success-rate algorithm if configured. @@ -212,10 +194,9 @@ impl OutlierDetector { }); } - // Compute a cap on the number of new ejections this sweep so we - // don't exceed `max_ejection_percent` of the total. Per A50, the - // check is performed before each candidate ejection; we model that - // as a budget that algorithms decrement. + // Per-sweep cap on new ejections, enforced as a budget the + // algorithms decrement. Per A50, the check happens before each + // candidate. let total_endpoints = self.state.len(); let max_ejections = (total_endpoints as u64 * u64::from(self.config.max_ejection_percent.get()) @@ -227,14 +208,11 @@ impl OutlierDetector { .count(); let mut budget = max_ejections.saturating_sub(already_ejected); - // Steps 3 & 4: run the algorithms on the snapshot. Hosts that are - // currently ejected naturally fail the `request_volume` gate - // because they receive no traffic in production, so iterating - // every address (per spec) and ejected-only candidates produce - // the same outcome on real workloads. - // - // Step 3 (`success_rate_ejection`) is intentionally not yet - // dispatched in this PR; it lands in a follow-up. + // Steps 3 & 4: run the algorithms on the snapshot. Ejected + // hosts have no in-interval traffic in production and so + // naturally fail the `request_volume` gate; iterating every + // address (per spec) is equivalent to iterating non-ejected + // ones. Step 3 (success-rate ejection) is not yet dispatched. let mut to_eject: Vec = Vec::new(); if let Some(fp) = self.config.failure_percentage.as_ref() { @@ -248,11 +226,10 @@ impl OutlierDetector { } } - // Step 5: decrement multipliers for non-ejected addresses, and - // un-eject any ejected addresses whose backoff has elapsed. This - // runs *after* re-ejection, so a same-sweep re-ejection updates - // `ejected_at` to `now` and the un-eject check sees zero elapsed - // time — no spurious uneject decision is emitted. + // Step 5: decrement multipliers for non-ejected addresses; + // un-eject ejected addresses whose backoff has elapsed. Runs + // *after* re-ejection, so a same-sweep re-eject refreshes + // `ejected_at` and the un-eject check sees zero elapsed time. let cap = self .config .base_ejection_time @@ -318,10 +295,8 @@ fn run_failure_percentage( let failure_pct = 100 * c.failure / c.total; if failure_pct > threshold && roll(rng, cfg.enforcing_failure_percentage.get()) { out.push(c.addr.clone()); - // Only NEW ejections consume a budget slot; re-ejecting - // an already-ejected address only refreshes its - // timestamp and multiplier, leaving the count of - // currently-ejected addresses unchanged. + // See `Candidate::already_ejected` for why re-ejections + // don't consume the budget. if !c.already_ejected { *budget -= 1; } @@ -346,11 +321,11 @@ struct Candidate { success: u64, failure: u64, total: u64, - /// Whether this address was already ejected at the start of the sweep. - /// "Re-ejecting" an already-ejected address only refreshes its - /// ejection timestamp and bumps the multiplier; it does not change - /// the count of currently-ejected addresses, so it must not consume - /// a `max_ejection_percent` budget slot. + /// Whether this address was already ejected at the start of the + /// sweep. Re-ejecting an already-ejected address refreshes its + /// timestamp and bumps its multiplier but doesn't change the count + /// of currently-ejected addresses, so it must not consume a + /// `max_ejection_percent` budget slot. already_ejected: bool, } @@ -715,11 +690,6 @@ mod tests { // accumulated during its backoff), four newly bad. Cap permits // 3 concurrently ejected hosts (60% of 5), with 1 already taken // by the pre-ejected host — so 2 new ejections remain in budget. - // - // This test would fail before the fix that excludes re-ejections - // from budget accounting: the algorithm would "re-eject" the - // already-ejected host (consuming the second slot), leaving only - // 1 new ejection from the four bad hosts. let mut config = fp_config(50, 10, 3); config.max_ejection_percent = pct(60); let mut detector = detector_with_rng(config, FixedRng::boxed(99)); From 026efc8e7ca53a6fe14c91b21323fc2931f22433 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Fri, 8 May 2026 11:24:21 -0700 Subject: [PATCH 18/33] refactor(tonic-xds): unpack EndpointCounters into two AtomicU64s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The packed-AtomicU64 design fixed a specific gap raised earlier in review (the snapshot's two swaps weren't atomic against each other), but the cost in readability and the marginal correctness benefit no longer justify it: the snapshot boundary is approximate either way — RPCs land continuously, so some always cross between "this interval" and "next interval" regardless of how the swap is implemented. For a statistical threshold at 85% over typically hundreds-to-thousands of RPCs per interval, the bias is well below the precision of the check. Replace the packing with two plain `AtomicU64` counters and document the snapshot's non-atomicity honestly on `snapshot_and_reset`. --- .../client/loadbalance/outlier_detection.rs | 34 +++++++------------ 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index e28e81df1..11b8e902b 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -24,40 +24,30 @@ use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDe /// Lock-free success/failure counter for one endpoint. The data path /// records RPC outcomes via `record_success` / `record_failure`; the -/// sweep snapshots and resets atomically. -/// -/// Counts are packed into a single `AtomicU64` (high 32 bits: -/// successes, low 32 bits: failures), so each record is one `fetch_add` -/// and a snapshot is one `swap(0)`. Each counter is capped at -/// `u32::MAX` per sweep interval; exceeding that carries into the -/// other counter's bits but is unreachable for realistic workloads. +/// sweep reads and resets between intervals. #[derive(Debug, Default)] pub(crate) struct EndpointCounters { - /// High 32 bits: successes since last sweep. - /// Low 32 bits: failures since last sweep. - packed: AtomicU64, + success: AtomicU64, + failure: AtomicU64, } -/// Increment to apply to [`EndpointCounters::packed`] for one success. -const SUCCESS_INC: u64 = 1 << 32; -/// Increment to apply to [`EndpointCounters::packed`] for one failure. -const FAILURE_INC: u64 = 1; -/// Mask for the failure half of the packed counter. -const FAILURE_MASK: u64 = 0xFFFF_FFFF; - impl EndpointCounters { pub(crate) fn record_success(&self) { - self.packed.fetch_add(SUCCESS_INC, Ordering::Relaxed); + self.success.fetch_add(1, Ordering::Relaxed); } pub(crate) fn record_failure(&self) { - self.packed.fetch_add(FAILURE_INC, Ordering::Relaxed); + self.failure.fetch_add(1, Ordering::Relaxed); } - /// Atomically read and zero both counters. Returns `(success, failure)`. + /// Read and zero both counters. Returns `(success, failure)`. The + /// two swaps are not atomic against each other — RPCs landing + /// between them may bias the snapshot by a small number of events, + /// well below the precision of the failure-percentage threshold. fn snapshot_and_reset(&self) -> (u64, u64) { - let v = self.packed.swap(0, Ordering::Relaxed); - (v >> 32, v & FAILURE_MASK) + let s = self.success.swap(0, Ordering::Relaxed); + let f = self.failure.swap(0, Ordering::Relaxed); + (s, f) } } From 6656304859046f7cde0060d11164476f47c3d7d4 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Fri, 8 May 2026 14:16:05 -0700 Subject: [PATCH 19/33] refactor(tonic-xds): outlier detection via shared DashMap + actor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move outlier-detection state onto the channels themselves and run the sweep in a spawned actor task that mutates the shared state. The load balancer's `poll_ready` will observe ejection events O(1) per change via per-channel `watch::Receiver::changed()` futures (wired in a follow-up integration PR), so the O(n) scan stays off the LB's critical path. In `channel_state.rs`: - Add `EndpointCounters` (lock-free success/failure atomics) and `OutlierChannelState` (counters + edge-triggered `watch::Sender` ejection signal). Both `pub(crate)`. - `ReadyChannel` gains `outlier: Arc`. `ConnectingChannel::new` generates a fresh state; `with_outlier` preserves an existing one (for reconnect paths). - `EjectedChannel` carries the outlier state through the cooldown so it survives the eject → un-eject cycle. In `outlier_detection.rs`: - `OutlierDetector` no longer owns counters; it owns only algorithm- private state (per-endpoint multiplier and last-ejection timestamp) and config + RNG. - `OutlierStatsRegistry = Arc>>` is the shared structure between the detector and the LB. - `run_sweep(&mut self, now, &OutlierStatsRegistry)` scans the DashMap, snapshots counters via the channel state, decides ejections, and applies them inline by calling `OutlierChannelState::eject()` / `uneject()`. Algorithm state for removed channels is GC'd per-sweep. - `OutlierDetector::spawn(config, channels)` spawns the actor task on a `tokio::time::interval` ticker; returns `AbortOnDrop` for lifecycle control. `EjectionDecision` enum, `maybe_run_sweep`, `last_sweep_at`, `add_endpoint`, and `remove_endpoint` are all removed — the actor + shared state replaces them. - Tests rewritten to drive the new shape: construct a `DashMap>`, populate counters, call `run_sweep` and observe `is_ejected()` directly. Adds two actor-level tests covering `spawn` + `AbortOnDrop`. --- .../src/client/loadbalance/channel_state.rs | 164 +++- .../client/loadbalance/outlier_detection.rs | 723 +++++++++--------- 2 files changed, 503 insertions(+), 384 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index 7916c9bb8..fb534b9cd 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -26,16 +26,135 @@ use std::future::Future; use std::pin::Pin; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use std::task::{Context, Poll}; use std::time::Duration; use pin_project_lite::pin_project; +use tokio::sync::watch; use tower::Service; use tower::load::Load; use crate::client::endpoint::{Connector, EndpointAddress}; use crate::common::async_util::BoxFuture; +// --------------------------------------------------------------------------- +// EndpointCounters / OutlierChannelState +// --------------------------------------------------------------------------- + +/// Lock-free success/failure counter for one endpoint. Records RPC +/// outcomes from the data path; the outlier-detection actor reads and +/// resets between intervals. +#[derive(Debug, Default)] +pub(crate) struct EndpointCounters { + success: AtomicU64, + failure: AtomicU64, +} + +impl EndpointCounters { + pub(crate) fn record_success(&self) { + self.success.fetch_add(1, Ordering::Relaxed); + } + + pub(crate) fn record_failure(&self) { + self.failure.fetch_add(1, Ordering::Relaxed); + } + + /// Read and zero both counters. The two swaps are not atomic against + /// each other — RPCs landing between them may bias the snapshot by + /// a small number of events, well below the precision of the + /// failure-percentage threshold. + pub(crate) fn snapshot_and_reset(&self) -> (u64, u64) { + let s = self.success.swap(0, Ordering::Relaxed); + let f = self.failure.swap(0, Ordering::Relaxed); + (s, f) + } +} + +/// Per-channel outlier-detection state, shared between the data path +/// (for outcome recording) and the outlier-detection actor (for sweeps +/// and ejection signalling). +/// +/// The ejection signal is edge-triggered: the actor calls [`eject`] / +/// [`uneject`] to flip the flag; observers subscribe via +/// [`subscribe`] and poll `Receiver::changed()` (typically inside a +/// `FuturesUnordered`) to react in O(1) on each transition. +/// +/// [`eject`]: Self::eject +/// [`uneject`]: Self::uneject +/// [`subscribe`]: Self::subscribe +#[derive(Debug)] +pub(crate) struct OutlierChannelState { + counters: EndpointCounters, + eject_tx: watch::Sender, +} + +impl Default for OutlierChannelState { + fn default() -> Self { + Self::new() + } +} + +impl OutlierChannelState { + pub(crate) fn new() -> Self { + let (eject_tx, _) = watch::channel(false); + Self { + counters: EndpointCounters::default(), + eject_tx, + } + } + + pub(crate) fn record_success(&self) { + self.counters.record_success(); + } + + pub(crate) fn record_failure(&self) { + self.counters.record_failure(); + } + + /// Atomically read and zero the counters. Returns `(success, failure)`. + pub(crate) fn snapshot_and_reset(&self) -> (u64, u64) { + self.counters.snapshot_and_reset() + } + + /// Flip the ejection flag to `true`. No-op if already ejected. + pub(crate) fn eject(&self) { + self.eject_tx.send_if_modified(|state| { + if *state { + false + } else { + *state = true; + true + } + }); + } + + /// Flip the ejection flag back to `false`. No-op if not ejected. + pub(crate) fn uneject(&self) { + self.eject_tx.send_if_modified(|state| { + if *state { + *state = false; + true + } else { + false + } + }); + } + + /// Current ejection state. + pub(crate) fn is_ejected(&self) -> bool { + *self.eject_tx.borrow() + } + + /// Subscribe to ejection-state changes. The returned receiver's + /// `changed()` future resolves on each transition; consumers + /// typically push it into a `FuturesUnordered`. + #[allow(dead_code)] // wired by the LoadBalancer in a follow-up PR. + pub(crate) fn subscribe(&self) -> watch::Receiver { + self.eject_tx.subscribe() + } +} + /// Configuration for an ejected channel. #[derive(Debug, Clone)] pub(crate) struct EjectionConfig { @@ -92,12 +211,27 @@ pub(crate) struct ConnectingChannel { } impl ConnectingChannel { + /// Start a connection, generating a fresh per-channel outlier + /// state. Used for first-time connects from `IdleChannel`. pub(crate) fn new(fut: BoxFuture, addr: EndpointAddress) -> Self { + Self::with_outlier(fut, addr, Arc::new(OutlierChannelState::new())) + } + + /// Start a connection that inherits an existing + /// [`OutlierChannelState`]. Used by reconnect paths so the + /// per-channel counters and ejection signal survive across the + /// connection cycle. + pub(crate) fn with_outlier( + fut: BoxFuture, + addr: EndpointAddress, + outlier: Arc, + ) -> Self { Self { inner: Box::pin(async move { ReadyChannel { addr, inner: fut.await, + outlier, } }), } @@ -119,14 +253,23 @@ impl Future for ConnectingChannel { /// A channel that is connected and ready to serve requests. /// /// Holds the raw service `S` and delegates [`Service`] calls directly, -/// preserving `S::Future` and `S::Error` with no wrapping or type erasure. +/// preserving `S::Future` and `S::Error` with no wrapping or type +/// erasure. The `Arc` is shared with the outlier- +/// detection actor for stats accumulation and edge-triggered ejection. #[derive(Clone)] pub(crate) struct ReadyChannel { addr: EndpointAddress, inner: S, + outlier: Arc, } impl ReadyChannel { + /// Per-channel outlier-detection state. Cloned cheaply via `Arc`. + #[allow(dead_code)] // consumed by the LoadBalancer in a follow-up PR. + pub(crate) fn outlier(&self) -> &Arc { + &self.outlier + } + /// Eject this channel (e.g., due to outlier detection). Consumes self. pub(crate) fn eject(self, config: EjectionConfig, connector: Arc) -> EjectedChannel where @@ -136,13 +279,15 @@ impl ReadyChannel { EjectedChannel { addr: self.addr, inner: self.inner, + outlier: self.outlier, config, connector, ejection_timer, } } - /// Start reconnecting. Consumes self, dropping the old connection. + /// Start reconnecting. Consumes self, dropping the old connection + /// but preserving the outlier-detection state. pub(crate) fn reconnect>( self, connector: Arc, @@ -150,7 +295,7 @@ impl ReadyChannel { where S: Send + 'static, { - ConnectingChannel::new(connector.connect(&self.addr), self.addr) + ConnectingChannel::with_outlier(connector.connect(&self.addr), self.addr, self.outlier) } } @@ -193,6 +338,7 @@ pin_project! { pub(crate) struct EjectedChannel { addr: EndpointAddress, inner: S, + outlier: Arc, config: EjectionConfig, connector: Arc + Send + Sync>, #[pin] @@ -209,14 +355,18 @@ impl Future for EjectedChannel { Poll::Ready(()) => { if this.config.needs_reconnect { let fut = this.connector.connect(this.addr); - Poll::Ready(UnejectedChannel::Connecting(ConnectingChannel::new( - fut, - this.addr.clone(), - ))) + Poll::Ready(UnejectedChannel::Connecting( + ConnectingChannel::with_outlier( + fut, + this.addr.clone(), + this.outlier.clone(), + ), + )) } else { Poll::Ready(UnejectedChannel::Ready(ReadyChannel { addr: this.addr.clone(), inner: this.inner.clone(), + outlier: this.outlier.clone(), })) } } diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 11b8e902b..37023ad95 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -1,66 +1,39 @@ //! gRFC A50 outlier-detection sweep engine. //! -//! Tracks per-endpoint success/failure counters and an ejection state -//! machine. Callers feed RPC outcomes via the lock-free -//! [`EndpointCounters`] handle returned by -//! [`OutlierDetector::add_endpoint`], and drive sweeps by calling -//! [`OutlierDetector::maybe_run_sweep`] from their own event loop -//! (typically the load balancer's `poll_ready`); a sweep runs at most -//! once per `config.interval`. +//! Reads per-endpoint counters from a shared +//! [`DashMap>`] and applies +//! ejection / un-ejection decisions in place by toggling each entry's +//! ejection signal. The load balancer registers each [`ReadyChannel`]'s +//! [`OutlierChannelState`] in the same map and observes the signal via +//! a `FuturesUnordered` of `watch::Receiver::changed()` futures, so the +//! O(n) sweep runs in a spawned actor task off the LB's critical path. //! //! Only the failure-percentage algorithm is currently dispatched. If //! [`OutlierDetectionConfig::success_rate`] is set, it is ignored. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md +//! [`ReadyChannel`]: crate::client::loadbalance::channel_state::ReadyChannel +//! [`OutlierChannelState`]: crate::client::loadbalance::channel_state::OutlierChannelState //! [`OutlierDetectionConfig::success_rate`]: crate::xds::resource::outlier_detection::OutlierDetectionConfig::success_rate -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use std::sync::atomic::{AtomicU64, Ordering}; use std::time::Instant; +use dashmap::DashMap; + use crate::client::endpoint::EndpointAddress; +use crate::client::loadbalance::channel_state::OutlierChannelState; +use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDetectionConfig}; -/// Lock-free success/failure counter for one endpoint. The data path -/// records RPC outcomes via `record_success` / `record_failure`; the -/// sweep reads and resets between intervals. -#[derive(Debug, Default)] -pub(crate) struct EndpointCounters { - success: AtomicU64, - failure: AtomicU64, -} - -impl EndpointCounters { - pub(crate) fn record_success(&self) { - self.success.fetch_add(1, Ordering::Relaxed); - } - - pub(crate) fn record_failure(&self) { - self.failure.fetch_add(1, Ordering::Relaxed); - } - - /// Read and zero both counters. Returns `(success, failure)`. The - /// two swaps are not atomic against each other — RPCs landing - /// between them may bias the snapshot by a small number of events, - /// well below the precision of the failure-percentage threshold. - fn snapshot_and_reset(&self) -> (u64, u64) { - let s = self.success.swap(0, Ordering::Relaxed); - let f = self.failure.swap(0, Ordering::Relaxed); - (s, f) - } -} - -/// A decision emitted by an [`OutlierDetector`] sweep. -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] -pub(crate) enum EjectionDecision { - /// Eject this endpoint from the load-balancing pool. The caller - /// should keep its underlying connection alive (A50 requires - /// preserving connections across ejection). - Eject(EndpointAddress), - /// Restore a previously-ejected endpoint to the pool. - Uneject(EndpointAddress), -} +/// Shared map of per-endpoint outlier state, keyed by address. The +/// load balancer inserts each [`ReadyChannel`]'s +/// [`OutlierChannelState`] on connect and removes it on disconnect; the +/// detector iterates the map on each sweep. +/// +/// [`ReadyChannel`]: crate::client::loadbalance::channel_state::ReadyChannel +pub(crate) type OutlierStatsRegistry = Arc>>; /// Probability source for `enforcing_*` rolls. pub(crate) trait Rng: Send + Sync + 'static { @@ -77,9 +50,11 @@ impl Rng for FastRandRng { } } -/// Per-endpoint state held inside the detector. -struct EndpointState { - counters: Arc, +/// Algorithm-private per-endpoint state. Tracks the ejection-time +/// multiplier and the last ejection timestamp; counters and the +/// outward-facing ejection signal live on the channel's +/// [`OutlierChannelState`]. +struct AlgState { /// Number of times this endpoint has been ejected. Grows on each /// re-ejection and decays on each healthy interval. ejection_multiplier: u32, @@ -87,10 +62,9 @@ struct EndpointState { ejected_at: Option, } -impl EndpointState { +impl AlgState { fn new() -> Self { Self { - counters: Arc::new(EndpointCounters::default()), ejection_multiplier: 0, ejected_at: None, } @@ -99,16 +73,14 @@ impl EndpointState { /// gRFC A50 outlier detector. /// -/// Held by `&mut`; the consumer drives sweeps by calling -/// [`Self::maybe_run_sweep`] from its own event loop (typically the -/// load balancer's `poll_ready`). +/// Held by an actor task that ticks once per `config.interval` and +/// calls [`Self::run_sweep`] over the shared [`OutlierStatsRegistry`]. +/// Stats and ejection signals live on the channels themselves; the +/// detector owns only algorithm-private metadata (per-endpoint +/// multiplier and last-ejection timestamp). pub(crate) struct OutlierDetector { config: OutlierDetectionConfig, - state: HashMap, - /// Wall-clock time of the last sweep that actually ran. `None` - /// before the first sweep, so the first call to `maybe_run_sweep` - /// always runs. - last_sweep_at: Option, + state: HashMap, rng: Box, } @@ -123,66 +95,43 @@ impl OutlierDetector { Self { config, state: HashMap::new(), - last_sweep_at: None, rng, } } - /// Register an endpoint and return its lock-free counter handle. - /// The caller wires this handle into the data-path RPC interceptor. + /// Run one sweep at logical time `now` over the shared registry. + /// Applies ejection decisions inline by calling + /// [`OutlierChannelState::eject`] / [`OutlierChannelState::uneject`] + /// on each affected entry. /// - /// Adding an already-registered address returns the existing handle. - pub(crate) fn add_endpoint(&mut self, addr: EndpointAddress) -> Arc { - self.state - .entry(addr) - .or_insert_with(EndpointState::new) - .counters - .clone() - } - - /// Forget a previously-registered endpoint, dropping its counters - /// and ejection state. No `Uneject` decision is emitted if the - /// endpoint was ejected; the caller handles removal directly. - pub(crate) fn remove_endpoint(&mut self, addr: &EndpointAddress) { - self.state.remove(addr); - } - - /// Run a sweep at logical time `now`, returning the resulting - /// decisions. Sweeps are gated to at most one per `config.interval`; - /// calls inside the gate return an empty vector and leave state - /// untouched. The first call after construction always sweeps. - pub(crate) fn maybe_run_sweep(&mut self, now: Instant) -> Vec { - if let Some(last) = self.last_sweep_at - && now.duration_since(last) < self.config.interval - { - return Vec::new(); - } - self.last_sweep_at = Some(now); - self.run_sweep(now) - } - - /// Run one sweep at logical time `now` unconditionally and return - /// the resulting decisions, in gRFC A50 step order: + /// Order of operations follows gRFC A50: /// 1. Record the timestamp. - /// 2. Swap each address's call-counter buckets. - /// 3. Run the success-rate algorithm if configured. + /// 2. Snapshot each address's call-counter buckets. + /// 3. Run the success-rate algorithm if configured (not yet dispatched). /// 4. Run the failure-percentage algorithm if configured. - /// 5. For each address: decrement the multiplier of non-ejected - /// addresses with multiplier > 0, and un-eject ejected addresses - /// whose backoff has elapsed. - pub(crate) fn run_sweep(&mut self, now: Instant) -> Vec { - // Step 2: snapshot every endpoint's counters. - let mut snapshots: Vec = Vec::with_capacity(self.state.len()); - for (addr, ep) in self.state.iter_mut() { - let (success, failure) = ep.counters.snapshot_and_reset(); + /// 5. Decrement the multiplier of non-ejected addresses with + /// multiplier > 0; un-eject ejected addresses whose backoff has + /// elapsed. + pub(crate) fn run_sweep(&mut self, now: Instant, channels: &OutlierStatsRegistry) { + // Step 2: snapshot every channel's counters and record which + // addresses are still in the registry. + let mut snapshots: Vec = Vec::with_capacity(channels.len()); + let mut seen: HashSet = HashSet::with_capacity(channels.len()); + for entry in channels.iter() { + let addr = entry.key().clone(); + let (success, failure) = entry.value().snapshot_and_reset(); + let alg = self.state.entry(addr.clone()).or_insert_with(AlgState::new); snapshots.push(Candidate { addr: addr.clone(), success, failure, total: success + failure, - already_ejected: ep.ejected_at.is_some(), + already_ejected: alg.ejected_at.is_some(), }); + seen.insert(addr); } + // Drop algorithm state for addresses no longer in the registry. + self.state.retain(|addr, _| seen.contains(addr)); // Per-sweep cap on new ejections, enforced as a budget the // algorithms decrement. Per A50, the check happens before each @@ -194,25 +143,27 @@ impl OutlierDetector { let already_ejected = self .state .values() - .filter(|ep| ep.ejected_at.is_some()) + .filter(|s| s.ejected_at.is_some()) .count(); let mut budget = max_ejections.saturating_sub(already_ejected); - // Steps 3 & 4: run the algorithms on the snapshot. Ejected - // hosts have no in-interval traffic in production and so - // naturally fail the `request_volume` gate; iterating every - // address (per spec) is equivalent to iterating non-ejected - // ones. Step 3 (success-rate ejection) is not yet dispatched. + // Steps 3 & 4: run the algorithms. Ejected hosts have no + // in-interval traffic in production and so naturally fail the + // `request_volume` gate; iterating every address (per spec) is + // equivalent to iterating non-ejected ones. Step 3 (success- + // rate ejection) is not yet dispatched. let mut to_eject: Vec = Vec::new(); - if let Some(fp) = self.config.failure_percentage.as_ref() { run_failure_percentage(fp, &snapshots, &mut budget, &mut to_eject, &*self.rng); } for addr in &to_eject { - if let Some(ep) = self.state.get_mut(addr) { - ep.ejected_at = Some(now); - ep.ejection_multiplier = ep.ejection_multiplier.saturating_add(1); + if let Some(alg) = self.state.get_mut(addr) { + alg.ejected_at = Some(now); + alg.ejection_multiplier = alg.ejection_multiplier.saturating_add(1); + } + if let Some(state) = channels.get(addr) { + state.eject(); } } @@ -224,31 +175,58 @@ impl OutlierDetector { .config .base_ejection_time .max(self.config.max_ejection_time); - let mut to_uneject: Vec = Vec::new(); - for (addr, ep) in self.state.iter_mut() { - if let Some(at) = ep.ejected_at { + for (addr, alg) in self.state.iter_mut() { + if let Some(at) = alg.ejected_at { if let Some(scaled) = self .config .base_ejection_time - .checked_mul(ep.ejection_multiplier) + .checked_mul(alg.ejection_multiplier) && now.duration_since(at) >= scaled.min(cap) { - ep.ejected_at = None; - to_uneject.push(addr.clone()); + alg.ejected_at = None; + if let Some(state) = channels.get(addr) { + state.uneject(); + } } - } else if ep.ejection_multiplier > 0 { - ep.ejection_multiplier -= 1; + } else if alg.ejection_multiplier > 0 { + alg.ejection_multiplier -= 1; } } + } - let mut decisions = Vec::with_capacity(to_uneject.len() + to_eject.len()); - for addr in to_uneject { - decisions.push(EjectionDecision::Uneject(addr)); - } - for addr in to_eject { - decisions.push(EjectionDecision::Eject(addr)); - } - decisions + /// Spawn the detector as an actor task with the default RNG. The + /// task ticks every `config.interval` and runs a sweep over the + /// shared registry. Dropping the returned [`AbortOnDrop`] stops + /// the task. + pub(crate) fn spawn( + config: OutlierDetectionConfig, + channels: OutlierStatsRegistry, + ) -> AbortOnDrop { + Self::spawn_inner(Self::new(config), channels) + } + + /// Variant of [`Self::spawn`] that accepts a custom [`Rng`]. + pub(crate) fn spawn_with_rng( + config: OutlierDetectionConfig, + rng: Box, + channels: OutlierStatsRegistry, + ) -> AbortOnDrop { + Self::spawn_inner(Self::with_rng(config, rng), channels) + } + + fn spawn_inner(mut detector: Self, channels: OutlierStatsRegistry) -> AbortOnDrop { + let interval = detector.config.interval; + let task = tokio::spawn(async move { + let mut ticker = tokio::time::interval(interval); + ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + // First tick fires immediately so the actor runs an initial + // sweep on startup; subsequent ticks fire on the interval. + loop { + ticker.tick().await; + detector.run_sweep(Instant::now(), &channels); + } + }); + AbortOnDrop(task) } } @@ -322,8 +300,10 @@ struct Candidate { #[cfg(test)] mod tests { use super::*; - use crate::xds::resource::outlier_detection::Percentage; - use std::sync::atomic::AtomicU32; + use crate::xds::resource::outlier_detection::{ + FailurePercentageConfig, OutlierDetectionConfig, Percentage, + }; + use std::sync::atomic::{AtomicU32, Ordering}; use std::time::Duration; // ----- Fixtures ----- @@ -336,7 +316,6 @@ mod tests { Percentage::new(v).unwrap() } - /// Base config with both algorithms disabled; tests opt in. fn base_config() -> OutlierDetectionConfig { OutlierDetectionConfig { interval: Duration::from_secs(1), @@ -348,15 +327,27 @@ mod tests { } } - /// Deterministic RNG: `pct_roll()` returns a fixed value, configurable. + fn fp_config( + threshold: u32, + request_volume: u32, + minimum_hosts: u32, + ) -> OutlierDetectionConfig { + let mut c = base_config(); + c.failure_percentage = Some(FailurePercentageConfig { + threshold: pct(threshold), + enforcing_failure_percentage: pct(100), + minimum_hosts, + request_volume, + }); + c + } + + /// Deterministic RNG: `pct_roll()` returns a fixed value. struct FixedRng(AtomicU32); impl FixedRng { - fn new(value: u32) -> Self { - Self(AtomicU32::new(value)) - } fn boxed(value: u32) -> Box { - Box::new(Self::new(value)) + Box::new(Self(AtomicU32::new(value))) } } @@ -370,69 +361,63 @@ mod tests { OutlierDetector::with_rng(config, rng) } - // ----- EndpointCounters ----- + fn registry() -> OutlierStatsRegistry { + Arc::new(DashMap::new()) + } + + fn add(channels: &OutlierStatsRegistry, port: u16) -> Arc { + let state = Arc::new(OutlierChannelState::new()); + channels.insert(addr(port), state.clone()); + state + } - #[test] - fn counters_record_and_reset() { - let c = EndpointCounters::default(); - c.record_success(); - c.record_success(); - c.record_failure(); - assert_eq!(c.snapshot_and_reset(), (2, 1)); - assert_eq!(c.snapshot_and_reset(), (0, 0)); + fn ejected(channels: &OutlierStatsRegistry, port: u16) -> bool { + channels + .get(&addr(port)) + .map(|e| e.value().is_ejected()) + .unwrap_or(false) + } + + fn ejected_count(channels: &OutlierStatsRegistry) -> usize { + channels.iter().filter(|e| e.value().is_ejected()).count() } - // ----- add_endpoint / remove_endpoint ----- + // ----- OutlierChannelState (sanity) ----- #[test] - fn add_endpoint_returns_shared_counter() { - let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); - let h1 = detector.add_endpoint(addr(8080)); - let h2 = detector.add_endpoint(addr(8080)); - assert!( - Arc::ptr_eq(&h1, &h2), - "second add should return same handle" - ); - h1.record_success(); - assert_eq!(h2.snapshot_and_reset(), (1, 0)); + fn channel_state_records_and_resets() { + let s = OutlierChannelState::new(); + s.record_success(); + s.record_success(); + s.record_failure(); + assert_eq!(s.snapshot_and_reset(), (2, 1)); + assert_eq!(s.snapshot_and_reset(), (0, 0)); } #[test] - fn remove_endpoint_drops_state() { - let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); - detector.add_endpoint(addr(8080)); - detector.remove_endpoint(&addr(8080)); - assert!(detector.state.is_empty()); + fn channel_state_eject_uneject_flips_signal() { + let s = OutlierChannelState::new(); + assert!(!s.is_ejected()); + s.eject(); + assert!(s.is_ejected()); + s.uneject(); + assert!(!s.is_ejected()); } // ----- Failure-percentage algorithm ----- - fn fp_config( - threshold: u32, - request_volume: u32, - minimum_hosts: u32, - ) -> OutlierDetectionConfig { - let mut c = base_config(); - c.failure_percentage = Some(FailurePercentageConfig { - threshold: pct(threshold), - enforcing_failure_percentage: pct(100), - minimum_hosts, - request_volume, - }); - c - } - #[test] fn failure_percentage_ejects_above_threshold() { let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); - // 4 healthy endpoints + 1 bad one. + let channels = registry(); + for port in 8080..=8083 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); for _ in 0..100 { - h.record_success(); + s.record_success(); } } - let bad = detector.add_endpoint(addr(8084)); + let bad = add(&channels, 8084); for _ in 0..90 { bad.record_failure(); } @@ -440,72 +425,80 @@ mod tests { bad.record_success(); } - let decisions = detector.run_sweep(Instant::now()); - assert_eq!(decisions, vec![EjectionDecision::Eject(addr(8084))]); + detector.run_sweep(Instant::now(), &channels); + assert!(bad.is_ejected()); + for port in 8080..=8083 { + assert!(!ejected(&channels, port)); + } } #[test] fn failure_percentage_skips_below_threshold() { let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let channels = registry(); for port in 8080..=8084 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); // 30% failure → below threshold of 50%. for _ in 0..70 { - h.record_success(); + s.record_success(); } for _ in 0..30 { - h.record_failure(); + s.record_failure(); } } - assert!(detector.run_sweep(Instant::now()).is_empty()); + detector.run_sweep(Instant::now(), &channels); + assert_eq!(ejected_count(&channels), 0); } #[test] fn failure_percentage_at_threshold_does_not_eject() { - // A50 specifies a strict "greater than" comparison: an address - // sitting exactly at the threshold should *not* be ejected. + // A50 specifies a strict "greater than" comparison. let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(0)); + let channels = registry(); for port in 8080..=8084 { - let h = detector.add_endpoint(addr(port)); - // Exactly 50% failure rate — equal to the threshold. + let s = add(&channels, port); for _ in 0..50 { - h.record_success(); + s.record_success(); } for _ in 0..50 { - h.record_failure(); + s.record_failure(); } } - assert!(detector.run_sweep(Instant::now()).is_empty()); + detector.run_sweep(Instant::now(), &channels); + assert_eq!(ejected_count(&channels), 0); } #[test] fn minimum_hosts_gates_failure_percentage() { let mut detector = detector_with_rng(fp_config(50, 10, 5), FixedRng::boxed(99)); + let channels = registry(); // Only 2 hosts have request_volume ≥ 10; minimum_hosts is 5 ⇒ skip. for port in 8080..=8081 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); for _ in 0..100 { - h.record_failure(); + s.record_failure(); } } - assert!(detector.run_sweep(Instant::now()).is_empty()); + detector.run_sweep(Instant::now(), &channels); + assert_eq!(ejected_count(&channels), 0); } #[test] fn request_volume_filters_low_traffic_endpoints() { let mut detector = detector_with_rng(fp_config(50, 100, 3), FixedRng::boxed(99)); - // Bad endpoint, but only 5 requests — below request_volume=100. - let bad = detector.add_endpoint(addr(8080)); + let channels = registry(); + let bad = add(&channels, 8080); for _ in 0..5 { bad.record_failure(); } for port in 8081..=8084 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); for _ in 0..200 { - h.record_success(); + s.record_success(); } } - assert!(detector.run_sweep(Instant::now()).is_empty()); + detector.run_sweep(Instant::now(), &channels); + assert_eq!(ejected_count(&channels), 0); } #[test] @@ -516,16 +509,16 @@ mod tests { .as_mut() .unwrap() .enforcing_failure_percentage = pct(0); - // Roll = 0 wouldn't trigger anyway since `roll(0)` short-circuits; - // pin the RNG to 0 just to be explicit. let mut detector = detector_with_rng(config, FixedRng::boxed(0)); + let channels = registry(); for port in 8080..=8084 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); for _ in 0..100 { - h.record_failure(); + s.record_failure(); } } - assert!(detector.run_sweep(Instant::now()).is_empty()); + detector.run_sweep(Instant::now(), &channels); + assert_eq!(ejected_count(&channels), 0); } // ----- Ejection multiplier / un-ejection ----- @@ -536,195 +529,193 @@ mod tests { config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(60); let mut detector = detector_with_rng(config, FixedRng::boxed(99)); + let channels = registry(); for port in 8080..=8084 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); if port == 8084 { for _ in 0..100 { - h.record_failure(); + s.record_failure(); } } else { for _ in 0..100 { - h.record_success(); + s.record_success(); } } } let t0 = Instant::now(); - assert_eq!( - detector.run_sweep(t0), - vec![EjectionDecision::Eject(addr(8084))], - ); + detector.run_sweep(t0, &channels); + assert!(ejected(&channels, 8084)); // Still ejected just before base_ejection_time elapses. - assert!(detector.run_sweep(t0 + Duration::from_secs(9)).is_empty()); + detector.run_sweep(t0 + Duration::from_secs(9), &channels); + assert!(ejected(&channels, 8084)); // Un-eject after `base * multiplier(=1)` = 10s. - assert_eq!( - detector.run_sweep(t0 + Duration::from_secs(10)), - vec![EjectionDecision::Uneject(addr(8084))], - ); + detector.run_sweep(t0 + Duration::from_secs(10), &channels); + assert!(!ejected(&channels, 8084)); } #[test] fn re_ejection_doubles_duration() { - // The multiplier doubles only when un-ejection and re-ejection - // happen in the *same* sweep — at that point the multiplier- - // decrement step has skipped the (still-ejected-at-start) - // endpoint, so re-ejection increments it from 1 to 2. + // Same-sweep un-eject + re-eject grows the multiplier 1 → 2. let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(60); let mut detector = detector_with_rng(config, FixedRng::boxed(99)); + let channels = registry(); - let bad = addr(8084); - let bad_h = detector.add_endpoint(bad.clone()); + let bad = add(&channels, 8084); for port in 8080..=8083 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); for _ in 0..100 { - h.record_success(); + s.record_success(); } } for _ in 0..100 { - bad_h.record_failure(); + bad.record_failure(); } // Sweep 1: eject. Multiplier 0 → 1. let t0 = Instant::now(); - assert_eq!( - detector.run_sweep(t0), - vec![EjectionDecision::Eject(bad.clone())], - ); + detector.run_sweep(t0, &channels); + assert!(bad.is_ejected()); - // Re-record stats so sweep 2's snapshot has volume to evaluate. + // Re-record stats so sweep 2 has volume to evaluate. for port in 8080..=8083 { - let h = detector.add_endpoint(addr(port)); + let s = channels.get(&addr(port)).unwrap().value().clone(); for _ in 0..100 { - h.record_success(); + s.record_success(); } } for _ in 0..100 { - bad_h.record_failure(); + bad.record_failure(); } - // Sweep 2 at t0+10: re-ejection happens before the un-eject - // housekeeping step (per A50 ordering), so `ejected_at` is - // refreshed to `now` and the un-eject check sees zero elapsed - // time. Only an Eject decision is emitted; the multiplier moves - // 1 → 2. - assert_eq!( - detector.run_sweep(t0 + Duration::from_secs(10)), - vec![EjectionDecision::Eject(bad.clone())], - ); + // Sweep 2 at t0+10: re-ejection refreshes timestamp, multiplier 1 → 2. + detector.run_sweep(t0 + Duration::from_secs(10), &channels); + assert!(bad.is_ejected()); // Re-ejection started at t0+10 with multiplier=2 → duration 20s. - // Still ejected 19s later (29s after t0). - assert!(detector.run_sweep(t0 + Duration::from_secs(29)).is_empty()); + detector.run_sweep(t0 + Duration::from_secs(29), &channels); + assert!(bad.is_ejected()); // Un-ejects at the 20s mark (30s after t0). - assert_eq!( - detector.run_sweep(t0 + Duration::from_secs(30)), - vec![EjectionDecision::Uneject(bad)], - ); + detector.run_sweep(t0 + Duration::from_secs(30), &channels); + assert!(!bad.is_ejected()); } #[test] fn ejection_capped_by_max_ejection_time() { - // base=10s, max=15s, multiplier=10 → cap at 15s rather than 100s. let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(15); let mut detector = detector_with_rng(config, FixedRng::boxed(99)); + let channels = registry(); for port in 8080..=8084 { - detector.add_endpoint(addr(port)); + add(&channels, port); } let t0 = Instant::now(); - // Force multiplier=10 directly. - { - let ep = detector.state.get_mut(&addr(8084)).unwrap(); - ep.ejection_multiplier = 10; - ep.ejected_at = Some(t0); - } - // After base*multiplier (= 100s) the cap (= 15s) has long passed, - // so a sweep at 16s should un-eject. - let decisions = detector.run_sweep(t0 + Duration::from_secs(16)); - assert_eq!(decisions, vec![EjectionDecision::Uneject(addr(8084))]); + // Force multiplier=10 on 8084 directly. We need to drive a + // first sweep to populate `state[8084]`, then fix it up. + detector.run_sweep(t0, &channels); + let alg = detector.state.get_mut(&addr(8084)).unwrap(); + alg.ejection_multiplier = 10; + alg.ejected_at = Some(t0); + channels.get(&addr(8084)).unwrap().value().eject(); + + // base*multiplier = 100s; cap = 15s → un-eject after 16s. + detector.run_sweep(t0 + Duration::from_secs(16), &channels); + assert!(!ejected(&channels, 8084)); } #[test] fn max_ejection_percent_caps_concurrent_ejections() { - // 5 hosts, all bad, but max_ejection_percent=20 ⇒ at most 1 ejected. let mut config = fp_config(50, 10, 3); config.max_ejection_percent = pct(20); let mut detector = detector_with_rng(config, FixedRng::boxed(99)); + let channels = registry(); for port in 8080..=8084 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); for _ in 0..100 { - h.record_failure(); + s.record_failure(); } } - let mut decisions = detector.run_sweep(Instant::now()); - decisions.sort(); - let ejects = decisions - .iter() - .filter(|d| matches!(d, EjectionDecision::Eject(_))) - .count(); - assert_eq!(ejects, 1, "max_ejection_percent=20% of 5 hosts ⇒ 1"); + detector.run_sweep(Instant::now(), &channels); + assert_eq!(ejected_count(&channels), 1); } #[test] fn already_ejected_re_ejection_does_not_consume_budget() { - // 5 hosts: one already ejected (with stats from in-flight RPCs - // accumulated during its backoff), four newly bad. Cap permits - // 3 concurrently ejected hosts (60% of 5), with 1 already taken - // by the pre-ejected host — so 2 new ejections remain in budget. + // 5 hosts: one already ejected, four newly bad. Cap permits 3 + // concurrently ejected, with 1 already taken — so 2 new + // ejections remain in budget. let mut config = fp_config(50, 10, 3); config.max_ejection_percent = pct(60); let mut detector = detector_with_rng(config, FixedRng::boxed(99)); + let channels = registry(); - // Pre-eject host 8080 directly and give it bad in-flight stats. - let already_bad = detector.add_endpoint(addr(8080)); + // Pre-eject 8080 by driving one sweep with bad stats. + let already_bad = add(&channels, 8080); for _ in 0..100 { already_bad.record_failure(); } - { - let ep = detector.state.get_mut(&addr(8080)).unwrap(); - ep.ejected_at = Some(Instant::now()); - ep.ejection_multiplier = 1; + // Use a tiny first sweep to enter ejected state via the algorithm. + // Need at least minimum_hosts=3 candidates with volume; add three + // healthy hosts with ≥10 requests so the algorithm runs and the + // single bad one is ejected (cap 60% of 4 hosts = 2 → budget 2 → 1 + // new ejection). + for port in 8085..=8087 { + let s = add(&channels, port); + for _ in 0..100 { + s.record_success(); + } } - - // Four more bad hosts. + let t0 = Instant::now(); + detector.run_sweep(t0, &channels); + assert!(already_bad.is_ejected()); + + // Now grow the cluster to 5 hosts (8080 + 8081..=8084) and feed + // bad stats. 8085..=8087 are no longer relevant — drop them. + channels.remove(&addr(8085)); + channels.remove(&addr(8086)); + channels.remove(&addr(8087)); for port in 8081..=8084 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); for _ in 0..100 { - h.record_failure(); + s.record_failure(); } } + for _ in 0..100 { + already_bad.record_failure(); + } - let mut decisions = detector.run_sweep(Instant::now()); - decisions.sort(); - let new_ejects = decisions - .iter() - .filter(|d| matches!(d, EjectionDecision::Eject(a) if *a != addr(8080))) - .count(); - assert_eq!(new_ejects, 2, "expected 2 new ejections under the cap"); + detector.run_sweep(t0 + Duration::from_secs(2), &channels); + // Cap = 60% of 5 = 3. already_ejected = 1. Budget = 2. Plus + // 8080's re-eject which doesn't consume budget. So 2 NEW + // ejections among 8081..=8084. + let new_ejects = (8081..=8084).filter(|p| ejected(&channels, *p)).count(); + assert_eq!(new_ejects, 2); } #[test] fn multiplier_decrements_on_healthy_interval() { let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); - let h = detector.add_endpoint(addr(8080)); + let channels = registry(); + let s = add(&channels, 8080); + // First sweep populates the alg state. + detector.run_sweep(Instant::now(), &channels); // Force multiplier to 3 without ejecting. detector .state .get_mut(&addr(8080)) .unwrap() .ejection_multiplier = 3; - // Healthy interval (some traffic, no ejection). - h.record_success(); - detector.run_sweep(Instant::now()); + s.record_success(); + detector.run_sweep(Instant::now(), &channels); assert_eq!( detector.state.get(&addr(8080)).unwrap().ejection_multiplier, 2, @@ -734,109 +725,87 @@ mod tests { #[test] fn multiplier_decrements_even_without_traffic() { // A50: a non-ejected address with multiplier > 0 has its - // multiplier decremented every sweep, regardless of whether it - // received any RPCs that interval. + // multiplier decremented every sweep, regardless of traffic. let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); - detector.add_endpoint(addr(8080)); + let channels = registry(); + add(&channels, 8080); + detector.run_sweep(Instant::now(), &channels); detector .state .get_mut(&addr(8080)) .unwrap() .ejection_multiplier = 3; - // No traffic recorded. - detector.run_sweep(Instant::now()); + detector.run_sweep(Instant::now(), &channels); assert_eq!( detector.state.get(&addr(8080)).unwrap().ejection_multiplier, 2, ); } - // ----- maybe_run_sweep gating ----- - #[test] - fn maybe_run_sweep_runs_on_first_call() { - // `last_sweep_at` starts as `None`, so the first call always - // sweeps regardless of the wall clock argument. - let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); - for port in 8080..=8083 { - let h = detector.add_endpoint(addr(port)); - for _ in 0..100 { - h.record_success(); - } - } - let bad = detector.add_endpoint(addr(8084)); - for _ in 0..100 { - bad.record_failure(); - } - let decisions = detector.maybe_run_sweep(Instant::now()); - assert_eq!(decisions, vec![EjectionDecision::Eject(addr(8084))]); + fn alg_state_dropped_when_channel_removed() { + let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); + let channels = registry(); + add(&channels, 8080); + add(&channels, 8081); + detector.run_sweep(Instant::now(), &channels); + assert_eq!(detector.state.len(), 2); + + channels.remove(&addr(8080)); + detector.run_sweep(Instant::now(), &channels); + assert_eq!(detector.state.len(), 1); + assert!(detector.state.contains_key(&addr(8081))); } - #[test] - fn maybe_run_sweep_skips_when_interval_not_elapsed() { + // ----- Spawned actor ----- + + #[tokio::test(start_paused = true)] + async fn spawned_actor_runs_sweeps_on_tick() { let mut config = fp_config(50, 10, 3); - config.interval = Duration::from_secs(10); - let mut detector = detector_with_rng(config, FixedRng::boxed(99)); + config.interval = Duration::from_millis(100); + let channels = registry(); + for port in 8080..=8083 { - let h = detector.add_endpoint(addr(port)); + let s = add(&channels, port); for _ in 0..100 { - h.record_success(); + s.record_success(); } } - let bad = detector.add_endpoint(addr(8084)); + let bad = add(&channels, 8084); for _ in 0..100 { bad.record_failure(); } - // First call always runs. - let t0 = Instant::now(); - assert_eq!( - detector.maybe_run_sweep(t0), - vec![EjectionDecision::Eject(addr(8084))], - ); + let _abort = OutlierDetector::spawn_with_rng(config, FixedRng::boxed(99), channels.clone()); - // Re-arm with bad stats; second call Date: Fri, 8 May 2026 15:09:05 -0700 Subject: [PATCH 20/33] refactor(tonic-xds): per-RPC outlier detection + actor for housekeeping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pivot the algorithm split per design feedback: - Per-RPC detection runs inline on each call completion via `OutlierStatsRegistry::record_outcome`. The wrapper records the outcome on the channel's `OutlierChannelState`, evaluates the failure-percentage threshold against the channel's local counters, and ejects the channel directly by flipping its `watch::Sender`. Cluster-wide gates (`minimum_hosts`, `max_ejection_percent`) are enforced via two atomic counters on the registry, kept in sync as channels cross thresholds. - The spawned actor runs only interval-boundary housekeeping: counter reset, un-eject if backoff has elapsed, decrement multipliers for non-ejected channels. The actor never makes ejection decisions. Reaction latency drops from up to one `interval` (default 10s) to the first failed RPC after `request_volume` is reached, while `LoadBalancer::poll_ready` stays O(1) — ejections are observed via per-channel `watch::Receiver::changed()` futures in a `FuturesUnordered`, which the integration PR will wire. Implementation: - `OutlierChannelState` (channel_state.rs) gains atomic ejection-time state: `is_qualifying: AtomicBool`, `ejection_multiplier: AtomicU32`, `ejected_at_nanos: AtomicU64` with a constant `epoch: Instant`. `try_eject` / `try_uneject` are CAS-style and return whether the call performed the transition, so callers can update registry counters exactly once. - `OutlierStatsRegistry` (outlier_detection.rs) is the new central type. Holds the `DashMap>`, cluster-wide atomic counters, config, and RNG. All methods take `&self` (concurrent access from data path and actor). - `OutlierDetector` struct removed; everything lives on the registry. The actor is spawned via the free `spawn_actor(registry)` function. - Tests rewritten: drive `record_outcome` and observe `is_ejected()`; drive `run_housekeeping` for interval-boundary scenarios. --- .../src/client/loadbalance/channel_state.rs | 131 ++- .../client/loadbalance/outlier_detection.rs | 949 +++++++----------- 2 files changed, 471 insertions(+), 609 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index fb534b9cd..7885705a1 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -26,9 +26,9 @@ use std::future::Future; use std::pin::Pin; use std::sync::Arc; -use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering}; use std::task::{Context, Poll}; -use std::time::Duration; +use std::time::{Duration, Instant}; use pin_project_lite::pin_project; use tokio::sync::watch; @@ -71,22 +71,37 @@ impl EndpointCounters { } } -/// Per-channel outlier-detection state, shared between the data path -/// (for outcome recording) and the outlier-detection actor (for sweeps -/// and ejection signalling). +/// Per-channel outlier-detection state, shared (via `Arc`) between +/// the data path (per-RPC outcome recording + threshold-based ejection) +/// and the outlier-detection actor (interval-based housekeeping). /// -/// The ejection signal is edge-triggered: the actor calls [`eject`] / -/// [`uneject`] to flip the flag; observers subscribe via -/// [`subscribe`] and poll `Receiver::changed()` (typically inside a -/// `FuturesUnordered`) to react in O(1) on each transition. +/// Ejection is edge-triggered: callers flip the flag via [`eject`] / +/// [`uneject`]; observers poll `Receiver::changed()` (typically inside +/// a `FuturesUnordered`) to react in O(1) on each transition. +/// +/// All fields are atomics or wrapped in lock-free primitives so the +/// data path can mutate them without locking. /// /// [`eject`]: Self::eject /// [`uneject`]: Self::uneject -/// [`subscribe`]: Self::subscribe #[derive(Debug)] pub(crate) struct OutlierChannelState { counters: EndpointCounters, eject_tx: watch::Sender, + /// Whether this channel currently contributes to the registry's + /// `qualifying_count`. Set when `total` first reaches + /// `request_volume` in the current interval; cleared on counter + /// reset. + is_qualifying: AtomicBool, + /// Number of times this channel has been ejected. Bumped on each + /// ejection; decremented (saturating) on each healthy interval. + ejection_multiplier: AtomicU32, + /// `0` when not ejected. Otherwise nanos since [`Self::epoch`] of + /// the current ejection's start. + ejected_at_nanos: AtomicU64, + /// Reference instant used as the origin for `ejected_at_nanos`. + /// Established at construction and never changes. + epoch: Instant, } impl Default for OutlierChannelState { @@ -101,6 +116,10 @@ impl OutlierChannelState { Self { counters: EndpointCounters::default(), eject_tx, + is_qualifying: AtomicBool::new(false), + ejection_multiplier: AtomicU32::new(0), + ejected_at_nanos: AtomicU64::new(0), + epoch: Instant::now(), } } @@ -112,14 +131,39 @@ impl OutlierChannelState { self.counters.record_failure(); } - /// Atomically read and zero the counters. Returns `(success, failure)`. + /// Read the current counter values without resetting. Returns + /// `(success, failure)`. The two reads are not atomic against + /// each other but the difference is bounded by concurrent in-flight + /// RPCs and is below the precision of the failure-percentage check. + pub(crate) fn counters(&self) -> (u64, u64) { + let s = self.counters.success.load(Ordering::Relaxed); + let f = self.counters.failure.load(Ordering::Relaxed); + (s, f) + } + + /// Read and zero the counters. Returns `(success, failure)`. pub(crate) fn snapshot_and_reset(&self) -> (u64, u64) { self.counters.snapshot_and_reset() } - /// Flip the ejection flag to `true`. No-op if already ejected. - pub(crate) fn eject(&self) { - self.eject_tx.send_if_modified(|state| { + /// Try to set `is_qualifying` to `true`. Returns `true` if this + /// call performed the false → true transition, so callers can + /// increment a registry-level counter exactly once per crossing. + pub(crate) fn mark_qualifying(&self) -> bool { + !self.is_qualifying.swap(true, Ordering::AcqRel) + } + + /// Clear `is_qualifying`. Returns the previous value. + pub(crate) fn clear_qualifying(&self) -> bool { + self.is_qualifying.swap(false, Ordering::AcqRel) + } + + /// Flip the ejection flag to `true`. Returns `true` if this call + /// performed the false → true transition (so callers can update + /// registry-level counters exactly once per ejection). + /// Records the ejection timestamp and bumps the multiplier. + pub(crate) fn try_eject(&self, now: Instant) -> bool { + let won = self.eject_tx.send_if_modified(|state| { if *state { false } else { @@ -127,11 +171,24 @@ impl OutlierChannelState { true } }); - } - - /// Flip the ejection flag back to `false`. No-op if not ejected. - pub(crate) fn uneject(&self) { - self.eject_tx.send_if_modified(|state| { + if !won { + return false; + } + let nanos = now + .saturating_duration_since(self.epoch) + .as_nanos() + .min(u64::MAX as u128) as u64; + // Use 1 as a sentinel if the channel was created at exactly + // `now`, since 0 means "not ejected". + self.ejected_at_nanos.store(nanos.max(1), Ordering::Relaxed); + self.ejection_multiplier.fetch_add(1, Ordering::Relaxed); + true + } + + /// Flip the ejection flag back to `false`. Returns `true` if this + /// call performed the true → false transition. + pub(crate) fn try_uneject(&self) -> bool { + let won = self.eject_tx.send_if_modified(|state| { if *state { *state = false; true @@ -139,6 +196,10 @@ impl OutlierChannelState { false } }); + if won { + self.ejected_at_nanos.store(0, Ordering::Relaxed); + } + won } /// Current ejection state. @@ -146,6 +207,31 @@ impl OutlierChannelState { *self.eject_tx.borrow() } + /// Returns the elapsed time since this channel was ejected, or + /// `None` if it is not currently ejected. + pub(crate) fn ejected_duration(&self, now: Instant) -> Option { + let nanos = self.ejected_at_nanos.load(Ordering::Relaxed); + if nanos == 0 { + return None; + } + let ejected_at = self.epoch + Duration::from_nanos(nanos); + Some(now.saturating_duration_since(ejected_at)) + } + + /// Current ejection multiplier. + pub(crate) fn ejection_multiplier(&self) -> u32 { + self.ejection_multiplier.load(Ordering::Relaxed) + } + + /// Decrement the multiplier saturating at zero. Called by the + /// actor on healthy intervals. + pub(crate) fn decrement_multiplier(&self) { + let prev = self.ejection_multiplier.load(Ordering::Relaxed); + if prev > 0 { + self.ejection_multiplier.store(prev - 1, Ordering::Relaxed); + } + } + /// Subscribe to ejection-state changes. The returned receiver's /// `changed()` future resolves on each transition; consumers /// typically push it into a `FuturesUnordered`. @@ -153,6 +239,13 @@ impl OutlierChannelState { pub(crate) fn subscribe(&self) -> watch::Receiver { self.eject_tx.subscribe() } + + /// Test-only setter for the ejection multiplier; lets tests drive + /// housekeeping behavior without going through `try_eject`. + #[cfg(test)] + pub(crate) fn set_ejection_multiplier(&self, value: u32) { + self.ejection_multiplier.store(value, Ordering::Relaxed); + } } /// Configuration for an ejected channel. diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 37023ad95..5295a09c7 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -1,23 +1,34 @@ -//! gRFC A50 outlier-detection sweep engine. +//! gRFC A50 outlier detection. //! -//! Reads per-endpoint counters from a shared -//! [`DashMap>`] and applies -//! ejection / un-ejection decisions in place by toggling each entry's -//! ejection signal. The load balancer registers each [`ReadyChannel`]'s -//! [`OutlierChannelState`] in the same map and observes the signal via -//! a `FuturesUnordered` of `watch::Receiver::changed()` futures, so the -//! O(n) sweep runs in a spawned actor task off the LB's critical path. +//! The algorithm is split between the data path and a spawned actor: //! -//! Only the failure-percentage algorithm is currently dispatched. If -//! [`OutlierDetectionConfig::success_rate`] is set, it is ignored. +//! - **Per-RPC detection** runs inline on each call completion via +//! [`OutlierStatsRegistry::record_outcome`]. The wrapper records the +//! outcome on the channel's [`OutlierChannelState`], evaluates the +//! failure-percentage threshold against the channel's local +//! counters, and ejects the channel directly by flipping its +//! `watch::Sender`. Cluster-wide gates (`minimum_hosts`, +//! `max_ejection_percent`) are enforced via two atomic counters on +//! the registry, kept in sync as channels cross thresholds. +//! - **Interval-based housekeeping** runs in a spawned actor (see +//! [`spawn_actor`]). It resets per-channel counters at the +//! `config.interval` boundary, un-ejects channels whose +//! `base × multiplier` backoff has elapsed, and decrements +//! multipliers for non-ejected channels. The actor never makes +//! ejection decisions. +//! +//! `LoadBalancer::poll_ready` observes ejections in O(1) per +//! transition by polling a `FuturesUnordered` +//! over each channel's signal. +//! +//! Only the failure-percentage algorithm is dispatched. The +//! success-rate algorithm (cross-endpoint mean/stdev) is left to a +//! follow-up. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md -//! [`ReadyChannel`]: crate::client::loadbalance::channel_state::ReadyChannel -//! [`OutlierChannelState`]: crate::client::loadbalance::channel_state::OutlierChannelState -//! [`OutlierDetectionConfig::success_rate`]: crate::xds::resource::outlier_detection::OutlierDetectionConfig::success_rate -use std::collections::{HashMap, HashSet}; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use std::time::Instant; use dashmap::DashMap; @@ -25,15 +36,7 @@ use dashmap::DashMap; use crate::client::endpoint::EndpointAddress; use crate::client::loadbalance::channel_state::OutlierChannelState; use crate::common::async_util::AbortOnDrop; -use crate::xds::resource::outlier_detection::{FailurePercentageConfig, OutlierDetectionConfig}; - -/// Shared map of per-endpoint outlier state, keyed by address. The -/// load balancer inserts each [`ReadyChannel`]'s -/// [`OutlierChannelState`] on connect and removes it on disconnect; the -/// detector iterates the map on each sweep. -/// -/// [`ReadyChannel`]: crate::client::loadbalance::channel_state::ReadyChannel -pub(crate) type OutlierStatsRegistry = Arc>>; +use crate::xds::resource::outlier_detection::OutlierDetectionConfig; /// Probability source for `enforcing_*` rolls. pub(crate) trait Rng: Send + Sync + 'static { @@ -50,226 +53,184 @@ impl Rng for FastRandRng { } } -/// Algorithm-private per-endpoint state. Tracks the ejection-time -/// multiplier and the last ejection timestamp; counters and the -/// outward-facing ejection signal live on the channel's -/// [`OutlierChannelState`]. -struct AlgState { - /// Number of times this endpoint has been ejected. Grows on each - /// re-ejection and decays on each healthy interval. - ejection_multiplier: u32, - /// `Some(at)` when currently ejected; `None` otherwise. - ejected_at: Option, -} - -impl AlgState { - fn new() -> Self { - Self { - ejection_multiplier: 0, - ejected_at: None, - } - } -} - -/// gRFC A50 outlier detector. -/// -/// Held by an actor task that ticks once per `config.interval` and -/// calls [`Self::run_sweep`] over the shared [`OutlierStatsRegistry`]. -/// Stats and ejection signals live on the channels themselves; the -/// detector owns only algorithm-private metadata (per-endpoint -/// multiplier and last-ejection timestamp). -pub(crate) struct OutlierDetector { +/// Shared outlier-detection state, owned by `Arc` and accessed +/// concurrently by: +/// - The load balancer's call wrapper, which calls +/// [`Self::record_outcome`] after each RPC completion. +/// - The spawned actor task, which calls [`Self::run_housekeeping`] +/// on every `config.interval` tick. +/// - The load balancer's `poll_ready`, which subscribes to per-channel +/// ejection signals via [`OutlierChannelState::subscribe`]. +pub(crate) struct OutlierStatsRegistry { + /// Per-endpoint state, keyed by address. Inserted by the LB on + /// channel creation and removed on disconnect. + channels: DashMap>, + /// Number of channels currently with `total >= request_volume` in + /// the active interval. Drives the `minimum_hosts` gate. + qualifying_count: AtomicU64, + /// Number of channels currently ejected. Drives the + /// `max_ejection_percent` cap. + ejected_count: AtomicU64, config: OutlierDetectionConfig, - state: HashMap, rng: Box, } -impl OutlierDetector { - /// Build the detector with the default RNG (`fastrand`). - pub(crate) fn new(config: OutlierDetectionConfig) -> Self { +impl OutlierStatsRegistry { + /// Build a registry with the default RNG. + pub(crate) fn new(config: OutlierDetectionConfig) -> Arc { Self::with_rng(config, Box::new(FastRandRng)) } - /// Build the detector with a custom [`Rng`]. - pub(crate) fn with_rng(config: OutlierDetectionConfig, rng: Box) -> Self { - Self { + /// Build a registry with a custom [`Rng`]. + pub(crate) fn with_rng(config: OutlierDetectionConfig, rng: Box) -> Arc { + Arc::new(Self { + channels: DashMap::new(), + qualifying_count: AtomicU64::new(0), + ejected_count: AtomicU64::new(0), config, - state: HashMap::new(), rng, - } + }) + } + + /// Register a new channel. Returns the `Arc` + /// the load balancer wires into the channel; the same `Arc` is + /// retained in the registry so the actor can iterate it. + pub(crate) fn add_channel(&self, addr: EndpointAddress) -> Arc { + let state = Arc::new(OutlierChannelState::new()); + self.channels.insert(addr, state.clone()); + state } - /// Run one sweep at logical time `now` over the shared registry. - /// Applies ejection decisions inline by calling - /// [`OutlierChannelState::eject`] / [`OutlierChannelState::uneject`] - /// on each affected entry. - /// - /// Order of operations follows gRFC A50: - /// 1. Record the timestamp. - /// 2. Snapshot each address's call-counter buckets. - /// 3. Run the success-rate algorithm if configured (not yet dispatched). - /// 4. Run the failure-percentage algorithm if configured. - /// 5. Decrement the multiplier of non-ejected addresses with - /// multiplier > 0; un-eject ejected addresses whose backoff has - /// elapsed. - pub(crate) fn run_sweep(&mut self, now: Instant, channels: &OutlierStatsRegistry) { - // Step 2: snapshot every channel's counters and record which - // addresses are still in the registry. - let mut snapshots: Vec = Vec::with_capacity(channels.len()); - let mut seen: HashSet = HashSet::with_capacity(channels.len()); - for entry in channels.iter() { - let addr = entry.key().clone(); - let (success, failure) = entry.value().snapshot_and_reset(); - let alg = self.state.entry(addr.clone()).or_insert_with(AlgState::new); - snapshots.push(Candidate { - addr: addr.clone(), - success, - failure, - total: success + failure, - already_ejected: alg.ejected_at.is_some(), - }); - seen.insert(addr); - } - // Drop algorithm state for addresses no longer in the registry. - self.state.retain(|addr, _| seen.contains(addr)); - - // Per-sweep cap on new ejections, enforced as a budget the - // algorithms decrement. Per A50, the check happens before each - // candidate. - let total_endpoints = self.state.len(); - let max_ejections = (total_endpoints as u64 - * u64::from(self.config.max_ejection_percent.get()) - / 100) as usize; - let already_ejected = self - .state - .values() - .filter(|s| s.ejected_at.is_some()) - .count(); - let mut budget = max_ejections.saturating_sub(already_ejected); - - // Steps 3 & 4: run the algorithms. Ejected hosts have no - // in-interval traffic in production and so naturally fail the - // `request_volume` gate; iterating every address (per spec) is - // equivalent to iterating non-ejected ones. Step 3 (success- - // rate ejection) is not yet dispatched. - let mut to_eject: Vec = Vec::new(); - if let Some(fp) = self.config.failure_percentage.as_ref() { - run_failure_percentage(fp, &snapshots, &mut budget, &mut to_eject, &*self.rng); - } - - for addr in &to_eject { - if let Some(alg) = self.state.get_mut(addr) { - alg.ejected_at = Some(now); - alg.ejection_multiplier = alg.ejection_multiplier.saturating_add(1); + /// Forget a channel. Drops the registry's reference; cluster-wide + /// counters are decremented if the channel was qualifying or + /// ejected. + pub(crate) fn remove_channel(&self, addr: &EndpointAddress) { + if let Some((_, state)) = self.channels.remove(addr) { + if state.clear_qualifying() { + self.qualifying_count.fetch_sub(1, Ordering::Relaxed); } - if let Some(state) = channels.get(addr) { - state.eject(); + if state.is_ejected() { + self.ejected_count.fetch_sub(1, Ordering::Relaxed); } } + } + + /// Number of registered channels. + pub(crate) fn len(&self) -> usize { + self.channels.len() + } + + /// Per-RPC entry point. Called by the load balancer's call wrapper + /// after each RPC completion. Increments the channel's success or + /// failure counter and then evaluates the failure-percentage + /// threshold; if all gates pass, ejects the channel inline. + pub(crate) fn record_outcome(&self, state: &OutlierChannelState, success: bool) { + if success { + state.record_success(); + } else { + state.record_failure(); + } + + let Some(fp) = self.config.failure_percentage.as_ref() else { + return; + }; + + let (s, f) = state.counters(); + let total = s + f; + let request_volume = u64::from(fp.request_volume); + + // Track when each channel first qualifies in the current + // interval, so the `minimum_hosts` gate can be checked with a + // single atomic load. + if total >= request_volume && state.mark_qualifying() { + self.qualifying_count.fetch_add(1, Ordering::Relaxed); + } + + if state.is_ejected() { + return; + } + if total < request_volume { + return; + } + if self.qualifying_count.load(Ordering::Relaxed) < u64::from(fp.minimum_hosts) { + return; + } + if self.ejected_count.load(Ordering::Relaxed) >= self.max_ejections() { + return; + } - // Step 5: decrement multipliers for non-ejected addresses; - // un-eject ejected addresses whose backoff has elapsed. Runs - // *after* re-ejection, so a same-sweep re-eject refreshes - // `ejected_at` and the un-eject check sees zero elapsed time. + // failure_pct = 100 * failure / total. A50 uses strict ">". + let failure_pct = 100 * f / total; + if failure_pct <= u64::from(fp.threshold.get()) { + return; + } + if !roll(&*self.rng, fp.enforcing_failure_percentage.get()) { + return; + } + + if state.try_eject(Instant::now()) { + self.ejected_count.fetch_add(1, Ordering::Relaxed); + } + } + + /// Interval-boundary housekeeping. Called by the spawned actor on + /// each `config.interval` tick. Resets counters, un-ejects + /// channels whose backoff has elapsed, and decrements multipliers + /// for non-ejected channels. + pub(crate) fn run_housekeeping(&self, now: Instant) { + // Cap the un-ejection backoff at `max(base, max_ejection_time)`. let cap = self .config .base_ejection_time .max(self.config.max_ejection_time); - for (addr, alg) in self.state.iter_mut() { - if let Some(at) = alg.ejected_at { - if let Some(scaled) = self - .config - .base_ejection_time - .checked_mul(alg.ejection_multiplier) - && now.duration_since(at) >= scaled.min(cap) + + for entry in self.channels.iter() { + let state = entry.value(); + + // Reset counters; clear `is_qualifying` and adjust the + // registry-level counter in lockstep. + state.snapshot_and_reset(); + if state.clear_qualifying() { + self.qualifying_count.fetch_sub(1, Ordering::Relaxed); + } + + if state.is_ejected() { + let multiplier = state.ejection_multiplier(); + let elapsed = state.ejected_duration(now).unwrap_or_default(); + if let Some(scaled) = self.config.base_ejection_time.checked_mul(multiplier) + && elapsed >= scaled.min(cap) + && state.try_uneject() { - alg.ejected_at = None; - if let Some(state) = channels.get(addr) { - state.uneject(); - } + self.ejected_count.fetch_sub(1, Ordering::Relaxed); } - } else if alg.ejection_multiplier > 0 { - alg.ejection_multiplier -= 1; + } else { + state.decrement_multiplier(); } } } - /// Spawn the detector as an actor task with the default RNG. The - /// task ticks every `config.interval` and runs a sweep over the - /// shared registry. Dropping the returned [`AbortOnDrop`] stops - /// the task. - pub(crate) fn spawn( - config: OutlierDetectionConfig, - channels: OutlierStatsRegistry, - ) -> AbortOnDrop { - Self::spawn_inner(Self::new(config), channels) - } - - /// Variant of [`Self::spawn`] that accepts a custom [`Rng`]. - pub(crate) fn spawn_with_rng( - config: OutlierDetectionConfig, - rng: Box, - channels: OutlierStatsRegistry, - ) -> AbortOnDrop { - Self::spawn_inner(Self::with_rng(config, rng), channels) - } - - fn spawn_inner(mut detector: Self, channels: OutlierStatsRegistry) -> AbortOnDrop { - let interval = detector.config.interval; - let task = tokio::spawn(async move { - let mut ticker = tokio::time::interval(interval); - ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - // First tick fires immediately so the actor runs an initial - // sweep on startup; subsequent ticks fire on the interval. - loop { - ticker.tick().await; - detector.run_sweep(Instant::now(), &channels); - } - }); - AbortOnDrop(task) + /// `max_ejection_percent` resolved against the current channel + /// count. Updated as channels come and go. + fn max_ejections(&self) -> u64 { + self.channels.len() as u64 * u64::from(self.config.max_ejection_percent.get()) / 100 } } -/// A50 failure-percentage algorithm. -fn run_failure_percentage( - cfg: &FailurePercentageConfig, - all: &[Candidate], - budget: &mut usize, - out: &mut Vec, - rng: &dyn Rng, -) { - let qualifying: Vec<&Candidate> = all - .iter() - .filter(|c| c.total >= u64::from(cfg.request_volume)) - .collect(); - if qualifying.len() < cfg.minimum_hosts as usize { - return; - } - - let threshold = u64::from(cfg.threshold.get()); - for c in qualifying { - if *budget == 0 { - break; - } - // A50 doesn't forbid `request_volume == 0`, in which case a - // candidate may have `total == 0`. The spec is silent on - // `0/0`; skip these endpoints rather than divide by zero. - if c.total == 0 { - continue; - } - // failure_pct = 100 * failure / total. A50 specifies a strict - // "greater than" comparison: an address sitting exactly at - // the threshold is not ejected. - let failure_pct = 100 * c.failure / c.total; - if failure_pct > threshold && roll(rng, cfg.enforcing_failure_percentage.get()) { - out.push(c.addr.clone()); - // See `Candidate::already_ejected` for why re-ejections - // don't consume the budget. - if !c.already_ejected { - *budget -= 1; - } - } - } +/// Spawn the housekeeping actor. The task ticks every +/// `config.interval` and calls +/// [`OutlierStatsRegistry::run_housekeeping`]. Dropping the returned +/// [`AbortOnDrop`] stops the task. +pub(crate) fn spawn_actor(registry: Arc) -> AbortOnDrop { + let interval = registry.config.interval; + let task = tokio::spawn(async move { + let mut ticker = tokio::time::interval(interval); + ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + loop { + ticker.tick().await; + registry.run_housekeeping(Instant::now()); + } + }); + AbortOnDrop(task) } /// Return true with probability `pct / 100` (clamped at 100 ⇒ always). @@ -283,20 +244,6 @@ fn roll(rng: &dyn Rng, pct: u8) -> bool { rng.pct_roll() < u32::from(pct) } -/// Cached per-endpoint snapshot used during a sweep. -struct Candidate { - addr: EndpointAddress, - success: u64, - failure: u64, - total: u64, - /// Whether this address was already ejected at the start of the - /// sweep. Re-ejecting an already-ejected address refreshes its - /// timestamp and bumps its multiplier but doesn't change the count - /// of currently-ejected addresses, so it must not consume a - /// `max_ejection_percent` budget slot. - already_ejected: bool, -} - #[cfg(test)] mod tests { use super::*; @@ -306,8 +253,6 @@ mod tests { use std::sync::atomic::{AtomicU32, Ordering}; use std::time::Duration; - // ----- Fixtures ----- - fn addr(port: u16) -> EndpointAddress { EndpointAddress::new("10.0.0.1", port) } @@ -357,148 +302,91 @@ mod tests { } } - fn detector_with_rng(config: OutlierDetectionConfig, rng: Box) -> OutlierDetector { - OutlierDetector::with_rng(config, rng) - } - - fn registry() -> OutlierStatsRegistry { - Arc::new(DashMap::new()) - } - - fn add(channels: &OutlierStatsRegistry, port: u16) -> Arc { - let state = Arc::new(OutlierChannelState::new()); - channels.insert(addr(port), state.clone()); - state - } - - fn ejected(channels: &OutlierStatsRegistry, port: u16) -> bool { - channels - .get(&addr(port)) - .map(|e| e.value().is_ejected()) - .unwrap_or(false) - } - - fn ejected_count(channels: &OutlierStatsRegistry) -> usize { - channels.iter().filter(|e| e.value().is_ejected()).count() - } - - // ----- OutlierChannelState (sanity) ----- - - #[test] - fn channel_state_records_and_resets() { - let s = OutlierChannelState::new(); - s.record_success(); - s.record_success(); - s.record_failure(); - assert_eq!(s.snapshot_and_reset(), (2, 1)); - assert_eq!(s.snapshot_and_reset(), (0, 0)); - } - - #[test] - fn channel_state_eject_uneject_flips_signal() { - let s = OutlierChannelState::new(); - assert!(!s.is_ejected()); - s.eject(); - assert!(s.is_ejected()); - s.uneject(); - assert!(!s.is_ejected()); + /// Drive `n` outcomes through `record_outcome` for one channel. + fn drive( + registry: &OutlierStatsRegistry, + state: &OutlierChannelState, + successes: u64, + failures: u64, + ) { + for _ in 0..successes { + registry.record_outcome(state, true); + } + for _ in 0..failures { + registry.record_outcome(state, false); + } } - // ----- Failure-percentage algorithm ----- + // ----- record_outcome: failure-percentage detection ----- #[test] - fn failure_percentage_ejects_above_threshold() { - let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); - let channels = registry(); - + fn ejects_above_threshold_inline() { + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { - let s = add(&channels, port); - for _ in 0..100 { - s.record_success(); - } - } - let bad = add(&channels, 8084); - for _ in 0..90 { - bad.record_failure(); + let s = registry.add_channel(addr(port)); + drive(®istry, &s, 100, 0); } - for _ in 0..10 { - bad.record_success(); - } - - detector.run_sweep(Instant::now(), &channels); + drive(®istry, &bad, 10, 90); assert!(bad.is_ejected()); - for port in 8080..=8083 { - assert!(!ejected(&channels, port)); - } + assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 1); } #[test] - fn failure_percentage_skips_below_threshold() { - let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); - let channels = registry(); + fn skips_below_threshold() { + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let mut all = vec![]; for port in 8080..=8084 { - let s = add(&channels, port); - // 30% failure → below threshold of 50%. - for _ in 0..70 { - s.record_success(); - } - for _ in 0..30 { - s.record_failure(); - } + let s = registry.add_channel(addr(port)); + // 30% failure → below 50% threshold. + drive(®istry, &s, 70, 30); + all.push(s); + } + for s in &all { + assert!(!s.is_ejected()); } - detector.run_sweep(Instant::now(), &channels); - assert_eq!(ejected_count(&channels), 0); } #[test] - fn failure_percentage_at_threshold_does_not_eject() { + fn at_threshold_does_not_eject() { // A50 specifies a strict "greater than" comparison. - let mut detector = detector_with_rng(fp_config(50, 10, 3), FixedRng::boxed(0)); - let channels = registry(); + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(0)); + let mut all = vec![]; for port in 8080..=8084 { - let s = add(&channels, port); - for _ in 0..50 { - s.record_success(); - } - for _ in 0..50 { - s.record_failure(); - } + let s = registry.add_channel(addr(port)); + drive(®istry, &s, 50, 50); + all.push(s); + } + for s in &all { + assert!(!s.is_ejected()); } - detector.run_sweep(Instant::now(), &channels); - assert_eq!(ejected_count(&channels), 0); } #[test] - fn minimum_hosts_gates_failure_percentage() { - let mut detector = detector_with_rng(fp_config(50, 10, 5), FixedRng::boxed(99)); - let channels = registry(); + fn minimum_hosts_gates_ejection() { + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 5), FixedRng::boxed(99)); // Only 2 hosts have request_volume ≥ 10; minimum_hosts is 5 ⇒ skip. + let mut all = vec![]; for port in 8080..=8081 { - let s = add(&channels, port); - for _ in 0..100 { - s.record_failure(); - } + let s = registry.add_channel(addr(port)); + drive(®istry, &s, 0, 100); + all.push(s); + } + for s in &all { + assert!(!s.is_ejected()); } - detector.run_sweep(Instant::now(), &channels); - assert_eq!(ejected_count(&channels), 0); } #[test] - fn request_volume_filters_low_traffic_endpoints() { - let mut detector = detector_with_rng(fp_config(50, 100, 3), FixedRng::boxed(99)); - let channels = registry(); - let bad = add(&channels, 8080); - for _ in 0..5 { - bad.record_failure(); - } + fn request_volume_filters_low_traffic() { + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 100, 3), FixedRng::boxed(99)); + let bad = registry.add_channel(addr(8080)); + drive(®istry, &bad, 0, 5); for port in 8081..=8084 { - let s = add(&channels, port); - for _ in 0..200 { - s.record_success(); - } + let s = registry.add_channel(addr(port)); + drive(®istry, &s, 200, 0); } - detector.run_sweep(Instant::now(), &channels); - assert_eq!(ejected_count(&channels), 0); + assert!(!bad.is_ejected()); } #[test] @@ -509,303 +397,184 @@ mod tests { .as_mut() .unwrap() .enforcing_failure_percentage = pct(0); - let mut detector = detector_with_rng(config, FixedRng::boxed(0)); - let channels = registry(); + let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(0)); + let mut all = vec![]; for port in 8080..=8084 { - let s = add(&channels, port); - for _ in 0..100 { - s.record_failure(); - } + let s = registry.add_channel(addr(port)); + drive(®istry, &s, 0, 100); + all.push(s); + } + for s in &all { + assert!(!s.is_ejected()); } - detector.run_sweep(Instant::now(), &channels); - assert_eq!(ejected_count(&channels), 0); } - // ----- Ejection multiplier / un-ejection ----- - #[test] - fn unejects_after_base_time() { + fn max_ejection_percent_caps_concurrent_ejections() { let mut config = fp_config(50, 10, 3); - config.base_ejection_time = Duration::from_secs(10); - config.max_ejection_time = Duration::from_secs(60); - let mut detector = detector_with_rng(config, FixedRng::boxed(99)); - let channels = registry(); + config.max_ejection_percent = pct(20); + let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let mut all = vec![]; for port in 8080..=8084 { - let s = add(&channels, port); - if port == 8084 { - for _ in 0..100 { - s.record_failure(); - } - } else { - for _ in 0..100 { - s.record_success(); - } - } + let s = registry.add_channel(addr(port)); + all.push(s); + } + // Drive all hosts to bad state in parallel pseudo-order. + for s in &all { + drive(®istry, s, 0, 100); } - let t0 = Instant::now(); - detector.run_sweep(t0, &channels); - assert!(ejected(&channels, 8084)); - - // Still ejected just before base_ejection_time elapses. - detector.run_sweep(t0 + Duration::from_secs(9), &channels); - assert!(ejected(&channels, 8084)); - - // Un-eject after `base * multiplier(=1)` = 10s. - detector.run_sweep(t0 + Duration::from_secs(10), &channels); - assert!(!ejected(&channels, 8084)); + let ejected = all.iter().filter(|s| s.is_ejected()).count(); + // 5 hosts × 20% = 1 max ejection. + assert_eq!(ejected, 1); } #[test] - fn re_ejection_doubles_duration() { - // Same-sweep un-eject + re-eject grows the multiplier 1 → 2. - let mut config = fp_config(50, 10, 3); - config.base_ejection_time = Duration::from_secs(10); - config.max_ejection_time = Duration::from_secs(60); - let mut detector = detector_with_rng(config, FixedRng::boxed(99)); - let channels = registry(); - - let bad = add(&channels, 8084); - for port in 8080..=8083 { - let s = add(&channels, port); - for _ in 0..100 { - s.record_success(); - } - } - for _ in 0..100 { - bad.record_failure(); - } - - // Sweep 1: eject. Multiplier 0 → 1. - let t0 = Instant::now(); - detector.run_sweep(t0, &channels); - assert!(bad.is_ejected()); - - // Re-record stats so sweep 2 has volume to evaluate. + fn remove_channel_decrements_counters() { + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let mut all = vec![]; for port in 8080..=8083 { - let s = channels.get(&addr(port)).unwrap().value().clone(); - for _ in 0..100 { - s.record_success(); - } + let s = registry.add_channel(addr(port)); + drive(®istry, &s, 100, 0); + all.push(s); } - for _ in 0..100 { - bad.record_failure(); - } - - // Sweep 2 at t0+10: re-ejection refreshes timestamp, multiplier 1 → 2. - detector.run_sweep(t0 + Duration::from_secs(10), &channels); - assert!(bad.is_ejected()); - - // Re-ejection started at t0+10 with multiplier=2 → duration 20s. - detector.run_sweep(t0 + Duration::from_secs(29), &channels); + let bad = registry.add_channel(addr(8084)); + drive(®istry, &bad, 0, 100); assert!(bad.is_ejected()); + assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 1); + // Each healthy host crossed request_volume; bad too. So + // qualifying_count = 5. + assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 5); - // Un-ejects at the 20s mark (30s after t0). - detector.run_sweep(t0 + Duration::from_secs(30), &channels); - assert!(!bad.is_ejected()); + registry.remove_channel(&addr(8084)); + assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 0); + assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 4); } - #[test] - fn ejection_capped_by_max_ejection_time() { - let mut config = fp_config(50, 10, 3); - config.base_ejection_time = Duration::from_secs(10); - config.max_ejection_time = Duration::from_secs(15); - let mut detector = detector_with_rng(config, FixedRng::boxed(99)); - let channels = registry(); + // ----- Housekeeping ----- - for port in 8080..=8084 { - add(&channels, port); + #[test] + fn housekeeping_resets_counters_and_qualifying() { + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + for port in 8080..=8083 { + let s = registry.add_channel(addr(port)); + drive(®istry, &s, 100, 0); } - let t0 = Instant::now(); - // Force multiplier=10 on 8084 directly. We need to drive a - // first sweep to populate `state[8084]`, then fix it up. - detector.run_sweep(t0, &channels); - let alg = detector.state.get_mut(&addr(8084)).unwrap(); - alg.ejection_multiplier = 10; - alg.ejected_at = Some(t0); - channels.get(&addr(8084)).unwrap().value().eject(); + assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 4); - // base*multiplier = 100s; cap = 15s → un-eject after 16s. - detector.run_sweep(t0 + Duration::from_secs(16), &channels); - assert!(!ejected(&channels, 8084)); + registry.run_housekeeping(Instant::now()); + assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 0); + for port in 8080..=8083 { + let s = registry.channels.get(&addr(port)).unwrap(); + assert_eq!(s.counters(), (0, 0)); + } } #[test] - fn max_ejection_percent_caps_concurrent_ejections() { + fn housekeeping_unejects_after_base_time() { let mut config = fp_config(50, 10, 3); - config.max_ejection_percent = pct(20); - let mut detector = detector_with_rng(config, FixedRng::boxed(99)); - let channels = registry(); + config.base_ejection_time = Duration::from_secs(10); + config.max_ejection_time = Duration::from_secs(60); + let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); - for port in 8080..=8084 { - let s = add(&channels, port); - for _ in 0..100 { - s.record_failure(); - } + let bad = registry.add_channel(addr(8084)); + for port in 8080..=8083 { + let s = registry.add_channel(addr(port)); + drive(®istry, &s, 100, 0); } - detector.run_sweep(Instant::now(), &channels); - assert_eq!(ejected_count(&channels), 1); - } + drive(®istry, &bad, 0, 100); + assert!(bad.is_ejected()); - #[test] - fn already_ejected_re_ejection_does_not_consume_budget() { - // 5 hosts: one already ejected, four newly bad. Cap permits 3 - // concurrently ejected, with 1 already taken — so 2 new - // ejections remain in budget. - let mut config = fp_config(50, 10, 3); - config.max_ejection_percent = pct(60); - let mut detector = detector_with_rng(config, FixedRng::boxed(99)); - let channels = registry(); - - // Pre-eject 8080 by driving one sweep with bad stats. - let already_bad = add(&channels, 8080); - for _ in 0..100 { - already_bad.record_failure(); - } - // Use a tiny first sweep to enter ejected state via the algorithm. - // Need at least minimum_hosts=3 candidates with volume; add three - // healthy hosts with ≥10 requests so the algorithm runs and the - // single bad one is ejected (cap 60% of 4 hosts = 2 → budget 2 → 1 - // new ejection). - for port in 8085..=8087 { - let s = add(&channels, port); - for _ in 0..100 { - s.record_success(); - } - } + // Advance fewer than base_ejection_time ⇒ stays ejected. let t0 = Instant::now(); - detector.run_sweep(t0, &channels); - assert!(already_bad.is_ejected()); - - // Now grow the cluster to 5 hosts (8080 + 8081..=8084) and feed - // bad stats. 8085..=8087 are no longer relevant — drop them. - channels.remove(&addr(8085)); - channels.remove(&addr(8086)); - channels.remove(&addr(8087)); - for port in 8081..=8084 { - let s = add(&channels, port); - for _ in 0..100 { - s.record_failure(); - } - } - for _ in 0..100 { - already_bad.record_failure(); - } + registry.run_housekeeping(t0 + Duration::from_secs(9)); + assert!(bad.is_ejected()); - detector.run_sweep(t0 + Duration::from_secs(2), &channels); - // Cap = 60% of 5 = 3. already_ejected = 1. Budget = 2. Plus - // 8080's re-eject which doesn't consume budget. So 2 NEW - // ejections among 8081..=8084. - let new_ejects = (8081..=8084).filter(|p| ejected(&channels, *p)).count(); - assert_eq!(new_ejects, 2); + // After base_ejection_time × 1 elapsed ⇒ uneject. + registry.run_housekeeping(t0 + Duration::from_secs(20)); + assert!(!bad.is_ejected()); + assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 0); } #[test] - fn multiplier_decrements_on_healthy_interval() { - let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); - let channels = registry(); - let s = add(&channels, 8080); - // First sweep populates the alg state. - detector.run_sweep(Instant::now(), &channels); - // Force multiplier to 3 without ejecting. - detector - .state - .get_mut(&addr(8080)) - .unwrap() - .ejection_multiplier = 3; - s.record_success(); - detector.run_sweep(Instant::now(), &channels); - assert_eq!( - detector.state.get(&addr(8080)).unwrap().ejection_multiplier, - 2, - ); - } + fn housekeeping_decrements_multiplier_on_healthy_interval() { + let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + let s = registry.add_channel(addr(8080)); + // Force multiplier to 3 directly (no traffic, no eject). + s.set_ejection_multiplier(3); - #[test] - fn multiplier_decrements_even_without_traffic() { - // A50: a non-ejected address with multiplier > 0 has its - // multiplier decremented every sweep, regardless of traffic. - let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); - let channels = registry(); - add(&channels, 8080); - detector.run_sweep(Instant::now(), &channels); - detector - .state - .get_mut(&addr(8080)) - .unwrap() - .ejection_multiplier = 3; - detector.run_sweep(Instant::now(), &channels); - assert_eq!( - detector.state.get(&addr(8080)).unwrap().ejection_multiplier, - 2, - ); + registry.run_housekeeping(Instant::now()); + assert_eq!(s.ejection_multiplier(), 2); } #[test] - fn alg_state_dropped_when_channel_removed() { - let mut detector = detector_with_rng(base_config(), FixedRng::boxed(99)); - let channels = registry(); - add(&channels, 8080); - add(&channels, 8081); - detector.run_sweep(Instant::now(), &channels); - assert_eq!(detector.state.len(), 2); - - channels.remove(&addr(8080)); - detector.run_sweep(Instant::now(), &channels); - assert_eq!(detector.state.len(), 1); - assert!(detector.state.contains_key(&addr(8081))); - } - - // ----- Spawned actor ----- - - #[tokio::test(start_paused = true)] - async fn spawned_actor_runs_sweeps_on_tick() { + fn housekeeping_caps_ejection_at_max_ejection_time() { let mut config = fp_config(50, 10, 3); - config.interval = Duration::from_millis(100); - let channels = registry(); - - for port in 8080..=8083 { - let s = add(&channels, port); - for _ in 0..100 { - s.record_success(); - } - } - let bad = add(&channels, 8084); - for _ in 0..100 { - bad.record_failure(); - } - - let _abort = OutlierDetector::spawn_with_rng(config, FixedRng::boxed(99), channels.clone()); + config.base_ejection_time = Duration::from_secs(10); + config.max_ejection_time = Duration::from_secs(15); + let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); - // Advance past the first sweep tick. The yield gives the - // spawned actor a turn to run after time advances. - tokio::time::advance(Duration::from_millis(150)).await; - tokio::task::yield_now().await; - tokio::task::yield_now().await; + let s = registry.add_channel(addr(8080)); + // Pretend 8080 was ejected long ago with a huge multiplier. + s.try_eject(Instant::now()); + s.set_ejection_multiplier(10); + registry.ejected_count.fetch_add(0, Ordering::Relaxed); // try_eject already added 1 - assert!(bad.is_ejected()); - for port in 8080..=8083 { - assert!(!ejected(&channels, port)); - } + // base * multiplier = 100s, but cap = 15s. Sweep at 16s ⇒ uneject. + let t0 = Instant::now(); + registry.run_housekeeping(t0 + Duration::from_secs(16)); + assert!(!s.is_ejected()); } + // ----- Spawned actor ----- + // + // The actor's algorithmic behavior is fully exercised by the + // synchronous `housekeeping_*` tests above; here we only verify + // that dropping the `AbortOnDrop` handle reliably stops the task. + #[tokio::test(start_paused = true)] async fn dropping_abort_stops_actor() { let mut config = base_config(); config.interval = Duration::from_millis(50); - let channels = registry(); - let bad = add(&channels, 8080); + let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let s = registry.add_channel(addr(8080)); + s.set_ejection_multiplier(5); - let abort = OutlierDetector::spawn(config, channels.clone()); + let abort = spawn_actor(registry.clone()); drop(abort); - // Even after several tick periods, no sweep should have run - // because the task was aborted. + // Even with several tick periods elapsed, no housekeeping + // should have run because the task was aborted. tokio::time::advance(Duration::from_millis(500)).await; + tokio::task::yield_now().await; - // The bad channel had no traffic recorded, so neither side - // would eject — but verify nothing happened to the signal. - assert!(!bad.is_ejected()); + assert_eq!(s.ejection_multiplier(), 5); + } + + // ----- OutlierChannelState sanity (kept in this file as it is the + // primary consumer of the type) ----- + + #[test] + fn channel_state_records_and_resets() { + let s = OutlierChannelState::new(); + s.record_success(); + s.record_success(); + s.record_failure(); + assert_eq!(s.snapshot_and_reset(), (2, 1)); + assert_eq!(s.snapshot_and_reset(), (0, 0)); + } + + #[test] + fn channel_state_try_eject_uneject_flips_signal() { + let s = OutlierChannelState::new(); + assert!(!s.is_ejected()); + assert!(s.try_eject(Instant::now())); + assert!(s.is_ejected()); + // Second call is a no-op. + assert!(!s.try_eject(Instant::now())); + assert!(s.try_uneject()); + assert!(!s.is_ejected()); + assert!(!s.try_uneject()); } } From 47944fda6b0ea158317025b97511c06570f81a00 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 11 May 2026 10:38:06 -0700 Subject: [PATCH 21/33] refactor(tonic-xds): lift outlier state out of Connecting/EjectedChannel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Outlier-detection state belongs to `ReadyChannel` — the only state machine variant that serves traffic. `ConnectingChannel` is just a connect future and `EjectedChannel` is just a cooldown timer; neither reads or writes counters or the ejection signal, so neither should carry the `Arc`. Changes: - `ConnectingChannel::Output` is now bare `S` (was `ReadyChannel`). The captured async block no longer holds an outlier state; the address is kept by the caller (typically as the key in `KeyedFutures`). - `EjectedChannel` drops its `outlier` field. `UnejectedChannel:: Ready(S)` now carries a bare service; the consumer re-attaches the registry-supplied outlier state when wrapping it back into a `ReadyChannel`. - `ReadyChannel` gains an explicit `new(addr, inner, outlier)` constructor so the outlier state is required at construction time. - `ReadyChannel::eject` and `ReadyChannel::reconnect` drop the outlier reference — it lives in the registry, keyed by address, and survives the cycle. - `LoadBalancer::connecting` is now `KeyedFutures` (was over `ReadyChannel`). `poll_connecting` wraps the resolved service into a `ReadyChannel` with a fresh `OutlierChannelState`; the integration PR replaces the fresh state with one supplied by the `OutlierStatsRegistry`. Tests in `channel_state.rs` use a small `wrap_ready` helper to build `ReadyChannel` instances from the bare services returned by `IdleChannel::connect()`. --- .../src/client/loadbalance/channel_state.rs | 130 ++++++++++-------- .../src/client/loadbalance/loadbalancer.rs | 17 ++- 2 files changed, 84 insertions(+), 63 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index 7885705a1..b87414bc1 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -259,8 +259,11 @@ pub(crate) struct EjectionConfig { /// Result of an ejection expiring. pub(crate) enum UnejectedChannel { - /// The channel is ready to serve again (ejection expired, no reconnect needed). - Ready(ReadyChannel), + /// The channel is ready to serve again (ejection expired, no + /// reconnect needed). The consumer wraps the bare service into a + /// [`ReadyChannel`] using the registry-supplied + /// [`OutlierChannelState`]. + Ready(S), /// A fresh connection has been started. Connecting(ConnectingChannel), } @@ -295,44 +298,31 @@ impl IdleChannel { /// A channel that is in the process of connecting. /// -/// Implements [`Future`] -- resolves to [`ReadyChannel`] when connected. +/// Implements [`Future`] -- resolves to the connected service `S` +/// when the connection completes. The consumer wraps that into a +/// [`ReadyChannel`] (attaching its [`OutlierChannelState`]). /// Cancellation is handled externally via [`KeyedFutures::cancel`]. /// +/// `ConnectingChannel` deliberately does not carry an +/// [`OutlierChannelState`]: it does not serve traffic, so it has +/// nothing to count or signal. +/// /// [`KeyedFutures::cancel`]: crate::client::loadbalance::keyed_futures::KeyedFutures::cancel pub(crate) struct ConnectingChannel { - inner: Pin> + Send>>, + inner: Pin + Send>>, } impl ConnectingChannel { - /// Start a connection, generating a fresh per-channel outlier - /// state. Used for first-time connects from `IdleChannel`. - pub(crate) fn new(fut: BoxFuture, addr: EndpointAddress) -> Self { - Self::with_outlier(fut, addr, Arc::new(OutlierChannelState::new())) - } - - /// Start a connection that inherits an existing - /// [`OutlierChannelState`]. Used by reconnect paths so the - /// per-channel counters and ejection signal survive across the - /// connection cycle. - pub(crate) fn with_outlier( - fut: BoxFuture, - addr: EndpointAddress, - outlier: Arc, - ) -> Self { - Self { - inner: Box::pin(async move { - ReadyChannel { - addr, - inner: fut.await, - outlier, - } - }), - } + /// Start a connection. The address is kept by the caller (it is + /// typically the key in a `KeyedFutures` map); only the future is + /// stored here. + pub(crate) fn new(fut: BoxFuture, _addr: EndpointAddress) -> Self { + Self { inner: fut } } } impl Future for ConnectingChannel { - type Output = ReadyChannel; + type Output = S; fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { self.get_mut().inner.as_mut().poll(cx) @@ -348,7 +338,9 @@ impl Future for ConnectingChannel { /// Holds the raw service `S` and delegates [`Service`] calls directly, /// preserving `S::Future` and `S::Error` with no wrapping or type /// erasure. The `Arc` is shared with the outlier- -/// detection actor for stats accumulation and edge-triggered ejection. +/// detection actor for stats accumulation and edge-triggered ejection; +/// because only `ReadyChannel` serves traffic, only `ReadyChannel` +/// carries this state. #[derive(Clone)] pub(crate) struct ReadyChannel { addr: EndpointAddress, @@ -357,13 +349,26 @@ pub(crate) struct ReadyChannel { } impl ReadyChannel { + /// Wrap a connected service `S` into a [`ReadyChannel`] using the + /// caller-supplied outlier state. + pub(crate) fn new(addr: EndpointAddress, inner: S, outlier: Arc) -> Self { + Self { + addr, + inner, + outlier, + } + } + /// Per-channel outlier-detection state. Cloned cheaply via `Arc`. #[allow(dead_code)] // consumed by the LoadBalancer in a follow-up PR. pub(crate) fn outlier(&self) -> &Arc { &self.outlier } - /// Eject this channel (e.g., due to outlier detection). Consumes self. + /// Eject this channel (e.g., due to outlier detection). Consumes + /// self. The outlier state remains in the registry; only the + /// service and address are passed into [`EjectedChannel`] (which + /// just times the cooldown). pub(crate) fn eject(self, config: EjectionConfig, connector: Arc) -> EjectedChannel where C: Connector + Send + Sync + 'static, @@ -372,15 +377,15 @@ impl ReadyChannel { EjectedChannel { addr: self.addr, inner: self.inner, - outlier: self.outlier, config, connector, ejection_timer, } } - /// Start reconnecting. Consumes self, dropping the old connection - /// but preserving the outlier-detection state. + /// Start reconnecting. Consumes self, dropping the old connection. + /// The outlier state remains in the registry; the consumer + /// re-attaches it when the new [`ReadyChannel`] is constructed. pub(crate) fn reconnect>( self, connector: Arc, @@ -388,7 +393,7 @@ impl ReadyChannel { where S: Send + 'static, { - ConnectingChannel::with_outlier(connector.connect(&self.addr), self.addr, self.outlier) + ConnectingChannel::new(connector.connect(&self.addr), self.addr) } } @@ -424,14 +429,19 @@ impl Load for ReadyChannel { pin_project! { /// A channel that has been ejected and is cooling down. /// - /// The underlying connection is kept alive but cannot serve requests. - /// Implements [`Future`] -- resolves once the ejection timer expires to either: + /// The underlying connection is kept alive but cannot serve + /// requests. Implements [`Future`] -- resolves once the ejection + /// timer expires to either: /// - [`UnejectedChannel::Ready`] if no reconnect is needed /// - [`UnejectedChannel::Connecting`] if a fresh connection is required + /// + /// `EjectedChannel` deliberately does not carry an + /// [`OutlierChannelState`]: the state lives in the registry, keyed + /// by address, and the consumer re-attaches it when the channel + /// transitions back to [`ReadyChannel`]. pub(crate) struct EjectedChannel { addr: EndpointAddress, inner: S, - outlier: Arc, config: EjectionConfig, connector: Arc + Send + Sync>, #[pin] @@ -448,19 +458,12 @@ impl Future for EjectedChannel { Poll::Ready(()) => { if this.config.needs_reconnect { let fut = this.connector.connect(this.addr); - Poll::Ready(UnejectedChannel::Connecting( - ConnectingChannel::with_outlier( - fut, - this.addr.clone(), - this.outlier.clone(), - ), - )) + Poll::Ready(UnejectedChannel::Connecting(ConnectingChannel::new( + fut, + this.addr.clone(), + ))) } else { - Poll::Ready(UnejectedChannel::Ready(ReadyChannel { - addr: this.addr.clone(), - inner: this.inner.clone(), - outlier: this.outlier.clone(), - })) + Poll::Ready(UnejectedChannel::Ready(this.inner.clone())) } } Poll::Pending => Poll::Pending, @@ -529,17 +532,23 @@ mod tests { assert_eq!(connector.connect_count.load(Ordering::SeqCst), 1); } + fn wrap_ready(addr: EndpointAddress, svc: MockService) -> ReadyChannel { + ReadyChannel::new(addr, svc, Arc::new(OutlierChannelState::new())) + } + #[tokio::test] - async fn test_connecting_future_yields_ready() { + async fn test_connecting_future_yields_service() { let connector = MockConnector::new(); - let ready = IdleChannel::new(test_addr()).connect(connector).await; - assert_eq!(ready.addr, test_addr()); + let svc: MockService = IdleChannel::new(test_addr()).connect(connector).await; + // The bare service is what `ConnectingChannel` resolves to. + let _ready = wrap_ready(test_addr(), svc); } #[tokio::test] async fn test_ready_service_delegates() { let connector = MockConnector::new(); - let mut ready = IdleChannel::new(test_addr()).connect(connector).await; + let svc = IdleChannel::new(test_addr()).connect(connector).await; + let mut ready = wrap_ready(test_addr(), svc); let resp: &str = ready.call("hello").await.unwrap(); assert_eq!(resp, "ok"); } @@ -547,9 +556,10 @@ mod tests { #[tokio::test] async fn test_ready_to_connecting_via_reconnect() { let connector = MockConnector::new(); - let ready = IdleChannel::new(test_addr()) + let svc = IdleChannel::new(test_addr()) .connect(connector.clone()) .await; + let ready = wrap_ready(test_addr(), svc); let _reconnecting = ready.reconnect(connector.clone()); assert_eq!(connector.connect_count.load(Ordering::SeqCst), 2); } @@ -562,7 +572,7 @@ mod tests { let connecting = ConnectingChannel::new(Box::pin(async move { rx.await.unwrap() }), test_addr()); - let mut set: KeyedFutures> = KeyedFutures::new(); + let mut set: KeyedFutures = KeyedFutures::new(); set.add(test_addr(), connecting).unwrap(); assert!(matches!(set.poll_next(&mut noop_cx()), Poll::Pending)); @@ -580,7 +590,7 @@ mod tests { let connecting = ConnectingChannel::new(Box::pin(future::pending::()), test_addr()); - let mut set: KeyedFutures> = KeyedFutures::new(); + let mut set: KeyedFutures = KeyedFutures::new(); set.add(test_addr(), connecting).unwrap(); assert!(matches!(set.poll_next(&mut noop_cx()), Poll::Pending)); @@ -592,9 +602,10 @@ mod tests { #[tokio::test(start_paused = true)] async fn test_ejected_in_keyed_futures_ready() { let connector = MockConnector::new(); - let ready = IdleChannel::new(test_addr()) + let svc = IdleChannel::new(test_addr()) .connect(connector.clone()) .await; + let ready = wrap_ready(test_addr(), svc); let ejected = ready.eject( EjectionConfig { timeout: Duration::from_secs(5), @@ -617,9 +628,10 @@ mod tests { #[tokio::test(start_paused = true)] async fn test_ejected_in_keyed_futures_needs_reconnect() { let connector = MockConnector::new(); - let ready = IdleChannel::new(test_addr()) + let svc = IdleChannel::new(test_addr()) .connect(connector.clone()) .await; + let ready = wrap_ready(test_addr(), svc); let ejected = ready.eject( EjectionConfig { timeout: Duration::from_secs(5), diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 3a1a0171f..61bc6681a 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -14,7 +14,7 @@ use tower::Service; use tower::discover::{Change, Discover}; use crate::client::endpoint::{Connector, EndpointAddress}; -use crate::client::loadbalance::channel_state::{IdleChannel, ReadyChannel}; +use crate::client::loadbalance::channel_state::{IdleChannel, OutlierChannelState, ReadyChannel}; use crate::client::loadbalance::errors::LbError; use crate::client::loadbalance::keyed_futures::KeyedFutures; use crate::client::loadbalance::pickers::ChannelPicker; @@ -58,7 +58,10 @@ pub(crate) struct LoadBalancer { /// Connector for creating connections from idle channels. connector: Arc, /// In-flight connection attempts, keyed by endpoint address. - connecting: KeyedFutures>, + /// `ConnectingChannel` resolves to the bare service; the LB wraps + /// it into a `ReadyChannel` with an outlier state when it + /// transitions to ready. + connecting: KeyedFutures, /// Ready-to-serve channels, keyed by endpoint address. ready: IndexMap>, /// Channel picker for load balancing. @@ -117,9 +120,15 @@ where } } - /// Drain completed connection futures into the ready set. + /// Drain completed connection futures into the ready set. Wraps + /// the bare service into a `ReadyChannel` with a fresh + /// `OutlierChannelState`. The outlier-detection PR will replace + /// the fresh state with one looked up from the + /// `OutlierStatsRegistry`. fn poll_connecting(&mut self, cx: &mut Context<'_>) { - while let Poll::Ready(Some((addr, ready))) = self.connecting.poll_next(cx) { + while let Poll::Ready(Some((addr, svc))) = self.connecting.poll_next(cx) { + let outlier = Arc::new(OutlierChannelState::new()); + let ready = ReadyChannel::new(addr.clone(), svc, outlier); self.ready.insert(addr, ready); } } From 5e835e76ba7e7ff4b50acca4aac54b81412640b5 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 11 May 2026 11:05:33 -0700 Subject: [PATCH 22/33] feat(tonic-xds): integrate outlier detection with LoadBalancer Wire the outlier-detection registry into `LoadBalancer` end-to-end: - `LoadBalancer::with_outlier(discovery, connector, picker, Some(registry))` constructs an LB that participates in outlier detection. The plain `new(...)` constructor is a thin alias that passes `None` (no outlier detection); existing tests are unchanged. - At construction, the housekeeping actor is spawned via `spawn_actor(registry)`; the returned `AbortOnDrop` is stored on the LB so the actor stops when the LB is dropped. - `poll_discover` now also unhooks the registry entry, the ejection signal stream, and any ejected slot when an address is removed or re-inserted. - `poll_connecting` registers the new channel with the registry (`registry.add_channel(addr)`), subscribes to its ejection signal via `WatchStream::from_changes`, and inserts the stream into a `StreamMap>`. - A new `poll_ejection_signals` step in `poll_ready` drains the `StreamMap` in amortized O(1) per transition, moving channels between `ready: IndexMap` and a new `ejected: HashMap`. The picker continues to see only `ready`, so ejected endpoints are automatically excluded from selection. - `call` clones the picked channel's `OutlierChannelState` and, after the inner call completes, invokes `registry.record_outcome(state, result.is_ok())`. Per-RPC detection runs inline; the LB's critical path stays O(1) in the number of endpoints. Other changes: - `OutlierStatsRegistry::add_channel` is idempotent: re-inserting an existing address returns the existing state so reconnect cycles preserve counters and ejection bookkeeping. - Cargo: `tokio-stream` gains the `sync` feature to expose `WatchStream`. Three integration tests cover: a failing endpoint gets ejected and removed from `ready`; a healthy cluster sees no ejections; endpoint removal cleans up the registry. --- tonic-xds/Cargo.toml | 2 +- .../src/client/loadbalance/loadbalancer.rs | 269 ++++++++++++++++-- .../client/loadbalance/outlier_detection.rs | 13 +- 3 files changed, 261 insertions(+), 23 deletions(-) diff --git a/tonic-xds/Cargo.toml b/tonic-xds/Cargo.toml index 8d94f3342..84e1246e0 100644 --- a/tonic-xds/Cargo.toml +++ b/tonic-xds/Cargo.toml @@ -45,7 +45,7 @@ tokio = { version = "1", features = ["sync", "time"] } fastrand = "2" indexmap = "2" tracing = "0.1" -tokio-stream = "0.1" +tokio-stream = { version = "0.1", features = ["sync"] } tokio-util = "0.7" backoff = "0.4" shared_http_body = "0.1" diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 61bc6681a..d11c57f52 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -3,13 +3,23 @@ //! Receives endpoint updates via [`tower::discover::Discover`] (yielding //! [`IdleChannel`]s), manages the connection lifecycle via the channel state //! machine, and routes requests to ready endpoints via a [`ChannelPicker`]. - +//! +//! Outlier detection is integrated via an optional +//! [`OutlierStatsRegistry`]: ejection decisions are made on the data +//! path (per-RPC) and surfaced to `poll_ready` via per-channel +//! `watch::Receiver` streams aggregated in a `StreamMap`. The +//! LB then moves the corresponding [`ReadyChannel`] between its ready +//! and ejected maps in O(1) per transition. + +use std::collections::HashMap; use std::future::Future; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll, ready}; use indexmap::IndexMap; +use tokio_stream::StreamMap; +use tokio_stream::wrappers::WatchStream; use tower::Service; use tower::discover::{Change, Discover}; @@ -17,7 +27,9 @@ use crate::client::endpoint::{Connector, EndpointAddress}; use crate::client::loadbalance::channel_state::{IdleChannel, OutlierChannelState, ReadyChannel}; use crate::client::loadbalance::errors::LbError; use crate::client::loadbalance::keyed_futures::KeyedFutures; +use crate::client::loadbalance::outlier_detection::{OutlierStatsRegistry, spawn_actor}; use crate::client::loadbalance::pickers::ChannelPicker; +use crate::common::async_util::AbortOnDrop; /// Future returned by [`LoadBalancer::call`]. /// @@ -64,6 +76,20 @@ pub(crate) struct LoadBalancer { connecting: KeyedFutures, /// Ready-to-serve channels, keyed by endpoint address. ready: IndexMap>, + /// Channels currently ejected by outlier detection. Their + /// underlying connections are kept alive so traffic can resume + /// without reconnecting after un-ejection. + ejected: HashMap>, + /// Per-channel ejection signal streams, aggregated for O(1) + /// observation in `poll_ready`. Present only when outlier + /// detection is enabled. + ejection_signals: StreamMap>, + /// Outlier-detection registry, shared with the spawned actor and + /// the data path. `None` disables outlier detection. + outlier: Option>, + /// Handle to the outlier-detection actor task; dropped when the + /// LB is dropped. + _outlier_actor: Option, /// Channel picker for load balancing. picker: Arc, Req> + Send + Sync>, } @@ -75,21 +101,51 @@ where C: Connector + Send + Sync + 'static, C::Service: Send + 'static, { - /// Create a new load balancer with the given picker. + /// Create a load balancer with no outlier detection. pub(crate) fn new( discovery: D, connector: Arc, picker: Arc, Req> + Send + Sync>, ) -> Self { + Self::with_outlier(discovery, connector, picker, None) + } + + /// Create a load balancer, optionally enabling outlier detection. + /// When `outlier` is `Some`, the registry's housekeeping actor is + /// spawned and its lifetime is bound to the load balancer. + pub(crate) fn with_outlier( + discovery: D, + connector: Arc, + picker: Arc, Req> + Send + Sync>, + outlier: Option>, + ) -> Self { + let _outlier_actor = outlier.as_ref().map(|reg| spawn_actor(reg.clone())); Self { discovery, connector, connecting: KeyedFutures::new(), ready: IndexMap::new(), + ejected: HashMap::new(), + ejection_signals: StreamMap::new(), + outlier, + _outlier_actor, picker, } } + /// Forget all per-endpoint state for `addr`: the connecting + /// future, the ready slot, the ejected slot, the ejection signal + /// stream, and the registry entry. + fn forget_endpoint(&mut self, addr: &EndpointAddress) { + let _ = self.connecting.cancel(addr); + self.ready.swap_remove(addr); + self.ejected.remove(addr); + self.ejection_signals.remove(addr); + if let Some(registry) = self.outlier.as_ref() { + registry.remove_channel(addr); + } + } + /// Drain pending discovery events. Either resolves to an error /// ([`LbError::DiscoverClosed`] or [`LbError::DiscoverError`]) or stays /// pending — there is no success outcome since the loop only exits on @@ -106,32 +162,56 @@ where Some(Err(e)) => return Poll::Ready(LbError::DiscoverError(e.into())), Some(Ok(Change::Insert(addr, idle))) => { tracing::trace!("discovery: insert {addr}"); - let _ = self.connecting.cancel(&addr); - self.ready.swap_remove(&addr); + self.forget_endpoint(&addr); let connecting = idle.connect(self.connector.clone()); let _ = self.connecting.add(addr, connecting); } Some(Ok(Change::Remove(addr))) => { tracing::trace!("discovery: remove {addr}"); - let _ = self.connecting.cancel(&addr); - self.ready.swap_remove(&addr); + self.forget_endpoint(&addr); } } } } /// Drain completed connection futures into the ready set. Wraps - /// the bare service into a `ReadyChannel` with a fresh - /// `OutlierChannelState`. The outlier-detection PR will replace - /// the fresh state with one looked up from the - /// `OutlierStatsRegistry`. + /// each bare service into a `ReadyChannel` using the outlier + /// state from the registry (or a fresh state if outlier detection + /// is disabled), and subscribes to the per-channel ejection + /// signal. fn poll_connecting(&mut self, cx: &mut Context<'_>) { while let Poll::Ready(Some((addr, svc))) = self.connecting.poll_next(cx) { - let outlier = Arc::new(OutlierChannelState::new()); - let ready = ReadyChannel::new(addr.clone(), svc, outlier); + let state = match self.outlier.as_ref() { + Some(registry) => registry.add_channel(addr.clone()), + None => Arc::new(OutlierChannelState::new()), + }; + if self.outlier.is_some() { + self.ejection_signals + .insert(addr.clone(), WatchStream::from_changes(state.subscribe())); + } + let ready = ReadyChannel::new(addr.clone(), svc, state); self.ready.insert(addr, ready); } } + + /// Drain ejection-signal transitions, moving channels between + /// `ready` and `ejected`. O(k) per call where k = ready signals. + fn poll_ejection_signals(&mut self, cx: &mut Context<'_>) { + use futures_core::Stream; + while let Poll::Ready(Some((addr, ejected))) = + Pin::new(&mut self.ejection_signals).poll_next(cx) + { + if ejected { + if let Some(ch) = self.ready.swap_remove(&addr) { + tracing::debug!("outlier detection: eject {addr}"); + self.ejected.insert(addr, ch); + } + } else if let Some(ch) = self.ejected.remove(&addr) { + tracing::debug!("outlier detection: uneject {addr}"); + self.ready.insert(addr, ch); + } + } + } } impl Service for LoadBalancer @@ -152,6 +232,7 @@ where fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { let discover_result = self.poll_discover(cx); self.poll_connecting(cx); + self.poll_ejection_signals(cx); if !self.ready.is_empty() { return Poll::Ready(Ok(())); @@ -183,16 +264,26 @@ where let Some(picked) = self.picker.pick(&req, &self.ready) else { return LbFuture::Error(Some(LbError::Unavailable)); }; - // `picked` is a read-only borrow into `self.ready`. Clone to get an - // owned service we can drive in the async block. + // `picked` is a read-only borrow into `self.ready`. Clone to get + // an owned service and outlier handle for the async block; both + // are `Arc`-shared, so cloning is cheap. let mut svc = picked.clone(); + let outlier_state = picked.outlier().clone(); + let registry = self.outlier.clone(); LbFuture::Pending(Box::pin(async move { tower::ServiceExt::ready(&mut svc) .await .map_err(|e| LbError::LbChannelPollReadyError(e.into()))?; - svc.call(req) - .await - .map_err(|e| LbError::LbChannelCallError(e.into())) + let result = svc.call(req).await; + if let Some(registry) = registry.as_ref() { + // Per-RPC outlier detection: bump the channel's + // counter and (inside `record_outcome`) possibly + // eject if the failure-percentage threshold is + // crossed. Treat any `Err` outcome as a failure for + // outlier purposes. + registry.record_outcome(&outlier_state, result.is_ok()); + } + result.map_err(|e| LbError::LbChannelCallError(e.into())) })) } } @@ -667,4 +758,148 @@ mod tests { "expected LbChannelCallError, got {result:?}" ); } + + // -- Outlier-detection integration tests -- + + use crate::client::loadbalance::outlier_detection::{OutlierStatsRegistry, Rng}; + use crate::xds::resource::outlier_detection::{ + FailurePercentageConfig, OutlierDetectionConfig, Percentage, + }; + use std::time::Duration; + + fn pct(v: u32) -> Percentage { + Percentage::new(v).unwrap() + } + + struct AlwaysFireRng; + impl Rng for AlwaysFireRng { + fn pct_roll(&self) -> u32 { + 0 + } + } + + fn fp_config( + threshold: u32, + request_volume: u32, + minimum_hosts: u32, + ) -> OutlierDetectionConfig { + OutlierDetectionConfig { + interval: Duration::from_secs(60), + base_ejection_time: Duration::from_secs(30), + max_ejection_time: Duration::from_secs(300), + max_ejection_percent: pct(100), + success_rate: None, + failure_percentage: Some(FailurePercentageConfig { + threshold: pct(threshold), + enforcing_failure_percentage: pct(100), + minimum_hosts, + request_volume, + }), + } + } + + /// Build an LB with outlier detection enabled. + fn make_lb_with_outlier( + discover: MockDiscover, + config: OutlierDetectionConfig, + ) -> (Lb, Arc, Arc) { + let connector = Arc::new(MockConnector::new()); + let picker: Arc, &'static str> + Send + Sync> = + Arc::new(P2cPicker); + let registry = OutlierStatsRegistry::with_rng(config, Box::new(AlwaysFireRng)); + let lb = + LoadBalancer::with_outlier(discover, connector.clone(), picker, Some(registry.clone())); + (lb, connector, registry) + } + + /// Drive the LB through one call per port. Asserts each succeeds. + async fn call_each(lb: &mut Lb, n: usize) { + for _ in 0..n { + lb.call("hello").await.unwrap(); + } + } + + #[tokio::test] + async fn test_outlier_detection_ejects_failing_endpoint() { + // 5 endpoints, all healthy except 8084. Once 8084's failures + // cross the threshold, it should be moved out of `ready` and + // into `ejected`. + let (tx, discover) = new_discover(); + let (mut lb, connector, registry) = make_lb_with_outlier( + discover, + fp_config( + /*threshold*/ 50, /*request_volume*/ 5, /*minimum_hosts*/ 3, + ), + ); + + for port in 8080..=8084 { + tx.send(Ok(Change::Insert(addr(port), IdleChannel::new(addr(port))))) + .await + .unwrap(); + } + drive_to_ready(&mut lb, &connector).await; + assert_eq!(lb.ready.len(), 5); + + // Configure 8084 to always fail. Other endpoints stay healthy. + connector + .service(&addr(8084)) + .fail_call + .store(true, Ordering::Relaxed); + + // Drive enough calls to ensure 8084 reaches request_volume + // and its failure rate triggers ejection. With 5 endpoints + // and P2C picking, each gets ~k/5 calls; drive 100 to be safe. + for _ in 0..100 { + let _ = lb.call("hello").await; + } + + // poll_ready drains the ejection signal and moves 8084. + let _ = poll_ready_now(&mut lb); + assert!( + lb.ejected.contains_key(&addr(8084)), + "8084 should be ejected; ejected map: {:?}, ready keys: {:?}", + lb.ejected.keys().collect::>(), + lb.ready.keys().collect::>(), + ); + assert!(!lb.ready.contains_key(&addr(8084))); + // The registry's `ejected_count` should reflect the same. + assert!(registry.len() == 5); + } + + #[tokio::test] + async fn test_outlier_detection_healthy_cluster_no_ejections() { + let (tx, discover) = new_discover(); + let (mut lb, connector, _registry) = make_lb_with_outlier(discover, fp_config(50, 5, 3)); + + for port in 8080..=8084 { + tx.send(Ok(Change::Insert(addr(port), IdleChannel::new(addr(port))))) + .await + .unwrap(); + } + drive_to_ready(&mut lb, &connector).await; + assert_eq!(lb.ready.len(), 5); + + call_each(&mut lb, 50).await; + + let _ = poll_ready_now(&mut lb); + assert_eq!(lb.ejected.len(), 0); + assert_eq!(lb.ready.len(), 5); + } + + #[tokio::test] + async fn test_outlier_detection_endpoint_removal_cleans_registry() { + let (tx, discover) = new_discover(); + let (mut lb, connector, registry) = make_lb_with_outlier(discover, fp_config(50, 5, 3)); + + tx.send(Ok(Change::Insert(addr(8080), IdleChannel::new(addr(8080))))) + .await + .unwrap(); + drive_to_ready(&mut lb, &connector).await; + assert_eq!(registry.len(), 1); + + tx.send(Ok(Change::Remove(addr(8080)))).await.unwrap(); + let _ = poll_ready_now(&mut lb); + assert_eq!(registry.len(), 0); + assert_eq!(lb.ready.len(), 0); + } } diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 5295a09c7..1cbfb6233 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -92,13 +92,16 @@ impl OutlierStatsRegistry { }) } - /// Register a new channel. Returns the `Arc` + /// Register a channel and return the `Arc` /// the load balancer wires into the channel; the same `Arc` is - /// retained in the registry so the actor can iterate it. + /// retained in the registry so the actor can iterate it. If a + /// state for this address already exists, returns it untouched — + /// state continuity across reconnect cycles is preserved. pub(crate) fn add_channel(&self, addr: EndpointAddress) -> Arc { - let state = Arc::new(OutlierChannelState::new()); - self.channels.insert(addr, state.clone()); - state + self.channels + .entry(addr) + .or_insert_with(|| Arc::new(OutlierChannelState::new())) + .clone() } /// Forget a channel. Drops the registry's reference; cluster-wide From b72040d3b023020a55cdbd8772643ba8e80d30a2 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 11 May 2026 14:19:07 -0700 Subject: [PATCH 23/33] refactor(tonic-xds): bundle outlier LB state into OutlierDetector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The four outlier-related fields on `LoadBalancer` (registry, ejected pool, ejection-signal streams, actor handle) always lived in lockstep — either all four were present (outlier detection enabled) or all four were absent. Bundle them into a single `OutlierDetector` struct stored as `Option>` so the type system enforces the invariant and the LB methods that touch outlier state become one-line delegations. --- .../src/client/loadbalance/loadbalancer.rs | 97 ++++++------------- .../client/loadbalance/outlier_detection.rs | 89 ++++++++++++++++- 2 files changed, 120 insertions(+), 66 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index d11c57f52..90effb476 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -5,21 +5,21 @@ //! machine, and routes requests to ready endpoints via a [`ChannelPicker`]. //! //! Outlier detection is integrated via an optional -//! [`OutlierStatsRegistry`]: ejection decisions are made on the data -//! path (per-RPC) and surfaced to `poll_ready` via per-channel -//! `watch::Receiver` streams aggregated in a `StreamMap`. The -//! LB then moves the corresponding [`ReadyChannel`] between its ready -//! and ejected maps in O(1) per transition. +//! [`OutlierDetector`], which bundles the shared +//! [`OutlierStatsRegistry`], the ejected-channel pool, the per-channel +//! ejection-signal streams, and the housekeeping actor handle. +//! Ejection decisions are made on the data path (per-RPC) and surfaced +//! to `poll_ready` via per-channel `watch::Receiver` streams +//! aggregated in a `StreamMap`. The LB then moves the corresponding +//! [`ReadyChannel`] between its `ready` map and the detector's ejected +//! pool in O(1) per transition. -use std::collections::HashMap; use std::future::Future; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll, ready}; use indexmap::IndexMap; -use tokio_stream::StreamMap; -use tokio_stream::wrappers::WatchStream; use tower::Service; use tower::discover::{Change, Discover}; @@ -27,9 +27,8 @@ use crate::client::endpoint::{Connector, EndpointAddress}; use crate::client::loadbalance::channel_state::{IdleChannel, OutlierChannelState, ReadyChannel}; use crate::client::loadbalance::errors::LbError; use crate::client::loadbalance::keyed_futures::KeyedFutures; -use crate::client::loadbalance::outlier_detection::{OutlierStatsRegistry, spawn_actor}; +use crate::client::loadbalance::outlier_detection::{OutlierDetector, OutlierStatsRegistry}; use crate::client::loadbalance::pickers::ChannelPicker; -use crate::common::async_util::AbortOnDrop; /// Future returned by [`LoadBalancer::call`]. /// @@ -76,20 +75,10 @@ pub(crate) struct LoadBalancer { connecting: KeyedFutures, /// Ready-to-serve channels, keyed by endpoint address. ready: IndexMap>, - /// Channels currently ejected by outlier detection. Their - /// underlying connections are kept alive so traffic can resume - /// without reconnecting after un-ejection. - ejected: HashMap>, - /// Per-channel ejection signal streams, aggregated for O(1) - /// observation in `poll_ready`. Present only when outlier - /// detection is enabled. - ejection_signals: StreamMap>, - /// Outlier-detection registry, shared with the spawned actor and - /// the data path. `None` disables outlier detection. - outlier: Option>, - /// Handle to the outlier-detection actor task; dropped when the - /// LB is dropped. - _outlier_actor: Option, + /// All per-LB outlier-detection state — the shared registry, the + /// ejected pool, the ejection-signal streams, and the + /// housekeeping actor handle. `None` disables outlier detection. + outlier: Option>, /// Channel picker for load balancing. picker: Arc, Req> + Send + Sync>, } @@ -119,30 +108,23 @@ where picker: Arc, Req> + Send + Sync>, outlier: Option>, ) -> Self { - let _outlier_actor = outlier.as_ref().map(|reg| spawn_actor(reg.clone())); Self { discovery, connector, connecting: KeyedFutures::new(), ready: IndexMap::new(), - ejected: HashMap::new(), - ejection_signals: StreamMap::new(), - outlier, - _outlier_actor, + outlier: outlier.map(OutlierDetector::new), picker, } } /// Forget all per-endpoint state for `addr`: the connecting - /// future, the ready slot, the ejected slot, the ejection signal - /// stream, and the registry entry. + /// future, the ready slot, and any outlier bookkeeping. fn forget_endpoint(&mut self, addr: &EndpointAddress) { let _ = self.connecting.cancel(addr); self.ready.swap_remove(addr); - self.ejected.remove(addr); - self.ejection_signals.remove(addr); - if let Some(registry) = self.outlier.as_ref() { - registry.remove_channel(addr); + if let Some(o) = self.outlier.as_mut() { + o.forget(addr); } } @@ -176,40 +158,24 @@ where /// Drain completed connection futures into the ready set. Wraps /// each bare service into a `ReadyChannel` using the outlier - /// state from the registry (or a fresh state if outlier detection - /// is disabled), and subscribes to the per-channel ejection - /// signal. + /// state from the detector (or a fresh state if outlier detection + /// is disabled). fn poll_connecting(&mut self, cx: &mut Context<'_>) { while let Poll::Ready(Some((addr, svc))) = self.connecting.poll_next(cx) { - let state = match self.outlier.as_ref() { - Some(registry) => registry.add_channel(addr.clone()), + let state = match self.outlier.as_mut() { + Some(o) => o.register(addr.clone()), None => Arc::new(OutlierChannelState::new()), }; - if self.outlier.is_some() { - self.ejection_signals - .insert(addr.clone(), WatchStream::from_changes(state.subscribe())); - } let ready = ReadyChannel::new(addr.clone(), svc, state); self.ready.insert(addr, ready); } } - /// Drain ejection-signal transitions, moving channels between - /// `ready` and `ejected`. O(k) per call where k = ready signals. - fn poll_ejection_signals(&mut self, cx: &mut Context<'_>) { - use futures_core::Stream; - while let Poll::Ready(Some((addr, ejected))) = - Pin::new(&mut self.ejection_signals).poll_next(cx) - { - if ejected { - if let Some(ch) = self.ready.swap_remove(&addr) { - tracing::debug!("outlier detection: eject {addr}"); - self.ejected.insert(addr, ch); - } - } else if let Some(ch) = self.ejected.remove(&addr) { - tracing::debug!("outlier detection: uneject {addr}"); - self.ready.insert(addr, ch); - } + /// Drain outlier ejection-signal transitions, moving channels + /// between `ready` and the detector's ejected pool. + fn poll_outlier(&mut self, cx: &mut Context<'_>) { + if let Some(o) = self.outlier.as_mut() { + o.poll_signals(cx, &mut self.ready); } } } @@ -232,7 +198,7 @@ where fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { let discover_result = self.poll_discover(cx); self.poll_connecting(cx); - self.poll_ejection_signals(cx); + self.poll_outlier(cx); if !self.ready.is_empty() { return Poll::Ready(Ok(())); @@ -269,7 +235,7 @@ where // are `Arc`-shared, so cloning is cheap. let mut svc = picked.clone(); let outlier_state = picked.outlier().clone(); - let registry = self.outlier.clone(); + let registry = self.outlier.as_ref().map(|o| o.registry().clone()); LbFuture::Pending(Box::pin(async move { tower::ServiceExt::ready(&mut svc) .await @@ -855,10 +821,11 @@ mod tests { // poll_ready drains the ejection signal and moves 8084. let _ = poll_ready_now(&mut lb); + let ejected = lb.outlier.as_ref().unwrap().ejected(); assert!( - lb.ejected.contains_key(&addr(8084)), + ejected.contains_key(&addr(8084)), "8084 should be ejected; ejected map: {:?}, ready keys: {:?}", - lb.ejected.keys().collect::>(), + ejected.keys().collect::>(), lb.ready.keys().collect::>(), ); assert!(!lb.ready.contains_key(&addr(8084))); @@ -882,7 +849,7 @@ mod tests { call_each(&mut lb, 50).await; let _ = poll_ready_now(&mut lb); - assert_eq!(lb.ejected.len(), 0); + assert_eq!(lb.outlier.as_ref().unwrap().ejected().len(), 0); assert_eq!(lb.ready.len(), 5); } diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 1cbfb6233..6ccad8d77 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -27,14 +27,20 @@ //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md +use std::collections::HashMap; +use std::pin::Pin; use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; +use std::task::{Context, Poll}; use std::time::Instant; use dashmap::DashMap; +use indexmap::IndexMap; +use tokio_stream::StreamMap; +use tokio_stream::wrappers::WatchStream; use crate::client::endpoint::EndpointAddress; -use crate::client::loadbalance::channel_state::OutlierChannelState; +use crate::client::loadbalance::channel_state::{OutlierChannelState, ReadyChannel}; use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::OutlierDetectionConfig; @@ -236,6 +242,87 @@ pub(crate) fn spawn_actor(registry: Arc) -> AbortOnDrop { AbortOnDrop(task) } +/// All per-LB outlier-detection state: the shared registry, the pool +/// of currently-ejected channels (whose connections are kept alive +/// across ejection), the per-channel ejection-signal streams +/// aggregated for O(1) observation in `poll_ready`, and the handle to +/// the housekeeping actor (dropped with the LB). +/// +/// `LoadBalancer` holds this as `Option>`: `None` +/// when outlier detection is disabled, `Some` when enabled. +pub(crate) struct OutlierDetector { + registry: Arc, + ejected: HashMap>, + ejection_signals: StreamMap>, + _actor: AbortOnDrop, +} + +impl OutlierDetector { + /// Build from a registry, spawning the housekeeping actor. + pub(crate) fn new(registry: Arc) -> Self { + let _actor = spawn_actor(registry.clone()); + Self { + registry, + ejected: HashMap::new(), + ejection_signals: StreamMap::new(), + _actor, + } + } + + /// Shared registry handle — clone to hand to the data path. + pub(crate) fn registry(&self) -> &Arc { + &self.registry + } + + /// Register a newly-connected channel for tracking and subscribe + /// to its ejection signal. Returns the per-channel state for the + /// load balancer to wire into [`ReadyChannel`]. + pub(crate) fn register(&mut self, addr: EndpointAddress) -> Arc { + let state = self.registry.add_channel(addr.clone()); + self.ejection_signals + .insert(addr, WatchStream::from_changes(state.subscribe())); + state + } + + /// Drop all bookkeeping for `addr`: ejection slot, signal stream, + /// registry entry. + pub(crate) fn forget(&mut self, addr: &EndpointAddress) { + self.ejected.remove(addr); + self.ejection_signals.remove(addr); + self.registry.remove_channel(addr); + } + + /// Drain ejection-signal transitions, moving channels between + /// `ready` and the internal ejected pool. O(k) per call where k is + /// the number of pending signal changes. + pub(crate) fn poll_signals( + &mut self, + cx: &mut Context<'_>, + ready: &mut IndexMap>, + ) { + use futures_core::Stream; + while let Poll::Ready(Some((addr, ejected))) = + Pin::new(&mut self.ejection_signals).poll_next(cx) + { + if ejected { + if let Some(ch) = ready.swap_remove(&addr) { + tracing::debug!("outlier detection: eject {addr}"); + self.ejected.insert(addr, ch); + } + } else if let Some(ch) = self.ejected.remove(&addr) { + tracing::debug!("outlier detection: uneject {addr}"); + ready.insert(addr, ch); + } + } + } + + /// Number of currently-ejected channels. + #[cfg(test)] + pub(crate) fn ejected(&self) -> &HashMap> { + &self.ejected + } +} + /// Return true with probability `pct / 100` (clamped at 100 ⇒ always). fn roll(rng: &dyn Rng, pct: u8) -> bool { if pct >= 100 { From be41f3fd1fa03365b6457509d4b628827531a111 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 11 May 2026 14:58:54 -0700 Subject: [PATCH 24/33] fix(tonic-xds): preserve outlier-detection state across re-insert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Match grpc-go (`internal/xds/balancer/outlierdetection`) and Envoy (`BaseDynamicClusterImpl::updateDynamicHostList` reusing existing `HostSharedPtr`s): outlier-detection state is keyed by stable endpoint identity and survives a transient discovery flap. Previously, every `Change::Insert` ran the same purge path as `Change::Remove`, wiping the registry entry along with the connecting / ready / ejected slots — a brief disappearance lost the channel's counters and ejection multiplier. Split the path: - `purge_endpoint` (Remove) — cancels connecting, clears ready, and drops all outlier bookkeeping including the registry entry. - `reset_active_slots` (Insert) — cancels connecting, clears ready, and drops the obsolete `ReadyChannel` from the detector's ejected pool, but leaves the registry entry and ejection-signal subscription intact. `OutlierDetector::register` now only inserts a new signal subscription when one is not already present, so a pending ejection transition is not dropped by a redundant resubscribe. `poll_connecting` checks the preserved `state.is_ejected()` and routes a re-discovered ejected channel directly into the ejected pool via the new `place_ejected`, avoiding any window where traffic could be routed to a logically ejected channel. Adds two regression tests: - `test_outlier_detection_reinsert_preserves_state` — counters survive Insert for an existing address; same `Arc` is returned. - `test_outlier_detection_reinsert_while_ejected_stays_ejected` — re-discovered ejected channel lands in the ejected pool, not `ready`. --- .../src/client/loadbalance/loadbalancer.rs | 160 +++++++++++++++++- .../client/loadbalance/outlier_detection.rs | 43 ++++- 2 files changed, 191 insertions(+), 12 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 90effb476..7465ea997 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -118,9 +118,11 @@ where } } - /// Forget all per-endpoint state for `addr`: the connecting - /// future, the ready slot, and any outlier bookkeeping. - fn forget_endpoint(&mut self, addr: &EndpointAddress) { + /// Purge all per-endpoint state for `addr`: the connecting + /// future, the ready slot, and **all** outlier bookkeeping + /// (registry entry, ejection-signal subscription, ejected slot). + /// Used when discovery says the endpoint is gone from the cluster. + fn purge_endpoint(&mut self, addr: &EndpointAddress) { let _ = self.connecting.cancel(addr); self.ready.swap_remove(addr); if let Some(o) = self.outlier.as_mut() { @@ -128,6 +130,24 @@ where } } + /// Clear stale slots that held the old service (in-flight + /// connecting future, ready entry, ejected entry) but **preserve** + /// the outlier-detection registry entry — counters, ejection + /// multiplier, and ejection flag carry across the reconnect. + /// Used when discovery re-inserts an endpoint we already track. + /// + /// This matches grpc-go and Envoy: outlier state is keyed by + /// stable endpoint identity and survives a transient discovery + /// flap, so a brief disappearance does not wipe what we already + /// know about the endpoint's health. + fn reset_active_slots(&mut self, addr: &EndpointAddress) { + let _ = self.connecting.cancel(addr); + self.ready.swap_remove(addr); + if let Some(o) = self.outlier.as_mut() { + o.clear_active_slots(addr); + } + } + /// Drain pending discovery events. Either resolves to an error /// ([`LbError::DiscoverClosed`] or [`LbError::DiscoverError`]) or stays /// pending — there is no success outcome since the loop only exits on @@ -144,13 +164,13 @@ where Some(Err(e)) => return Poll::Ready(LbError::DiscoverError(e.into())), Some(Ok(Change::Insert(addr, idle))) => { tracing::trace!("discovery: insert {addr}"); - self.forget_endpoint(&addr); + self.reset_active_slots(&addr); let connecting = idle.connect(self.connector.clone()); let _ = self.connecting.add(addr, connecting); } Some(Ok(Change::Remove(addr))) => { tracing::trace!("discovery: remove {addr}"); - self.forget_endpoint(&addr); + self.purge_endpoint(&addr); } } } @@ -160,14 +180,25 @@ where /// each bare service into a `ReadyChannel` using the outlier /// state from the detector (or a fresh state if outlier detection /// is disabled). + /// + /// If the preserved outlier state for a re-discovered endpoint + /// says it is still ejected, the new channel goes directly into + /// the detector's ejected pool — not the ready set — so no + /// traffic is routed to it until the housekeeping actor un-ejects. fn poll_connecting(&mut self, cx: &mut Context<'_>) { while let Poll::Ready(Some((addr, svc))) = self.connecting.poll_next(cx) { let state = match self.outlier.as_mut() { Some(o) => o.register(addr.clone()), None => Arc::new(OutlierChannelState::new()), }; + let is_ejected = state.is_ejected(); let ready = ReadyChannel::new(addr.clone(), svc, state); - self.ready.insert(addr, ready); + match self.outlier.as_mut() { + Some(o) if is_ejected => o.place_ejected(addr, ready), + _ => { + self.ready.insert(addr, ready); + } + } } } @@ -869,4 +900,121 @@ mod tests { assert_eq!(registry.len(), 0); assert_eq!(lb.ready.len(), 0); } + + /// Re-discovering an endpoint (Insert for an address the LB + /// already tracks) must preserve its outlier-detection counters + /// and multiplier. Matches grpc-go / Envoy behavior. + #[tokio::test] + async fn test_outlier_detection_reinsert_preserves_state() { + let (tx, discover) = new_discover(); + let (mut lb, connector, registry) = make_lb_with_outlier(discover, fp_config(50, 5, 3)); + + tx.send(Ok(Change::Insert(addr(8080), IdleChannel::new(addr(8080))))) + .await + .unwrap(); + drive_to_ready(&mut lb, &connector).await; + let state = registry.add_channel(addr(8080)); // idempotent — returns the existing state + // Drive some successes through the data path so the channel + // accumulates counter state worth preserving. + for _ in 0..3 { + lb.call("hello").await.unwrap(); + } + let (s_before, f_before) = state.counters(); + assert!( + s_before > 0, + "expected accumulated successes before re-insert" + ); + let registry_before = Arc::as_ptr(&state); + + // Re-insert the same address. State must survive. + tx.send(Ok(Change::Insert(addr(8080), IdleChannel::new(addr(8080))))) + .await + .unwrap(); + drive_to_ready(&mut lb, &connector).await; + + let state_after = registry.add_channel(addr(8080)); + assert_eq!( + Arc::as_ptr(&state_after), + registry_before, + "registry entry should be the same Arc — state continuity preserved", + ); + let (s_after, f_after) = state_after.counters(); + assert_eq!( + (s_after, f_after), + (s_before, f_before), + "counters must survive re-insert", + ); + assert_eq!(registry.len(), 1); + } + + /// A re-discovered endpoint whose preserved state says "ejected" + /// is placed directly into the ejected pool, not the ready set, so + /// no traffic is routed to it until the housekeeping actor + /// un-ejects it. + #[tokio::test] + async fn test_outlier_detection_reinsert_while_ejected_stays_ejected() { + let (tx, discover) = new_discover(); + let (mut lb, connector, registry) = make_lb_with_outlier(discover, fp_config(50, 5, 3)); + + // Bring up 5 endpoints; make 8084 fail enough to be ejected. + for port in 8080..=8084 { + tx.send(Ok(Change::Insert(addr(port), IdleChannel::new(addr(port))))) + .await + .unwrap(); + } + drive_to_ready(&mut lb, &connector).await; + connector + .service(&addr(8084)) + .fail_call + .store(true, Ordering::Relaxed); + for _ in 0..100 { + let _ = lb.call("hello").await; + } + let _ = poll_ready_now(&mut lb); + let state_8084 = registry.add_channel(addr(8084)); + assert!( + state_8084.is_ejected(), + "8084 must be ejected before re-insert" + ); + assert!( + lb.outlier + .as_ref() + .unwrap() + .ejected() + .contains_key(&addr(8084)), + "8084 should be in the ejected pool" + ); + + // Re-insert 8084. The ejected slot's old ReadyChannel is + // dropped, but the registry entry (is_ejected=true) is + // preserved. The new channel should land in the ejected pool, + // not in `ready`. Drive the steps explicitly because + // `lb.ready` is non-empty throughout (8080..=8083), so + // `drive_to_ready` may return before the new 8084 connect + // resolves. + tx.send(Ok(Change::Insert(addr(8084), IdleChannel::new(addr(8084))))) + .await + .unwrap(); + // 1. Drain the Insert into `self.connecting`. + let _ = poll_ready_now(&mut lb); + // 2. Synchronously resolve the new connect future. + connector.resolve_all(); + // 3. Drain the now-ready connecting future; `poll_connecting` + // sees `state.is_ejected() == true` and calls `place_ejected`. + let _ = poll_ready_now(&mut lb); + + assert!( + !lb.ready.contains_key(&addr(8084)), + "8084 must not be in ready while still logically ejected" + ); + assert!( + lb.outlier + .as_ref() + .unwrap() + .ejected() + .contains_key(&addr(8084)), + "8084 must remain in the ejected pool after re-insert" + ); + assert!(state_8084.is_ejected()); + } } diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 6ccad8d77..e4fd685d2 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -274,24 +274,55 @@ impl OutlierDetector { &self.registry } - /// Register a newly-connected channel for tracking and subscribe - /// to its ejection signal. Returns the per-channel state for the - /// load balancer to wire into [`ReadyChannel`]. + /// Register a newly-connected channel for tracking and (on first + /// registration only) subscribe to its ejection signal. Returns + /// the per-channel state for the load balancer to wire into + /// [`ReadyChannel`]. + /// + /// When an endpoint is re-discovered (Insert for an address whose + /// registry entry was preserved), the existing signal subscription + /// is left in place so any pending ejection transition is not + /// dropped. pub(crate) fn register(&mut self, addr: EndpointAddress) -> Arc { let state = self.registry.add_channel(addr.clone()); - self.ejection_signals - .insert(addr, WatchStream::from_changes(state.subscribe())); + if !self.ejection_signals.contains_key(&addr) { + self.ejection_signals + .insert(addr, WatchStream::from_changes(state.subscribe())); + } state } /// Drop all bookkeeping for `addr`: ejection slot, signal stream, - /// registry entry. + /// registry entry. Used when the endpoint is removed from the + /// cluster. pub(crate) fn forget(&mut self, addr: &EndpointAddress) { self.ejected.remove(addr); self.ejection_signals.remove(addr); self.registry.remove_channel(addr); } + /// Drop the ejected-pool entry for `addr` (which holds an obsolete + /// `ReadyChannel`) but preserve the registry entry — counters, + /// ejection multiplier, and ejection flag carry across the + /// reconnect. Used when an endpoint is re-discovered. + /// + /// Matches grpc-go (`internal/xds/balancer/outlierdetection`) and + /// Envoy (`BaseDynamicClusterImpl::updateDynamicHostList` reusing + /// existing `HostSharedPtr`s): outlier state is keyed by stable + /// endpoint identity and survives transient discovery flaps. + pub(crate) fn clear_active_slots(&mut self, addr: &EndpointAddress) { + self.ejected.remove(addr); + } + + /// Place a freshly-connected channel directly into the ejected + /// pool. Used by the load balancer when the preserved state for a + /// re-discovered endpoint says it is still ejected; this avoids a + /// brief window of routing traffic to a logically-ejected channel + /// until the housekeeping actor un-ejects it. + pub(crate) fn place_ejected(&mut self, addr: EndpointAddress, ch: ReadyChannel) { + self.ejected.insert(addr, ch); + } + /// Drain ejection-signal transitions, moving channels between /// `ready` and the internal ejected pool. O(k) per call where k is /// the number of pending signal changes. From 6d5324bff73e6776b5eb5382accc4b61c5d12d49 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 10:54:07 -0700 Subject: [PATCH 25/33] refactor(tonic-xds): drive outlier ejection through the channel state machine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the type-state machinery from `channel_state.rs` (`ReadyChannel::eject` → `EjectedChannel` → `UnejectedChannel`) as the primary mechanism for outlier-detection ejection, retiring the parallel `watch::Sender` + dual-map design. The compile-time invariant that ejected channels cannot be picked is now enforced by the type system: the picker takes `ReadyChannel`, ejected channels live in a `KeyedFutures<_, UnejectedChannel<_>>` mirroring the existing pattern for `ConnectingChannel`. This brings the outlier-detection LB integration in line with the project's existing idioms and gives the previously-unused channel state machine its first production caller. Architecture: - **Data path** still uses `OutlierStatsRegistry::record_outcome` to apply the failure-percentage algorithm per-RPC. On transition to ejected the registry sends the address through an mpsc `UnboundedSender` rather than flipping a watch flag. - **LoadBalancer** drains the mpsc in `poll_ready`, consumes the matching `ReadyChannel` via `.eject(EjectionConfig { timeout, .. })`, and tracks the resulting `EjectedChannel` in a second `KeyedFutures`. Each ejected channel's internal `Sleep` fires exactly at `base × multiplier` (capped at `max_ejection_time`), yielding `UnejectedChannel::Ready(svc)`; `poll_unejection` drains it on the next `poll_ready` and routes the channel back into `ready`. - **Housekeeping actor** simplifies: it resets counters and decrements multipliers on the `config.interval` boundary, but no longer un-ejects — un-ejection is timer-driven by `EjectedChannel`. `OutlierStatsRegistry` gains two methods: - `note_uneject(state)` — clears the `ejected_at_nanos` atomic on the channel state and decrements `ejected_count`. Called by the LB when an `EjectedChannel`'s timer fires. - `remaining_ejection(state, now)` — computes how much of the ejection window is left, capped by `max_ejection_time`. Used by the LB on initial ejection (full duration) and on re-discovery (remaining duration) to size the `EjectionConfig::timeout`. `OutlierChannelState` drops the `watch::Sender` field entirely; `is_ejected` / `try_eject` / `try_uneject` now use atomic CAS on `ejected_at_nanos` as the single source of truth. The `OutlierDetector` struct simplifies to `{ registry, eject_rx, _actor }` — no generic parameter, no internal `ejected` map, no signal-stream aggregator. Re-discovery while ejected (Insert for an address whose preserved state says `is_ejected`) re-ejects the new channel with the `remaining_ejection` duration so the original backoff is honored rather than restarted; if the deadline has already passed, the channel is un-ejected immediately. Behavior matches grpc-go and Envoy. Adds tests: - `OutlierStatsRegistry::{remaining_ejection,note_uneject}` — five new unit tests covering full duration, cap, mid-eject subtraction, past-deadline, and not-ejected cases. - `ejection_dispatches_address_through_mpsc` — verifies the data path sends through the mpsc on transition. - `housekeeping_leaves_ejected_multipliers_alone` — guards the new invariant that the actor no longer touches ejected channels. - `test_outlier_detection_timer_driven_unejection` — end-to-end LB test that an ejected channel returns to `ready` after `base × multiplier` elapses (with `tokio::time::advance`). Adds `KeyedFutures::contains_key` for test access; no production caller depends on it. --- .../src/client/loadbalance/channel_state.rs | 95 ++-- .../src/client/loadbalance/keyed_futures.rs | 7 + .../src/client/loadbalance/loadbalancer.rs | 322 ++++++++++--- .../client/loadbalance/outlier_detection.rs | 445 ++++++++++-------- 4 files changed, 551 insertions(+), 318 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index b87414bc1..472ba95c8 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -31,7 +31,6 @@ use std::task::{Context, Poll}; use std::time::{Duration, Instant}; use pin_project_lite::pin_project; -use tokio::sync::watch; use tower::Service; use tower::load::Load; @@ -72,22 +71,21 @@ impl EndpointCounters { } /// Per-channel outlier-detection state, shared (via `Arc`) between -/// the data path (per-RPC outcome recording + threshold-based ejection) -/// and the outlier-detection actor (interval-based housekeeping). +/// the data path (per-RPC outcome recording + threshold-based ejection), +/// the outlier-detection actor (interval-based housekeeping), and the +/// load balancer (consults `is_ejected` / `ejected_duration` on +/// reconnect). /// -/// Ejection is edge-triggered: callers flip the flag via [`eject`] / -/// [`uneject`]; observers poll `Receiver::changed()` (typically inside -/// a `FuturesUnordered`) to react in O(1) on each transition. -/// -/// All fields are atomics or wrapped in lock-free primitives so the -/// data path can mutate them without locking. -/// -/// [`eject`]: Self::eject -/// [`uneject`]: Self::uneject +/// All fields are atomics so the data path can mutate them without +/// locking. Ejection state is encoded in [`Self::ejected_at_nanos`]: +/// zero means not ejected, non-zero is the nanos-since-epoch of the +/// ejection's start. [`Self::try_eject`] / [`Self::try_uneject`] use +/// CAS to flip the field atomically and report whether the transition +/// fired (so callers can update registry-level counters exactly once +/// per transition). #[derive(Debug)] pub(crate) struct OutlierChannelState { counters: EndpointCounters, - eject_tx: watch::Sender, /// Whether this channel currently contributes to the registry's /// `qualifying_count`. Set when `total` first reaches /// `request_volume` in the current interval; cleared on counter @@ -97,7 +95,8 @@ pub(crate) struct OutlierChannelState { /// ejection; decremented (saturating) on each healthy interval. ejection_multiplier: AtomicU32, /// `0` when not ejected. Otherwise nanos since [`Self::epoch`] of - /// the current ejection's start. + /// the current ejection's start. Single source of truth for + /// "is this channel ejected right now?". ejected_at_nanos: AtomicU64, /// Reference instant used as the origin for `ejected_at_nanos`. /// Established at construction and never changes. @@ -112,10 +111,8 @@ impl Default for OutlierChannelState { impl OutlierChannelState { pub(crate) fn new() -> Self { - let (eject_tx, _) = watch::channel(false); Self { counters: EndpointCounters::default(), - eject_tx, is_qualifying: AtomicBool::new(false), ejection_multiplier: AtomicU32::new(0), ejected_at_nanos: AtomicU64::new(0), @@ -158,53 +155,39 @@ impl OutlierChannelState { self.is_qualifying.swap(false, Ordering::AcqRel) } - /// Flip the ejection flag to `true`. Returns `true` if this call - /// performed the false → true transition (so callers can update - /// registry-level counters exactly once per ejection). - /// Records the ejection timestamp and bumps the multiplier. + /// Atomically mark this channel as ejected starting at `now`. + /// Returns `true` if this call performed the not-ejected → + /// ejected transition (so callers can update registry-level + /// counters exactly once per ejection). Bumps the multiplier on + /// transition. pub(crate) fn try_eject(&self, now: Instant) -> bool { - let won = self.eject_tx.send_if_modified(|state| { - if *state { - false - } else { - *state = true; - true - } - }); - if !won { - return false; - } let nanos = now .saturating_duration_since(self.epoch) .as_nanos() .min(u64::MAX as u128) as u64; - // Use 1 as a sentinel if the channel was created at exactly - // `now`, since 0 means "not ejected". - self.ejected_at_nanos.store(nanos.max(1), Ordering::Relaxed); + // 0 means "not ejected"; use 1 as a sentinel if the channel + // was created at exactly `now`. + let stamp = nanos.max(1); + if self + .ejected_at_nanos + .compare_exchange(0, stamp, Ordering::AcqRel, Ordering::Relaxed) + .is_err() + { + return false; + } self.ejection_multiplier.fetch_add(1, Ordering::Relaxed); true } - /// Flip the ejection flag back to `false`. Returns `true` if this - /// call performed the true → false transition. + /// Atomically clear the ejection. Returns `true` if this call + /// performed the ejected → not-ejected transition. pub(crate) fn try_uneject(&self) -> bool { - let won = self.eject_tx.send_if_modified(|state| { - if *state { - *state = false; - true - } else { - false - } - }); - if won { - self.ejected_at_nanos.store(0, Ordering::Relaxed); - } - won + self.ejected_at_nanos.swap(0, Ordering::AcqRel) != 0 } /// Current ejection state. pub(crate) fn is_ejected(&self) -> bool { - *self.eject_tx.borrow() + self.ejected_at_nanos.load(Ordering::Acquire) != 0 } /// Returns the elapsed time since this channel was ejected, or @@ -232,14 +215,6 @@ impl OutlierChannelState { } } - /// Subscribe to ejection-state changes. The returned receiver's - /// `changed()` future resolves on each transition; consumers - /// typically push it into a `FuturesUnordered`. - #[allow(dead_code)] // wired by the LoadBalancer in a follow-up PR. - pub(crate) fn subscribe(&self) -> watch::Receiver { - self.eject_tx.subscribe() - } - /// Test-only setter for the ejection multiplier; lets tests drive /// housekeeping behavior without going through `try_eject`. #[cfg(test)] @@ -360,11 +335,15 @@ impl ReadyChannel { } /// Per-channel outlier-detection state. Cloned cheaply via `Arc`. - #[allow(dead_code)] // consumed by the LoadBalancer in a follow-up PR. pub(crate) fn outlier(&self) -> &Arc { &self.outlier } + /// Endpoint address this channel was created for. + pub(crate) fn addr(&self) -> &EndpointAddress { + &self.addr + } + /// Eject this channel (e.g., due to outlier detection). Consumes /// self. The outlier state remains in the registry; only the /// service and address are passed into [`EjectedChannel`] (which diff --git a/tonic-xds/src/client/loadbalance/keyed_futures.rs b/tonic-xds/src/client/loadbalance/keyed_futures.rs index 74319c6f3..c7f48aeaf 100644 --- a/tonic-xds/src/client/loadbalance/keyed_futures.rs +++ b/tonic-xds/src/client/loadbalance/keyed_futures.rs @@ -89,6 +89,13 @@ where self.futures.len() } + /// Returns true if a future is currently tracked for `key`. + /// Cancelled-but-not-yet-drained futures still count, since their + /// cancellation token entry is removed eagerly by [`Self::cancel`]. + pub(crate) fn contains_key(&self, key: &K) -> bool { + self.cancellations.contains_key(key) + } + /// Advance the internal futures. Yields `(K, T)` when a future completes, /// skipping cancelled futures silently. /// diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 7465ea997..4cedff7dd 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -4,27 +4,35 @@ //! [`IdleChannel`]s), manages the connection lifecycle via the channel state //! machine, and routes requests to ready endpoints via a [`ChannelPicker`]. //! -//! Outlier detection is integrated via an optional -//! [`OutlierDetector`], which bundles the shared -//! [`OutlierStatsRegistry`], the ejected-channel pool, the per-channel -//! ejection-signal streams, and the housekeeping actor handle. -//! Ejection decisions are made on the data path (per-RPC) and surfaced -//! to `poll_ready` via per-channel `watch::Receiver` streams -//! aggregated in a `StreamMap`. The LB then moves the corresponding -//! [`ReadyChannel`] between its `ready` map and the detector's ejected -//! pool in O(1) per transition. +//! Outlier detection is integrated via an optional [`OutlierDetector`]. +//! Ejection decisions originate on the data path (per-RPC) and are +//! signaled to the LB via an mpsc channel. The LB consumes the named +//! [`ReadyChannel`] via [`ReadyChannel::eject`], obtaining an +//! [`EjectedChannel`] whose internal sleep fires exactly at +//! `base × multiplier` (capped by `max_ejection_time`); ejected +//! channels live in a second [`KeyedFutures`] (mirroring the existing +//! pattern for `ConnectingChannel`) until their timer yields +//! [`UnejectedChannel`], at which point the channel is routed back +//! into `ready` (`UnejectedChannel::Ready`) or `connecting` +//! (`UnejectedChannel::Connecting`). +//! +//! [`EjectedChannel`]: crate::client::loadbalance::channel_state::EjectedChannel +//! [`UnejectedChannel`]: crate::client::loadbalance::channel_state::UnejectedChannel use std::future::Future; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll, ready}; +use std::time::{Duration, Instant}; use indexmap::IndexMap; use tower::Service; use tower::discover::{Change, Discover}; use crate::client::endpoint::{Connector, EndpointAddress}; -use crate::client::loadbalance::channel_state::{IdleChannel, OutlierChannelState, ReadyChannel}; +use crate::client::loadbalance::channel_state::{ + EjectionConfig, IdleChannel, OutlierChannelState, ReadyChannel, UnejectedChannel, +}; use crate::client::loadbalance::errors::LbError; use crate::client::loadbalance::keyed_futures::KeyedFutures; use crate::client::loadbalance::outlier_detection::{OutlierDetector, OutlierStatsRegistry}; @@ -75,10 +83,16 @@ pub(crate) struct LoadBalancer { connecting: KeyedFutures, /// Ready-to-serve channels, keyed by endpoint address. ready: IndexMap>, - /// All per-LB outlier-detection state — the shared registry, the - /// ejected pool, the ejection-signal streams, and the - /// housekeeping actor handle. `None` disables outlier detection. - outlier: Option>, + /// Channels currently ejected by outlier detection. Each entry is + /// an [`EjectedChannel`] whose `Sleep` fires when the ejection + /// window expires; the resolved [`UnejectedChannel`] is drained in + /// `poll_ready` and routed back into `ready` (or `connecting` if + /// the underlying connection needs replacing). + ejected: KeyedFutures>, + /// Outlier-detection plumbing: shared registry, eject-signal + /// receiver, and the housekeeping actor handle. `None` disables + /// outlier detection. + outlier: Option, /// Channel picker for load balancing. picker: Arc, Req> + Send + Sync>, } @@ -88,7 +102,7 @@ where D: Discover + Unpin, D::Error: Into, C: Connector + Send + Sync + 'static, - C::Service: Send + 'static, + C::Service: Clone + Send + 'static, { /// Create a load balancer with no outlier detection. pub(crate) fn new( @@ -113,28 +127,31 @@ where connector, connecting: KeyedFutures::new(), ready: IndexMap::new(), + ejected: KeyedFutures::new(), outlier: outlier.map(OutlierDetector::new), picker, } } /// Purge all per-endpoint state for `addr`: the connecting - /// future, the ready slot, and **all** outlier bookkeeping - /// (registry entry, ejection-signal subscription, ejected slot). - /// Used when discovery says the endpoint is gone from the cluster. + /// future, the ready slot, the ejected channel (if any), and the + /// outlier-detection registry entry. Used when discovery says the + /// endpoint is gone from the cluster. fn purge_endpoint(&mut self, addr: &EndpointAddress) { let _ = self.connecting.cancel(addr); self.ready.swap_remove(addr); - if let Some(o) = self.outlier.as_mut() { - o.forget(addr); + let _ = self.ejected.cancel(addr); + if let Some(o) = self.outlier.as_ref() { + o.registry().remove_channel(addr); } } /// Clear stale slots that held the old service (in-flight - /// connecting future, ready entry, ejected entry) but **preserve** - /// the outlier-detection registry entry — counters, ejection - /// multiplier, and ejection flag carry across the reconnect. - /// Used when discovery re-inserts an endpoint we already track. + /// connecting future, ready entry, ejected channel) but + /// **preserve** the outlier-detection registry entry — counters, + /// ejection multiplier, and ejection flag carry across the + /// reconnect. Used when discovery re-inserts an endpoint we + /// already track. /// /// This matches grpc-go and Envoy: outlier state is keyed by /// stable endpoint identity and survives a transient discovery @@ -143,9 +160,7 @@ where fn reset_active_slots(&mut self, addr: &EndpointAddress) { let _ = self.connecting.cancel(addr); self.ready.swap_remove(addr); - if let Some(o) = self.outlier.as_mut() { - o.clear_active_slots(addr); - } + let _ = self.ejected.cancel(addr); } /// Drain pending discovery events. Either resolves to an error @@ -176,37 +191,140 @@ where } } - /// Drain completed connection futures into the ready set. Wraps - /// each bare service into a `ReadyChannel` using the outlier - /// state from the detector (or a fresh state if outlier detection - /// is disabled). + /// Drain completed connection futures. Wraps each bare service + /// into a `ReadyChannel` using the outlier state from the + /// registry (or a fresh state if outlier detection is disabled). /// /// If the preserved outlier state for a re-discovered endpoint - /// says it is still ejected, the new channel goes directly into - /// the detector's ejected pool — not the ready set — so no - /// traffic is routed to it until the housekeeping actor un-ejects. + /// says it is still ejected, the new channel is re-ejected with + /// the *remaining* ejection time so the ongoing backoff is + /// honored. If the deadline has already passed, the channel is + /// un-ejected immediately and routed to `ready`. fn poll_connecting(&mut self, cx: &mut Context<'_>) { while let Poll::Ready(Some((addr, svc))) = self.connecting.poll_next(cx) { - let state = match self.outlier.as_mut() { - Some(o) => o.register(addr.clone()), + let state = match self.outlier.as_ref() { + Some(o) => o.registry().add_channel(addr.clone()), None => Arc::new(OutlierChannelState::new()), }; - let is_ejected = state.is_ejected(); - let ready = ReadyChannel::new(addr.clone(), svc, state); - match self.outlier.as_mut() { - Some(o) if is_ejected => o.place_ejected(addr, ready), - _ => { - self.ready.insert(addr, ready); + let ready = ReadyChannel::new(addr.clone(), svc, state.clone()); + let remaining = self + .outlier + .as_ref() + .and_then(|o| o.registry().remaining_ejection(&state, Instant::now())); + self.place_after_connect(addr, ready, remaining); + } + } + + /// Route a freshly-connected `ReadyChannel` into the right pool + /// based on the preserved outlier state's `remaining` ejection + /// duration. Factored out so `poll_connecting` stays terse and + /// the three cases (fresh, mid-eject, past-deadline) are visible. + fn place_after_connect( + &mut self, + addr: EndpointAddress, + ready: ReadyChannel, + remaining: Option, + ) { + match remaining { + None => { + self.ready.insert(addr, ready); + } + Some(d) if d.is_zero() => { + if let Some(o) = self.outlier.as_ref() { + o.registry().note_uneject(ready.outlier()); + } + self.ready.insert(addr, ready); + } + Some(d) => { + let ejected = ready.eject( + EjectionConfig { + timeout: d, + needs_reconnect: false, + }, + self.connector.clone(), + ); + tracing::debug!("outlier detection: re-eject {addr} for {d:?}"); + let _ = self.ejected.add(addr, ejected); + } + } + } + + /// Drain eject requests from the outlier detector's mpsc and + /// transition the named `ReadyChannel`s into ejected ones. The + /// per-channel ejection state has already been flipped by + /// `record_outcome`; this step is the visible transition on the + /// LB side. + fn poll_eject_requests(&mut self, cx: &mut Context<'_>) { + loop { + let Some(o) = self.outlier.as_mut() else { + return; + }; + let addr = match o.poll_eject_request(cx) { + Poll::Ready(Some(a)) => a, + _ => return, + }; + let registry = o.registry().clone(); + // The eject signal arrives once `try_eject` has flipped + // the channel's state and the cluster-wide + // `ejected_count`. If the channel is no longer in `ready` + // (e.g. discovery removed it), there's nothing to do. + let Some(ch) = self.ready.swap_remove(&addr) else { + continue; + }; + let state = ch.outlier().clone(); + match registry.remaining_ejection(&state, Instant::now()) { + Some(d) if !d.is_zero() => { + let ejected = ch.eject( + EjectionConfig { + timeout: d, + needs_reconnect: false, + }, + self.connector.clone(), + ); + tracing::debug!("outlier detection: eject {addr} for {d:?}"); + let _ = self.ejected.add(addr, ejected); + } + Some(_) => { + // Deadline already past — un-eject immediately. + registry.note_uneject(&state); + self.ready.insert(addr, ch); + } + None => { + // State is no longer ejected (concurrent uneject?) — restore. + self.ready.insert(addr, ch); } } } } - /// Drain outlier ejection-signal transitions, moving channels - /// between `ready` and the detector's ejected pool. - fn poll_outlier(&mut self, cx: &mut Context<'_>) { - if let Some(o) = self.outlier.as_mut() { - o.poll_signals(cx, &mut self.ready); + /// Drain completed `EjectedChannel` timers. Each yields either an + /// `UnejectedChannel::Ready(svc)` (timer expired, reuse the + /// connection) or `UnejectedChannel::Connecting(future)` (timer + /// expired but a fresh connect was requested). The address's + /// outlier state is cleared and the channel is routed back into + /// `ready` or `connecting` accordingly. + fn poll_unejection(&mut self, cx: &mut Context<'_>) { + while let Poll::Ready(Some((addr, unejected))) = self.ejected.poll_next(cx) { + let state = match self.outlier.as_ref() { + Some(o) => o.registry().add_channel(addr.clone()), + None => Arc::new(OutlierChannelState::new()), + }; + if let Some(o) = self.outlier.as_ref() { + o.registry().note_uneject(&state); + } + match unejected { + UnejectedChannel::Ready(svc) => { + tracing::debug!("outlier detection: uneject {addr}"); + let ready = ReadyChannel::new(addr.clone(), svc, state); + self.ready.insert(addr, ready); + } + UnejectedChannel::Connecting(future) => { + // `needs_reconnect = false` for A50, so this arm + // is unused today; handle it for completeness in + // case a future policy sets it. + let _ = self.connecting.add(addr, future); + } + } } } } @@ -228,8 +346,13 @@ where fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { let discover_result = self.poll_discover(cx); + // Drain un-ejection completions BEFORE servicing eject requests + // so a freshly un-ejected channel can immediately serve traffic + // (and so cluster-wide `ejected_count` is current when the next + // eject is evaluated). + self.poll_unejection(cx); self.poll_connecting(cx); - self.poll_outlier(cx); + self.poll_eject_requests(cx); if !self.ready.is_empty() { return Poll::Ready(Ok(())); @@ -265,6 +388,7 @@ where // an owned service and outlier handle for the async block; both // are `Arc`-shared, so cloning is cheap. let mut svc = picked.clone(); + let addr = picked.addr().clone(); let outlier_state = picked.outlier().clone(); let registry = self.outlier.as_ref().map(|o| o.registry().clone()); LbFuture::Pending(Box::pin(async move { @@ -275,10 +399,9 @@ where if let Some(registry) = registry.as_ref() { // Per-RPC outlier detection: bump the channel's // counter and (inside `record_outcome`) possibly - // eject if the failure-percentage threshold is - // crossed. Treat any `Err` outcome as a failure for - // outlier purposes. - registry.record_outcome(&outlier_state, result.is_ok()); + // dispatch an eject request to the LB. Treat any + // `Err` outcome as a failure for outlier purposes. + registry.record_outcome(&addr, &outlier_state, result.is_ok()); } result.map_err(|e| LbError::LbChannelCallError(e.into())) })) @@ -850,13 +973,13 @@ mod tests { let _ = lb.call("hello").await; } - // poll_ready drains the ejection signal and moves 8084. + // poll_ready drains the eject mpsc and transitions 8084 into + // `self.ejected` via `ReadyChannel::eject`. let _ = poll_ready_now(&mut lb); - let ejected = lb.outlier.as_ref().unwrap().ejected(); assert!( - ejected.contains_key(&addr(8084)), - "8084 should be ejected; ejected map: {:?}, ready keys: {:?}", - ejected.keys().collect::>(), + lb.ejected.contains_key(&addr(8084)), + "8084 should be ejected; ejected.len()={}, ready keys: {:?}", + lb.ejected.len(), lb.ready.keys().collect::>(), ); assert!(!lb.ready.contains_key(&addr(8084))); @@ -880,7 +1003,7 @@ mod tests { call_each(&mut lb, 50).await; let _ = poll_ready_now(&mut lb); - assert_eq!(lb.outlier.as_ref().unwrap().ejected().len(), 0); + assert_eq!(lb.ejected.len(), 0); assert_eq!(lb.ready.len(), 5); } @@ -977,21 +1100,17 @@ mod tests { "8084 must be ejected before re-insert" ); assert!( - lb.outlier - .as_ref() - .unwrap() - .ejected() - .contains_key(&addr(8084)), + lb.ejected.contains_key(&addr(8084)), "8084 should be in the ejected pool" ); - // Re-insert 8084. The ejected slot's old ReadyChannel is - // dropped, but the registry entry (is_ejected=true) is - // preserved. The new channel should land in the ejected pool, - // not in `ready`. Drive the steps explicitly because - // `lb.ready` is non-empty throughout (8080..=8083), so - // `drive_to_ready` may return before the new 8084 connect - // resolves. + // Re-insert 8084. The ejected slot's old EjectedChannel is + // cancelled, but the registry entry (is_ejected=true, + // ejected_at_nanos preserved) survives. The new channel + // should be re-ejected with the *remaining* ejection time. + // Drive the steps explicitly because `lb.ready` is non-empty + // throughout (8080..=8083), so `drive_to_ready` may return + // before the new 8084 connect resolves. tx.send(Ok(Change::Insert(addr(8084), IdleChannel::new(addr(8084))))) .await .unwrap(); @@ -1000,7 +1119,7 @@ mod tests { // 2. Synchronously resolve the new connect future. connector.resolve_all(); // 3. Drain the now-ready connecting future; `poll_connecting` - // sees `state.is_ejected() == true` and calls `place_ejected`. + // sees `state.is_ejected() == true` and re-ejects. let _ = poll_ready_now(&mut lb); assert!( @@ -1008,13 +1127,64 @@ mod tests { "8084 must not be in ready while still logically ejected" ); assert!( - lb.outlier - .as_ref() - .unwrap() - .ejected() - .contains_key(&addr(8084)), + lb.ejected.contains_key(&addr(8084)), "8084 must remain in the ejected pool after re-insert" ); assert!(state_8084.is_ejected()); } + + /// Once `base × multiplier` time elapses on an ejected channel, + /// the [`EjectedChannel`]'s timer fires and the LB's + /// `poll_unejection` should move the channel back to `ready`. + #[tokio::test(start_paused = true)] + async fn test_outlier_detection_timer_driven_unejection() { + let mut config = fp_config(50, 5, 3); + // Short base for fast test; multiplier is 1 on first eject. + config.base_ejection_time = Duration::from_secs(10); + config.max_ejection_time = Duration::from_secs(60); + + let (tx, discover) = new_discover(); + let (mut lb, connector, registry) = make_lb_with_outlier(discover, config); + + for port in 8080..=8084 { + tx.send(Ok(Change::Insert(addr(port), IdleChannel::new(addr(port))))) + .await + .unwrap(); + } + drive_to_ready(&mut lb, &connector).await; + connector + .service(&addr(8084)) + .fail_call + .store(true, Ordering::Relaxed); + for _ in 0..100 { + let _ = lb.call("hello").await; + } + let _ = poll_ready_now(&mut lb); + assert!( + lb.ejected.contains_key(&addr(8084)), + "8084 must be ejected before the timer fires" + ); + assert!(registry.add_channel(addr(8084)).is_ejected()); + + // Stop 8084 from failing so it can serve again, then advance + // past `base × multiplier = 10s`. + connector + .service(&addr(8084)) + .fail_call + .store(false, Ordering::Relaxed); + tokio::time::advance(Duration::from_secs(11)).await; + // Drive poll_ready; `EjectedChannel`'s timer fires and + // `poll_unejection` routes 8084 back to ready. + let _ = poll_ready_now(&mut lb); + + assert!( + !lb.ejected.contains_key(&addr(8084)), + "8084 must leave the ejected pool once the timer fires" + ); + assert!( + lb.ready.contains_key(&addr(8084)), + "8084 must be back in ready after un-ejection" + ); + assert!(!registry.add_channel(addr(8084)).is_ejected()); + } } diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index e4fd685d2..80666cb1f 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -1,46 +1,51 @@ //! gRFC A50 outlier detection. //! -//! The algorithm is split between the data path and a spawned actor: +//! The algorithm is split between the data path, the load balancer, +//! and a spawned actor: //! //! - **Per-RPC detection** runs inline on each call completion via //! [`OutlierStatsRegistry::record_outcome`]. The wrapper records the //! outcome on the channel's [`OutlierChannelState`], evaluates the -//! failure-percentage threshold against the channel's local -//! counters, and ejects the channel directly by flipping its -//! `watch::Sender`. Cluster-wide gates (`minimum_hosts`, -//! `max_ejection_percent`) are enforced via two atomic counters on -//! the registry, kept in sync as channels cross thresholds. +//! failure-percentage threshold, and on transition to ejected sends +//! the address through an mpsc channel for the LB to consume. +//! Cluster-wide gates (`minimum_hosts`, `max_ejection_percent`) are +//! enforced via two atomic counters on the registry, kept in sync +//! as channels cross thresholds. +//! - **The load balancer** drains the eject mpsc in `poll_ready`, +//! consumes the matching [`ReadyChannel`] via +//! [`ReadyChannel::eject`], and tracks the resulting +//! [`EjectedChannel`] in a `KeyedFutures`. Each ejected channel's +//! internal sleep fires at exactly `base × multiplier` (capped by +//! `max_ejection_time`) after ejection, yielding +//! [`UnejectedChannel::Ready`]; the LB drains it on the next +//! `poll_ready` and routes the channel back to the ready set. //! - **Interval-based housekeeping** runs in a spawned actor (see //! [`spawn_actor`]). It resets per-channel counters at the -//! `config.interval` boundary, un-ejects channels whose -//! `base × multiplier` backoff has elapsed, and decrements -//! multipliers for non-ejected channels. The actor never makes -//! ejection decisions. -//! -//! `LoadBalancer::poll_ready` observes ejections in O(1) per -//! transition by polling a `FuturesUnordered` -//! over each channel's signal. +//! `config.interval` boundary and decrements multipliers for +//! non-ejected channels. Un-ejection is timer-driven by +//! [`EjectedChannel`] — the actor never un-ejects. //! //! Only the failure-percentage algorithm is dispatched. The //! success-rate algorithm (cross-endpoint mean/stdev) is left to a //! follow-up. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md +//! [`ReadyChannel`]: crate::client::loadbalance::channel_state::ReadyChannel +//! [`ReadyChannel::eject`]: crate::client::loadbalance::channel_state::ReadyChannel::eject +//! [`EjectedChannel`]: crate::client::loadbalance::channel_state::EjectedChannel +//! [`UnejectedChannel::Ready`]: crate::client::loadbalance::channel_state::UnejectedChannel::Ready -use std::collections::HashMap; -use std::pin::Pin; use std::sync::Arc; +use std::sync::Mutex; use std::sync::atomic::{AtomicU64, Ordering}; use std::task::{Context, Poll}; -use std::time::Instant; +use std::time::{Duration, Instant}; use dashmap::DashMap; -use indexmap::IndexMap; -use tokio_stream::StreamMap; -use tokio_stream::wrappers::WatchStream; +use tokio::sync::mpsc; use crate::client::endpoint::EndpointAddress; -use crate::client::loadbalance::channel_state::{OutlierChannelState, ReadyChannel}; +use crate::client::loadbalance::channel_state::OutlierChannelState; use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::OutlierDetectionConfig; @@ -65,8 +70,9 @@ impl Rng for FastRandRng { /// [`Self::record_outcome`] after each RPC completion. /// - The spawned actor task, which calls [`Self::run_housekeeping`] /// on every `config.interval` tick. -/// - The load balancer's `poll_ready`, which subscribes to per-channel -/// ejection signals via [`OutlierChannelState::subscribe`]. +/// - The load balancer's `poll_ready`, which drains the eject mpsc +/// (via [`OutlierDetector::poll_eject_request`]) and calls +/// [`Self::note_uneject`] when an `EjectedChannel`'s timer fires. pub(crate) struct OutlierStatsRegistry { /// Per-endpoint state, keyed by address. Inserted by the LB on /// channel creation and removed on disconnect. @@ -79,6 +85,15 @@ pub(crate) struct OutlierStatsRegistry { ejected_count: AtomicU64, config: OutlierDetectionConfig, rng: Box, + /// Sender half of the eject signal. `record_outcome` pushes an + /// address through on transition to ejected; the LB's + /// [`OutlierDetector`] drains the receiver in `poll_ready` and + /// consumes the matching `ReadyChannel`. + eject_tx: mpsc::UnboundedSender, + /// Receiver half, handed to the LB at construction time. Wrapped + /// in a `Mutex>` so [`Self::take_eject_rx`] can move it + /// out exactly once. Outside that hand-off there is no contention. + eject_rx: Mutex>>, } impl OutlierStatsRegistry { @@ -89,15 +104,28 @@ impl OutlierStatsRegistry { /// Build a registry with a custom [`Rng`]. pub(crate) fn with_rng(config: OutlierDetectionConfig, rng: Box) -> Arc { + let (eject_tx, eject_rx) = mpsc::unbounded_channel(); Arc::new(Self { channels: DashMap::new(), qualifying_count: AtomicU64::new(0), ejected_count: AtomicU64::new(0), config, rng, + eject_tx, + eject_rx: Mutex::new(Some(eject_rx)), }) } + /// Take the eject-signal receiver. Called exactly once by + /// [`OutlierDetector::new`]. + fn take_eject_rx(&self) -> mpsc::UnboundedReceiver { + self.eject_rx + .lock() + .expect("eject_rx mutex poisoned") + .take() + .expect("OutlierStatsRegistry::take_eject_rx called more than once") + } + /// Register a channel and return the `Arc` /// the load balancer wires into the channel; the same `Arc` is /// retained in the registry so the actor can iterate it. If a @@ -132,8 +160,15 @@ impl OutlierStatsRegistry { /// Per-RPC entry point. Called by the load balancer's call wrapper /// after each RPC completion. Increments the channel's success or /// failure counter and then evaluates the failure-percentage - /// threshold; if all gates pass, ejects the channel inline. - pub(crate) fn record_outcome(&self, state: &OutlierChannelState, success: bool) { + /// threshold; if all gates pass and the channel was not already + /// ejected, marks it ejected and sends the address through the + /// eject mpsc for the LB to consume. + pub(crate) fn record_outcome( + &self, + addr: &EndpointAddress, + state: &OutlierChannelState, + success: bool, + ) { if success { state.record_success(); } else { @@ -179,20 +214,60 @@ impl OutlierStatsRegistry { if state.try_eject(Instant::now()) { self.ejected_count.fetch_add(1, Ordering::Relaxed); + // The LB drains this in `poll_ready` and consumes the + // `ReadyChannel` via `ReadyChannel::eject`. If the LB has + // dropped its receiver (shutdown), the send fails silently + // — the channel will be cleaned up by `forget`. + let _ = self.eject_tx.send(addr.clone()); + } + } + + /// Clear the ejection on `state` and decrement the cluster-wide + /// `ejected_count`. Returns whether the transition fired (so + /// callers can guard against double-counting). Called by the LB + /// when an `EjectedChannel`'s timer fires and yields + /// `UnejectedChannel::Ready`. + pub(crate) fn note_uneject(&self, state: &OutlierChannelState) -> bool { + if state.try_uneject() { + self.ejected_count.fetch_sub(1, Ordering::Relaxed); + true + } else { + false } } - /// Interval-boundary housekeeping. Called by the spawned actor on - /// each `config.interval` tick. Resets counters, un-ejects - /// channels whose backoff has elapsed, and decrements multipliers - /// for non-ejected channels. - pub(crate) fn run_housekeeping(&self, now: Instant) { - // Cap the un-ejection backoff at `max(base, max_ejection_time)`. + /// Compute how long `state` still has to remain ejected, or + /// `None` if it is not currently ejected. Returns + /// `Some(Duration::ZERO)` if the deadline has already passed + /// (caller should un-eject immediately rather than starting a + /// fresh sleep). Used by the LB on initial ejection and on + /// re-discovery to size the `EjectionConfig::timeout`. + pub(crate) fn remaining_ejection( + &self, + state: &OutlierChannelState, + now: Instant, + ) -> Option { + let elapsed = state.ejected_duration(now)?; + let multiplier = state.ejection_multiplier(); let cap = self .config .base_ejection_time .max(self.config.max_ejection_time); + let target = self + .config + .base_ejection_time + .checked_mul(multiplier) + .unwrap_or(cap) + .min(cap); + Some(target.checked_sub(elapsed).unwrap_or_default()) + } + /// Interval-boundary housekeeping. Called by the spawned actor on + /// each `config.interval` tick. Resets counters and decrements + /// multipliers for non-ejected channels. Does **not** un-eject — + /// un-ejection is timer-driven by each `EjectedChannel` and + /// handled by the LB when the channel resolves. + pub(crate) fn run_housekeeping(&self) { for entry in self.channels.iter() { let state = entry.value(); @@ -203,16 +278,7 @@ impl OutlierStatsRegistry { self.qualifying_count.fetch_sub(1, Ordering::Relaxed); } - if state.is_ejected() { - let multiplier = state.ejection_multiplier(); - let elapsed = state.ejected_duration(now).unwrap_or_default(); - if let Some(scaled) = self.config.base_ejection_time.checked_mul(multiplier) - && elapsed >= scaled.min(cap) - && state.try_uneject() - { - self.ejected_count.fetch_sub(1, Ordering::Relaxed); - } - } else { + if !state.is_ejected() { state.decrement_multiplier(); } } @@ -236,121 +302,57 @@ pub(crate) fn spawn_actor(registry: Arc) -> AbortOnDrop { ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); loop { ticker.tick().await; - registry.run_housekeeping(Instant::now()); + registry.run_housekeeping(); } }); AbortOnDrop(task) } -/// All per-LB outlier-detection state: the shared registry, the pool -/// of currently-ejected channels (whose connections are kept alive -/// across ejection), the per-channel ejection-signal streams -/// aggregated for O(1) observation in `poll_ready`, and the handle to -/// the housekeeping actor (dropped with the LB). +/// Per-LB outlier-detection plumbing: the shared registry, the +/// receiver half of the eject signal mpsc, and the handle to the +/// housekeeping actor (dropped with the LB). /// -/// `LoadBalancer` holds this as `Option>`: `None` -/// when outlier detection is disabled, `Some` when enabled. -pub(crate) struct OutlierDetector { +/// `LoadBalancer` holds this as `Option`: `None` +/// when outlier detection is disabled, `Some` when enabled. The +/// pool of ejected channels themselves lives directly on the LB in a +/// `KeyedFutures<_, UnejectedChannel<_>>` — see the channel state +/// machine in [`channel_state`] for the type-state transitions. +/// +/// [`channel_state`]: crate::client::loadbalance::channel_state +pub(crate) struct OutlierDetector { registry: Arc, - ejected: HashMap>, - ejection_signals: StreamMap>, + eject_rx: mpsc::UnboundedReceiver, _actor: AbortOnDrop, } -impl OutlierDetector { - /// Build from a registry, spawning the housekeeping actor. +impl OutlierDetector { + /// Build from a registry, spawning the housekeeping actor and + /// taking ownership of the eject-signal receiver. pub(crate) fn new(registry: Arc) -> Self { + let eject_rx = registry.take_eject_rx(); let _actor = spawn_actor(registry.clone()); Self { registry, - ejected: HashMap::new(), - ejection_signals: StreamMap::new(), + eject_rx, _actor, } } - /// Shared registry handle — clone to hand to the data path. + /// Shared registry handle. pub(crate) fn registry(&self) -> &Arc { &self.registry } - /// Register a newly-connected channel for tracking and (on first - /// registration only) subscribe to its ejection signal. Returns - /// the per-channel state for the load balancer to wire into - /// [`ReadyChannel`]. - /// - /// When an endpoint is re-discovered (Insert for an address whose - /// registry entry was preserved), the existing signal subscription - /// is left in place so any pending ejection transition is not - /// dropped. - pub(crate) fn register(&mut self, addr: EndpointAddress) -> Arc { - let state = self.registry.add_channel(addr.clone()); - if !self.ejection_signals.contains_key(&addr) { - self.ejection_signals - .insert(addr, WatchStream::from_changes(state.subscribe())); - } - state - } - - /// Drop all bookkeeping for `addr`: ejection slot, signal stream, - /// registry entry. Used when the endpoint is removed from the - /// cluster. - pub(crate) fn forget(&mut self, addr: &EndpointAddress) { - self.ejected.remove(addr); - self.ejection_signals.remove(addr); - self.registry.remove_channel(addr); - } - - /// Drop the ejected-pool entry for `addr` (which holds an obsolete - /// `ReadyChannel`) but preserve the registry entry — counters, - /// ejection multiplier, and ejection flag carry across the - /// reconnect. Used when an endpoint is re-discovered. - /// - /// Matches grpc-go (`internal/xds/balancer/outlierdetection`) and - /// Envoy (`BaseDynamicClusterImpl::updateDynamicHostList` reusing - /// existing `HostSharedPtr`s): outlier state is keyed by stable - /// endpoint identity and survives transient discovery flaps. - pub(crate) fn clear_active_slots(&mut self, addr: &EndpointAddress) { - self.ejected.remove(addr); - } - - /// Place a freshly-connected channel directly into the ejected - /// pool. Used by the load balancer when the preserved state for a - /// re-discovered endpoint says it is still ejected; this avoids a - /// brief window of routing traffic to a logically-ejected channel - /// until the housekeeping actor un-ejects it. - pub(crate) fn place_ejected(&mut self, addr: EndpointAddress, ch: ReadyChannel) { - self.ejected.insert(addr, ch); - } - - /// Drain ejection-signal transitions, moving channels between - /// `ready` and the internal ejected pool. O(k) per call where k is - /// the number of pending signal changes. - pub(crate) fn poll_signals( + /// Poll for the next address whose data path has decided to + /// eject. Returns `Poll::Pending` when no eject decision is + /// queued; returns `Poll::Ready(None)` only if the registry has + /// been dropped (which can't happen while this detector holds an + /// `Arc`). + pub(crate) fn poll_eject_request( &mut self, cx: &mut Context<'_>, - ready: &mut IndexMap>, - ) { - use futures_core::Stream; - while let Poll::Ready(Some((addr, ejected))) = - Pin::new(&mut self.ejection_signals).poll_next(cx) - { - if ejected { - if let Some(ch) = ready.swap_remove(&addr) { - tracing::debug!("outlier detection: eject {addr}"); - self.ejected.insert(addr, ch); - } - } else if let Some(ch) = self.ejected.remove(&addr) { - tracing::debug!("outlier detection: uneject {addr}"); - ready.insert(addr, ch); - } - } - } - - /// Number of currently-ejected channels. - #[cfg(test)] - pub(crate) fn ejected(&self) -> &HashMap> { - &self.ejected + ) -> Poll> { + self.eject_rx.poll_recv(cx) } } @@ -426,15 +428,16 @@ mod tests { /// Drive `n` outcomes through `record_outcome` for one channel. fn drive( registry: &OutlierStatsRegistry, + a: &EndpointAddress, state: &OutlierChannelState, successes: u64, failures: u64, ) { for _ in 0..successes { - registry.record_outcome(state, true); + registry.record_outcome(a, state, true); } for _ in 0..failures { - registry.record_outcome(state, false); + registry.record_outcome(a, state, false); } } @@ -446,9 +449,9 @@ mod tests { let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); - drive(®istry, &s, 100, 0); + drive(®istry, &addr(port), &s, 100, 0); } - drive(®istry, &bad, 10, 90); + drive(®istry, &addr(8084), &bad, 10, 90); assert!(bad.is_ejected()); assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 1); } @@ -460,7 +463,7 @@ mod tests { for port in 8080..=8084 { let s = registry.add_channel(addr(port)); // 30% failure → below 50% threshold. - drive(®istry, &s, 70, 30); + drive(®istry, &addr(port), &s, 70, 30); all.push(s); } for s in &all { @@ -475,7 +478,7 @@ mod tests { let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); - drive(®istry, &s, 50, 50); + drive(®istry, &addr(port), &s, 50, 50); all.push(s); } for s in &all { @@ -490,7 +493,7 @@ mod tests { let mut all = vec![]; for port in 8080..=8081 { let s = registry.add_channel(addr(port)); - drive(®istry, &s, 0, 100); + drive(®istry, &addr(port), &s, 0, 100); all.push(s); } for s in &all { @@ -502,10 +505,10 @@ mod tests { fn request_volume_filters_low_traffic() { let registry = OutlierStatsRegistry::with_rng(fp_config(50, 100, 3), FixedRng::boxed(99)); let bad = registry.add_channel(addr(8080)); - drive(®istry, &bad, 0, 5); + drive(®istry, &addr(8080), &bad, 0, 5); for port in 8081..=8084 { let s = registry.add_channel(addr(port)); - drive(®istry, &s, 200, 0); + drive(®istry, &addr(port), &s, 200, 0); } assert!(!bad.is_ejected()); } @@ -522,7 +525,7 @@ mod tests { let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); - drive(®istry, &s, 0, 100); + drive(®istry, &addr(port), &s, 0, 100); all.push(s); } for s in &all { @@ -538,15 +541,16 @@ mod tests { let mut all = vec![]; for port in 8080..=8084 { - let s = registry.add_channel(addr(port)); - all.push(s); + let a = addr(port); + let s = registry.add_channel(a.clone()); + all.push((a, s)); } // Drive all hosts to bad state in parallel pseudo-order. - for s in &all { - drive(®istry, s, 0, 100); + for (a, s) in &all { + drive(®istry, a, s, 0, 100); } - let ejected = all.iter().filter(|s| s.is_ejected()).count(); + let ejected = all.iter().filter(|(_, s)| s.is_ejected()).count(); // 5 hosts × 20% = 1 max ejection. assert_eq!(ejected, 1); } @@ -557,11 +561,11 @@ mod tests { let mut all = vec![]; for port in 8080..=8083 { let s = registry.add_channel(addr(port)); - drive(®istry, &s, 100, 0); + drive(®istry, &addr(port), &s, 100, 0); all.push(s); } let bad = registry.add_channel(addr(8084)); - drive(®istry, &bad, 0, 100); + drive(®istry, &addr(8084), &bad, 0, 100); assert!(bad.is_ejected()); assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 1); // Each healthy host crossed request_volume; bad too. So @@ -573,6 +577,25 @@ mod tests { assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 4); } + #[test] + fn ejection_dispatches_address_through_mpsc() { + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let mut rx = registry.take_eject_rx(); + let bad = registry.add_channel(addr(8084)); + for port in 8080..=8083 { + let s = registry.add_channel(addr(port)); + drive(®istry, &addr(port), &s, 100, 0); + } + drive(®istry, &addr(8084), &bad, 10, 90); + + // Eject dispatched exactly once via the mpsc. + assert_eq!(rx.try_recv(), Ok(addr(8084))); + assert!(matches!( + rx.try_recv(), + Err(mpsc::error::TryRecvError::Empty) + )); + } + // ----- Housekeeping ----- #[test] @@ -580,11 +603,11 @@ mod tests { let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); - drive(®istry, &s, 100, 0); + drive(®istry, &addr(port), &s, 100, 0); } assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 4); - registry.run_housekeeping(Instant::now()); + registry.run_housekeeping(); assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 0); for port in 8080..=8083 { let s = registry.channels.get(&addr(port)).unwrap(); @@ -593,59 +616,113 @@ mod tests { } #[test] - fn housekeeping_unejects_after_base_time() { + fn housekeeping_decrements_multiplier_on_healthy_interval() { + let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + let s = registry.add_channel(addr(8080)); + // Force multiplier to 3 directly (no traffic, no eject). + s.set_ejection_multiplier(3); + + registry.run_housekeeping(); + assert_eq!(s.ejection_multiplier(), 2); + } + + #[test] + fn housekeeping_leaves_ejected_multipliers_alone() { + let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + let s = registry.add_channel(addr(8080)); + s.try_eject(Instant::now()); + s.set_ejection_multiplier(3); + + registry.run_housekeeping(); + // Ejected channels keep their multiplier; un-ejection is the + // LB's job (timer-driven via EjectedChannel). + assert_eq!(s.ejection_multiplier(), 3); + assert!(s.is_ejected()); + } + + // ----- remaining_ejection / note_uneject ----- + + #[test] + fn remaining_ejection_returns_full_duration_for_fresh_eject() { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(60); let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); - - let bad = registry.add_channel(addr(8084)); - for port in 8080..=8083 { - let s = registry.add_channel(addr(port)); - drive(®istry, &s, 100, 0); - } - drive(®istry, &bad, 0, 100); - assert!(bad.is_ejected()); - - // Advance fewer than base_ejection_time ⇒ stays ejected. + let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); - registry.run_housekeeping(t0 + Duration::from_secs(9)); - assert!(bad.is_ejected()); - - // After base_ejection_time × 1 elapsed ⇒ uneject. - registry.run_housekeeping(t0 + Duration::from_secs(20)); - assert!(!bad.is_ejected()); - assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 0); + s.try_eject(t0); + // Multiplier is 1 after the first eject, so target = 10s. + let remaining = registry.remaining_ejection(&s, t0).unwrap(); + assert_eq!(remaining, Duration::from_secs(10)); } #[test] - fn housekeeping_decrements_multiplier_on_healthy_interval() { - let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + fn remaining_ejection_capped_at_max_ejection_time() { + let mut config = fp_config(50, 10, 3); + config.base_ejection_time = Duration::from_secs(10); + config.max_ejection_time = Duration::from_secs(15); + let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); let s = registry.add_channel(addr(8080)); - // Force multiplier to 3 directly (no traffic, no eject). - s.set_ejection_multiplier(3); + let t0 = Instant::now(); + s.try_eject(t0); + s.set_ejection_multiplier(10); // base * 10 = 100s, but cap = 15s. + let remaining = registry.remaining_ejection(&s, t0).unwrap(); + assert_eq!(remaining, Duration::from_secs(15)); + } - registry.run_housekeeping(Instant::now()); - assert_eq!(s.ejection_multiplier(), 2); + #[test] + fn remaining_ejection_subtracts_elapsed_for_re_discovery() { + let mut config = fp_config(50, 10, 3); + config.base_ejection_time = Duration::from_secs(30); + config.max_ejection_time = Duration::from_secs(60); + let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let s = registry.add_channel(addr(8080)); + let t0 = Instant::now(); + s.try_eject(t0); + // Re-discovered 10s into the ejection — should still have 20s left. + let remaining = registry + .remaining_ejection(&s, t0 + Duration::from_secs(10)) + .unwrap(); + assert_eq!(remaining, Duration::from_secs(20)); } #[test] - fn housekeeping_caps_ejection_at_max_ejection_time() { + fn remaining_ejection_zero_past_deadline() { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); - config.max_ejection_time = Duration::from_secs(15); + config.max_ejection_time = Duration::from_secs(60); let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let s = registry.add_channel(addr(8080)); + let t0 = Instant::now(); + s.try_eject(t0); + // 60s have passed but target is 10s — caller should un-eject. + let remaining = registry + .remaining_ejection(&s, t0 + Duration::from_secs(60)) + .unwrap(); + assert_eq!(remaining, Duration::ZERO); + } + #[test] + fn remaining_ejection_none_when_not_ejected() { + let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + let s = registry.add_channel(addr(8080)); + assert!(registry.remaining_ejection(&s, Instant::now()).is_none()); + } + + #[test] + fn note_uneject_clears_state_and_decrements_counter() { + let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); let s = registry.add_channel(addr(8080)); - // Pretend 8080 was ejected long ago with a huge multiplier. s.try_eject(Instant::now()); - s.set_ejection_multiplier(10); - registry.ejected_count.fetch_add(0, Ordering::Relaxed); // try_eject already added 1 + registry.ejected_count.fetch_add(1, Ordering::Relaxed); + assert!(s.is_ejected()); - // base * multiplier = 100s, but cap = 15s. Sweep at 16s ⇒ uneject. - let t0 = Instant::now(); - registry.run_housekeeping(t0 + Duration::from_secs(16)); + assert!(registry.note_uneject(&s)); assert!(!s.is_ejected()); + assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 0); + + // Second call is a no-op. + assert!(!registry.note_uneject(&s)); } // ----- Spawned actor ----- @@ -687,7 +764,7 @@ mod tests { } #[test] - fn channel_state_try_eject_uneject_flips_signal() { + fn channel_state_try_eject_uneject_transitions_atomically() { let s = OutlierChannelState::new(); assert!(!s.is_ejected()); assert!(s.try_eject(Instant::now())); From 7cf9053857a0644f06bc02eb21b3a20041183970 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 11:01:33 -0700 Subject: [PATCH 26/33] fix(tonic-xds): decrement multiplier on un-eject to match A50 step 6.b MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A50 step 6 runs once per interval and (a) un-ejects hosts whose backoff has elapsed, then (b) decrements the multiplier for every non-ejected host — in the same sweep. Envoy implements this exactly, so a host un-ejected at sweep N has its multiplier decremented at sweep N. In this PR's design un-ejection is timer-driven (each EjectedChannel holds its own Sleep), decoupled from the housekeeping sweep. With the previous note_uneject, the multiplier was only decremented at the next housekeeping interval — leaving a window where a re-eject during that window would see a stale (one-higher) multiplier and back off too aggressively relative to the spec. Apply the decrement inside note_uneject so it happens atomically with the transition. The actor's housekeeping decrement still runs at each interval; saturating arithmetic keeps the eventual decrement-to-zero correct. Adds a focused test (`re_eject_after_uneject_uses_fresh_multiplier`) verifying that a re-ejection immediately after un-ejection sizes the remaining-ejection duration with the fresh multiplier (base × 1), not the stale one (base × 2). --- .../client/loadbalance/outlier_detection.rs | 63 +++++++++++++++++-- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 80666cb1f..bd19fc6bb 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -222,14 +222,25 @@ impl OutlierStatsRegistry { } } - /// Clear the ejection on `state` and decrement the cluster-wide - /// `ejected_count`. Returns whether the transition fired (so - /// callers can guard against double-counting). Called by the LB - /// when an `EjectedChannel`'s timer fires and yields - /// `UnejectedChannel::Ready`. + /// Clear the ejection on `state`, decrement the cluster-wide + /// `ejected_count`, and decrement the channel's ejection + /// multiplier (matching gRFC A50 step 6.b, which decrements + /// multiplier in the same sweep that un-ejects). Returns whether + /// the transition fired (so callers can guard against + /// double-counting). Called by the LB when an `EjectedChannel`'s + /// timer fires and yields `UnejectedChannel::Ready`. pub(crate) fn note_uneject(&self, state: &OutlierChannelState) -> bool { if state.try_uneject() { self.ejected_count.fetch_sub(1, Ordering::Relaxed); + // Per A50, the same sweep that un-ejects also decrements + // the multiplier. Since our un-ejection is timer-driven + // (decoupled from the housekeeping sweep), we apply the + // decrement here to avoid a window where a re-eject would + // see a stale (one-higher) multiplier and back off too + // aggressively. The actor's housekeeping decrement still + // runs at each interval; saturating arithmetic ensures + // the eventual decrement to zero stays correct. + state.decrement_multiplier(); true } else { false @@ -713,16 +724,56 @@ mod tests { fn note_uneject_clears_state_and_decrements_counter() { let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); let s = registry.add_channel(addr(8080)); - s.try_eject(Instant::now()); + s.try_eject(Instant::now()); // bumps multiplier 0 → 1 registry.ejected_count.fetch_add(1, Ordering::Relaxed); assert!(s.is_ejected()); + assert_eq!(s.ejection_multiplier(), 1); assert!(registry.note_uneject(&s)); assert!(!s.is_ejected()); assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 0); + // A50 step 6.b: same sweep that un-ejects also decrements + // the multiplier. + assert_eq!(s.ejection_multiplier(), 0); // Second call is a no-op. assert!(!registry.note_uneject(&s)); + assert_eq!(s.ejection_multiplier(), 0); + } + + /// Re-ejecting a channel immediately after un-ejection should + /// produce a backoff sized for multiplier=1, not multiplier=2 — + /// i.e. it should *not* punish the channel for the previous + /// ejection that has just finished serving its cooldown. This is + /// what gRFC A50 prescribes and what Envoy does (un-eject and + /// decrement happen at the same sweep). + #[test] + fn re_eject_after_uneject_uses_fresh_multiplier() { + let mut config = fp_config(50, 10, 3); + config.base_ejection_time = Duration::from_secs(10); + config.max_ejection_time = Duration::from_secs(300); + let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let s = registry.add_channel(addr(8080)); + + let t0 = Instant::now(); + s.try_eject(t0); // multiplier 0 → 1 + registry.ejected_count.fetch_add(1, Ordering::Relaxed); + assert_eq!(s.ejection_multiplier(), 1); + + // Backoff elapses; LB calls note_uneject. + registry.note_uneject(&s); + assert_eq!(s.ejection_multiplier(), 0); + + // Channel immediately misbehaves again and gets re-ejected. + let t1 = t0 + Duration::from_secs(11); + s.try_eject(t1); // multiplier 0 → 1, not 1 → 2 + assert_eq!(s.ejection_multiplier(), 1); + // Remaining ejection duration should be `base * 1 = 10s`, + // not `base * 2 = 20s`. + assert_eq!( + registry.remaining_ejection(&s, t1).unwrap(), + Duration::from_secs(10), + ); } // ----- Spawned actor ----- From 3ef3748fcbcd14b60840a59e64a45db2561e43d4 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 11:04:28 -0700 Subject: [PATCH 27/33] fix(tonic-xds): make decrement_multiplier atomic via fetch_update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The load-then-store implementation could lose decrements when: - actor housekeeping decrements concurrently with note_uneject (both call decrement_multiplier); - or either of those races a data-path try_eject (which does fetch_add on the same atomic). Swap to fetch_update with a saturating closure so the read-modify- write is atomic. Bias was bounded at ±1 before, so this is not a correctness fix per se — just closes a small race window cleanly. --- tonic-xds/src/client/loadbalance/channel_state.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index 472ba95c8..3e6354e8d 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -207,12 +207,15 @@ impl OutlierChannelState { } /// Decrement the multiplier saturating at zero. Called by the - /// actor on healthy intervals. + /// actor on healthy intervals and by `note_uneject` on un-ejection. + /// Uses `fetch_update` so the load-and-store is atomic against + /// concurrent `try_eject` (`fetch_add`) and other decrements. pub(crate) fn decrement_multiplier(&self) { - let prev = self.ejection_multiplier.load(Ordering::Relaxed); - if prev > 0 { - self.ejection_multiplier.store(prev - 1, Ordering::Relaxed); - } + let _ = self + .ejection_multiplier + .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |v| { + if v > 0 { Some(v - 1) } else { None } + }); } /// Test-only setter for the ejection multiplier; lets tests drive From 66d2d6e8e9e8d670a613845250b4bb0eec63d20d Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 11:08:28 -0700 Subject: [PATCH 28/33] fix(tonic-xds): error instead of panic when an OutlierStatsRegistry is wired twice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The registry's eject-signal mpsc receiver is one-shot — a registry can drive at most one LoadBalancer. The previous implementation panicked at runtime if a misuse handed the same registry to two `with_outlier` calls. Return a typed error (`RegistryAlreadyWired`) from `OutlierStatsRegistry::take_eject_rx`, propagated through `OutlierDetector::new` and `LoadBalancer::with_outlier`. `LoadBalancer::new` stays infallible because the `outlier=None` path does not invoke the registry hand-off. Adds `test_outlier_registry_cannot_be_wired_twice` to lock the contract. --- .../src/client/loadbalance/loadbalancer.rs | 54 ++++++++++++++++--- .../client/loadbalance/outlier_detection.rs | 33 ++++++++---- 2 files changed, 71 insertions(+), 16 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 4cedff7dd..63c4fedaf 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -35,7 +35,9 @@ use crate::client::loadbalance::channel_state::{ }; use crate::client::loadbalance::errors::LbError; use crate::client::loadbalance::keyed_futures::KeyedFutures; -use crate::client::loadbalance::outlier_detection::{OutlierDetector, OutlierStatsRegistry}; +use crate::client::loadbalance::outlier_detection::{ + OutlierDetector, OutlierStatsRegistry, RegistryAlreadyWired, +}; use crate::client::loadbalance::pickers::ChannelPicker; /// Future returned by [`LoadBalancer::call`]. @@ -110,27 +112,36 @@ where connector: Arc, picker: Arc, Req> + Send + Sync>, ) -> Self { - Self::with_outlier(discovery, connector, picker, None) + // Infallible: `with_outlier(_, _, _, None)` never touches the + // outlier-detection construction path. + match Self::with_outlier(discovery, connector, picker, None) { + Ok(lb) => lb, + Err(_) => unreachable!("with_outlier(.., None) cannot wire a registry"), + } } /// Create a load balancer, optionally enabling outlier detection. /// When `outlier` is `Some`, the registry's housekeeping actor is /// spawned and its lifetime is bound to the load balancer. + /// Returns [`RegistryAlreadyWired`] if the provided registry has + /// already been wired to another load balancer — a registry's + /// eject-signal receiver is one-shot. pub(crate) fn with_outlier( discovery: D, connector: Arc, picker: Arc, Req> + Send + Sync>, outlier: Option>, - ) -> Self { - Self { + ) -> Result { + let outlier = outlier.map(OutlierDetector::new).transpose()?; + Ok(Self { discovery, connector, connecting: KeyedFutures::new(), ready: IndexMap::new(), ejected: KeyedFutures::new(), - outlier: outlier.map(OutlierDetector::new), + outlier, picker, - } + }) } /// Purge all per-endpoint state for `addr`: the connecting @@ -928,7 +939,8 @@ mod tests { Arc::new(P2cPicker); let registry = OutlierStatsRegistry::with_rng(config, Box::new(AlwaysFireRng)); let lb = - LoadBalancer::with_outlier(discover, connector.clone(), picker, Some(registry.clone())); + LoadBalancer::with_outlier(discover, connector.clone(), picker, Some(registry.clone())) + .expect("registry not yet wired"); (lb, connector, registry) } @@ -1187,4 +1199,32 @@ mod tests { ); assert!(!registry.add_channel(addr(8084)).is_ejected()); } + + /// Sharing one `OutlierStatsRegistry` across two `LoadBalancer`s is + /// not supported — the eject-signal receiver is one-shot. The + /// second `with_outlier` call must return an error rather than + /// panic. + #[tokio::test] + async fn test_outlier_registry_cannot_be_wired_twice() { + let (_tx1, discover1) = new_discover(); + let (_tx2, discover2) = new_discover(); + let connector = Arc::new(MockConnector::new()); + let picker: Arc, &'static str> + Send + Sync> = + Arc::new(P2cPicker); + let registry = OutlierStatsRegistry::with_rng(fp_config(50, 5, 3), Box::new(AlwaysFireRng)); + + // First wiring succeeds. + LoadBalancer::with_outlier( + discover1, + connector.clone(), + picker.clone(), + Some(registry.clone()), + ) + .expect("first wire"); + + // Second wiring of the same registry must error, not panic. + let result = + LoadBalancer::with_outlier(discover2, connector, picker, Some(registry.clone())); + assert!(result.is_err()); + } } diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index bd19fc6bb..d3066b600 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -49,6 +49,14 @@ use crate::client::loadbalance::channel_state::OutlierChannelState; use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::OutlierDetectionConfig; +/// Construction-time error returned when a single +/// [`OutlierStatsRegistry`] is wired to more than one load balancer. +/// The registry's eject-signal receiver is one-shot; reuse is not +/// supported. +#[derive(Debug, thiserror::Error)] +#[error("OutlierStatsRegistry is already wired to a LoadBalancer")] +pub(crate) struct RegistryAlreadyWired; + /// Probability source for `enforcing_*` rolls. pub(crate) trait Rng: Send + Sync + 'static { /// Return a uniform random `u32` in `0..100`. @@ -117,13 +125,17 @@ impl OutlierStatsRegistry { } /// Take the eject-signal receiver. Called exactly once by - /// [`OutlierDetector::new`]. - fn take_eject_rx(&self) -> mpsc::UnboundedReceiver { + /// [`OutlierDetector::new`]. Returns + /// [`RegistryAlreadyWired`] if a previous call has already taken + /// the receiver — a registry can drive at most one load balancer. + fn take_eject_rx( + &self, + ) -> Result, RegistryAlreadyWired> { self.eject_rx .lock() .expect("eject_rx mutex poisoned") .take() - .expect("OutlierStatsRegistry::take_eject_rx called more than once") + .ok_or(RegistryAlreadyWired) } /// Register a channel and return the `Arc` @@ -338,15 +350,18 @@ pub(crate) struct OutlierDetector { impl OutlierDetector { /// Build from a registry, spawning the housekeeping actor and - /// taking ownership of the eject-signal receiver. - pub(crate) fn new(registry: Arc) -> Self { - let eject_rx = registry.take_eject_rx(); + /// taking ownership of the eject-signal receiver. Returns + /// [`RegistryAlreadyWired`] if the registry's receiver has + /// already been taken (i.e. this registry is already driving + /// another load balancer); a registry can drive at most one LB. + pub(crate) fn new(registry: Arc) -> Result { + let eject_rx = registry.take_eject_rx()?; let _actor = spawn_actor(registry.clone()); - Self { + Ok(Self { registry, eject_rx, _actor, - } + }) } /// Shared registry handle. @@ -591,7 +606,7 @@ mod tests { #[test] fn ejection_dispatches_address_through_mpsc() { let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); - let mut rx = registry.take_eject_rx(); + let mut rx = registry.take_eject_rx().expect("receiver available"); let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); From 48ae8985461226a32c9edbe3e2651f7837e49c0e Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 11:14:23 -0700 Subject: [PATCH 29/33] refactor(tonic-xds): give OutlierChannelState its own address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Store the endpoint address directly on `OutlierChannelState` at construction time and expose `state.addr()` so downstream callers don't need to thread `(addr, state)` pairs alongside the state. API impact: - `OutlierChannelState::new(addr)` now takes the address explicitly. - `OutlierStatsRegistry::record_outcome(state, success)` drops its `addr` parameter; the mpsc dispatch reads `state.addr()`. - `ReadyChannel::addr()` (added earlier this PR only for the `record_outcome` thread-through) is removed — no remaining caller. The data path now passes just the `Arc` to `record_outcome`, which is cleaner and removes the awkwardness of two parameters that always travel together. --- .../src/client/loadbalance/channel_state.rs | 44 +++++++------ .../src/client/loadbalance/loadbalancer.rs | 7 +- .../client/loadbalance/outlier_detection.rs | 65 +++++++++---------- 3 files changed, 59 insertions(+), 57 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index 3e6354e8d..29db913c8 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -76,15 +76,23 @@ impl EndpointCounters { /// load balancer (consults `is_ejected` / `ejected_duration` on /// reconnect). /// -/// All fields are atomics so the data path can mutate them without -/// locking. Ejection state is encoded in [`Self::ejected_at_nanos`]: -/// zero means not ejected, non-zero is the nanos-since-epoch of the -/// ejection's start. [`Self::try_eject`] / [`Self::try_uneject`] use -/// CAS to flip the field atomically and report whether the transition -/// fired (so callers can update registry-level counters exactly once -/// per transition). +/// All mutable fields are atomics so the data path can mutate them +/// without locking. Ejection state is encoded in +/// [`Self::ejected_at_nanos`]: zero means not ejected, non-zero is the +/// nanos-since-epoch of the ejection's start. [`Self::try_eject`] / +/// [`Self::try_uneject`] use CAS to flip the field atomically and +/// report whether the transition fired (so callers can update +/// registry-level counters exactly once per transition). +/// +/// The `addr` field is set at construction and never changes, so +/// downstream callers (the registry's eject-mpsc dispatch in +/// particular) can recover the address from the state alone — no +/// need to thread `(addr, state)` pairs through the data path. #[derive(Debug)] pub(crate) struct OutlierChannelState { + /// Endpoint address this state belongs to. Immutable for the + /// lifetime of the state object. + addr: EndpointAddress, counters: EndpointCounters, /// Whether this channel currently contributes to the registry's /// `qualifying_count`. Set when `total` first reaches @@ -103,15 +111,10 @@ pub(crate) struct OutlierChannelState { epoch: Instant, } -impl Default for OutlierChannelState { - fn default() -> Self { - Self::new() - } -} - impl OutlierChannelState { - pub(crate) fn new() -> Self { + pub(crate) fn new(addr: EndpointAddress) -> Self { Self { + addr, counters: EndpointCounters::default(), is_qualifying: AtomicBool::new(false), ejection_multiplier: AtomicU32::new(0), @@ -120,6 +123,11 @@ impl OutlierChannelState { } } + /// Endpoint address this state belongs to. + pub(crate) fn addr(&self) -> &EndpointAddress { + &self.addr + } + pub(crate) fn record_success(&self) { self.counters.record_success(); } @@ -342,11 +350,6 @@ impl ReadyChannel { &self.outlier } - /// Endpoint address this channel was created for. - pub(crate) fn addr(&self) -> &EndpointAddress { - &self.addr - } - /// Eject this channel (e.g., due to outlier detection). Consumes /// self. The outlier state remains in the registry; only the /// service and address are passed into [`EjectedChannel`] (which @@ -515,7 +518,8 @@ mod tests { } fn wrap_ready(addr: EndpointAddress, svc: MockService) -> ReadyChannel { - ReadyChannel::new(addr, svc, Arc::new(OutlierChannelState::new())) + let state = Arc::new(OutlierChannelState::new(addr.clone())); + ReadyChannel::new(addr, svc, state) } #[tokio::test] diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 63c4fedaf..f37218623 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -215,7 +215,7 @@ where while let Poll::Ready(Some((addr, svc))) = self.connecting.poll_next(cx) { let state = match self.outlier.as_ref() { Some(o) => o.registry().add_channel(addr.clone()), - None => Arc::new(OutlierChannelState::new()), + None => Arc::new(OutlierChannelState::new(addr.clone())), }; let ready = ReadyChannel::new(addr.clone(), svc, state.clone()); let remaining = self @@ -318,7 +318,7 @@ where while let Poll::Ready(Some((addr, unejected))) = self.ejected.poll_next(cx) { let state = match self.outlier.as_ref() { Some(o) => o.registry().add_channel(addr.clone()), - None => Arc::new(OutlierChannelState::new()), + None => Arc::new(OutlierChannelState::new(addr.clone())), }; if let Some(o) = self.outlier.as_ref() { o.registry().note_uneject(&state); @@ -399,7 +399,6 @@ where // an owned service and outlier handle for the async block; both // are `Arc`-shared, so cloning is cheap. let mut svc = picked.clone(); - let addr = picked.addr().clone(); let outlier_state = picked.outlier().clone(); let registry = self.outlier.as_ref().map(|o| o.registry().clone()); LbFuture::Pending(Box::pin(async move { @@ -412,7 +411,7 @@ where // counter and (inside `record_outcome`) possibly // dispatch an eject request to the LB. Treat any // `Err` outcome as a failure for outlier purposes. - registry.record_outcome(&addr, &outlier_state, result.is_ok()); + registry.record_outcome(&outlier_state, result.is_ok()); } result.map_err(|e| LbError::LbChannelCallError(e.into())) })) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index d3066b600..6e1bed53b 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -145,8 +145,8 @@ impl OutlierStatsRegistry { /// state continuity across reconnect cycles is preserved. pub(crate) fn add_channel(&self, addr: EndpointAddress) -> Arc { self.channels - .entry(addr) - .or_insert_with(|| Arc::new(OutlierChannelState::new())) + .entry(addr.clone()) + .or_insert_with(|| Arc::new(OutlierChannelState::new(addr))) .clone() } @@ -175,12 +175,7 @@ impl OutlierStatsRegistry { /// threshold; if all gates pass and the channel was not already /// ejected, marks it ejected and sends the address through the /// eject mpsc for the LB to consume. - pub(crate) fn record_outcome( - &self, - addr: &EndpointAddress, - state: &OutlierChannelState, - success: bool, - ) { + pub(crate) fn record_outcome(&self, state: &OutlierChannelState, success: bool) { if success { state.record_success(); } else { @@ -230,7 +225,7 @@ impl OutlierStatsRegistry { // `ReadyChannel` via `ReadyChannel::eject`. If the LB has // dropped its receiver (shutdown), the send fails silently // — the channel will be cleaned up by `forget`. - let _ = self.eject_tx.send(addr.clone()); + let _ = self.eject_tx.send(state.addr().clone()); } } @@ -454,16 +449,15 @@ mod tests { /// Drive `n` outcomes through `record_outcome` for one channel. fn drive( registry: &OutlierStatsRegistry, - a: &EndpointAddress, state: &OutlierChannelState, successes: u64, failures: u64, ) { for _ in 0..successes { - registry.record_outcome(a, state, true); + registry.record_outcome(state, true); } for _ in 0..failures { - registry.record_outcome(a, state, false); + registry.record_outcome(state, false); } } @@ -475,9 +469,9 @@ mod tests { let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); - drive(®istry, &addr(port), &s, 100, 0); + drive(®istry, &s, 100, 0); } - drive(®istry, &addr(8084), &bad, 10, 90); + drive(®istry, &bad, 10, 90); assert!(bad.is_ejected()); assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 1); } @@ -489,7 +483,7 @@ mod tests { for port in 8080..=8084 { let s = registry.add_channel(addr(port)); // 30% failure → below 50% threshold. - drive(®istry, &addr(port), &s, 70, 30); + drive(®istry, &s, 70, 30); all.push(s); } for s in &all { @@ -504,7 +498,7 @@ mod tests { let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); - drive(®istry, &addr(port), &s, 50, 50); + drive(®istry, &s, 50, 50); all.push(s); } for s in &all { @@ -519,7 +513,7 @@ mod tests { let mut all = vec![]; for port in 8080..=8081 { let s = registry.add_channel(addr(port)); - drive(®istry, &addr(port), &s, 0, 100); + drive(®istry, &s, 0, 100); all.push(s); } for s in &all { @@ -531,10 +525,10 @@ mod tests { fn request_volume_filters_low_traffic() { let registry = OutlierStatsRegistry::with_rng(fp_config(50, 100, 3), FixedRng::boxed(99)); let bad = registry.add_channel(addr(8080)); - drive(®istry, &addr(8080), &bad, 0, 5); + drive(®istry, &bad, 0, 5); for port in 8081..=8084 { let s = registry.add_channel(addr(port)); - drive(®istry, &addr(port), &s, 200, 0); + drive(®istry, &s, 200, 0); } assert!(!bad.is_ejected()); } @@ -551,7 +545,7 @@ mod tests { let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); - drive(®istry, &addr(port), &s, 0, 100); + drive(®istry, &s, 0, 100); all.push(s); } for s in &all { @@ -567,16 +561,15 @@ mod tests { let mut all = vec![]; for port in 8080..=8084 { - let a = addr(port); - let s = registry.add_channel(a.clone()); - all.push((a, s)); + let s = registry.add_channel(addr(port)); + all.push(s); } // Drive all hosts to bad state in parallel pseudo-order. - for (a, s) in &all { - drive(®istry, a, s, 0, 100); + for s in &all { + drive(®istry, s, 0, 100); } - let ejected = all.iter().filter(|(_, s)| s.is_ejected()).count(); + let ejected = all.iter().filter(|s| s.is_ejected()).count(); // 5 hosts × 20% = 1 max ejection. assert_eq!(ejected, 1); } @@ -587,11 +580,11 @@ mod tests { let mut all = vec![]; for port in 8080..=8083 { let s = registry.add_channel(addr(port)); - drive(®istry, &addr(port), &s, 100, 0); + drive(®istry, &s, 100, 0); all.push(s); } let bad = registry.add_channel(addr(8084)); - drive(®istry, &addr(8084), &bad, 0, 100); + drive(®istry, &bad, 0, 100); assert!(bad.is_ejected()); assert_eq!(registry.ejected_count.load(Ordering::Relaxed), 1); // Each healthy host crossed request_volume; bad too. So @@ -610,9 +603,9 @@ mod tests { let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); - drive(®istry, &addr(port), &s, 100, 0); + drive(®istry, &s, 100, 0); } - drive(®istry, &addr(8084), &bad, 10, 90); + drive(®istry, &bad, 10, 90); // Eject dispatched exactly once via the mpsc. assert_eq!(rx.try_recv(), Ok(addr(8084))); @@ -629,7 +622,7 @@ mod tests { let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); - drive(®istry, &addr(port), &s, 100, 0); + drive(®istry, &s, 100, 0); } assert_eq!(registry.qualifying_count.load(Ordering::Relaxed), 4); @@ -821,7 +814,7 @@ mod tests { #[test] fn channel_state_records_and_resets() { - let s = OutlierChannelState::new(); + let s = OutlierChannelState::new(addr(8080)); s.record_success(); s.record_success(); s.record_failure(); @@ -831,7 +824,7 @@ mod tests { #[test] fn channel_state_try_eject_uneject_transitions_atomically() { - let s = OutlierChannelState::new(); + let s = OutlierChannelState::new(addr(8080)); assert!(!s.is_ejected()); assert!(s.try_eject(Instant::now())); assert!(s.is_ejected()); @@ -841,4 +834,10 @@ mod tests { assert!(!s.is_ejected()); assert!(!s.try_uneject()); } + + #[test] + fn channel_state_remembers_its_address() { + let s = OutlierChannelState::new(addr(9090)); + assert_eq!(s.addr(), &addr(9090)); + } } From 7c903eb9b5936aaab1623d4e6d735ac200dab4bc Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 11:36:23 -0700 Subject: [PATCH 30/33] docs(tonic-xds): trim outlier-detection doc comments Tighten doc comments across the outlier-detection module, the channel state machine, and the LB. Remove rationale-style narrative and references to past designs; keep API contracts, gRFC references, and non-obvious invariants. No code changes. --- .../src/client/loadbalance/channel_state.rs | 140 ++++-------- .../src/client/loadbalance/keyed_futures.rs | 4 +- .../src/client/loadbalance/loadbalancer.rs | 170 +++++---------- .../client/loadbalance/outlier_detection.rs | 205 ++++++------------ .../src/xds/resource/outlier_detection.rs | 32 +-- 5 files changed, 183 insertions(+), 368 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index 29db913c8..a4a57d4bb 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -60,9 +60,8 @@ impl EndpointCounters { } /// Read and zero both counters. The two swaps are not atomic against - /// each other — RPCs landing between them may bias the snapshot by - /// a small number of events, well below the precision of the - /// failure-percentage threshold. + /// each other; bias from in-flight RPCs is bounded and well below + /// the precision of the failure-percentage threshold. pub(crate) fn snapshot_and_reset(&self) -> (u64, u64) { let s = self.success.swap(0, Ordering::Relaxed); let f = self.failure.swap(0, Ordering::Relaxed); @@ -70,44 +69,29 @@ impl EndpointCounters { } } -/// Per-channel outlier-detection state, shared (via `Arc`) between -/// the data path (per-RPC outcome recording + threshold-based ejection), -/// the outlier-detection actor (interval-based housekeeping), and the -/// load balancer (consults `is_ejected` / `ejected_duration` on -/// reconnect). +/// Per-channel outlier-detection state, shared via `Arc` between the +/// data path (per-RPC outcome recording + threshold-based ejection), +/// the housekeeping actor, and the load balancer. /// -/// All mutable fields are atomics so the data path can mutate them -/// without locking. Ejection state is encoded in -/// [`Self::ejected_at_nanos`]: zero means not ejected, non-zero is the -/// nanos-since-epoch of the ejection's start. [`Self::try_eject`] / -/// [`Self::try_uneject`] use CAS to flip the field atomically and -/// report whether the transition fired (so callers can update -/// registry-level counters exactly once per transition). -/// -/// The `addr` field is set at construction and never changes, so -/// downstream callers (the registry's eject-mpsc dispatch in -/// particular) can recover the address from the state alone — no -/// need to thread `(addr, state)` pairs through the data path. +/// Ejection state is encoded in [`Self::ejected_at_nanos`]: zero means +/// not ejected, non-zero is the nanos-since-epoch of the ejection's +/// start. [`Self::try_eject`] / [`Self::try_uneject`] use CAS so callers +/// can update registry-level counters exactly once per transition. #[derive(Debug)] pub(crate) struct OutlierChannelState { - /// Endpoint address this state belongs to. Immutable for the - /// lifetime of the state object. addr: EndpointAddress, counters: EndpointCounters, - /// Whether this channel currently contributes to the registry's - /// `qualifying_count`. Set when `total` first reaches - /// `request_volume` in the current interval; cleared on counter - /// reset. + /// `true` while this channel is counted in the registry's + /// `qualifying_count` (i.e. has hit `request_volume` in the + /// current interval). is_qualifying: AtomicBool, - /// Number of times this channel has been ejected. Bumped on each - /// ejection; decremented (saturating) on each healthy interval. + /// Bumped on each ejection; decremented (saturating) on each + /// healthy interval. ejection_multiplier: AtomicU32, - /// `0` when not ejected. Otherwise nanos since [`Self::epoch`] of - /// the current ejection's start. Single source of truth for - /// "is this channel ejected right now?". + /// `0` when not ejected; otherwise nanos since [`Self::epoch`] of + /// the current ejection's start. ejected_at_nanos: AtomicU64, - /// Reference instant used as the origin for `ejected_at_nanos`. - /// Established at construction and never changes. + /// Origin for `ejected_at_nanos`. Set at construction. epoch: Instant, } @@ -136,10 +120,8 @@ impl OutlierChannelState { self.counters.record_failure(); } - /// Read the current counter values without resetting. Returns - /// `(success, failure)`. The two reads are not atomic against - /// each other but the difference is bounded by concurrent in-flight - /// RPCs and is below the precision of the failure-percentage check. + /// Returns `(success, failure)` without resetting. The two reads + /// are not atomic together; bias is bounded by in-flight RPCs. pub(crate) fn counters(&self) -> (u64, u64) { let s = self.counters.success.load(Ordering::Relaxed); let f = self.counters.failure.load(Ordering::Relaxed); @@ -151,9 +133,9 @@ impl OutlierChannelState { self.counters.snapshot_and_reset() } - /// Try to set `is_qualifying` to `true`. Returns `true` if this - /// call performed the false → true transition, so callers can - /// increment a registry-level counter exactly once per crossing. + /// Set `is_qualifying` to `true`. Returns `true` if this call + /// performed the false → true transition (so the caller can bump + /// the registry counter exactly once per crossing). pub(crate) fn mark_qualifying(&self) -> bool { !self.is_qualifying.swap(true, Ordering::AcqRel) } @@ -164,10 +146,8 @@ impl OutlierChannelState { } /// Atomically mark this channel as ejected starting at `now`. - /// Returns `true` if this call performed the not-ejected → - /// ejected transition (so callers can update registry-level - /// counters exactly once per ejection). Bumps the multiplier on - /// transition. + /// Returns `true` on the not-ejected → ejected transition and + /// bumps the multiplier; `false` if already ejected. pub(crate) fn try_eject(&self, now: Instant) -> bool { let nanos = now .saturating_duration_since(self.epoch) @@ -187,8 +167,8 @@ impl OutlierChannelState { true } - /// Atomically clear the ejection. Returns `true` if this call - /// performed the ejected → not-ejected transition. + /// Atomically clear the ejection. Returns `true` on the + /// ejected → not-ejected transition. pub(crate) fn try_uneject(&self) -> bool { self.ejected_at_nanos.swap(0, Ordering::AcqRel) != 0 } @@ -214,10 +194,8 @@ impl OutlierChannelState { self.ejection_multiplier.load(Ordering::Relaxed) } - /// Decrement the multiplier saturating at zero. Called by the - /// actor on healthy intervals and by `note_uneject` on un-ejection. - /// Uses `fetch_update` so the load-and-store is atomic against - /// concurrent `try_eject` (`fetch_add`) and other decrements. + /// Decrement the multiplier, saturating at zero. Atomic against + /// concurrent `try_eject` and other decrements. pub(crate) fn decrement_multiplier(&self) { let _ = self .ejection_multiplier @@ -226,8 +204,8 @@ impl OutlierChannelState { }); } - /// Test-only setter for the ejection multiplier; lets tests drive - /// housekeeping behavior without going through `try_eject`. + /// Test-only multiplier setter for driving housekeeping without + /// going through `try_eject`. #[cfg(test)] pub(crate) fn set_ejection_multiplier(&self, value: u32) { self.ejection_multiplier.store(value, Ordering::Relaxed); @@ -245,10 +223,8 @@ pub(crate) struct EjectionConfig { /// Result of an ejection expiring. pub(crate) enum UnejectedChannel { - /// The channel is ready to serve again (ejection expired, no - /// reconnect needed). The consumer wraps the bare service into a - /// [`ReadyChannel`] using the registry-supplied - /// [`OutlierChannelState`]. + /// Connection reused; the caller wraps the service back into a + /// [`ReadyChannel`]. Ready(S), /// A fresh connection has been started. Connecting(ConnectingChannel), @@ -284,14 +260,10 @@ impl IdleChannel { /// A channel that is in the process of connecting. /// -/// Implements [`Future`] -- resolves to the connected service `S` -/// when the connection completes. The consumer wraps that into a -/// [`ReadyChannel`] (attaching its [`OutlierChannelState`]). -/// Cancellation is handled externally via [`KeyedFutures::cancel`]. -/// -/// `ConnectingChannel` deliberately does not carry an -/// [`OutlierChannelState`]: it does not serve traffic, so it has -/// nothing to count or signal. +/// `impl Future` — resolves to the connected service when +/// the connection completes. The caller wraps the resolved service +/// into a [`ReadyChannel`]. Cancellation is handled externally via +/// [`KeyedFutures::cancel`]. /// /// [`KeyedFutures::cancel`]: crate::client::loadbalance::keyed_futures::KeyedFutures::cancel pub(crate) struct ConnectingChannel { @@ -299,9 +271,6 @@ pub(crate) struct ConnectingChannel { } impl ConnectingChannel { - /// Start a connection. The address is kept by the caller (it is - /// typically the key in a `KeyedFutures` map); only the future is - /// stored here. pub(crate) fn new(fut: BoxFuture, _addr: EndpointAddress) -> Self { Self { inner: fut } } @@ -322,11 +291,8 @@ impl Future for ConnectingChannel { /// A channel that is connected and ready to serve requests. /// /// Holds the raw service `S` and delegates [`Service`] calls directly, -/// preserving `S::Future` and `S::Error` with no wrapping or type -/// erasure. The `Arc` is shared with the outlier- -/// detection actor for stats accumulation and edge-triggered ejection; -/// because only `ReadyChannel` serves traffic, only `ReadyChannel` -/// carries this state. +/// preserving `S::Future` and `S::Error`. Shares +/// [`OutlierChannelState`] with the outlier-detection actor via `Arc`. #[derive(Clone)] pub(crate) struct ReadyChannel { addr: EndpointAddress, @@ -335,8 +301,6 @@ pub(crate) struct ReadyChannel { } impl ReadyChannel { - /// Wrap a connected service `S` into a [`ReadyChannel`] using the - /// caller-supplied outlier state. pub(crate) fn new(addr: EndpointAddress, inner: S, outlier: Arc) -> Self { Self { addr, @@ -350,10 +314,8 @@ impl ReadyChannel { &self.outlier } - /// Eject this channel (e.g., due to outlier detection). Consumes - /// self. The outlier state remains in the registry; only the - /// service and address are passed into [`EjectedChannel`] (which - /// just times the cooldown). + /// Eject this channel. Consumes self; the outlier state remains + /// in the registry. pub(crate) fn eject(self, config: EjectionConfig, connector: Arc) -> EjectedChannel where C: Connector + Send + Sync + 'static, @@ -368,9 +330,8 @@ impl ReadyChannel { } } - /// Start reconnecting. Consumes self, dropping the old connection. - /// The outlier state remains in the registry; the consumer - /// re-attaches it when the new [`ReadyChannel`] is constructed. + /// Drop the connection and start a fresh connect for the same + /// address. The outlier state remains in the registry. pub(crate) fn reconnect>( self, connector: Arc, @@ -412,18 +373,13 @@ impl Load for ReadyChannel { // --------------------------------------------------------------------------- pin_project! { - /// A channel that has been ejected and is cooling down. - /// - /// The underlying connection is kept alive but cannot serve - /// requests. Implements [`Future`] -- resolves once the ejection - /// timer expires to either: - /// - [`UnejectedChannel::Ready`] if no reconnect is needed - /// - [`UnejectedChannel::Connecting`] if a fresh connection is required + /// A channel that has been ejected and is cooling down. The + /// underlying connection is kept alive but cannot serve requests. /// - /// `EjectedChannel` deliberately does not carry an - /// [`OutlierChannelState`]: the state lives in the registry, keyed - /// by address, and the consumer re-attaches it when the channel - /// transitions back to [`ReadyChannel`]. + /// `impl Future>` — resolves when + /// `config.timeout` elapses, to [`UnejectedChannel::Ready`] if + /// `needs_reconnect` is false, otherwise + /// [`UnejectedChannel::Connecting`]. pub(crate) struct EjectedChannel { addr: EndpointAddress, inner: S, diff --git a/tonic-xds/src/client/loadbalance/keyed_futures.rs b/tonic-xds/src/client/loadbalance/keyed_futures.rs index c7f48aeaf..701ff865f 100644 --- a/tonic-xds/src/client/loadbalance/keyed_futures.rs +++ b/tonic-xds/src/client/loadbalance/keyed_futures.rs @@ -89,9 +89,7 @@ where self.futures.len() } - /// Returns true if a future is currently tracked for `key`. - /// Cancelled-but-not-yet-drained futures still count, since their - /// cancellation token entry is removed eagerly by [`Self::cancel`]. + /// True if a live (non-cancelled) future is tracked for `key`. pub(crate) fn contains_key(&self, key: &K) -> bool { self.cancellations.contains_key(key) } diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index f37218623..75c6dffb0 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -1,20 +1,16 @@ //! Load balancer tower service. //! -//! Receives endpoint updates via [`tower::discover::Discover`] (yielding -//! [`IdleChannel`]s), manages the connection lifecycle via the channel state -//! machine, and routes requests to ready endpoints via a [`ChannelPicker`]. +//! Receives endpoint updates via [`tower::discover::Discover`], +//! manages the connection lifecycle via the channel state machine, +//! and routes requests to ready endpoints via a [`ChannelPicker`]. //! -//! Outlier detection is integrated via an optional [`OutlierDetector`]. -//! Ejection decisions originate on the data path (per-RPC) and are -//! signaled to the LB via an mpsc channel. The LB consumes the named -//! [`ReadyChannel`] via [`ReadyChannel::eject`], obtaining an -//! [`EjectedChannel`] whose internal sleep fires exactly at -//! `base × multiplier` (capped by `max_ejection_time`); ejected -//! channels live in a second [`KeyedFutures`] (mirroring the existing -//! pattern for `ConnectingChannel`) until their timer yields -//! [`UnejectedChannel`], at which point the channel is routed back -//! into `ready` (`UnejectedChannel::Ready`) or `connecting` -//! (`UnejectedChannel::Connecting`). +//! Outlier detection (gRFC A50) is integrated via an optional +//! [`OutlierDetector`]. Eject requests arrive on an mpsc channel from +//! the data path; the LB consumes the matching [`ReadyChannel`] via +//! [`ReadyChannel::eject`] and tracks the resulting +//! [`EjectedChannel`] in [`Self::ejected`]. When the timer fires, the +//! resolved [`UnejectedChannel`] is routed back into `ready` or +//! `connecting`. //! //! [`EjectedChannel`]: crate::client::loadbalance::channel_state::EjectedChannel //! [`UnejectedChannel`]: crate::client::loadbalance::channel_state::UnejectedChannel @@ -40,10 +36,8 @@ use crate::client::loadbalance::outlier_detection::{ }; use crate::client::loadbalance::pickers::ChannelPicker; -/// Future returned by [`LoadBalancer::call`]. -/// -/// Either resolves immediately with an [`LbError`], or drives `poll_ready` + -/// `call` on the selected channel asynchronously. +/// Future returned by [`LoadBalancer::call`]. Either resolves +/// immediately with an [`LbError`] or drives the selected channel. pub(crate) enum LbFuture { Error(Option), Pending(Pin> + Send>>), @@ -74,28 +68,18 @@ impl Future for LbFuture { /// `C::Service` is the underlying service type held in ready channels. /// - `Req`: The request type. pub(crate) struct LoadBalancer { - /// Discovery stream providing endpoint additions/removals. discovery: D, - /// Connector for creating connections from idle channels. connector: Arc, - /// In-flight connection attempts, keyed by endpoint address. - /// `ConnectingChannel` resolves to the bare service; the LB wraps - /// it into a `ReadyChannel` with an outlier state when it - /// transitions to ready. + /// In-flight connection attempts. connecting: KeyedFutures, - /// Ready-to-serve channels, keyed by endpoint address. + /// Ready-to-serve channels. ready: IndexMap>, - /// Channels currently ejected by outlier detection. Each entry is - /// an [`EjectedChannel`] whose `Sleep` fires when the ejection - /// window expires; the resolved [`UnejectedChannel`] is drained in - /// `poll_ready` and routed back into `ready` (or `connecting` if - /// the underlying connection needs replacing). + /// Currently-ejected channels. Each entry is an + /// [`EjectedChannel`] whose `Sleep` fires when the ejection + /// window expires. ejected: KeyedFutures>, - /// Outlier-detection plumbing: shared registry, eject-signal - /// receiver, and the housekeeping actor handle. `None` disables - /// outlier detection. + /// `None` disables outlier detection. outlier: Option, - /// Channel picker for load balancing. picker: Arc, Req> + Send + Sync>, } @@ -112,20 +96,16 @@ where connector: Arc, picker: Arc, Req> + Send + Sync>, ) -> Self { - // Infallible: `with_outlier(_, _, _, None)` never touches the - // outlier-detection construction path. - match Self::with_outlier(discovery, connector, picker, None) { - Ok(lb) => lb, - Err(_) => unreachable!("with_outlier(.., None) cannot wire a registry"), - } + // Infallible: `with_outlier(.., None)` never wires a registry. + Self::with_outlier(discovery, connector, picker, None) + .expect("with_outlier(.., None) is infallible") } /// Create a load balancer, optionally enabling outlier detection. /// When `outlier` is `Some`, the registry's housekeeping actor is - /// spawned and its lifetime is bound to the load balancer. - /// Returns [`RegistryAlreadyWired`] if the provided registry has - /// already been wired to another load balancer — a registry's - /// eject-signal receiver is one-shot. + /// spawned and bound to this LB. Returns + /// [`RegistryAlreadyWired`] if the registry already drives + /// another LB. pub(crate) fn with_outlier( discovery: D, connector: Arc, @@ -144,10 +124,8 @@ where }) } - /// Purge all per-endpoint state for `addr`: the connecting - /// future, the ready slot, the ejected channel (if any), and the - /// outlier-detection registry entry. Used when discovery says the - /// endpoint is gone from the cluster. + /// Purge all state for `addr`, including the outlier-detection + /// registry entry. Called on `Change::Remove`. fn purge_endpoint(&mut self, addr: &EndpointAddress) { let _ = self.connecting.cancel(addr); self.ready.swap_remove(addr); @@ -157,33 +135,23 @@ where } } - /// Clear stale slots that held the old service (in-flight - /// connecting future, ready entry, ejected channel) but - /// **preserve** the outlier-detection registry entry — counters, - /// ejection multiplier, and ejection flag carry across the - /// reconnect. Used when discovery re-inserts an endpoint we - /// already track. - /// - /// This matches grpc-go and Envoy: outlier state is keyed by - /// stable endpoint identity and survives a transient discovery - /// flap, so a brief disappearance does not wipe what we already - /// know about the endpoint's health. + /// Clear stale connecting/ready/ejected slots for `addr` but + /// preserve the outlier-detection registry entry. Called on + /// `Change::Insert` so transient discovery flaps don't lose + /// counters or ejection state, matching grpc-go and Envoy. fn reset_active_slots(&mut self, addr: &EndpointAddress) { let _ = self.connecting.cancel(addr); self.ready.swap_remove(addr); let _ = self.ejected.cancel(addr); } - /// Drain pending discovery events. Either resolves to an error - /// ([`LbError::DiscoverClosed`] or [`LbError::DiscoverError`]) or stays - /// pending — there is no success outcome since the loop only exits on - /// pending or error. + /// Drain pending discovery events. Resolves to an error + /// ([`LbError::DiscoverClosed`] or [`LbError::DiscoverError`]) + /// or stays pending — there is no success outcome. fn poll_discover(&mut self, cx: &mut Context<'_>) -> Poll { loop { match ready!(Pin::new(&mut self.discovery).poll_discover(cx)) { None => { - // tower::discover::Discover::poll_discover() returns Ready(None) when the - // discover object is closed, as indicated by Stream trait. tracing::error!("discover object is closed"); return Poll::Ready(LbError::DiscoverClosed); } @@ -202,15 +170,10 @@ where } } - /// Drain completed connection futures. Wraps each bare service - /// into a `ReadyChannel` using the outlier state from the - /// registry (or a fresh state if outlier detection is disabled). - /// - /// If the preserved outlier state for a re-discovered endpoint - /// says it is still ejected, the new channel is re-ejected with - /// the *remaining* ejection time so the ongoing backoff is - /// honored. If the deadline has already passed, the channel is - /// un-ejected immediately and routed to `ready`. + /// Drain completed connection futures. If the outlier state for + /// a re-discovered endpoint is still ejected, the new channel is + /// re-ejected for the *remaining* duration; if the deadline has + /// already passed, it is un-ejected and routed to `ready`. fn poll_connecting(&mut self, cx: &mut Context<'_>) { while let Poll::Ready(Some((addr, svc))) = self.connecting.poll_next(cx) { let state = match self.outlier.as_ref() { @@ -226,10 +189,9 @@ where } } - /// Route a freshly-connected `ReadyChannel` into the right pool - /// based on the preserved outlier state's `remaining` ejection - /// duration. Factored out so `poll_connecting` stays terse and - /// the three cases (fresh, mid-eject, past-deadline) are visible. + /// Route a freshly-connected `ReadyChannel` based on its + /// preserved outlier state: `None` → ready; `Some(0)` → un-eject + /// then ready; `Some(d)` → ejected for `d`. fn place_after_connect( &mut self, addr: EndpointAddress, @@ -261,10 +223,9 @@ where } /// Drain eject requests from the outlier detector's mpsc and - /// transition the named `ReadyChannel`s into ejected ones. The - /// per-channel ejection state has already been flipped by - /// `record_outcome`; this step is the visible transition on the - /// LB side. + /// move each named `ReadyChannel` into [`Self::ejected`]. The + /// per-channel ejection flag has already been set by + /// `record_outcome`. fn poll_eject_requests(&mut self, cx: &mut Context<'_>) { loop { let Some(o) = self.outlier.as_mut() else { @@ -275,10 +236,8 @@ where _ => return, }; let registry = o.registry().clone(); - // The eject signal arrives once `try_eject` has flipped - // the channel's state and the cluster-wide - // `ejected_count`. If the channel is no longer in `ready` - // (e.g. discovery removed it), there's nothing to do. + // Channel may have been removed by discovery in the + // meantime; if so, nothing to eject. let Some(ch) = self.ready.swap_remove(&addr) else { continue; }; @@ -296,24 +255,21 @@ where let _ = self.ejected.add(addr, ejected); } Some(_) => { - // Deadline already past — un-eject immediately. + // Deadline already past — un-eject. registry.note_uneject(&state); self.ready.insert(addr, ch); } None => { - // State is no longer ejected (concurrent uneject?) — restore. + // No longer ejected (raced with un-eject). self.ready.insert(addr, ch); } } } } - /// Drain completed `EjectedChannel` timers. Each yields either an - /// `UnejectedChannel::Ready(svc)` (timer expired, reuse the - /// connection) or `UnejectedChannel::Connecting(future)` (timer - /// expired but a fresh connect was requested). The address's - /// outlier state is cleared and the channel is routed back into - /// `ready` or `connecting` accordingly. + /// Drain completed `EjectedChannel` timers. Clears the + /// outlier state and routes the resolved channel back into + /// `ready` or `connecting`. fn poll_unejection(&mut self, cx: &mut Context<'_>) { while let Poll::Ready(Some((addr, unejected))) = self.ejected.poll_next(cx) { let state = match self.outlier.as_ref() { @@ -329,10 +285,9 @@ where let ready = ReadyChannel::new(addr.clone(), svc, state); self.ready.insert(addr, ready); } + // `needs_reconnect = false` for A50; this arm is + // reserved for future policies. UnejectedChannel::Connecting(future) => { - // `needs_reconnect = false` for A50, so this arm - // is unused today; handle it for completeness in - // case a future policy sets it. let _ = self.connecting.add(addr, future); } } @@ -357,10 +312,8 @@ where fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { let discover_result = self.poll_discover(cx); - // Drain un-ejection completions BEFORE servicing eject requests - // so a freshly un-ejected channel can immediately serve traffic - // (and so cluster-wide `ejected_count` is current when the next - // eject is evaluated). + // Un-ejections before ejections so `ejected_count` is current + // when the next eject is evaluated. self.poll_unejection(cx); self.poll_connecting(cx); self.poll_eject_requests(cx); @@ -369,15 +322,13 @@ where return Poll::Ready(Ok(())); } - // No ready endpoints. Check if we should fail fast. + // No ready endpoints. Fail fast iff discovery is closed and + // nothing else can produce one. match discover_result { Poll::Ready(LbError::DiscoverClosed) if self.connecting.len() == 0 => { - // Discovery is closed and nothing is connecting — no progress is possible. Poll::Ready(Err(LbError::Stagnation)) } Poll::Ready(e) => { - // Other discovery errors (or DiscoverClosed with connecting in flight) - // are non-fatal — log and stay pending. tracing::warn!("discovery yielded error: {e}"); Poll::Pending } @@ -395,9 +346,8 @@ where let Some(picked) = self.picker.pick(&req, &self.ready) else { return LbFuture::Error(Some(LbError::Unavailable)); }; - // `picked` is a read-only borrow into `self.ready`. Clone to get - // an owned service and outlier handle for the async block; both - // are `Arc`-shared, so cloning is cheap. + // Cheap clones (all Arc-shared internals) so the async block + // can take ownership without holding the picker borrow. let mut svc = picked.clone(); let outlier_state = picked.outlier().clone(); let registry = self.outlier.as_ref().map(|o| o.registry().clone()); @@ -407,10 +357,6 @@ where .map_err(|e| LbError::LbChannelPollReadyError(e.into()))?; let result = svc.call(req).await; if let Some(registry) = registry.as_ref() { - // Per-RPC outlier detection: bump the channel's - // counter and (inside `record_outcome`) possibly - // dispatch an eject request to the LB. Treat any - // `Err` outcome as a failure for outlier purposes. registry.record_outcome(&outlier_state, result.is_ok()); } result.map_err(|e| LbError::LbChannelCallError(e.into())) diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index 6e1bed53b..df3e78796 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -1,39 +1,31 @@ -//! gRFC A50 outlier detection. +//! [gRFC A50] outlier detection. //! -//! The algorithm is split between the data path, the load balancer, -//! and a spawned actor: +//! Work is split across three sites: //! -//! - **Per-RPC detection** runs inline on each call completion via -//! [`OutlierStatsRegistry::record_outcome`]. The wrapper records the -//! outcome on the channel's [`OutlierChannelState`], evaluates the -//! failure-percentage threshold, and on transition to ejected sends -//! the address through an mpsc channel for the LB to consume. -//! Cluster-wide gates (`minimum_hosts`, `max_ejection_percent`) are -//! enforced via two atomic counters on the registry, kept in sync -//! as channels cross thresholds. -//! - **The load balancer** drains the eject mpsc in `poll_ready`, +//! - **Data path** ([`OutlierStatsRegistry::record_outcome`]): runs +//! inline per RPC. Updates per-channel counters, applies the +//! failure-percentage gate, and on transition to ejected sends the +//! address through an mpsc channel. +//! - **Load balancer**: drains the eject mpsc in `poll_ready`, //! consumes the matching [`ReadyChannel`] via //! [`ReadyChannel::eject`], and tracks the resulting //! [`EjectedChannel`] in a `KeyedFutures`. Each ejected channel's -//! internal sleep fires at exactly `base × multiplier` (capped by -//! `max_ejection_time`) after ejection, yielding -//! [`UnejectedChannel::Ready`]; the LB drains it on the next -//! `poll_ready` and routes the channel back to the ready set. -//! - **Interval-based housekeeping** runs in a spawned actor (see -//! [`spawn_actor`]). It resets per-channel counters at the -//! `config.interval` boundary and decrements multipliers for -//! non-ejected channels. Un-ejection is timer-driven by -//! [`EjectedChannel`] — the actor never un-ejects. +//! sleep fires at `base × multiplier` (capped by +//! `max_ejection_time`); the LB then routes the resolved +//! [`UnejectedChannel`] back into the ready set. +//! - **Housekeeping actor** ([`spawn_actor`]): on each +//! `config.interval` tick, resets counters and decrements +//! multipliers for non-ejected channels. The actor never ejects or +//! un-ejects. //! -//! Only the failure-percentage algorithm is dispatched. The -//! success-rate algorithm (cross-endpoint mean/stdev) is left to a -//! follow-up. +//! Only the failure-percentage algorithm is implemented; success-rate +//! (cross-endpoint mean/stdev) is left to a follow-up. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md //! [`ReadyChannel`]: crate::client::loadbalance::channel_state::ReadyChannel //! [`ReadyChannel::eject`]: crate::client::loadbalance::channel_state::ReadyChannel::eject //! [`EjectedChannel`]: crate::client::loadbalance::channel_state::EjectedChannel -//! [`UnejectedChannel::Ready`]: crate::client::loadbalance::channel_state::UnejectedChannel::Ready +//! [`UnejectedChannel`]: crate::client::loadbalance::channel_state::UnejectedChannel use std::sync::Arc; use std::sync::Mutex; @@ -49,10 +41,8 @@ use crate::client::loadbalance::channel_state::OutlierChannelState; use crate::common::async_util::AbortOnDrop; use crate::xds::resource::outlier_detection::OutlierDetectionConfig; -/// Construction-time error returned when a single -/// [`OutlierStatsRegistry`] is wired to more than one load balancer. -/// The registry's eject-signal receiver is one-shot; reuse is not -/// supported. +/// Returned when an [`OutlierStatsRegistry`] is handed to a second +/// load balancer. The eject-signal receiver is one-shot. #[derive(Debug, thiserror::Error)] #[error("OutlierStatsRegistry is already wired to a LoadBalancer")] pub(crate) struct RegistryAlreadyWired; @@ -73,34 +63,23 @@ impl Rng for FastRandRng { } /// Shared outlier-detection state, owned by `Arc` and accessed -/// concurrently by: -/// - The load balancer's call wrapper, which calls -/// [`Self::record_outcome`] after each RPC completion. -/// - The spawned actor task, which calls [`Self::run_housekeeping`] -/// on every `config.interval` tick. -/// - The load balancer's `poll_ready`, which drains the eject mpsc -/// (via [`OutlierDetector::poll_eject_request`]) and calls -/// [`Self::note_uneject`] when an `EjectedChannel`'s timer fires. +/// concurrently by the data path ([`Self::record_outcome`]), the +/// housekeeping actor ([`Self::run_housekeeping`]), and the load +/// balancer ([`Self::note_uneject`], [`Self::remaining_ejection`]). pub(crate) struct OutlierStatsRegistry { - /// Per-endpoint state, keyed by address. Inserted by the LB on - /// channel creation and removed on disconnect. channels: DashMap>, - /// Number of channels currently with `total >= request_volume` in - /// the active interval. Drives the `minimum_hosts` gate. + /// Channels with `total >= request_volume` in the active + /// interval. Drives the `minimum_hosts` gate. qualifying_count: AtomicU64, - /// Number of channels currently ejected. Drives the + /// Channels currently ejected. Drives the /// `max_ejection_percent` cap. ejected_count: AtomicU64, config: OutlierDetectionConfig, rng: Box, - /// Sender half of the eject signal. `record_outcome` pushes an - /// address through on transition to ejected; the LB's - /// [`OutlierDetector`] drains the receiver in `poll_ready` and - /// consumes the matching `ReadyChannel`. + /// Sender half of the eject signal. The receiver is owned by the + /// LB's [`OutlierDetector`]. eject_tx: mpsc::UnboundedSender, - /// Receiver half, handed to the LB at construction time. Wrapped - /// in a `Mutex>` so [`Self::take_eject_rx`] can move it - /// out exactly once. Outside that hand-off there is no contention. + /// Receiver moved out exactly once by [`Self::take_eject_rx`]. eject_rx: Mutex>>, } @@ -124,10 +103,9 @@ impl OutlierStatsRegistry { }) } - /// Take the eject-signal receiver. Called exactly once by - /// [`OutlierDetector::new`]. Returns - /// [`RegistryAlreadyWired`] if a previous call has already taken - /// the receiver — a registry can drive at most one load balancer. + /// Take the eject-signal receiver. Returns + /// [`RegistryAlreadyWired`] on a second call — a registry can + /// drive at most one load balancer. fn take_eject_rx( &self, ) -> Result, RegistryAlreadyWired> { @@ -138,11 +116,8 @@ impl OutlierStatsRegistry { .ok_or(RegistryAlreadyWired) } - /// Register a channel and return the `Arc` - /// the load balancer wires into the channel; the same `Arc` is - /// retained in the registry so the actor can iterate it. If a - /// state for this address already exists, returns it untouched — - /// state continuity across reconnect cycles is preserved. + /// Get or create the state for `addr`. Idempotent — existing + /// state is preserved across reconnect. pub(crate) fn add_channel(&self, addr: EndpointAddress) -> Arc { self.channels .entry(addr.clone()) @@ -150,9 +125,8 @@ impl OutlierStatsRegistry { .clone() } - /// Forget a channel. Drops the registry's reference; cluster-wide - /// counters are decremented if the channel was qualifying or - /// ejected. + /// Drop the state for `addr`, decrementing cluster-wide counters + /// (`qualifying_count`, `ejected_count`) if it was contributing. pub(crate) fn remove_channel(&self, addr: &EndpointAddress) { if let Some((_, state)) = self.channels.remove(addr) { if state.clear_qualifying() { @@ -169,12 +143,9 @@ impl OutlierStatsRegistry { self.channels.len() } - /// Per-RPC entry point. Called by the load balancer's call wrapper - /// after each RPC completion. Increments the channel's success or - /// failure counter and then evaluates the failure-percentage - /// threshold; if all gates pass and the channel was not already - /// ejected, marks it ejected and sends the address through the - /// eject mpsc for the LB to consume. + /// Per-RPC entry point. Records the outcome and, if all gates + /// pass, transitions the channel to ejected and dispatches the + /// address on the eject mpsc. pub(crate) fn record_outcome(&self, state: &OutlierChannelState, success: bool) { if success { state.record_success(); @@ -190,9 +161,8 @@ impl OutlierStatsRegistry { let total = s + f; let request_volume = u64::from(fp.request_volume); - // Track when each channel first qualifies in the current - // interval, so the `minimum_hosts` gate can be checked with a - // single atomic load. + // Bump `qualifying_count` exactly once per channel per + // interval so the `minimum_hosts` gate is a single atomic load. if total >= request_volume && state.mark_qualifying() { self.qualifying_count.fetch_add(1, Ordering::Relaxed); } @@ -221,32 +191,19 @@ impl OutlierStatsRegistry { if state.try_eject(Instant::now()) { self.ejected_count.fetch_add(1, Ordering::Relaxed); - // The LB drains this in `poll_ready` and consumes the - // `ReadyChannel` via `ReadyChannel::eject`. If the LB has - // dropped its receiver (shutdown), the send fails silently - // — the channel will be cleaned up by `forget`. + // Send failure (LB receiver dropped during shutdown) is + // ignored; the registry will be torn down momentarily. let _ = self.eject_tx.send(state.addr().clone()); } } - /// Clear the ejection on `state`, decrement the cluster-wide - /// `ejected_count`, and decrement the channel's ejection - /// multiplier (matching gRFC A50 step 6.b, which decrements - /// multiplier in the same sweep that un-ejects). Returns whether - /// the transition fired (so callers can guard against - /// double-counting). Called by the LB when an `EjectedChannel`'s - /// timer fires and yields `UnejectedChannel::Ready`. + /// Clear the ejection: flip the state, decrement + /// `ejected_count`, and decrement the multiplier (gRFC A50 + /// step 6.b: same sweep that un-ejects also decrements). Returns + /// `true` on the ejected → not-ejected transition. pub(crate) fn note_uneject(&self, state: &OutlierChannelState) -> bool { if state.try_uneject() { self.ejected_count.fetch_sub(1, Ordering::Relaxed); - // Per A50, the same sweep that un-ejects also decrements - // the multiplier. Since our un-ejection is timer-driven - // (decoupled from the housekeeping sweep), we apply the - // decrement here to avoid a window where a re-eject would - // see a stale (one-higher) multiplier and back off too - // aggressively. The actor's housekeeping decrement still - // runs at each interval; saturating arithmetic ensures - // the eventual decrement to zero stays correct. state.decrement_multiplier(); true } else { @@ -254,12 +211,10 @@ impl OutlierStatsRegistry { } } - /// Compute how long `state` still has to remain ejected, or - /// `None` if it is not currently ejected. Returns - /// `Some(Duration::ZERO)` if the deadline has already passed - /// (caller should un-eject immediately rather than starting a - /// fresh sleep). Used by the LB on initial ejection and on - /// re-discovery to size the `EjectionConfig::timeout`. + /// Time remaining on `state`'s ejection (capped by + /// `max_ejection_time`). `None` if not ejected; + /// `Some(Duration::ZERO)` if the deadline has passed (caller + /// should un-eject rather than start a fresh sleep). pub(crate) fn remaining_ejection( &self, state: &OutlierChannelState, @@ -280,39 +235,31 @@ impl OutlierStatsRegistry { Some(target.checked_sub(elapsed).unwrap_or_default()) } - /// Interval-boundary housekeeping. Called by the spawned actor on - /// each `config.interval` tick. Resets counters and decrements - /// multipliers for non-ejected channels. Does **not** un-eject — - /// un-ejection is timer-driven by each `EjectedChannel` and - /// handled by the LB when the channel resolves. + /// Interval-boundary housekeeping. Resets counters and + /// decrements multipliers for non-ejected channels. Does not + /// un-eject — that is driven by each `EjectedChannel`'s timer. pub(crate) fn run_housekeeping(&self) { for entry in self.channels.iter() { let state = entry.value(); - - // Reset counters; clear `is_qualifying` and adjust the - // registry-level counter in lockstep. state.snapshot_and_reset(); if state.clear_qualifying() { self.qualifying_count.fetch_sub(1, Ordering::Relaxed); } - if !state.is_ejected() { state.decrement_multiplier(); } } } - /// `max_ejection_percent` resolved against the current channel - /// count. Updated as channels come and go. + /// Resolve `max_ejection_percent` against the current channel count. fn max_ejections(&self) -> u64 { self.channels.len() as u64 * u64::from(self.config.max_ejection_percent.get()) / 100 } } -/// Spawn the housekeeping actor. The task ticks every -/// `config.interval` and calls -/// [`OutlierStatsRegistry::run_housekeeping`]. Dropping the returned -/// [`AbortOnDrop`] stops the task. +/// Spawn the housekeeping actor. Ticks every `config.interval` and +/// calls [`OutlierStatsRegistry::run_housekeeping`]. Dropping the +/// returned [`AbortOnDrop`] stops the task. pub(crate) fn spawn_actor(registry: Arc) -> AbortOnDrop { let interval = registry.config.interval; let task = tokio::spawn(async move { @@ -326,17 +273,9 @@ pub(crate) fn spawn_actor(registry: Arc) -> AbortOnDrop { AbortOnDrop(task) } -/// Per-LB outlier-detection plumbing: the shared registry, the -/// receiver half of the eject signal mpsc, and the handle to the -/// housekeeping actor (dropped with the LB). -/// -/// `LoadBalancer` holds this as `Option`: `None` -/// when outlier detection is disabled, `Some` when enabled. The -/// pool of ejected channels themselves lives directly on the LB in a -/// `KeyedFutures<_, UnejectedChannel<_>>` — see the channel state -/// machine in [`channel_state`] for the type-state transitions. -/// -/// [`channel_state`]: crate::client::loadbalance::channel_state +/// Per-LB outlier-detection plumbing: shared registry, eject-signal +/// receiver, and the housekeeping actor handle (aborted on drop). The +/// LB holds this as `Option`. pub(crate) struct OutlierDetector { registry: Arc, eject_rx: mpsc::UnboundedReceiver, @@ -344,11 +283,10 @@ pub(crate) struct OutlierDetector { } impl OutlierDetector { - /// Build from a registry, spawning the housekeeping actor and - /// taking ownership of the eject-signal receiver. Returns - /// [`RegistryAlreadyWired`] if the registry's receiver has - /// already been taken (i.e. this registry is already driving - /// another load balancer); a registry can drive at most one LB. + /// Take ownership of the registry's eject-signal receiver and + /// spawn the housekeeping actor. Returns + /// [`RegistryAlreadyWired`] if the registry is already wired to + /// another LB. pub(crate) fn new(registry: Arc) -> Result { let eject_rx = registry.take_eject_rx()?; let _actor = spawn_actor(registry.clone()); @@ -364,11 +302,7 @@ impl OutlierDetector { &self.registry } - /// Poll for the next address whose data path has decided to - /// eject. Returns `Poll::Pending` when no eject decision is - /// queued; returns `Poll::Ready(None)` only if the registry has - /// been dropped (which can't happen while this detector holds an - /// `Arc`). + /// Poll for the next address the data path has decided to eject. pub(crate) fn poll_eject_request( &mut self, cx: &mut Context<'_>, @@ -749,12 +683,9 @@ mod tests { assert_eq!(s.ejection_multiplier(), 0); } - /// Re-ejecting a channel immediately after un-ejection should - /// produce a backoff sized for multiplier=1, not multiplier=2 — - /// i.e. it should *not* punish the channel for the previous - /// ejection that has just finished serving its cooldown. This is - /// what gRFC A50 prescribes and what Envoy does (un-eject and - /// decrement happen at the same sweep). + /// A50 step 6.b: un-eject and multiplier decrement happen at the + /// same sweep. Re-eject right after un-eject must size the + /// backoff with the *decremented* multiplier. #[test] fn re_eject_after_uneject_uses_fresh_multiplier() { let mut config = fp_config(50, 10, 3); diff --git a/tonic-xds/src/xds/resource/outlier_detection.rs b/tonic-xds/src/xds/resource/outlier_detection.rs index 159ff7735..970232bea 100644 --- a/tonic-xds/src/xds/resource/outlier_detection.rs +++ b/tonic-xds/src/xds/resource/outlier_detection.rs @@ -2,32 +2,19 @@ //! //! [`OutlierDetectionConfig`] is the input to the outlier-detection //! algorithm. The two sub-configs gate which ejection algorithms run. -//! -//! Note: A50 specifies outlier detection as a load-balancing policy -//! wrapping a `child_policy`. `tonic-xds` currently runs P2C as its -//! only load balancer, so there is no `child_policy` field here yet — -//! it will be added when more balancers are supported. Integration -//! with the data path is via an mpsc channel of ejection decisions -//! polled by the [`LoadBalancer`] tower service, which marks the -//! corresponding [`ReadyChannel`] as ejected via [`EjectedChannel`]. +//! The `child_policy` field from A50 is not modeled — `tonic-xds` +//! currently runs P2C as its only load balancer. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md -//! [`LoadBalancer`]: crate::client::loadbalance::loadbalancer::LoadBalancer -//! [`ReadyChannel`]: crate::client::loadbalance::channel_state::ReadyChannel -//! [`EjectedChannel`]: crate::client::loadbalance::channel_state::EjectedChannel use std::time::Duration; -/// A 0–100 percentage. Construction is fallible; once held, every -/// `Percentage` is guaranteed to be in range, so the algorithm never -/// has to re-validate. +/// A 0–100 percentage, validated at construction. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub(crate) struct Percentage(u8); impl Percentage { /// Construct from a raw value, returning `Err` if it exceeds 100. - /// Accepts `u32` to match the proto wire type without forcing callers - /// to cast at every site. pub(crate) fn new(value: u32) -> Result { if value > 100 { Err(PercentageError(value)) @@ -73,9 +60,8 @@ pub(crate) struct SuccessRateConfig { /// An endpoint is a candidate for ejection when its success rate falls /// below `mean - stdev * (stdev_factor / 1000.0)`. pub stdev_factor: u32, - /// Probability that a flagged candidate is actually ejected — *not* - /// the success-rate threshold (which is derived from `stdev_factor`). - /// Set to 0 to disable enforcement while still computing statistics. + /// Probability that a flagged candidate is actually ejected. + /// Set to 0 to compute statistics without enforcing. pub enforcing_success_rate: Percentage, /// Minimum number of candidate endpoints required to run the algorithm. pub minimum_hosts: u32, @@ -90,9 +76,8 @@ pub(crate) struct FailurePercentageConfig { /// Failure rate at or above which an endpoint is a candidate for /// ejection. pub threshold: Percentage, - /// Probability that a flagged candidate is actually ejected — *not* - /// the failure-rate threshold (that is `threshold` above). Set to 0 - /// to disable enforcement while still computing statistics. + /// Probability that a flagged candidate is actually ejected. + /// Set to 0 to compute statistics without enforcing. pub enforcing_failure_percentage: Percentage, /// Minimum number of candidate endpoints required to run the algorithm. pub minimum_hosts: u32, @@ -102,8 +87,7 @@ pub(crate) struct FailurePercentageConfig { } impl OutlierDetectionConfig { - /// True when at least one ejection algorithm is enabled and the detector - /// should do work. If false, the cluster can skip instantiating detection. + /// True when at least one ejection algorithm is enabled. pub(crate) fn is_enabled(&self) -> bool { self.success_rate.is_some() || self.failure_percentage.is_some() } From f4e1e8c5ca12d77ebf6382f2db920a82d6e77c53 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 13:49:37 -0700 Subject: [PATCH 31/33] refactor(tonic-xds): UnejectedChannel::Ready carries a full ReadyChannel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass the outlier state through `ReadyChannel::eject` → `EjectedChannel` so its `Future::poll` can yield `UnejectedChannel::Ready(ReadyChannel)` with the state already reattached, instead of `Ready(S)` and asking the LB to rebuild. Symmetric ends for the `Ready ↔ Ejected` transition (both speak `ReadyChannel`), and `poll_unejection`'s ready arm drops its state-lookup + `ReadyChannel::new` rebuild — it just calls `note_uneject(ready.outlier())` and inserts. The `Connecting` arm stays asymmetric since the fresh connect produces a bare service. --- .../src/client/loadbalance/channel_state.rs | 20 ++++++++++++----- .../src/client/loadbalance/loadbalancer.rs | 22 +++++++++---------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index a4a57d4bb..398ef13a4 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -223,9 +223,9 @@ pub(crate) struct EjectionConfig { /// Result of an ejection expiring. pub(crate) enum UnejectedChannel { - /// Connection reused; the caller wraps the service back into a - /// [`ReadyChannel`]. - Ready(S), + /// Cooldown elapsed; the original connection is reused with its + /// outlier state reattached. + Ready(ReadyChannel), /// A fresh connection has been started. Connecting(ConnectingChannel), } @@ -314,8 +314,9 @@ impl ReadyChannel { &self.outlier } - /// Eject this channel. Consumes self; the outlier state remains - /// in the registry. + /// Eject this channel. Consumes self; the outlier state is moved + /// into the [`EjectedChannel`] so it can be reattached to the + /// [`ReadyChannel`] produced when the cooldown elapses. pub(crate) fn eject(self, config: EjectionConfig, connector: Arc) -> EjectedChannel where C: Connector + Send + Sync + 'static, @@ -324,6 +325,7 @@ impl ReadyChannel { EjectedChannel { addr: self.addr, inner: self.inner, + outlier: self.outlier, config, connector, ejection_timer, @@ -383,6 +385,7 @@ pin_project! { pub(crate) struct EjectedChannel { addr: EndpointAddress, inner: S, + outlier: Arc, config: EjectionConfig, connector: Arc + Send + Sync>, #[pin] @@ -404,7 +407,12 @@ impl Future for EjectedChannel { this.addr.clone(), ))) } else { - Poll::Ready(UnejectedChannel::Ready(this.inner.clone())) + let ready = ReadyChannel::new( + this.addr.clone(), + this.inner.clone(), + this.outlier.clone(), + ); + Poll::Ready(UnejectedChannel::Ready(ready)) } } Poll::Pending => Poll::Pending, diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 75c6dffb0..1b995cb74 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -268,26 +268,26 @@ where } /// Drain completed `EjectedChannel` timers. Clears the - /// outlier state and routes the resolved channel back into - /// `ready` or `connecting`. + /// registry-level ejection counter and routes the resolved + /// channel back into `ready` (with its outlier state already + /// reattached) or `connecting`. fn poll_unejection(&mut self, cx: &mut Context<'_>) { while let Poll::Ready(Some((addr, unejected))) = self.ejected.poll_next(cx) { - let state = match self.outlier.as_ref() { - Some(o) => o.registry().add_channel(addr.clone()), - None => Arc::new(OutlierChannelState::new(addr.clone())), - }; - if let Some(o) = self.outlier.as_ref() { - o.registry().note_uneject(&state); - } match unejected { - UnejectedChannel::Ready(svc) => { + UnejectedChannel::Ready(ready) => { + if let Some(o) = self.outlier.as_ref() { + o.registry().note_uneject(ready.outlier()); + } tracing::debug!("outlier detection: uneject {addr}"); - let ready = ReadyChannel::new(addr.clone(), svc, state); self.ready.insert(addr, ready); } // `needs_reconnect = false` for A50; this arm is // reserved for future policies. UnejectedChannel::Connecting(future) => { + if let Some(o) = self.outlier.as_ref() { + let state = o.registry().add_channel(addr.clone()); + o.registry().note_uneject(&state); + } let _ = self.connecting.add(addr, future); } } From fa12110612378959b4a251d4d8d01f1880184b93 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 13:56:06 -0700 Subject: [PATCH 32/33] refactor(tonic-xds): drop unused addr parameter from ConnectingChannel::new `ConnectingChannel` stopped constructing `ReadyChannel` internally when outlier state was added (the registry-supplied `Arc` has to come from the LB, not the channel-state type), and the second parameter `_addr` has been ignored ever since. Drop it; callers already hand `KeyedFutures` the canonical address as the key. --- .../src/client/loadbalance/channel_state.rs | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/channel_state.rs b/tonic-xds/src/client/loadbalance/channel_state.rs index 398ef13a4..159838991 100644 --- a/tonic-xds/src/client/loadbalance/channel_state.rs +++ b/tonic-xds/src/client/loadbalance/channel_state.rs @@ -250,7 +250,7 @@ impl IdleChannel { where C::Service: Send + 'static, { - ConnectingChannel::new(connector.connect(&self.addr), self.addr) + ConnectingChannel::new(connector.connect(&self.addr)) } } @@ -271,7 +271,7 @@ pub(crate) struct ConnectingChannel { } impl ConnectingChannel { - pub(crate) fn new(fut: BoxFuture, _addr: EndpointAddress) -> Self { + pub(crate) fn new(fut: BoxFuture) -> Self { Self { inner: fut } } } @@ -341,7 +341,7 @@ impl ReadyChannel { where S: Send + 'static, { - ConnectingChannel::new(connector.connect(&self.addr), self.addr) + ConnectingChannel::new(connector.connect(&self.addr)) } } @@ -402,10 +402,7 @@ impl Future for EjectedChannel { Poll::Ready(()) => { if this.config.needs_reconnect { let fut = this.connector.connect(this.addr); - Poll::Ready(UnejectedChannel::Connecting(ConnectingChannel::new( - fut, - this.addr.clone(), - ))) + Poll::Ready(UnejectedChannel::Connecting(ConnectingChannel::new(fut))) } else { let ready = ReadyChannel::new( this.addr.clone(), @@ -519,8 +516,7 @@ mod tests { #[tokio::test] async fn test_connecting_in_keyed_futures() { let (tx, rx) = tokio::sync::oneshot::channel::(); - let connecting = - ConnectingChannel::new(Box::pin(async move { rx.await.unwrap() }), test_addr()); + let connecting = ConnectingChannel::new(Box::pin(async move { rx.await.unwrap() })); let mut set: KeyedFutures = KeyedFutures::new(); set.add(test_addr(), connecting).unwrap(); @@ -537,8 +533,7 @@ mod tests { #[tokio::test] async fn test_connecting_cancelled_via_keyed_futures() { - let connecting = - ConnectingChannel::new(Box::pin(future::pending::()), test_addr()); + let connecting = ConnectingChannel::new(Box::pin(future::pending::())); let mut set: KeyedFutures = KeyedFutures::new(); set.add(test_addr(), connecting).unwrap(); From b55d41969216ccf11cd9cb708972b8e18eb691c4 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Tue, 12 May 2026 14:04:51 -0700 Subject: [PATCH 33/33] refactor(tonic-xds): drop the Rng trait, call fastrand directly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `Rng` trait existed only as a test seam for the `enforcing_failure_percentage` probability roll, but every caller in both the algorithm tests and the LB integration tests uses `enforcing = 100` or `enforcing = 0` — values for which `roll` short- circuits without consulting the RNG. The trait, the `FastRandRng` default, and the test-side `FixedRng` / `AlwaysFireRng` impls were all bookkeeping for a code path none of them exercised. Inline the `fastrand::u32(0..100)` call into `roll` and remove the trait. `OutlierStatsRegistry::with_rng` collapses into `new`. --- .../src/client/loadbalance/loadbalancer.rs | 13 +-- .../client/loadbalance/outlier_detection.rs | 86 ++++++------------- 2 files changed, 27 insertions(+), 72 deletions(-) diff --git a/tonic-xds/src/client/loadbalance/loadbalancer.rs b/tonic-xds/src/client/loadbalance/loadbalancer.rs index 1b995cb74..ac58b6080 100644 --- a/tonic-xds/src/client/loadbalance/loadbalancer.rs +++ b/tonic-xds/src/client/loadbalance/loadbalancer.rs @@ -837,7 +837,7 @@ mod tests { // -- Outlier-detection integration tests -- - use crate::client::loadbalance::outlier_detection::{OutlierStatsRegistry, Rng}; + use crate::client::loadbalance::outlier_detection::OutlierStatsRegistry; use crate::xds::resource::outlier_detection::{ FailurePercentageConfig, OutlierDetectionConfig, Percentage, }; @@ -847,13 +847,6 @@ mod tests { Percentage::new(v).unwrap() } - struct AlwaysFireRng; - impl Rng for AlwaysFireRng { - fn pct_roll(&self) -> u32 { - 0 - } - } - fn fp_config( threshold: u32, request_volume: u32, @@ -882,7 +875,7 @@ mod tests { let connector = Arc::new(MockConnector::new()); let picker: Arc, &'static str> + Send + Sync> = Arc::new(P2cPicker); - let registry = OutlierStatsRegistry::with_rng(config, Box::new(AlwaysFireRng)); + let registry = OutlierStatsRegistry::new(config); let lb = LoadBalancer::with_outlier(discover, connector.clone(), picker, Some(registry.clone())) .expect("registry not yet wired"); @@ -1156,7 +1149,7 @@ mod tests { let connector = Arc::new(MockConnector::new()); let picker: Arc, &'static str> + Send + Sync> = Arc::new(P2cPicker); - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 5, 3), Box::new(AlwaysFireRng)); + let registry = OutlierStatsRegistry::new(fp_config(50, 5, 3)); // First wiring succeeds. LoadBalancer::with_outlier( diff --git a/tonic-xds/src/client/loadbalance/outlier_detection.rs b/tonic-xds/src/client/loadbalance/outlier_detection.rs index df3e78796..8cff6dce9 100644 --- a/tonic-xds/src/client/loadbalance/outlier_detection.rs +++ b/tonic-xds/src/client/loadbalance/outlier_detection.rs @@ -47,21 +47,6 @@ use crate::xds::resource::outlier_detection::OutlierDetectionConfig; #[error("OutlierStatsRegistry is already wired to a LoadBalancer")] pub(crate) struct RegistryAlreadyWired; -/// Probability source for `enforcing_*` rolls. -pub(crate) trait Rng: Send + Sync + 'static { - /// Return a uniform random `u32` in `0..100`. - fn pct_roll(&self) -> u32; -} - -/// Default RNG backed by `fastrand`. -struct FastRandRng; - -impl Rng for FastRandRng { - fn pct_roll(&self) -> u32 { - fastrand::u32(0..100) - } -} - /// Shared outlier-detection state, owned by `Arc` and accessed /// concurrently by the data path ([`Self::record_outcome`]), the /// housekeeping actor ([`Self::run_housekeeping`]), and the load @@ -75,7 +60,6 @@ pub(crate) struct OutlierStatsRegistry { /// `max_ejection_percent` cap. ejected_count: AtomicU64, config: OutlierDetectionConfig, - rng: Box, /// Sender half of the eject signal. The receiver is owned by the /// LB's [`OutlierDetector`]. eject_tx: mpsc::UnboundedSender, @@ -84,20 +68,13 @@ pub(crate) struct OutlierStatsRegistry { } impl OutlierStatsRegistry { - /// Build a registry with the default RNG. pub(crate) fn new(config: OutlierDetectionConfig) -> Arc { - Self::with_rng(config, Box::new(FastRandRng)) - } - - /// Build a registry with a custom [`Rng`]. - pub(crate) fn with_rng(config: OutlierDetectionConfig, rng: Box) -> Arc { let (eject_tx, eject_rx) = mpsc::unbounded_channel(); Arc::new(Self { channels: DashMap::new(), qualifying_count: AtomicU64::new(0), ejected_count: AtomicU64::new(0), config, - rng, eject_tx, eject_rx: Mutex::new(Some(eject_rx)), }) @@ -185,7 +162,7 @@ impl OutlierStatsRegistry { if failure_pct <= u64::from(fp.threshold.get()) { return; } - if !roll(&*self.rng, fp.enforcing_failure_percentage.get()) { + if !roll(fp.enforcing_failure_percentage.get()) { return; } @@ -312,14 +289,14 @@ impl OutlierDetector { } /// Return true with probability `pct / 100` (clamped at 100 ⇒ always). -fn roll(rng: &dyn Rng, pct: u8) -> bool { +fn roll(pct: u8) -> bool { if pct >= 100 { return true; } if pct == 0 { return false; } - rng.pct_roll() < u32::from(pct) + fastrand::u32(0..100) < u32::from(pct) } #[cfg(test)] @@ -328,7 +305,7 @@ mod tests { use crate::xds::resource::outlier_detection::{ FailurePercentageConfig, OutlierDetectionConfig, Percentage, }; - use std::sync::atomic::{AtomicU32, Ordering}; + use std::sync::atomic::Ordering; use std::time::Duration; fn addr(port: u16) -> EndpointAddress { @@ -365,21 +342,6 @@ mod tests { c } - /// Deterministic RNG: `pct_roll()` returns a fixed value. - struct FixedRng(AtomicU32); - - impl FixedRng { - fn boxed(value: u32) -> Box { - Box::new(Self(AtomicU32::new(value))) - } - } - - impl Rng for FixedRng { - fn pct_roll(&self) -> u32 { - self.0.load(Ordering::Relaxed) - } - } - /// Drive `n` outcomes through `record_outcome` for one channel. fn drive( registry: &OutlierStatsRegistry, @@ -399,7 +361,7 @@ mod tests { #[test] fn ejects_above_threshold_inline() { - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); @@ -412,7 +374,7 @@ mod tests { #[test] fn skips_below_threshold() { - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); @@ -428,7 +390,7 @@ mod tests { #[test] fn at_threshold_does_not_eject() { // A50 specifies a strict "greater than" comparison. - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(0)); + let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); @@ -442,7 +404,7 @@ mod tests { #[test] fn minimum_hosts_gates_ejection() { - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 5), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(fp_config(50, 10, 5)); // Only 2 hosts have request_volume ≥ 10; minimum_hosts is 5 ⇒ skip. let mut all = vec![]; for port in 8080..=8081 { @@ -457,7 +419,7 @@ mod tests { #[test] fn request_volume_filters_low_traffic() { - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 100, 3), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(fp_config(50, 100, 3)); let bad = registry.add_channel(addr(8080)); drive(®istry, &bad, 0, 5); for port in 8081..=8084 { @@ -475,7 +437,7 @@ mod tests { .as_mut() .unwrap() .enforcing_failure_percentage = pct(0); - let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(0)); + let registry = OutlierStatsRegistry::new(config); let mut all = vec![]; for port in 8080..=8084 { let s = registry.add_channel(addr(port)); @@ -491,7 +453,7 @@ mod tests { fn max_ejection_percent_caps_concurrent_ejections() { let mut config = fp_config(50, 10, 3); config.max_ejection_percent = pct(20); - let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(config); let mut all = vec![]; for port in 8080..=8084 { @@ -510,7 +472,7 @@ mod tests { #[test] fn remove_channel_decrements_counters() { - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); let mut all = vec![]; for port in 8080..=8083 { let s = registry.add_channel(addr(port)); @@ -532,7 +494,7 @@ mod tests { #[test] fn ejection_dispatches_address_through_mpsc() { - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); let mut rx = registry.take_eject_rx().expect("receiver available"); let bad = registry.add_channel(addr(8084)); for port in 8080..=8083 { @@ -553,7 +515,7 @@ mod tests { #[test] fn housekeeping_resets_counters_and_qualifying() { - let registry = OutlierStatsRegistry::with_rng(fp_config(50, 10, 3), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(fp_config(50, 10, 3)); for port in 8080..=8083 { let s = registry.add_channel(addr(port)); drive(®istry, &s, 100, 0); @@ -570,7 +532,7 @@ mod tests { #[test] fn housekeeping_decrements_multiplier_on_healthy_interval() { - let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(base_config()); let s = registry.add_channel(addr(8080)); // Force multiplier to 3 directly (no traffic, no eject). s.set_ejection_multiplier(3); @@ -581,7 +543,7 @@ mod tests { #[test] fn housekeeping_leaves_ejected_multipliers_alone() { - let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(base_config()); let s = registry.add_channel(addr(8080)); s.try_eject(Instant::now()); s.set_ejection_multiplier(3); @@ -600,7 +562,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(60); - let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(config); let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); s.try_eject(t0); @@ -614,7 +576,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(15); - let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(config); let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); s.try_eject(t0); @@ -628,7 +590,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(30); config.max_ejection_time = Duration::from_secs(60); - let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(config); let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); s.try_eject(t0); @@ -644,7 +606,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(60); - let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(config); let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); s.try_eject(t0); @@ -657,14 +619,14 @@ mod tests { #[test] fn remaining_ejection_none_when_not_ejected() { - let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(base_config()); let s = registry.add_channel(addr(8080)); assert!(registry.remaining_ejection(&s, Instant::now()).is_none()); } #[test] fn note_uneject_clears_state_and_decrements_counter() { - let registry = OutlierStatsRegistry::with_rng(base_config(), FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(base_config()); let s = registry.add_channel(addr(8080)); s.try_eject(Instant::now()); // bumps multiplier 0 → 1 registry.ejected_count.fetch_add(1, Ordering::Relaxed); @@ -691,7 +653,7 @@ mod tests { let mut config = fp_config(50, 10, 3); config.base_ejection_time = Duration::from_secs(10); config.max_ejection_time = Duration::from_secs(300); - let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(config); let s = registry.add_channel(addr(8080)); let t0 = Instant::now(); @@ -725,7 +687,7 @@ mod tests { async fn dropping_abort_stops_actor() { let mut config = base_config(); config.interval = Duration::from_millis(50); - let registry = OutlierStatsRegistry::with_rng(config, FixedRng::boxed(99)); + let registry = OutlierStatsRegistry::new(config); let s = registry.add_channel(addr(8080)); s.set_ejection_multiplier(5);