From 36ebd848f5eb80a67014295510eb5aefd0fd3810 Mon Sep 17 00:00:00 2001 From: Ethan Rodkin Date: Mon, 4 May 2026 13:46:46 -0400 Subject: [PATCH] fix mdns connection failures --- Cargo.lock | 20 ++++- Cargo.toml | 7 ++ src/rpc/dial.rs | 205 +++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 203 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4bf4a13..ba3c480 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1513,6 +1513,22 @@ dependencies = [ "want", ] +[[package]] +name = "hyper-rustls" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" +dependencies = [ + "futures-util", + "http", + "hyper", + "log", + "rustls 0.21.12", + "rustls-native-certs", + "tokio", + "tokio-rustls 0.24.1", +] + [[package]] name = "hyper-timeout" version = "0.4.1" @@ -3819,8 +3835,6 @@ checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "viam-mdns" version = "3.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed628096380455931a36c47c97ac663b97f80a7174ed2b26a96bae3dc01282f6" dependencies = [ "async-std", "async-stream 0.2.1", @@ -3855,6 +3869,7 @@ dependencies = [ "http", "http-body", "hyper", + "hyper-rustls", "interceptor 0.12.0", "libc", "local-ip-address", @@ -3864,6 +3879,7 @@ dependencies = [ "prost", "prost-types", "rand 0.8.5", + "rustls 0.21.12", "serde", "serde_json", "tokio", diff --git a/Cargo.toml b/Cargo.toml index 4f50329..e2b34ca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,6 +38,8 @@ futures-util = "0.3" http = "0.2.7" http-body = {version = "0.4.4"} hyper = { version = "0.14.20", features = ["full"] } +hyper-rustls = { version = "0.24", features = ["http2"] } +rustls = { version = "0.21", features = ["dangerous_configuration"] } interceptor = "0.12.0" libc = {version = "0.2"} local-ip-address = "0.5.5" @@ -61,6 +63,11 @@ viam-mdns = "3.0.1" webpki-roots = "0.21.1" webrtc = "0.12.0" +# Override viam-mdns with the local development version while the macOS fix is +# not yet published. Remove this section before releasing a new crate version. +[patch.crates-io] +viam-mdns = { path = "../mdns" } + [dev-dependencies] async-stream = "0.3.3" env_logger = "0.9.0" diff --git a/src/rpc/dial.rs b/src/rpc/dial.rs index 4508943..5774266 100644 --- a/src/rpc/dial.rs +++ b/src/rpc/dial.rs @@ -21,7 +21,7 @@ use ::http::{ uri::{Authority, Parts, PathAndQuery, Scheme}, HeaderValue, Version, }; -use ::viam_mdns::{discover, Response}; +use ::viam_mdns::{discover, RecordKind, Response}; use ::webrtc::ice_transport::{ ice_candidate::{RTCIceCandidate, RTCIceCandidateInit}, ice_connection_state::RTCIceConnectionState, @@ -43,9 +43,59 @@ use std::{ time::{Duration, Instant}, }; use tokio::sync::{mpsc, watch}; +use tonic::body::BoxBody; use tonic::codegen::BoxFuture; use tonic::transport::{Body, Channel, Uri}; -use tonic::{body::BoxBody, transport::ClientTlsConfig}; + +/// TLS certificate verifier that accepts any certificate, used for mDNS connections +/// where the local device's cert is signed by a public CA (Let's Encrypt or Google) +/// but previously only the leaf cert was sent in the TLS handshake. Without the +/// intermediate CA cert in the chain, clients cannot build the path to the root even +/// though the root is in the system trust store, so verification fails. +/// +/// TODO(RSDK-13879): app commit d83137fc added `Bundle: true` to ACME certificate +/// issuance (both Let's Encrypt and Google CA paths), which causes the full chain +/// (leaf + intermediate) to be stored and sent during TLS handshakes. A follow-up +/// migration (RSDK-13799) redistributes renewal dates so that all existing robot +/// certificates are re-issued with the bundled chain by May 22, 2026. After that +/// date, `NoCertVerification` can be replaced with tonic's standard TLS: +/// 1. Remove this struct entirely. +/// 2. Remove the `hyper-rustls` and `rustls` dependencies from Cargo.toml. +/// 3. Restore the `domain: &str` parameter to `create_channel` (it was removed +/// along with this workaround). The domain is the robot's canonical hostname +/// from the original URI (e.g. `my-robot.abcdefg.local.viam.cloud`), NOT the +/// discovered IP — it is needed for SNI and SAN verification since the cert's +/// SANs are hostnames, not IP addresses. +/// 4. In `create_channel` (non-loopback mDNS path below), replace the entire +/// custom-connector block with: +/// let tls_config = ClientTlsConfig::new().domain_name(domain); +/// let mut parts = uri.into_parts(); +/// parts.scheme = Some(Scheme::HTTPS); +/// let uri = Uri::from_parts(parts)?; +/// return Channel::builder(uri.clone()) +/// .tls_config(tls_config)? +/// .connect() +/// .await +/// .with_context(|| format!("Connecting to {:?}", uri)); +/// `ClientTlsConfig::new()` uses the webpki root CA bundle (tonic is built +/// with `tls-webpki-roots`), which already contains the Let's Encrypt and +/// Google CA roots — no custom cert pinning required. +#[derive(Debug)] +struct NoCertVerification; + +impl rustls::client::ServerCertVerifier for NoCertVerification { + fn verify_server_cert( + &self, + _end_entity: &rustls::Certificate, + _intermediates: &[rustls::Certificate], + _server_name: &rustls::ServerName, + _scts: &mut dyn Iterator, + _ocsp_response: &[u8], + _now: std::time::SystemTime, + ) -> Result { + Ok(rustls::client::ServerCertVerified::assertion()) + } +} use tower::{Service, ServiceBuilder}; use tower_http::auth::AddAuthorization; use tower_http::auth::AddAuthorizationLayer; @@ -316,6 +366,7 @@ impl DialBuilder { async fn get_addr_from_interface( iface: (&str, Vec<&IpAddr>), candidates: &Vec, + local_ipv4s: &std::collections::HashSet, ) -> Option { let addresses: Vec = iface .1 @@ -329,12 +380,17 @@ impl DialBuilder { let mut resp: Option = None; for ipv4 in addresses { for candidate in candidates { - let discovery = discover::interface_with_loopback( + let discovery = match discover::interface_with_loopback( VIAM_MDNS_SERVICE_NAME, Duration::from_millis(250), ipv4, - ) - .ok()?; + ) { + Ok(d) => d, + Err(e) => { + log::debug!("mDNS socket error on {ipv4}: {e}"); + continue; + } + }; let stream = discovery.listen(); pin_mut!(stream); while let Some(Ok(response)) = stream.next().await { @@ -347,10 +403,19 @@ impl DialBuilder { // candidates based on the actual "my-cool-robot" name without being opinionated // on whether the candidate is locally named or not. let local_agnostic_candidate = candidate.as_str().split("viam").next()?; + log::debug!( + "mDNS response on {ipv4}: hostname={hostname:?}, candidate={candidate:?}, local_agnostic={local_agnostic_candidate:?}, matches={}", + hostname.contains(local_agnostic_candidate) + ); if hostname.contains(local_agnostic_candidate) { resp = Some(response); break; } + } else { + log::debug!( + "mDNS response on {ipv4}: no hostname (no PTR record); answers={:?}", + response.answers + ); } if resp.is_some() { break; @@ -367,10 +432,33 @@ impl DialBuilder { has_webrtc = has_webrtc || field.contains("webrtc"); } - let ip_addr = match resp.ip_addr() { - Some(std::net::IpAddr::V4(ip_v4)) => Some(ip_v4), - Some(std::net::IpAddr::V6(_)) | None => None, - }; + // Log all records in the response for diagnostics. + log::debug!( + "mDNS matched response records: {:?}", + resp.records().collect::>() + ); + + // Prefer a non-loopback IPv4 address that is currently present on one of our own + // network interfaces. viam-server may advertise a stale IP (e.g. a WiFi address + // from when it was started that is now unreachable because WiFi was turned off). + // Accepting only IPs we can actually route to avoids ENETUNREACH at connect time. + // If no reachable non-loopback IP is found, fall back to any advertised IPv4 + // (including loopback) so that offline-only (loopback-only) machines still work. + let ip_addr = resp + .records() + .filter_map(|r| match r.kind { + RecordKind::A(addr) if !addr.is_loopback() && local_ipv4s.contains(&addr) => { + Some(addr) + } + _ => None, + }) + .next() + .or_else(|| { + resp.records().find_map(|r| match r.kind { + RecordKind::A(addr) => Some(addr), + _ => None, + }) + }); if !(has_grpc || has_webrtc) || ip_addr.is_none() { return None; @@ -378,6 +466,7 @@ impl DialBuilder { let mut local_addr = ip_addr?.to_string(); local_addr.push(':'); local_addr.push_str(&resp.port()?.to_string()); + log::debug!("mDNS resolved address: {local_addr}"); Some(local_addr) } @@ -401,6 +490,18 @@ impl DialBuilder { let ifaces = list_afinet_netifas().ok()?; + // Collect all local IPv4 addresses so we can validate mDNS response IPs. + // viam-server may advertise a stale IP (e.g. a WiFi address from when it started, + // now unreachable because WiFi is off). By only accepting non-loopback IPs that are + // currently present on a local interface, we avoid picking an unreachable address. + let local_ipv4s: std::collections::HashSet = ifaces + .iter() + .filter_map(|(_, ip)| match ip { + IpAddr::V4(v4) => Some(*v4), + _ => None, + }) + .collect(); + let ifaces: HashMap<&str, Vec<&IpAddr>> = ifaces.iter().fold(HashMap::new(), |mut map, (k, v)| { map.entry(k).or_default().push(v); @@ -409,7 +510,11 @@ impl DialBuilder { let mut iface_futures = FuturesUnordered::new(); for iface in ifaces { - iface_futures.push(Self::get_addr_from_interface(iface, &candidates)); + iface_futures.push(Self::get_addr_from_interface( + iface, + &candidates, + &local_ipv4s, + )); } let mut local_addr: Option = None; @@ -437,18 +542,61 @@ impl DialBuilder { Some(uri) } - async fn create_channel( - allow_downgrade: bool, - domain: &str, - uri: Uri, - for_mdns: bool, - ) -> Result { - let mut chan = Channel::builder(uri.clone()); + // TODO(RSDK-13879) step 3: restore `domain: &str` as a second parameter (after + // `allow_downgrade`) and thread it through all four call sites below. See the + // `NoCertVerification` doc comment for the full migration instructions. + async fn create_channel(allow_downgrade: bool, uri: Uri, for_mdns: bool) -> Result { if for_mdns { - let tls_config = ClientTlsConfig::new().domain_name(domain); - chan = chan.tls_config(tls_config)?; + let host = uri.host().unwrap_or(""); + let is_loopback = host + .parse::() + .map(|ip| ip.is_loopback()) + .unwrap_or(false); + + if is_loopback { + // viam-server serves plain gRPC (no TLS) on the loopback port it advertises + // via mDNS. The server only accepts HTTP/2 (gRPC requires it), so we must use + // h2c (HTTP/2 cleartext Prior Knowledge). tonic's own connect() sets + // http2_only=true which sends the h2c connection preface directly over TCP. + // We intentionally do NOT use connect_with_connector here: hyper-rustls's + // plain-TCP path sends HTTP/1.1, which the server rejects with an empty reply + // (BrokenPipe from our side). + log::debug!("mDNS create_channel: loopback {host}, using tonic h2c connect"); + return Channel::builder(uri.clone()) + .connect() + .await + .with_context(|| format!("Connecting to {:?}", uri)); + } + + // Non-loopback LAN address: viam-server uses TLS but currently only sends the leaf + // cert (no intermediate), so standard verification fails. Use a custom connector + // that skips cert verification for now. + // TODO(RSDK-13879) step 4: replace the block below with standard tonic TLS once all + // robot certs include the full chain (after May 22, 2026). See the + // `NoCertVerification` doc comment for the exact replacement snippet. + log::debug!("mDNS create_channel: LAN address {host}, using no-cert-verify TLS"); + let tls_config = rustls::ClientConfig::builder() + .with_safe_defaults() + .with_custom_certificate_verifier(Arc::new(NoCertVerification)) + .with_no_client_auth(); + + let connector = hyper_rustls::HttpsConnectorBuilder::new() + .with_tls_config(tls_config) + .https_or_http() + .enable_http2() + .build(); + + let mut parts = uri.into_parts(); + parts.scheme = Some(Scheme::HTTPS); + let uri = Uri::from_parts(parts)?; + log::debug!("mDNS create_channel: connecting to {uri:?}"); + return Channel::builder(uri.clone()) + .connect_with_connector(connector) + .await + .with_context(|| format!("Connecting to {:?}", uri)); } - let chan = match chan + + let chan = match Channel::builder(uri.clone()) .connect() .await .with_context(|| format!("Connecting to {:?}", uri.clone())) @@ -512,7 +660,8 @@ impl DialBuilder { } let channel = match mdns_uri { - Some(uri) => Self::create_channel(self.config.allow_downgrade, domain, uri, true).await, + // TODO(RSDK-13879) step 3: pass `domain` as second arg once NoCertVerification is removed. + Some(uri) => Self::create_channel(self.config.allow_downgrade, uri, true).await, // not actually an error necessarily, but we want to ensure that a channel is still // created with the default uri None => Err(anyhow::anyhow!("")), @@ -526,11 +675,11 @@ impl DialBuilder { Err(e) => { if attempting_mdns { log::debug!( - "Unable to connect via mDNS; falling back to robot URI. Error: {e}" + "Unable to connect via mDNS; falling back to robot URI. Error: {e:#}" ); } - Self::create_channel(self.config.allow_downgrade, domain, uri.clone(), false) - .await? + // TODO(RSDK-13879) step 3: pass `domain` as second arg once NoCertVerification is removed. + Self::create_channel(self.config.allow_downgrade, uri.clone(), false).await? } }; // TODO (RSDK-517) make maybe_connect_via_webrtc take a more generic type so we don't @@ -686,7 +835,8 @@ impl DialBuilder { log::debug!("Attempting to connect"); } let channel = match mdns_uri { - Some(uri) => Self::create_channel(allow_downgrade, &domain, uri, true).await, + // TODO(RSDK-13879) step 3: pass `domain` as second arg once NoCertVerification is removed. + Some(uri) => Self::create_channel(allow_downgrade, uri, true).await, // not actually an error necessarily, but we want to ensure that a channel is still // created with the default uri None => Err(anyhow::anyhow!("")), @@ -699,10 +849,11 @@ impl DialBuilder { Err(e) => { if attempting_mdns { log::debug!( - "Unable to connect via mDNS; falling back to robot URI. Error: {e}" + "Unable to connect via mDNS; falling back to robot URI. Error: {e:#}" ); } - Self::create_channel(allow_downgrade, &domain, uri_for_auth, false).await? + // TODO(RSDK-13879) step 3: pass `domain` as second arg once NoCertVerification is removed. + Self::create_channel(allow_downgrade, uri_for_auth, false).await? } };