Skip to content

Commit 429c0ca

Browse files
committed
fix(ssh): harden keepalives and tunnel shutdown
1 parent 1d92e08 commit 429c0ca

File tree

12 files changed

+216
-47
lines changed

12 files changed

+216
-47
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ pin-project-lite = "0.2"
7878
tokio-stream = "0.1"
7979
protobuf-src = "1.1.0"
8080
url = "2"
81+
socket2 = "0.6"
8182

8283
# Database
8384
sqlx = { version = "0.8", features = ["runtime-tokio-rustls", "postgres", "sqlite", "migrate"] }

architecture/sandbox-connect.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,11 +138,22 @@ sequenceDiagram
138138
5. When SSH starts, it spawns the `ssh-proxy` subprocess as its `ProxyCommand`.
139139
6. `crates/navigator-cli/src/ssh.rs` -- `sandbox_ssh_proxy()`:
140140
- Parses the gateway URL, connects via TCP (plain) or TLS (mTLS)
141+
- Enables TCP keepalive on the gateway socket
141142
- Sends a raw HTTP CONNECT request with `X-Sandbox-Id` and `X-Sandbox-Token` headers
142143
- Reads the response status line; proceeds if 200
143144
- Spawns two `tokio::spawn` tasks for bidirectional copy between stdin/stdout and the gateway stream
144145
- When the remote-to-stdout direction completes, aborts the stdin-to-remote task (SSH has all the data it needs)
145146

147+
### Connection stability
148+
149+
Recent SSH stability hardening is split across the client, gateway, sandbox, and edge tunnel paths:
150+
151+
- **OpenSSH keepalives**: the CLI now sets `ServerAliveInterval=30` and `ServerAliveCountMax=3` on every SSH invocation so idle sessions still emit SSH traffic.
152+
- **TCP keepalive**: the CLI-to-gateway and gateway-to-sandbox TCP sockets enable 30-second keepalive probes to reduce drops from NAT, load balancers, and other idle-sensitive middleboxes.
153+
- **Sandbox SSH daemon**: the embedded `russh` server disables its default 10-minute inactivity timeout and instead sends protocol keepalives every 30 seconds. This prevents quiet shells from being garbage-collected while still detecting dead peers.
154+
- **Edge WebSocket tunnel**: the WebSocket bridge now lets both copy directions observe shutdown instead of aborting the peer task immediately, which reduces abrupt closes and truncated tail data.
155+
- **Limit diagnostics**: when the gateway rejects a connection because the per-session or per-sandbox cap is reached, it now logs the active count and configured limit to make 429s easier to diagnose.
156+
146157
### Command Execution (CLI)
147158

148159
The `sandbox exec` path is identical to interactive connect except:

crates/navigator-cli/src/edge_tunnel.rs

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -126,24 +126,18 @@ async fn handle_connection(tcp_stream: TcpStream, config: &TunnelConfig) -> Resu
126126
let (ws_sink, ws_source) = ws_stream.split();
127127
let (tcp_read, tcp_write) = tokio::io::split(tcp_stream);
128128

129-
// Two tasks: TCP->WS and WS->TCP. Abort the peer task once either
130-
// direction finishes so the connection tears down promptly.
131-
let mut tcp_to_ws = tokio::spawn(copy_tcp_to_ws(tcp_read, ws_sink));
132-
let mut ws_to_tcp = tokio::spawn(copy_ws_to_tcp(ws_source, tcp_write));
133-
134-
tokio::select! {
135-
res = &mut tcp_to_ws => {
136-
if let Err(e) = res {
137-
debug!(error = %e, "tcp->ws task panicked");
138-
}
139-
ws_to_tcp.abort();
140-
}
141-
res = &mut ws_to_tcp => {
142-
if let Err(e) = res {
143-
debug!(error = %e, "ws->tcp task panicked");
144-
}
145-
tcp_to_ws.abort();
146-
}
129+
// Keep both directions alive until each side has observed shutdown. This
130+
// avoids cutting off trailing bytes when one half closes slightly earlier
131+
// than the other.
132+
let tcp_to_ws = tokio::spawn(copy_tcp_to_ws(tcp_read, ws_sink));
133+
let ws_to_tcp = tokio::spawn(copy_ws_to_tcp(ws_source, tcp_write));
134+
135+
let (tcp_to_ws_res, ws_to_tcp_res) = tokio::join!(tcp_to_ws, ws_to_tcp);
136+
if let Err(e) = tcp_to_ws_res {
137+
debug!(error = %e, "tcp->ws task panicked");
138+
}
139+
if let Err(e) = ws_to_tcp_res {
140+
debug!(error = %e, "ws->tcp task panicked");
147141
}
148142

149143
Ok(())

crates/navigator-cli/src/ssh.rs

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use miette::{IntoDiagnostic, Result, WrapErr};
88
use navigator_core::forward::{
99
find_ssh_forward_pid, resolve_ssh_gateway, shell_escape, write_forward_pid,
1010
};
11+
use navigator_core::net::enable_tcp_keepalive;
1112
use navigator_core::proto::{CreateSshSessionRequest, GetSandboxRequest};
1213
use owo_colors::OwoColorize;
1314
use rustls::pki_types::ServerName;
@@ -24,6 +25,8 @@ use tokio_rustls::TlsConnector;
2425

2526
const SSH_PROXY_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
2627
const SSH_PROXY_STATUS_TIMEOUT: Duration = Duration::from_secs(10);
28+
const SSH_SERVER_ALIVE_INTERVAL_SECS: u64 = 30;
29+
const SSH_SERVER_ALIVE_COUNT_MAX: u64 = 3;
2730

2831
struct SshSessionConfig {
2932
proxy_command: String,
@@ -113,6 +116,12 @@ fn ssh_base_command(proxy_command: &str) -> Command {
113116
.arg("-o")
114117
.arg("GlobalKnownHostsFile=/dev/null")
115118
.arg("-o")
119+
.arg(format!(
120+
"ServerAliveInterval={SSH_SERVER_ALIVE_INTERVAL_SECS}"
121+
))
122+
.arg("-o")
123+
.arg(format!("ServerAliveCountMax={SSH_SERVER_ALIVE_COUNT_MAX}"))
124+
.arg("-o")
116125
.arg("LogLevel=ERROR");
117126
command
118127
}
@@ -541,8 +550,13 @@ pub async fn sandbox_ssh_proxy(
541550
.await
542551
.map_err(|_| miette::miette!("timed out waiting for gateway CONNECT response"))??;
543552
if status != 200 {
553+
let reason = match status {
554+
401 => " (SSH session expired, was revoked, or is invalid)",
555+
429 => " (too many concurrent SSH connections for this session or sandbox)",
556+
_ => "",
557+
};
544558
return Err(miette::miette!(
545-
"gateway CONNECT failed with status {status}"
559+
"gateway CONNECT failed with status {status}{reason}"
546560
));
547561
}
548562

@@ -603,6 +617,8 @@ pub fn print_ssh_config(gateway: &str, name: &str) {
603617
println!(" StrictHostKeyChecking no");
604618
println!(" UserKnownHostsFile /dev/null");
605619
println!(" GlobalKnownHostsFile /dev/null");
620+
println!(" ServerAliveInterval {SSH_SERVER_ALIVE_INTERVAL_SECS}");
621+
println!(" ServerAliveCountMax {SSH_SERVER_ALIVE_COUNT_MAX}");
606622
println!(" LogLevel ERROR");
607623
println!(" ProxyCommand {proxy_cmd}");
608624
}
@@ -645,6 +661,7 @@ async fn connect_gateway(
645661
.map_err(|_| miette::miette!("timed out connecting to edge tunnel proxy"))?
646662
.into_diagnostic()?;
647663
tcp.set_nodelay(true).into_diagnostic()?;
664+
let _ = enable_tcp_keepalive(&tcp);
648665
return Ok(Box::new(tcp));
649666
}
650667

@@ -653,6 +670,7 @@ async fn connect_gateway(
653670
.map_err(|_| miette::miette!("timed out connecting to SSH gateway"))?
654671
.into_diagnostic()?;
655672
tcp.set_nodelay(true).into_diagnostic()?;
673+
let _ = enable_tcp_keepalive(&tcp);
656674
if scheme.eq_ignore_ascii_case("https") {
657675
let materials = require_tls_materials(&format!("https://{host}:{port}"), tls)?;
658676
let config = build_rustls_config(&materials)?;
@@ -707,3 +725,22 @@ async fn read_connect_status<R: AsyncRead + Unpin>(stream: &mut R) -> Result<u16
707725
trait ProxyStream: AsyncRead + AsyncWrite + Unpin + Send {}
708726

709727
impl<T> ProxyStream for T where T: AsyncRead + AsyncWrite + Unpin + Send {}
728+
729+
#[cfg(test)]
730+
mod tests {
731+
use super::*;
732+
733+
#[test]
734+
fn ssh_base_command_enables_server_keepalives() {
735+
let command = ssh_base_command("openshell ssh-proxy");
736+
let args = command
737+
.get_args()
738+
.map(|arg| arg.to_string_lossy().into_owned())
739+
.collect::<Vec<_>>();
740+
741+
assert!(args.contains(&format!(
742+
"ServerAliveInterval={SSH_SERVER_ALIVE_INTERVAL_SECS}"
743+
)));
744+
assert!(args.contains(&format!("ServerAliveCountMax={SSH_SERVER_ALIVE_COUNT_MAX}")));
745+
}
746+
}

crates/navigator-core/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ miette = { workspace = true }
1919
serde = { workspace = true }
2020
serde_json = { workspace = true }
2121
url = { workspace = true }
22+
socket2 = { workspace = true }
2223

2324
[build-dependencies]
2425
tonic-build = { workspace = true }

crates/navigator-core/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ pub mod config;
1212
pub mod error;
1313
pub mod forward;
1414
pub mod inference;
15+
pub mod net;
1516
pub mod proto;
1617

1718
pub use config::{Config, TlsConfig};

crates/navigator-core/src/net.rs

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
//! Shared socket configuration helpers.
5+
6+
use socket2::{SockRef, TcpKeepalive};
7+
use std::io;
8+
use std::time::Duration;
9+
10+
/// Idle time before TCP keepalive probes start.
11+
pub const TCP_KEEPALIVE_IDLE: Duration = Duration::from_secs(30);
12+
13+
/// Interval between TCP keepalive probes on supported platforms.
14+
pub const TCP_KEEPALIVE_INTERVAL: Duration = Duration::from_secs(30);
15+
16+
fn default_keepalive() -> TcpKeepalive {
17+
let keepalive = TcpKeepalive::new().with_time(TCP_KEEPALIVE_IDLE);
18+
#[cfg(any(
19+
target_os = "android",
20+
target_os = "dragonfly",
21+
target_os = "freebsd",
22+
target_os = "fuchsia",
23+
target_os = "illumos",
24+
target_os = "ios",
25+
target_os = "visionos",
26+
target_os = "linux",
27+
target_os = "macos",
28+
target_os = "netbsd",
29+
target_os = "tvos",
30+
target_os = "watchos",
31+
target_os = "windows",
32+
target_os = "cygwin",
33+
))]
34+
let keepalive = keepalive.with_interval(TCP_KEEPALIVE_INTERVAL);
35+
keepalive
36+
}
37+
38+
/// Enable aggressive TCP keepalive on a socket.
39+
#[cfg(unix)]
40+
pub fn enable_tcp_keepalive<S>(socket: &S) -> io::Result<()>
41+
where
42+
S: std::os::fd::AsFd,
43+
{
44+
SockRef::from(socket).set_tcp_keepalive(&default_keepalive())
45+
}
46+
47+
/// Enable aggressive TCP keepalive on a socket.
48+
#[cfg(windows)]
49+
pub fn enable_tcp_keepalive<S>(socket: &S) -> io::Result<()>
50+
where
51+
S: std::os::windows::io::AsSocket,
52+
{
53+
SockRef::from(socket).set_tcp_keepalive(&default_keepalive())
54+
}
55+
56+
/// Enable aggressive TCP keepalive on a socket.
57+
#[cfg(not(any(unix, windows)))]
58+
pub fn enable_tcp_keepalive<S>(_socket: &S) -> io::Result<()> {
59+
Ok(())
60+
}

crates/navigator-sandbox/src/ssh.rs

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use crate::sandbox;
99
#[cfg(target_os = "linux")]
1010
use crate::{register_managed_child, unregister_managed_child};
1111
use miette::{IntoDiagnostic, Result};
12+
use navigator_core::net::enable_tcp_keepalive;
1213
use nix::pty::{Winsize, openpty};
1314
use nix::unistd::setsid;
1415
use rand_core::OsRng;
@@ -29,6 +30,20 @@ use tracing::{info, warn};
2930

3031
const PREFACE_MAGIC: &str = "NSSH1";
3132
const SSH_PREFACE_TIMEOUT: Duration = Duration::from_secs(10);
33+
const SSH_KEEPALIVE_INTERVAL: Duration = Duration::from_secs(30);
34+
const SSH_KEEPALIVE_MAX_MISSES: usize = 3;
35+
36+
fn build_ssh_server_config(host_key: PrivateKey) -> russh::server::Config {
37+
russh::server::Config {
38+
auth_rejection_time: Duration::from_secs(1),
39+
inactivity_timeout: None,
40+
keepalive_interval: Some(SSH_KEEPALIVE_INTERVAL),
41+
keepalive_max: SSH_KEEPALIVE_MAX_MISSES,
42+
nodelay: true,
43+
keys: vec![host_key],
44+
..Default::default()
45+
}
46+
}
3247

3348
/// A time-bounded set of nonces used to detect replayed NSSH1 handshakes.
3449
/// Each entry records the `Instant` it was inserted; a background reaper task
@@ -48,14 +63,7 @@ async fn ssh_server_init(
4863
)> {
4964
let mut rng = OsRng;
5065
let host_key = PrivateKey::random(&mut rng, Algorithm::Ed25519).into_diagnostic()?;
51-
52-
let mut config = russh::server::Config {
53-
auth_rejection_time: Duration::from_secs(1),
54-
..Default::default()
55-
};
56-
config.keys.push(host_key);
57-
58-
let config = Arc::new(config);
66+
let config = Arc::new(build_ssh_server_config(host_key));
5967
let ca_paths = ca_file_paths.as_ref().map(|p| Arc::new(p.clone()));
6068
let listener = TcpListener::bind(listen_addr).await.into_diagnostic()?;
6169
info!(addr = %listen_addr, "SSH server listening");
@@ -159,6 +167,9 @@ async fn handle_connection(
159167
nonce_cache: &NonceCache,
160168
) -> Result<()> {
161169
info!(peer = %peer, "SSH connection: reading handshake preface");
170+
if let Err(err) = enable_tcp_keepalive(&stream) {
171+
warn!(peer = %peer, error = %err, "SSH connection: failed to enable TCP keepalive");
172+
}
162173
let mut line = String::new();
163174
tokio::time::timeout(SSH_PREFACE_TIMEOUT, read_line(&mut stream, &mut line))
164175
.await
@@ -1419,4 +1430,17 @@ mod tests {
14191430
assert!(verify_preface(&line1, secret, 300, &cache).unwrap());
14201431
assert!(verify_preface(&line2, secret, 300, &cache).unwrap());
14211432
}
1433+
1434+
#[test]
1435+
fn ssh_server_config_keeps_idle_sessions_alive() {
1436+
let mut rng = OsRng;
1437+
let host_key = PrivateKey::random(&mut rng, Algorithm::Ed25519).unwrap();
1438+
let config = build_ssh_server_config(host_key);
1439+
1440+
assert_eq!(config.inactivity_timeout, None);
1441+
assert_eq!(config.keepalive_interval, Some(SSH_KEEPALIVE_INTERVAL));
1442+
assert_eq!(config.keepalive_max, SSH_KEEPALIVE_MAX_MISSES);
1443+
assert!(config.nodelay);
1444+
assert_eq!(config.keys.len(), 1);
1445+
}
14221446
}

crates/navigator-server/src/ssh_tunnel.rs

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use http::StatusCode;
88
use hyper::Request;
99
use hyper::upgrade::Upgraded;
1010
use hyper_util::rt::TokioIo;
11+
use navigator_core::net::enable_tcp_keepalive;
1112
use navigator_core::proto::{Sandbox, SandboxPhase, SshSession};
1213
use prost::Message;
1314
use std::net::SocketAddr;
@@ -118,7 +119,7 @@ async fn ssh_connect(
118119
let mut counts = state.ssh_connections_by_token.lock().unwrap();
119120
let count = counts.entry(token.clone()).or_insert(0);
120121
if *count >= MAX_CONNECTIONS_PER_TOKEN {
121-
warn!(token = %token, "SSH tunnel: per-token connection limit reached");
122+
warn!(sandbox_id = %sandbox_id, active_connections = *count, limit = MAX_CONNECTIONS_PER_TOKEN, "SSH tunnel: per-token connection limit reached");
122123
return StatusCode::TOO_MANY_REQUESTS.into_response();
123124
}
124125
*count += 1;
@@ -137,7 +138,7 @@ async fn ssh_connect(
137138
token_counts.remove(&token);
138139
}
139140
}
140-
warn!(sandbox_id = %sandbox_id, "SSH tunnel: per-sandbox connection limit reached");
141+
warn!(sandbox_id = %sandbox_id, active_connections = *count, limit = MAX_CONNECTIONS_PER_SANDBOX, "SSH tunnel: per-sandbox connection limit reached");
141142
return StatusCode::TOO_MANY_REQUESTS.into_response();
142143
}
143144
*count += 1;
@@ -265,6 +266,9 @@ async fn establish_upstream_with_timeouts(
265266
upstream
266267
.set_nodelay(true)
267268
.map_err(|err| TunnelSetupError::Other(err.to_string()))?;
269+
if let Err(err) = enable_tcp_keepalive(&upstream) {
270+
warn!(sandbox_id = %sandbox_id, error = %err, "SSH tunnel: failed to enable upstream TCP keepalive");
271+
}
268272
info!(sandbox_id = %sandbox_id, "SSH tunnel: sending NSSH1 handshake preface");
269273
let preface =
270274
build_preface(token, secret).map_err(|err| TunnelSetupError::Other(err.to_string()))?;
@@ -296,9 +300,26 @@ async fn bridge_tunnel(
296300
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
297301
info!(sandbox_id = %sandbox_id, "SSH tunnel established");
298302
let mut upgraded = TokioIo::new(upgraded);
299-
// Discard the result entirely – connection-close errors are expected when
300-
// the SSH session ends and do not represent a failure worth propagating.
301-
let _ = tokio::io::copy_bidirectional(&mut upgraded, &mut upstream).await;
303+
match tokio::io::copy_bidirectional(&mut upgraded, &mut upstream).await {
304+
Ok((client_to_sandbox, sandbox_to_client)) => {
305+
info!(sandbox_id = %sandbox_id, client_to_sandbox_bytes = client_to_sandbox, sandbox_to_client_bytes = sandbox_to_client, "SSH tunnel closed");
306+
}
307+
Err(err)
308+
if matches!(
309+
err.kind(),
310+
std::io::ErrorKind::BrokenPipe
311+
| std::io::ErrorKind::ConnectionAborted
312+
| std::io::ErrorKind::ConnectionReset
313+
| std::io::ErrorKind::NotConnected
314+
| std::io::ErrorKind::UnexpectedEof
315+
) =>
316+
{
317+
info!(sandbox_id = %sandbox_id, error = %err, "SSH tunnel closed during shutdown");
318+
}
319+
Err(err) => {
320+
warn!(sandbox_id = %sandbox_id, error = %err, "SSH tunnel I/O failure");
321+
}
322+
}
302323
// Gracefully shut down the write-half of the upgraded connection so the
303324
// client receives a clean EOF instead of a TCP RST. This gives SSH time
304325
// to read any remaining protocol data (e.g. exit-status) from its buffer.

0 commit comments

Comments
 (0)