Skip to content

Commit fbf375b

Browse files
committed
fix(ssh): harden keepalives and tunnel shutdown
1 parent 1b44995 commit fbf375b

File tree

12 files changed

+211
-48
lines changed

12 files changed

+211
-48
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ pin-project-lite = "0.2"
8282
tokio-stream = "0.1"
8383
protobuf-src = "1.1.0"
8484
url = "2"
85+
socket2 = "0.6"
8586

8687
# Database
8788
sqlx = { version = "0.8", features = ["runtime-tokio-rustls", "postgres", "sqlite", "migrate"] }

architecture/sandbox-connect.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,11 +141,22 @@ sequenceDiagram
141141
5. When SSH starts, it spawns the `ssh-proxy` subprocess as its `ProxyCommand`.
142142
6. `crates/openshell-cli/src/ssh.rs` -- `sandbox_ssh_proxy()`:
143143
- Parses the gateway URL, connects via TCP (plain) or TLS (mTLS)
144+
- Enables TCP keepalive on the gateway socket
144145
- Sends a raw HTTP CONNECT request with `X-Sandbox-Id` and `X-Sandbox-Token` headers
145146
- Reads the response status line; proceeds if 200
146147
- Spawns two `tokio::spawn` tasks for bidirectional copy between stdin/stdout and the gateway stream
147148
- When the remote-to-stdout direction completes, aborts the stdin-to-remote task (SSH has all the data it needs)
148149

150+
### Connection stability
151+
152+
Recent SSH stability hardening is split across the client, gateway, sandbox, and edge tunnel paths:
153+
154+
- **OpenSSH keepalives**: the CLI now sets `ServerAliveInterval=30` and `ServerAliveCountMax=3` on every SSH invocation so idle sessions still emit SSH traffic.
155+
- **TCP keepalive**: the CLI-to-gateway and gateway-to-sandbox TCP sockets enable 30-second keepalive probes to reduce drops from NAT, load balancers, and other idle-sensitive middleboxes.
156+
- **Sandbox SSH daemon**: the embedded `russh` server disables its default 10-minute inactivity timeout and instead sends protocol keepalives every 30 seconds. This prevents quiet shells from being garbage-collected while still detecting dead peers.
157+
- **Edge WebSocket tunnel**: the WebSocket bridge now lets both copy directions observe shutdown instead of aborting the peer task immediately, which reduces abrupt closes and truncated tail data.
158+
- **Limit diagnostics**: when the gateway rejects a connection because the per-session or per-sandbox cap is reached, it now logs the active count and configured limit to make 429s easier to diagnose.
159+
149160
### Command Execution (CLI)
150161

151162
The `sandbox exec` path is identical to interactive connect except:

crates/openshell-cli/src/edge_tunnel.rs

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -126,24 +126,18 @@ async fn handle_connection(tcp_stream: TcpStream, config: &TunnelConfig) -> Resu
126126
let (ws_sink, ws_source) = ws_stream.split();
127127
let (tcp_read, tcp_write) = tokio::io::split(tcp_stream);
128128

129-
// Two tasks: TCP->WS and WS->TCP. Abort the peer task once either
130-
// direction finishes so the connection tears down promptly.
131-
let mut tcp_to_ws = tokio::spawn(copy_tcp_to_ws(tcp_read, ws_sink));
132-
let mut ws_to_tcp = tokio::spawn(copy_ws_to_tcp(ws_source, tcp_write));
133-
134-
tokio::select! {
135-
res = &mut tcp_to_ws => {
136-
if let Err(e) = res {
137-
debug!(error = %e, "tcp->ws task panicked");
138-
}
139-
ws_to_tcp.abort();
140-
}
141-
res = &mut ws_to_tcp => {
142-
if let Err(e) = res {
143-
debug!(error = %e, "ws->tcp task panicked");
144-
}
145-
tcp_to_ws.abort();
146-
}
129+
// Keep both directions alive until each side has observed shutdown. This
130+
// avoids cutting off trailing bytes when one half closes slightly earlier
131+
// than the other.
132+
let tcp_to_ws = tokio::spawn(copy_tcp_to_ws(tcp_read, ws_sink));
133+
let ws_to_tcp = tokio::spawn(copy_ws_to_tcp(ws_source, tcp_write));
134+
135+
let (tcp_to_ws_res, ws_to_tcp_res) = tokio::join!(tcp_to_ws, ws_to_tcp);
136+
if let Err(e) = tcp_to_ws_res {
137+
debug!(error = %e, "tcp->ws task panicked");
138+
}
139+
if let Err(e) = ws_to_tcp_res {
140+
debug!(error = %e, "ws->tcp task panicked");
147141
}
148142

149143
Ok(())

crates/openshell-cli/src/ssh.rs

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use nix::sys::signal::{SaFlags, SigAction, SigHandler, SigSet, Signal, sigaction
1010
use openshell_core::forward::{
1111
find_ssh_forward_pid, resolve_ssh_gateway, shell_escape, write_forward_pid,
1212
};
13+
use openshell_core::net::enable_tcp_keepalive;
1314
use openshell_core::proto::{CreateSshSessionRequest, GetSandboxRequest};
1415
use owo_colors::OwoColorize;
1516
use rustls::pki_types::ServerName;
@@ -56,6 +57,8 @@ impl Editor {
5657

5758
const SSH_PROXY_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
5859
const SSH_PROXY_STATUS_TIMEOUT: Duration = Duration::from_secs(10);
60+
const SSH_SERVER_ALIVE_INTERVAL_SECS: u64 = 30;
61+
const SSH_SERVER_ALIVE_COUNT_MAX: u64 = 3;
5962

6063
struct SshSessionConfig {
6164
proxy_command: String,
@@ -145,6 +148,12 @@ fn ssh_base_command(proxy_command: &str) -> Command {
145148
.arg("-o")
146149
.arg("GlobalKnownHostsFile=/dev/null")
147150
.arg("-o")
151+
.arg(format!(
152+
"ServerAliveInterval={SSH_SERVER_ALIVE_INTERVAL_SECS}"
153+
))
154+
.arg("-o")
155+
.arg(format!("ServerAliveCountMax={SSH_SERVER_ALIVE_COUNT_MAX}"))
156+
.arg("-o")
148157
.arg("LogLevel=ERROR");
149158
command
150159
}
@@ -722,8 +731,13 @@ pub async fn sandbox_ssh_proxy(
722731
.await
723732
.map_err(|_| miette::miette!("timed out waiting for gateway CONNECT response"))??;
724733
if status != 200 {
734+
let reason = match status {
735+
401 => " (SSH session expired, was revoked, or is invalid)",
736+
429 => " (too many concurrent SSH connections for this session or sandbox)",
737+
_ => "",
738+
};
725739
return Err(miette::miette!(
726-
"gateway CONNECT failed with status {status}"
740+
"gateway CONNECT failed with status {status}{reason}"
727741
));
728742
}
729743

@@ -776,7 +790,7 @@ fn render_ssh_config(gateway: &str, name: &str) -> String {
776790
let proxy_cmd = format!("{exe} ssh-proxy --gateway-name {gateway} --name {name}");
777791
let host_alias = host_alias(name);
778792
format!(
779-
"Host {host_alias}\n User sandbox\n StrictHostKeyChecking no\n UserKnownHostsFile /dev/null\n GlobalKnownHostsFile /dev/null\n LogLevel ERROR\n ProxyCommand {proxy_cmd}\n"
793+
"Host {host_alias}\n User sandbox\n StrictHostKeyChecking no\n UserKnownHostsFile /dev/null\n GlobalKnownHostsFile /dev/null\n ServerAliveInterval {SSH_SERVER_ALIVE_INTERVAL_SECS}\n ServerAliveCountMax {SSH_SERVER_ALIVE_COUNT_MAX}\n LogLevel ERROR\n ProxyCommand {proxy_cmd}\n"
780794
)
781795
}
782796

@@ -1000,6 +1014,7 @@ async fn connect_gateway(
10001014
.map_err(|_| miette::miette!("timed out connecting to edge tunnel proxy"))?
10011015
.into_diagnostic()?;
10021016
tcp.set_nodelay(true).into_diagnostic()?;
1017+
let _ = enable_tcp_keepalive(&tcp);
10031018
return Ok(Box::new(tcp));
10041019
}
10051020

@@ -1008,6 +1023,7 @@ async fn connect_gateway(
10081023
.map_err(|_| miette::miette!("timed out connecting to SSH gateway"))?
10091024
.into_diagnostic()?;
10101025
tcp.set_nodelay(true).into_diagnostic()?;
1026+
let _ = enable_tcp_keepalive(&tcp);
10111027
if scheme.eq_ignore_ascii_case("https") {
10121028
let materials = require_tls_materials(&format!("https://{host}:{port}"), tls)?;
10131029
let config = build_rustls_config(&materials)?;
@@ -1167,4 +1183,19 @@ mod tests {
11671183
assert!(message.contains("Forwarding port 3000 to sandbox demo"));
11681184
assert!(message.contains("Access at: http://localhost:3000/"));
11691185
}
1186+
1187+
#[test]
1188+
fn ssh_base_command_enables_server_keepalives() {
1189+
let command = ssh_base_command("openshell ssh-proxy");
1190+
let args = command
1191+
.get_args()
1192+
.map(|arg| arg.to_string_lossy().into_owned())
1193+
.collect::<Vec<_>>();
1194+
1195+
assert!(args.contains(&format!(
1196+
"ServerAliveInterval={SSH_SERVER_ALIVE_INTERVAL_SECS}"
1197+
)));
1198+
assert!(args.contains(&format!("ServerAliveCountMax={SSH_SERVER_ALIVE_COUNT_MAX}")));
1199+
}
1200+
}
11701201
}

crates/openshell-core/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ miette = { workspace = true }
1919
serde = { workspace = true }
2020
serde_json = { workspace = true }
2121
url = { workspace = true }
22+
socket2 = { workspace = true }
2223

2324
[build-dependencies]
2425
tonic-build = { workspace = true }

crates/openshell-core/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ pub mod config;
1313
pub mod error;
1414
pub mod forward;
1515
pub mod inference;
16+
pub mod net;
1617
pub mod paths;
1718
pub mod proto;
1819

crates/openshell-core/src/net.rs

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
//! Shared socket configuration helpers.
5+
6+
use socket2::{SockRef, TcpKeepalive};
7+
use std::io;
8+
use std::time::Duration;
9+
10+
/// Idle time before TCP keepalive probes start.
11+
pub const TCP_KEEPALIVE_IDLE: Duration = Duration::from_secs(30);
12+
13+
/// Interval between TCP keepalive probes on supported platforms.
14+
pub const TCP_KEEPALIVE_INTERVAL: Duration = Duration::from_secs(30);
15+
16+
fn default_keepalive() -> TcpKeepalive {
17+
let keepalive = TcpKeepalive::new().with_time(TCP_KEEPALIVE_IDLE);
18+
#[cfg(any(
19+
target_os = "android",
20+
target_os = "dragonfly",
21+
target_os = "freebsd",
22+
target_os = "fuchsia",
23+
target_os = "illumos",
24+
target_os = "ios",
25+
target_os = "visionos",
26+
target_os = "linux",
27+
target_os = "macos",
28+
target_os = "netbsd",
29+
target_os = "tvos",
30+
target_os = "watchos",
31+
target_os = "windows",
32+
target_os = "cygwin",
33+
))]
34+
let keepalive = keepalive.with_interval(TCP_KEEPALIVE_INTERVAL);
35+
keepalive
36+
}
37+
38+
/// Enable aggressive TCP keepalive on a socket.
39+
#[cfg(unix)]
40+
pub fn enable_tcp_keepalive<S>(socket: &S) -> io::Result<()>
41+
where
42+
S: std::os::fd::AsFd,
43+
{
44+
SockRef::from(socket).set_tcp_keepalive(&default_keepalive())
45+
}
46+
47+
/// Enable aggressive TCP keepalive on a socket.
48+
#[cfg(windows)]
49+
pub fn enable_tcp_keepalive<S>(socket: &S) -> io::Result<()>
50+
where
51+
S: std::os::windows::io::AsSocket,
52+
{
53+
SockRef::from(socket).set_tcp_keepalive(&default_keepalive())
54+
}
55+
56+
/// Enable aggressive TCP keepalive on a socket.
57+
#[cfg(not(any(unix, windows)))]
58+
pub fn enable_tcp_keepalive<S>(_socket: &S) -> io::Result<()> {
59+
Ok(())
60+
}

crates/openshell-sandbox/src/ssh.rs

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use crate::sandbox;
1010
#[cfg(target_os = "linux")]
1111
use crate::{register_managed_child, unregister_managed_child};
1212
use miette::{IntoDiagnostic, Result};
13+
use openshell_core::net::enable_tcp_keepalive;
1314
use nix::pty::{Winsize, openpty};
1415
use nix::unistd::setsid;
1516
use rand_core::OsRng;
@@ -33,6 +34,20 @@ const PREFACE_MAGIC: &str = "NSSH1";
3334
const SSH_HANDSHAKE_SECRET_ENV: &str = "OPENSHELL_SSH_HANDSHAKE_SECRET";
3435

3536
const SSH_PREFACE_TIMEOUT: Duration = Duration::from_secs(10);
37+
const SSH_KEEPALIVE_INTERVAL: Duration = Duration::from_secs(30);
38+
const SSH_KEEPALIVE_MAX_MISSES: usize = 3;
39+
40+
fn build_ssh_server_config(host_key: PrivateKey) -> russh::server::Config {
41+
russh::server::Config {
42+
auth_rejection_time: Duration::from_secs(1),
43+
inactivity_timeout: None,
44+
keepalive_interval: Some(SSH_KEEPALIVE_INTERVAL),
45+
keepalive_max: SSH_KEEPALIVE_MAX_MISSES,
46+
nodelay: true,
47+
keys: vec![host_key],
48+
..Default::default()
49+
}
50+
}
3651

3752
/// A time-bounded set of nonces used to detect replayed NSSH1 handshakes.
3853
/// Each entry records the `Instant` it was inserted; a background reaper task
@@ -52,14 +67,7 @@ async fn ssh_server_init(
5267
)> {
5368
let mut rng = OsRng;
5469
let host_key = PrivateKey::random(&mut rng, Algorithm::Ed25519).into_diagnostic()?;
55-
56-
let mut config = russh::server::Config {
57-
auth_rejection_time: Duration::from_secs(1),
58-
..Default::default()
59-
};
60-
config.keys.push(host_key);
61-
62-
let config = Arc::new(config);
70+
let config = Arc::new(build_ssh_server_config(host_key));
6371
let ca_paths = ca_file_paths.as_ref().map(|p| Arc::new(p.clone()));
6472
let listener = TcpListener::bind(listen_addr).await.into_diagnostic()?;
6573
info!(addr = %listen_addr, "SSH server listening");
@@ -163,6 +171,9 @@ async fn handle_connection(
163171
nonce_cache: &NonceCache,
164172
) -> Result<()> {
165173
info!(peer = %peer, "SSH connection: reading handshake preface");
174+
if let Err(err) = enable_tcp_keepalive(&stream) {
175+
warn!(peer = %peer, error = %err, "SSH connection: failed to enable TCP keepalive");
176+
}
166177
let mut line = String::new();
167178
tokio::time::timeout(SSH_PREFACE_TIMEOUT, read_line(&mut stream, &mut line))
168179
.await
@@ -1466,4 +1477,17 @@ mod tests {
14661477
assert!(!stdout.contains(SSH_HANDSHAKE_SECRET_ENV));
14671478
assert!(stdout.contains("ANTHROPIC_API_KEY=openshell:resolve:env:ANTHROPIC_API_KEY"));
14681479
}
1480+
1481+
#[test]
1482+
fn ssh_server_config_keeps_idle_sessions_alive() {
1483+
let mut rng = OsRng;
1484+
let host_key = PrivateKey::random(&mut rng, Algorithm::Ed25519).unwrap();
1485+
let config = build_ssh_server_config(host_key);
1486+
1487+
assert_eq!(config.inactivity_timeout, None);
1488+
assert_eq!(config.keepalive_interval, Some(SSH_KEEPALIVE_INTERVAL));
1489+
assert_eq!(config.keepalive_max, SSH_KEEPALIVE_MAX_MISSES);
1490+
assert!(config.nodelay);
1491+
assert_eq!(config.keys.len(), 1);
1492+
}
14691493
}

crates/openshell-server/src/ssh_tunnel.rs

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use http::StatusCode;
88
use hyper::Request;
99
use hyper::upgrade::Upgraded;
1010
use hyper_util::rt::TokioIo;
11+
use openshell_core::net::enable_tcp_keepalive;
1112
use openshell_core::proto::{Sandbox, SandboxPhase, SshSession};
1213
use prost::Message;
1314
use std::net::SocketAddr;
@@ -118,7 +119,7 @@ async fn ssh_connect(
118119
let mut counts = state.ssh_connections_by_token.lock().unwrap();
119120
let count = counts.entry(token.clone()).or_insert(0);
120121
if *count >= MAX_CONNECTIONS_PER_TOKEN {
121-
warn!(token = %token, "SSH tunnel: per-token connection limit reached");
122+
warn!(sandbox_id = %sandbox_id, active_connections = *count, limit = MAX_CONNECTIONS_PER_TOKEN, "SSH tunnel: per-token connection limit reached");
122123
return StatusCode::TOO_MANY_REQUESTS.into_response();
123124
}
124125
*count += 1;
@@ -137,7 +138,7 @@ async fn ssh_connect(
137138
token_counts.remove(&token);
138139
}
139140
}
140-
warn!(sandbox_id = %sandbox_id, "SSH tunnel: per-sandbox connection limit reached");
141+
warn!(sandbox_id = %sandbox_id, active_connections = *count, limit = MAX_CONNECTIONS_PER_SANDBOX, "SSH tunnel: per-sandbox connection limit reached");
141142
return StatusCode::TOO_MANY_REQUESTS.into_response();
142143
}
143144
*count += 1;
@@ -265,6 +266,9 @@ async fn establish_upstream_with_timeouts(
265266
upstream
266267
.set_nodelay(true)
267268
.map_err(|err| TunnelSetupError::Other(err.to_string()))?;
269+
if let Err(err) = enable_tcp_keepalive(&upstream) {
270+
warn!(sandbox_id = %sandbox_id, error = %err, "SSH tunnel: failed to enable upstream TCP keepalive");
271+
}
268272
info!(sandbox_id = %sandbox_id, "SSH tunnel: sending NSSH1 handshake preface");
269273
let preface =
270274
build_preface(token, secret).map_err(|err| TunnelSetupError::Other(err.to_string()))?;
@@ -296,9 +300,26 @@ async fn bridge_tunnel(
296300
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
297301
info!(sandbox_id = %sandbox_id, "SSH tunnel established");
298302
let mut upgraded = TokioIo::new(upgraded);
299-
// Discard the result entirely – connection-close errors are expected when
300-
// the SSH session ends and do not represent a failure worth propagating.
301-
let _ = tokio::io::copy_bidirectional(&mut upgraded, &mut upstream).await;
303+
match tokio::io::copy_bidirectional(&mut upgraded, &mut upstream).await {
304+
Ok((client_to_sandbox, sandbox_to_client)) => {
305+
info!(sandbox_id = %sandbox_id, client_to_sandbox_bytes = client_to_sandbox, sandbox_to_client_bytes = sandbox_to_client, "SSH tunnel closed");
306+
}
307+
Err(err)
308+
if matches!(
309+
err.kind(),
310+
std::io::ErrorKind::BrokenPipe
311+
| std::io::ErrorKind::ConnectionAborted
312+
| std::io::ErrorKind::ConnectionReset
313+
| std::io::ErrorKind::NotConnected
314+
| std::io::ErrorKind::UnexpectedEof
315+
) =>
316+
{
317+
info!(sandbox_id = %sandbox_id, error = %err, "SSH tunnel closed during shutdown");
318+
}
319+
Err(err) => {
320+
warn!(sandbox_id = %sandbox_id, error = %err, "SSH tunnel I/O failure");
321+
}
322+
}
302323
// Gracefully shut down the write-half of the upgraded connection so the
303324
// client receives a clean EOF instead of a TCP RST. This gives SSH time
304325
// to read any remaining protocol data (e.g. exit-status) from its buffer.

0 commit comments

Comments
 (0)