Skip to content

Commit 2175d02

Browse files
committed
feat(bootstrap): add container memory limit with auto-detection
Add --memory flag to `openshell gateway start` that caps the gateway container's memory via Docker HostConfig. When unset, auto-detects 80% of available memory by querying the Docker daemon (docker info), which correctly reports the Docker Desktop VM's allocated memory on macOS and Windows rather than the full host RAM. Docker OOM-kills the container instead of letting runaway sandbox growth trigger the host kernel OOM killer. - parse_memory_limit(): human-readable sizes (80g, 4096m, bytes) - detect_memory_limit(): async, queries Docker daemon MemTotal - memory_swap = memory (disables swap inside container) - OPENSHELL_MEMORY_LIMIT env var supported Signed-off-by: Brian Taylor <brian.taylor818@gmail.com>
1 parent 5a66c9b commit 2175d02

File tree

5 files changed

+207
-0
lines changed

5 files changed

+207
-0
lines changed

crates/openshell-bootstrap/src/docker.rs

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,76 @@ fn env_bool(key: &str) -> Option<bool> {
4040
})
4141
}
4242

43+
/// Parse a human-readable memory size string into bytes.
44+
///
45+
/// Accepts integers (bytes) or values with `k`/`m`/`g`/`t` suffixes
46+
/// (case-insensitive, with or without a trailing `b`). Binary units
47+
/// (`ki`/`mi`/`gi`/`ti`) are also accepted. Examples: `80g`, `4096m`,
48+
/// `0.5g`, `1073741824`.
49+
///
50+
/// Returns an error if the value is empty, uses an unknown suffix, overflows
51+
/// `i64`, or is below the 4 MiB minimum required by Docker.
52+
pub fn parse_memory_limit(s: &str) -> Result<i64> {
53+
let s = s.trim().to_ascii_lowercase();
54+
if s.is_empty() {
55+
miette::bail!("empty memory limit string");
56+
}
57+
58+
// Split into numeric part and optional suffix.
59+
let (num_str, suffix) = match s.find(|c: char| !c.is_ascii_digit() && c != '.') {
60+
Some(idx) => (&s[..idx], s[idx..].trim_end_matches('b')),
61+
None => (s.as_str(), ""),
62+
};
63+
64+
let value: f64 = num_str
65+
.parse()
66+
.into_diagnostic()
67+
.wrap_err_with(|| format!("invalid numeric part in memory limit: {num_str}"))?;
68+
69+
let multiplier: f64 = match suffix {
70+
"" => 1.0,
71+
"k" | "ki" => 1024.0,
72+
"m" | "mi" => 1024.0 * 1024.0,
73+
"g" | "gi" => 1024.0 * 1024.0 * 1024.0,
74+
"t" | "ti" => 1024.0 * 1024.0 * 1024.0 * 1024.0,
75+
other => miette::bail!("unknown memory suffix: {other}"),
76+
};
77+
78+
let raw = value * multiplier;
79+
if raw > i64::MAX as f64 {
80+
miette::bail!("memory limit too large (exceeds i64::MAX): {s}");
81+
}
82+
#[allow(clippy::cast_possible_truncation)]
83+
let bytes = raw as i64;
84+
85+
// Docker requires at least ~6 MiB; enforce a 4 MiB floor so users get a
86+
// clear error instead of an opaque Docker API rejection.
87+
const MIN_MEMORY_BYTES: i64 = 4 * 1024 * 1024;
88+
if bytes < MIN_MEMORY_BYTES {
89+
miette::bail!("memory limit must be at least 4 MiB, got: {s} ({bytes} bytes)");
90+
}
91+
Ok(bytes)
92+
}
93+
94+
/// Detect a safe memory limit for the gateway container.
95+
///
96+
/// Queries the Docker daemon for `MemTotal` (via `docker info`) and returns
97+
/// 80% of that value. On macOS and Windows the daemon runs inside a Linux VM
98+
/// (Docker Desktop, colima, WSL2), so the reported total reflects the VM's
99+
/// allocated memory rather than the full host RAM.
100+
///
101+
/// Returns `None` if the daemon does not report memory information.
102+
pub async fn detect_memory_limit(docker: &Docker) -> Option<i64> {
103+
let info = docker.info().await.ok()?;
104+
let total_bytes = info.mem_total?;
105+
if total_bytes <= 0 {
106+
return None;
107+
}
108+
#[allow(clippy::cast_possible_truncation)]
109+
let limit = (total_bytes as f64 * 0.8) as i64;
110+
Some(limit)
111+
}
112+
43113
/// Platform information for a Docker daemon host.
44114
#[derive(Debug, Clone)]
45115
pub struct HostPlatform {
@@ -512,6 +582,7 @@ pub async fn ensure_container(
512582
registry_token: Option<&str>,
513583
gpu: bool,
514584
is_remote: bool,
585+
memory_limit: Option<i64>,
515586
) -> Result<()> {
516587
let container_name = container_name(name);
517588

@@ -616,6 +687,15 @@ pub async fn ensure_container(
616687
}]);
617688
}
618689

690+
// Apply memory limit. When set, Docker OOM-kills the container instead of
691+
// letting unchecked sandbox growth trigger the host kernel OOM killer.
692+
// Setting memory_swap equal to memory disables swap inside the container.
693+
if let Some(mem) = memory_limit {
694+
host_config.memory = Some(mem);
695+
host_config.memory_swap = Some(mem);
696+
tracing::info!("Container memory limit: {} MiB", mem / (1024 * 1024),);
697+
}
698+
619699
let mut cmd = vec![
620700
"server".to_string(),
621701
"--disable=traefik".to_string(),
@@ -1352,4 +1432,77 @@ mod tests {
13521432
let input = "nameserver 8.8.8.8\r\nnameserver 1.1.1.1\r\n";
13531433
assert_eq!(parse_resolv_conf(input), vec!["8.8.8.8", "1.1.1.1"]);
13541434
}
1435+
1436+
#[test]
1437+
fn parse_memory_limit_gigabytes() {
1438+
assert_eq!(parse_memory_limit("80g").unwrap(), 80 * 1024 * 1024 * 1024);
1439+
assert_eq!(parse_memory_limit("80G").unwrap(), 80 * 1024 * 1024 * 1024);
1440+
assert_eq!(parse_memory_limit("80gb").unwrap(), 80 * 1024 * 1024 * 1024);
1441+
}
1442+
1443+
#[test]
1444+
fn parse_memory_limit_megabytes() {
1445+
assert_eq!(parse_memory_limit("4096m").unwrap(), 4096 * 1024 * 1024);
1446+
assert_eq!(parse_memory_limit("4096M").unwrap(), 4096 * 1024 * 1024);
1447+
}
1448+
1449+
#[test]
1450+
fn parse_memory_limit_bare_bytes() {
1451+
assert_eq!(parse_memory_limit("1073741824").unwrap(), 1073741824);
1452+
}
1453+
1454+
#[test]
1455+
fn parse_memory_limit_binary_suffixes() {
1456+
assert_eq!(parse_memory_limit("1gi").unwrap(), 1024 * 1024 * 1024);
1457+
assert_eq!(parse_memory_limit("1gib").unwrap(), 1024 * 1024 * 1024);
1458+
}
1459+
1460+
#[test]
1461+
fn parse_memory_limit_rejects_empty() {
1462+
assert!(parse_memory_limit("").is_err());
1463+
}
1464+
1465+
#[test]
1466+
fn parse_memory_limit_rejects_unknown_suffix() {
1467+
assert!(parse_memory_limit("10x").is_err());
1468+
}
1469+
1470+
#[test]
1471+
fn parse_memory_limit_fractional() {
1472+
// 0.5g = 512 MiB
1473+
assert_eq!(parse_memory_limit("0.5g").unwrap(), 512 * 1024 * 1024);
1474+
}
1475+
1476+
#[test]
1477+
fn parse_memory_limit_rejects_zero() {
1478+
assert!(parse_memory_limit("0g").is_err());
1479+
}
1480+
1481+
#[test]
1482+
fn parse_memory_limit_rejects_negative() {
1483+
assert!(parse_memory_limit("-1g").is_err());
1484+
}
1485+
1486+
#[test]
1487+
fn parse_memory_limit_rejects_below_minimum() {
1488+
// 1 KiB is well below the 4 MiB floor
1489+
assert!(parse_memory_limit("1k").is_err());
1490+
}
1491+
1492+
#[test]
1493+
fn parse_memory_limit_rejects_overflow() {
1494+
// 99999999t exceeds i64::MAX (~9.2 exabytes)
1495+
assert!(parse_memory_limit("99999999t").is_err());
1496+
}
1497+
1498+
#[test]
1499+
fn parse_memory_limit_whitespace() {
1500+
assert_eq!(
1501+
parse_memory_limit(" 80g ").unwrap(),
1502+
80 * 1024 * 1024 * 1024
1503+
);
1504+
}
1505+
1506+
// detect_memory_limit is async and requires a Docker daemon connection,
1507+
// so it is tested via integration / e2e tests rather than unit tests.
13551508
}

crates/openshell-bootstrap/src/lib.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ use crate::runtime::{
4646
pub use crate::constants::container_name;
4747
pub use crate::docker::{
4848
DockerPreflight, ExistingGatewayInfo, check_docker_available, create_ssh_docker_client,
49+
detect_memory_limit, parse_memory_limit,
4950
};
5051
pub use crate::metadata::{
5152
GatewayMetadata, clear_active_gateway, extract_host_from_ssh_destination, get_gateway_metadata,
@@ -119,6 +120,11 @@ pub struct DeployOptions {
119120
/// When false, an existing gateway is left as-is and deployment is
120121
/// skipped (the caller is responsible for prompting the user first).
121122
pub recreate: bool,
123+
/// Memory limit for the gateway container in bytes. When set, Docker
124+
/// enforces the ceiling and OOM-kills the container instead of the host
125+
/// kernel OOM-killing unrelated processes. When `None`, auto-detected
126+
/// as 80% of available memory via the Docker daemon.
127+
pub memory_limit: Option<i64>,
122128
}
123129

124130
impl DeployOptions {
@@ -135,6 +141,7 @@ impl DeployOptions {
135141
registry_token: None,
136142
gpu: false,
137143
recreate: false,
144+
memory_limit: None,
138145
}
139146
}
140147

@@ -200,6 +207,13 @@ impl DeployOptions {
200207
self.recreate = recreate;
201208
self
202209
}
210+
211+
/// Set the memory limit for the gateway container in bytes.
212+
#[must_use]
213+
pub fn with_memory_limit(mut self, limit: i64) -> Self {
214+
self.memory_limit = Some(limit);
215+
self
216+
}
203217
}
204218

205219
#[derive(Debug, Clone)]
@@ -264,6 +278,7 @@ where
264278
let registry_token = options.registry_token;
265279
let gpu = options.gpu;
266280
let recreate = options.recreate;
281+
let explicit_memory_limit = options.memory_limit;
267282

268283
// Wrap on_log in Arc<Mutex<>> so we can share it with pull_remote_image
269284
// which needs a 'static callback for the bollard streaming pull.
@@ -288,6 +303,14 @@ where
288303
(preflight.docker, None)
289304
};
290305

306+
// Resolve memory limit: explicit value from CLI, or auto-detect from the
307+
// Docker daemon. On macOS / Windows this correctly reports the Docker
308+
// Desktop VM's memory, not the full host RAM.
309+
let memory_limit = match explicit_memory_limit {
310+
Some(limit) => Some(limit),
311+
None => detect_memory_limit(&target_docker).await,
312+
};
313+
291314
// If an existing gateway is found, either tear it down (when recreate is
292315
// requested) or bail out so the caller can prompt the user / reuse it.
293316
if let Some(existing) = check_existing_gateway(&target_docker, &name).await? {
@@ -418,6 +441,7 @@ where
418441
registry_token.as_deref(),
419442
gpu,
420443
remote_opts.is_some(),
444+
memory_limit,
421445
)
422446
.await?;
423447
start_container(&target_docker, &name).await?;

crates/openshell-cli/src/bootstrap.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,15 @@ pub async fn run_bootstrap(
179179
options = options.with_gateway_host(host);
180180
}
181181
options = options.with_gpu(gpu);
182+
// Read memory limit override from environment. The explicit `--memory`
183+
// flag is only on `gateway start`; this env var covers the auto-bootstrap
184+
// path triggered by `sandbox create`.
185+
if let Ok(mem_str) = std::env::var("OPENSHELL_MEMORY_LIMIT")
186+
&& !mem_str.trim().is_empty()
187+
{
188+
let limit = openshell_bootstrap::parse_memory_limit(&mem_str)?;
189+
options = options.with_memory_limit(limit);
190+
}
182191

183192
let handle = deploy_gateway_with_panel(options, &gateway_name, location).await?;
184193
let server = handle.gateway_endpoint().to_string();

crates/openshell-cli/src/main.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,17 @@ enum GatewayCommands {
809809
/// NVIDIA Container Toolkit on the host.
810810
#[arg(long)]
811811
gpu: bool,
812+
813+
/// Memory limit for the gateway container.
814+
///
815+
/// Accepts human-readable sizes: `80g`, `4096m`, `1073741824` (bytes).
816+
/// When unset, defaults to 80% of available memory (auto-detected via
817+
/// the Docker daemon). On macOS and Windows this reflects the Docker
818+
/// Desktop VM's allocated memory, not the full host RAM. Docker
819+
/// OOM-kills the container if it exceeds this limit, preventing
820+
/// runaway sandbox growth from triggering the host kernel OOM killer.
821+
#[arg(long, env = "OPENSHELL_MEMORY_LIMIT")]
822+
memory: Option<String>,
812823
},
813824

814825
/// Stop the gateway (preserves state).
@@ -1561,7 +1572,12 @@ async fn main() -> Result<()> {
15611572
registry_username,
15621573
registry_token,
15631574
gpu,
1575+
memory,
15641576
} => {
1577+
let memory_limit = memory
1578+
.as_deref()
1579+
.map(openshell_bootstrap::parse_memory_limit)
1580+
.transpose()?;
15651581
run::gateway_admin_deploy(
15661582
&name,
15671583
remote.as_deref(),
@@ -1574,6 +1590,7 @@ async fn main() -> Result<()> {
15741590
registry_username.as_deref(),
15751591
registry_token.as_deref(),
15761592
gpu,
1593+
memory_limit,
15771594
)
15781595
.await?;
15791596
}

crates/openshell-cli/src/run.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,6 +1356,7 @@ pub async fn gateway_admin_deploy(
13561356
registry_username: Option<&str>,
13571357
registry_token: Option<&str>,
13581358
gpu: bool,
1359+
memory_limit: Option<i64>,
13591360
) -> Result<()> {
13601361
let location = if remote.is_some() { "remote" } else { "local" };
13611362

@@ -1421,6 +1422,9 @@ pub async fn gateway_admin_deploy(
14211422
.with_disable_gateway_auth(disable_gateway_auth)
14221423
.with_gpu(gpu)
14231424
.with_recreate(should_recreate);
1425+
if let Some(mem) = memory_limit {
1426+
options = options.with_memory_limit(mem);
1427+
}
14241428
if let Some(opts) = remote_opts {
14251429
options = options.with_remote(opts);
14261430
}

0 commit comments

Comments
 (0)