From 3ba3412d2fd16494273f78a2e345b8444717c709 Mon Sep 17 00:00:00 2001 From: Wayland Yang Date: Thu, 28 May 2026 09:33:39 +0800 Subject: [PATCH 1/2] feat(controller): expose branch_concurrency via CLI + env var (#177) DEFAULT_BRANCH_CONCURRENCY = 4 was a recompile-only knob. The right value depends entirely on the operator's disk budget and snapshot-size distribution: on a 2 TB NVMe with 512 MiB snapshots, 4 is wildly conservative; on a 50 GiB EBS with 8 GiB snapshots, 4 is enough to fill the disk. This change plumbs the cap through DaemonConfig: - New `branch_concurrency: Option` field on DaemonConfig. `None` falls back to the existing DEFAULT_BRANCH_CONCURRENCY (4), so no behaviour change for existing users. - New `--branch-concurrency ` CLI flag on `forkd-controller serve`, also readable from the `FORKD_BRANCH_CONCURRENCY` env var (clap pattern matching the existing FORKD_* env-var family). - Validation: `branch_concurrency = 0` bails at daemon startup rather than constructing a zero-permit Semaphore (would deadlock the first BRANCH request). Non-default values are logged at INFO at startup so operators see the override took effect. Reported by @cortsdine in #177. Closes #177. The bonus metrics suggestion (`forkd_branches_in_flight` gauge on /metrics) is left for a follow-up PR to keep this diff focused on the config plumbing. --- crates/forkd-controller/src/lib.rs | 26 +++++++++++++++++++++++--- crates/forkd-controller/src/main.rs | 9 +++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/crates/forkd-controller/src/lib.rs b/crates/forkd-controller/src/lib.rs index acb7f47..31c1789 100644 --- a/crates/forkd-controller/src/lib.rs +++ b/crates/forkd-controller/src/lib.rs @@ -52,6 +52,11 @@ pub struct DaemonConfig { /// real disk. Must have enough free space to hold one /// guest-RAM-sized file per concurrent prewarmed child. pub prewarm_scratch_dir: PathBuf, + /// Maximum concurrent BRANCH operations the daemon will admit. Each + /// BRANCH writes a full `memory.bin` (typically 256 MiB - 8 GiB), so + /// the cap bounds peak transient disk usage during fan-outs. `None` + /// falls back to [`http::DEFAULT_BRANCH_CONCURRENCY`]. + pub branch_concurrency: Option, } impl Default for DaemonConfig { @@ -65,6 +70,7 @@ impl Default for DaemonConfig { tls_cert: None, tls_key: None, prewarm_scratch_dir: PathBuf::from("/dev/shm/forkd-prewarm"), + branch_concurrency: None, } } } @@ -104,14 +110,28 @@ pub async fn run_daemon(cfg: DaemonConfig) -> Result<()> { } }; + let branch_concurrency = cfg + .branch_concurrency + .unwrap_or(http::DEFAULT_BRANCH_CONCURRENCY); + if branch_concurrency == 0 { + anyhow::bail!( + "branch_concurrency must be > 0; got 0 (use the default {} if unsure)", + http::DEFAULT_BRANCH_CONCURRENCY + ); + } + if branch_concurrency != http::DEFAULT_BRANCH_CONCURRENCY { + tracing::info!( + branch_concurrency, + default = http::DEFAULT_BRANCH_CONCURRENCY, + "branch concurrency cap overridden" + ); + } let app_state = Arc::new(AppState { registry, live_vms: Mutex::new(HashMap::new()), snapshot_root: cfg.snapshot_root.clone(), branch_in_flight: Mutex::new(std::collections::HashSet::new()), - branch_sem: std::sync::Arc::new(tokio::sync::Semaphore::new( - http::DEFAULT_BRANCH_CONCURRENCY, - )), + branch_sem: std::sync::Arc::new(tokio::sync::Semaphore::new(branch_concurrency)), prewarm_scratch_dir: cfg.prewarm_scratch_dir.clone(), }); diff --git a/crates/forkd-controller/src/main.rs b/crates/forkd-controller/src/main.rs index 4560e17..2938351 100644 --- a/crates/forkd-controller/src/main.rs +++ b/crates/forkd-controller/src/main.rs @@ -69,6 +69,13 @@ enum Cmd { default_value = "/dev/shm/forkd-prewarm" )] prewarm_scratch_dir: PathBuf, + /// Maximum concurrent BRANCH operations the daemon will admit. + /// Each BRANCH writes a full memory.bin (typically 256 MiB - 8 GiB), + /// so this cap bounds peak transient disk usage during fan-outs. + /// Tune higher on hosts with fast NVMe + small snapshots; tune lower + /// on hosts with constrained disk. Defaults to 4. + #[arg(long, env = "FORKD_BRANCH_CONCURRENCY")] + branch_concurrency: Option, }, } @@ -92,6 +99,7 @@ async fn main() -> Result<()> { tls_cert, tls_key, prewarm_scratch_dir, + branch_concurrency, } => { let defaults = DaemonConfig::default(); run_daemon(DaemonConfig { @@ -103,6 +111,7 @@ async fn main() -> Result<()> { tls_cert, tls_key, prewarm_scratch_dir, + branch_concurrency, }) .await } From 66285486748092b73c3fc7fcc2936b0789541b12 Mon Sep 17 00:00:00 2001 From: Wayland Yang Date: Thu, 28 May 2026 09:36:33 +0800 Subject: [PATCH 2/2] test: add branch_concurrency: None to http_integration fixtures --- crates/forkd-controller/tests/http_integration.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/forkd-controller/tests/http_integration.rs b/crates/forkd-controller/tests/http_integration.rs index d6473ed..c4eaec6 100644 --- a/crates/forkd-controller/tests/http_integration.rs +++ b/crates/forkd-controller/tests/http_integration.rs @@ -57,6 +57,7 @@ impl TestDaemon { tls_cert: None, tls_key: None, prewarm_scratch_dir: td.path().join("prewarm"), + branch_concurrency: None, }; // We don't have a clean shutdown hook from outside (run_daemon @@ -196,6 +197,7 @@ impl TlsTestDaemon { tls_cert: Some(cert_path), tls_key: Some(key_path), prewarm_scratch_dir: td.path().join("prewarm"), + branch_concurrency: None, }; let (tx, rx) = tokio::sync::oneshot::channel::<()>();