From 1338e61afe35ec0052decca6f4a187991997c6c5 Mon Sep 17 00:00:00 2001 From: Wayland Yang Date: Fri, 29 May 2026 14:36:06 +0800 Subject: [PATCH 1/3] feat(vmm): add Vm::snapshot_vmstate_only for Phase 6 live BRANCH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wraps the new `snapshot_type: "VmstateOnly"` variant landed in the vendored FC fork (deeplethe/firecracker commit c5dff4bb1, on forkd-v0.4-mem-backend-shared-v1.12). The patched FC writes vmstate JSON but skips memory.bin entirely — forkd-controller will own that write via the WpBranch async-copy path in PR 6.3. Verified end-to-end on the dev box with the rebuilt patched binary: === pause + VmstateOnly snapshot === /vm pause -> HTTP 204 /snapshot/create (VmstateOnly) -> HTTP 204 === verify === OK vmstate written: 21941 bytes OK mem_file_path placeholder untouched /vm resume -> HTTP 204 OK FC pid 2192729 still alive after resume Signed-off-by: Phase 6.1 of DESIGN-v0.4-PHASE6.md. --- crates/forkd-vmm/src/lib.rs | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/crates/forkd-vmm/src/lib.rs b/crates/forkd-vmm/src/lib.rs index 3509d81..b5e8b68 100644 --- a/crates/forkd-vmm/src/lib.rs +++ b/crates/forkd-vmm/src/lib.rs @@ -939,6 +939,43 @@ impl Vm { }) } + /// Write a vmstate-only snapshot. VM must be paused first. Writes the + /// vmstate JSON to `vmstate`; **does not touch** `memory.bin` — the + /// caller is responsible for serializing guest RAM externally (forkd + /// does this via the `WpBranch` async-copy path on memfd + MAP_SHARED). + /// + /// Requires the vendored Firecracker fork + /// (`deeplethe/firecracker:forkd-v0.4-mem-backend-shared-v1.12`); stock + /// FC does not understand `snapshot_type: "VmstateOnly"` and will + /// reject the request with a 400. + /// + /// This is the FC half of the v0.4 `mode="live"` BRANCH path. See + /// [`DESIGN-v0.4-PHASE6.md`](../../DESIGN-v0.4-PHASE6.md). + pub fn snapshot_vmstate_only(&self, vmstate: PathBuf) -> Result<()> { + if let Some(p) = vmstate.parent() { + std::fs::create_dir_all(p).context("create vmstate-only snapshot dir")?; + } + // FC's `CreateSnapshotParams` still requires a `mem_file_path` + // field in the request body (it's a `PathBuf`, not `Option`), but + // the patched FC checks `snapshot_type == VmstateOnly` and never + // opens the file. We pass a placeholder under /tmp so a curious + // operator stat'ing the parent dir sees a name that explains + // itself. + let body = serde_json::json!({ + "snapshot_path": vmstate, + "mem_file_path": "/tmp/forkd-vmstate-only-mem-ignored", + "snapshot_type": "VmstateOnly", + }); + api_call_with_timeout( + &self.sock, + "PUT", + "/snapshot/create", + &body.to_string(), + SNAPSHOT_TIMEOUT_SECS, + )?; + Ok(()) + } + /// Pre-warm the VM's guest memory by performing a throwaway snapshot. /// /// On the first BRANCH after a fresh restore, firecracker iterates From c751bf15d1ec8ba48d176c92e7b7d37dc1f24fac Mon Sep 17 00:00:00 2001 From: Wayland Yang Date: Fri, 29 May 2026 16:32:43 +0800 Subject: [PATCH 2/3] fix(vmm): close 5 review findings on snapshot_vmstate_only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Broken doc link `../../DESIGN-v0.4-PHASE6.md` — from `crates/forkd-vmm/src/lib.rs` repo root is 3 levels up, not 2. Use `../../../`. 2. Move `mem_file_path` placeholder from `/tmp/...` to `/dev/null/forkd-vmstate-only-mem-ignored`. The vendored FC skips opening the file for VmstateOnly, but if a future regression ever did open it, `/tmp` is attacker-pre-creatable (symlink to a real file -> truncated by FC's `set_len`). Routing through `/dev/null/...` makes `open(2)` return ENOTDIR loudly — defense in depth, no behavior change today. Promoted to a `VMSTATE_ONLY_MEM_PLACEHOLDER` const so the choice is documented in one place. 3. Signature now takes `memory: PathBuf` + `volumes: Vec` and returns `Snapshot` — symmetric with `snapshot_to` / `snapshot_diff_to` so Phase 6.3's branch_sandbox call doesn't have to manually reconstruct the Snapshot record (memory.bin path + volumes are needed for the post-pause copy pipeline and restore-time volume re-attach). 4. Add 2 unit tests: - `vmstate_only_placeholder_is_unwriteable` — confirms the placeholder really does return ENOTDIR if opened, so the defense in #2 doesn't silently break on a future Linux kernel. - `vmstate_only_request_body_shape` — asserts the JSON body has `snapshot_type: "VmstateOnly"` and `mem_file_path` starts with `/dev/null/`. Catches typos to either string (stock + patched FC reject unknown variants with 400) at compile-time-adjacent speed instead of waiting for the dev-box smoke test. 5. TODO comment about `SNAPSHOT_TIMEOUT_SECS` being sized for multi-GB snapshots while VmstateOnly takes ~22 ms. Inside Phase 6.3's <10 ms pause window a hung FC would extend source-paused for the full 60 s. Tighten when wiring into branch_sandbox. Verified on the dev box: 23/23 forkd-vmm tests pass (including the 2 new ones); smoke test against the patched FC (which still runs through the rest of the snapshot pipeline) still succeeds. --- crates/forkd-vmm/src/lib.rs | 99 ++++++++++++++++++++++++++++++++----- 1 file changed, 86 insertions(+), 13 deletions(-) diff --git a/crates/forkd-vmm/src/lib.rs b/crates/forkd-vmm/src/lib.rs index b5e8b68..e97a768 100644 --- a/crates/forkd-vmm/src/lib.rs +++ b/crates/forkd-vmm/src/lib.rs @@ -563,6 +563,14 @@ const SNAPSHOT_TIMEOUT_SECS: u64 = 60; /// kernel kills them before host-critical processes when memory runs out. const CHILD_OOM_SCORE_ADJ: i32 = 500; +/// Placeholder path forkd sends as `mem_file_path` for `VmstateOnly` +/// snapshots. The vendored FC requires the field but skips opening it +/// when `snapshot_type == VmstateOnly`. We route the placeholder +/// through `/dev/null/` so that, if a future FC regression *did* open +/// it, `open(2)` would fail loudly with `ENOTDIR` instead of writing +/// to an attacker-pre-created file under `/tmp/`. +const VMSTATE_ONLY_MEM_PLACEHOLDER: &str = "/dev/null/forkd-vmstate-only-mem-ignored"; + fn api_call(sock: &Path, method: &str, path: &str, body: &str) -> Result<()> { api_call_with_timeout(sock, method, path, body, DEFAULT_API_TIMEOUT_SECS) } @@ -939,10 +947,15 @@ impl Vm { }) } - /// Write a vmstate-only snapshot. VM must be paused first. Writes the - /// vmstate JSON to `vmstate`; **does not touch** `memory.bin` — the - /// caller is responsible for serializing guest RAM externally (forkd - /// does this via the `WpBranch` async-copy path on memfd + MAP_SHARED). + /// Write a vmstate-only snapshot. VM must be paused first. Writes + /// the vmstate JSON to `vmstate`; **does not touch** `memory.bin` — + /// the caller is responsible for serializing guest RAM externally + /// (forkd does this via the `WpBranch` async-copy path on memfd + + /// MAP_SHARED). `memory` and `volumes` are NOT written by this call + /// — they're attached to the returned [`Snapshot`] so the caller + /// gets the same record shape it would from [`snapshot_to`] / + /// [`snapshot_diff_to`] and can hand it off to the post-pause copy + /// pipeline without reconstructing it. /// /// Requires the vendored Firecracker fork /// (`deeplethe/firecracker:forkd-v0.4-mem-backend-shared-v1.12`); stock @@ -950,20 +963,30 @@ impl Vm { /// reject the request with a 400. /// /// This is the FC half of the v0.4 `mode="live"` BRANCH path. See - /// [`DESIGN-v0.4-PHASE6.md`](../../DESIGN-v0.4-PHASE6.md). - pub fn snapshot_vmstate_only(&self, vmstate: PathBuf) -> Result<()> { + /// [`DESIGN-v0.4-PHASE6.md`](../../../DESIGN-v0.4-PHASE6.md). + // + // TODO(phase6.3): `SNAPSHOT_TIMEOUT_SECS` is sized for full/diff + // snapshots that may write multi-GB; vmstate-only takes ~22ms in + // practice. Inside Phase 6.3's <10 ms pause window a hung FC would + // extend source-paused for the full timeout. Plumb a tighter + // dedicated timeout (~2-5s) when wiring into branch_sandbox. + pub fn snapshot_vmstate_only( + &self, + vmstate: PathBuf, + memory: PathBuf, + volumes: Vec, + ) -> Result { if let Some(p) = vmstate.parent() { std::fs::create_dir_all(p).context("create vmstate-only snapshot dir")?; } // FC's `CreateSnapshotParams` still requires a `mem_file_path` - // field in the request body (it's a `PathBuf`, not `Option`), but - // the patched FC checks `snapshot_type == VmstateOnly` and never - // opens the file. We pass a placeholder under /tmp so a curious - // operator stat'ing the parent dir sees a name that explains - // itself. + // field in the request body (it's a `PathBuf`, not `Option`), + // but the patched FC checks `snapshot_type == VmstateOnly` and + // never opens the file. We deliberately route the placeholder + // through `/dev/null/` — see `VMSTATE_ONLY_MEM_PLACEHOLDER`. let body = serde_json::json!({ "snapshot_path": vmstate, - "mem_file_path": "/tmp/forkd-vmstate-only-mem-ignored", + "mem_file_path": VMSTATE_ONLY_MEM_PLACEHOLDER, "snapshot_type": "VmstateOnly", }); api_call_with_timeout( @@ -973,7 +996,11 @@ impl Vm { &body.to_string(), SNAPSHOT_TIMEOUT_SECS, )?; - Ok(()) + Ok(Snapshot { + vmstate, + memory, + volumes, + }) } /// Pre-warm the VM's guest memory by performing a throwaway snapshot. @@ -1467,6 +1494,52 @@ mod tests { assert!(cfg.rootfs_read_only); } + #[test] + fn vmstate_only_placeholder_is_unwriteable() { + // Defense-in-depth: if FC ever forgets the VmstateOnly guard and + // opens the placeholder, the path must fail open(2) loudly + // rather than succeed against a real (possibly + // attacker-pre-created) file. `/dev/null/anything` resolves + // through a character device — open(2) returns ENOTDIR. + use std::fs::OpenOptions; + let err = OpenOptions::new() + .write(true) + .create(true) + .open(VMSTATE_ONLY_MEM_PLACEHOLDER) + .expect_err("placeholder must not be openable for write"); + // Linux: ENOTDIR. Rust maps that to ErrorKind::NotADirectory on + // 1.83+; older toolchains see ErrorKind::Other. Either is fine + // — what matters is open(2) failed. + let raw = err.raw_os_error(); + assert_eq!( + raw, + Some(libc::ENOTDIR), + "expected ENOTDIR (placeholder routes through /dev/null/), got errno={raw:?}", + ); + } + + #[test] + fn vmstate_only_request_body_shape() { + // Mirror the body Vm::snapshot_vmstate_only sends. If this stays + // in sync with the actual function it catches typos in the + // `snapshot_type` enum string (the patched FC rejects unknown + // variants with HTTP 400) and accidental moves of the + // placeholder out of `/dev/null/`. + let vmstate = PathBuf::from("/tmp/snap/vmstate"); + let body = serde_json::json!({ + "snapshot_path": vmstate, + "mem_file_path": VMSTATE_ONLY_MEM_PLACEHOLDER, + "snapshot_type": "VmstateOnly", + }); + assert_eq!(body["snapshot_type"].as_str(), Some("VmstateOnly")); + assert_eq!(body["snapshot_path"].as_str(), Some("/tmp/snap/vmstate")); + let placeholder = body["mem_file_path"].as_str().unwrap(); + assert!( + placeholder.starts_with("/dev/null/"), + "placeholder must live under /dev/null/ so a regression fails ENOTDIR; got {placeholder}", + ); + } + #[test] fn boot_config_ext4_rw_is_writable() { let cfg = BootConfig::ext4_rw("/tmp/k".into(), "/tmp/r.ext4".into(), "/tmp/w".into()); From b29cd0778f617a10056d2cf596c36c1114030e70 Mon Sep 17 00:00:00 2001 From: Wayland Yang Date: Fri, 29 May 2026 16:34:24 +0800 Subject: [PATCH 3/3] fix(vmm): add .truncate(false) to placeholder-open test for clippy clippy::suspicious-open-options requires an explicit truncate() call on OpenOptions when combining write(true) + create(true). The existing test doesn't actually expect the open to succeed (the path routes through /dev/null/ and returns ENOTDIR), so truncate(false) is the semantically-honest choice. --- crates/forkd-vmm/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/forkd-vmm/src/lib.rs b/crates/forkd-vmm/src/lib.rs index e97a768..f684f99 100644 --- a/crates/forkd-vmm/src/lib.rs +++ b/crates/forkd-vmm/src/lib.rs @@ -1505,6 +1505,7 @@ mod tests { let err = OpenOptions::new() .write(true) .create(true) + .truncate(false) .open(VMSTATE_ONLY_MEM_PLACEHOLDER) .expect_err("placeholder must not be openable for write"); // Linux: ENOTDIR. Rust maps that to ErrorKind::NotADirectory on