From 0e70c4efd96e8953e93cc41ee428dfc71e4acc12 Mon Sep 17 00:00:00 2001 From: Wayland Yang Date: Fri, 29 May 2026 11:45:29 +0800 Subject: [PATCH 1/3] =?UTF-8?q?feat(vmm):=20v0.4=20Phase=205b=20=E2=80=94?= =?UTF-8?q?=20MemoryBackend::MemfdShared=20via=20restore=5Fmany=5Fwith?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires the memfd helper from #186 into the existing parallel-restore path. When ForkOpts::memory_backend == MemoryBackend::MemfdShared, each restored child gets its own memfd (created from the snapshot's memory.bin via forkd_vmm::memfd::create_and_populate) and the JSON PUT to /snapshot/load uses /proc/self/fd/ with mem_backend.shared: true. The patched Firecracker at deeplethe/firecracker:forkd-v0.4-mem- backend-shared honours the new shared flag and mmaps with MAP_SHARED; forkd-controller can then arm UFFDIO_WRITEPROTECT on the same backing in Phase 6 to capture dirty pages asynchronously. Changes: - MemoryBackend gains a MemfdShared variant. Docstring spells out the patched-FC dependency loudly enough that an operator running vanilla FC + setting MemfdShared knows they're on the wrong path. - Vm gains a public memfd: Option field. Held for the VM's lifetime; Drop closes the fd and the kernel reclaims pages. - restore_many_with's pre-flight check now permits both File and MemfdShared; Userfault still bails (still v0.3 scaffolding). - After spawn, a Phase 1.5 loop creates per-child memfds when MemfdShared. Failure surfaces with the child index + the source path in the error chain — same Drop-on-error guarantee as the helper module. - The single shared JSON body is replaced with a per-child Vec so each child's PUT can reference its own /proc/self/fd/ path without sharing state. Not in this PR: - Fresh-boot (non-snapshot) memfd backing in Vm::boot. v0.4's live- fork only needs the restore path; that's the only one wired. - `forkd doctor` check that the runtime FC binary supports the shared flag (Phase 8). Refs deeplethe/forkd#101, deeplethe/firecracker#1. --- crates/forkd-vmm/src/lib.rs | 98 ++++++++++++++++++++++++++++++------- 1 file changed, 80 insertions(+), 18 deletions(-) diff --git a/crates/forkd-vmm/src/lib.rs b/crates/forkd-vmm/src/lib.rs index 2917372..b24bfea 100644 --- a/crates/forkd-vmm/src/lib.rs +++ b/crates/forkd-vmm/src/lib.rs @@ -228,6 +228,12 @@ pub struct Vm { /// cgroup v2 directory this VM's Firecracker process was placed in, /// if `ForkOpts::memory_limit_mib` was set. Removed on Drop. pub cgroup: Option, + /// memfd holding this VM's restored guest RAM, when spawned under + /// `MemoryBackend::MemfdShared`. Held for the VM's lifetime so the + /// kernel keeps the backing pages alive; closed automatically on + /// Drop. Phase 6's UFFD_WP arming will dup this fd to register a + /// `userfaultfd` against the same VMA. + pub memfd: Option, } impl Vm { @@ -304,6 +310,20 @@ pub enum MemoryBackend { Userfault { handler_sock: PathBuf, }, + /// v0.4 live-fork backing. Each restored child gets its own memfd + /// populated from the snapshot's memory.bin; the patched + /// Firecracker (deeplethe/firecracker:forkd-v0.4-mem-backend-shared, + /// see `docs/VENDORED-FIRECRACKER.md`) mmaps the memfd with + /// `MAP_SHARED` so the controller can later arm + /// `UFFDIO_WRITEPROTECT` on the same backing and capture dirty + /// pages asynchronously — the actual live-fork primitive lands in + /// Phase 6. + /// + /// Requires the patched FC binary at runtime; an unpatched FC + /// silently falls back to `MAP_PRIVATE`, breaking the WP-capture + /// invariant. `forkd doctor` (Phase 8) will check for the patched + /// binary at daemon start. + MemfdShared, } /// Options controlling a fork-many operation. @@ -1044,16 +1064,15 @@ impl Snapshot { /// Same as `restore_many` but with explicit options. pub fn restore_many_with(&self, opts: ForkOpts, work_dir: &Path) -> Result { - // v0.3 scaffolding: the Userfault arm is reserved for the live-fork - // design in docs/design/userfaultfd.md but isn't wired up yet. Fail - // loudly so callers know not to rely on it; falling back to File - // would silently give them v0.2 semantics with the wrong perf - // expectations. - if !matches!(opts.memory_backend, MemoryBackend::File) { - bail!( + // v0.3 Userfault scaffolding is intentionally not wired up yet. + // v0.4 MemfdShared (Phase 5b) IS wired below. Anything else + // fails loudly so callers don't silently get File semantics. + match opts.memory_backend { + MemoryBackend::File | MemoryBackend::MemfdShared => {} + MemoryBackend::Userfault { .. } => bail!( "MemoryBackend::Userfault is v0.3 scaffolding and not yet \ implemented — see docs/design/userfaultfd.md for status" - ); + ), } let n = opts.n; std::fs::create_dir_all(work_dir).context("create fork work_dir")?; @@ -1094,6 +1113,7 @@ impl Snapshot { console, netns, cgroup: None, + memfd: None, }); } for c in &children { @@ -1126,21 +1146,63 @@ impl Snapshot { } } + // Phase 1.5 (v0.4 MemfdShared): create one memfd per child, + // populated from the snapshot's memory.bin, before any restore + // request goes out. The memfd holds the FC-visible RAM pages; + // forkd-controller keeps an mmap on the same memfd so Phase 6 + // can arm UFFDIO_WRITEPROTECT on the shared VMA. + if matches!(opts.memory_backend, MemoryBackend::MemfdShared) { + for (i, child) in children.iter_mut().enumerate() { + let region = memfd::create_and_populate( + &self.memory, + &format!("forkd-source-mem-{}", opts.netns_offset + i + 1), + ) + .with_context(|| { + format!( + "create_and_populate memfd from {} for child #{}", + self.memory.display(), + i + 1 + ) + })?; + child.memfd = Some(region); + } + } + // Phase 2: parallel restore via threads. Each thread issues one - // /snapshot/load PUT to its child's API socket. + // /snapshot/load PUT to its child's API socket. Body varies per + // child only under MemfdShared (each child has its own memfd + // path); for the File path, all children share the same JSON. let restore_start = Instant::now(); - let body = serde_json::json!({ - "snapshot_path": &self.vmstate, - "mem_backend": {"backend_path": &self.memory, "backend_type": "File"}, - "enable_diff_snapshots": opts.enable_diff_snapshots, - "resume_vm": true, - }) - .to_string(); + let bodies: Vec = children + .iter() + .map(|c| match &c.memfd { + Some(region) => serde_json::json!({ + "snapshot_path": &self.vmstate, + "mem_backend": { + "backend_path": region.backend_path(), + "backend_type": "File", + "shared": true, + }, + "enable_diff_snapshots": opts.enable_diff_snapshots, + "resume_vm": true, + }) + .to_string(), + None => serde_json::json!({ + "snapshot_path": &self.vmstate, + "mem_backend": { + "backend_path": &self.memory, + "backend_type": "File", + }, + "enable_diff_snapshots": opts.enable_diff_snapshots, + "resume_vm": true, + }) + .to_string(), + }) + .collect(); let mut handles = Vec::with_capacity(n); - for c in &children { + for (c, body) in children.iter().zip(bodies.into_iter()) { let sock = c.sock.clone(); - let body = body.clone(); handles.push(thread::spawn(move || -> Result<()> { api_call(&sock, "PUT", "/snapshot/load", &body) })); From c64059b9836f33fbc2aa3159ce68c468eb9d997b Mon Sep 17 00:00:00 2001 From: Wayland Yang Date: Fri, 29 May 2026 11:52:25 +0800 Subject: [PATCH 2/3] fix(vmm): add memfd: None to Vm::boot constructor (missed in 5b) --- crates/forkd-vmm/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/forkd-vmm/src/lib.rs b/crates/forkd-vmm/src/lib.rs index b24bfea..79d24a9 100644 --- a/crates/forkd-vmm/src/lib.rs +++ b/crates/forkd-vmm/src/lib.rs @@ -841,6 +841,7 @@ impl Vm { console, netns: None, cgroup: None, + memfd: None, }) } From eddfb876e9d653ff26702b7faa702f688156f816 Mon Sep 17 00:00:00 2001 From: Wayland Yang Date: Fri, 29 May 2026 11:57:28 +0800 Subject: [PATCH 3/3] style: drop redundant .into_iter() (clippy::useless_conversion) --- crates/forkd-vmm/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/forkd-vmm/src/lib.rs b/crates/forkd-vmm/src/lib.rs index 79d24a9..3509d81 100644 --- a/crates/forkd-vmm/src/lib.rs +++ b/crates/forkd-vmm/src/lib.rs @@ -1202,7 +1202,7 @@ impl Snapshot { .collect(); let mut handles = Vec::with_capacity(n); - for (c, body) in children.iter().zip(bodies.into_iter()) { + for (c, body) in children.iter().zip(bodies) { let sock = c.sock.clone(); handles.push(thread::spawn(move || -> Result<()> { api_call(&sock, "PUT", "/snapshot/load", &body)