Skip to content

Commit 6d85d59

Browse files
authored
perf: speed up firecracker standby with diff snapshot reuse (#146)
* fix: skip unsupported graceful shutdown wait Avoid spending standby time waiting for hypervisors without a shutdown API to exit gracefully when the process will need an immediate kill anyway. Made-with: Cursor * perf: reuse firecracker diff snapshot bases Keep a hidden Firecracker diff snapshot base across restores so standby only writes changed pages while preserving the existing standby snapshot semantics exposed to users. Made-with: Cursor * refactor: model snapshot base reuse as a capability Express retained snapshot-base behavior through hypervisor capabilities and generic snapshot-base paths so the diff snapshot optimization does not leak Firecracker-specific semantics. Made-with: Cursor * refactor: register hypervisor capabilities by type Move static hypervisor capabilities into a type-level registry so snapshot base reuse checks do not depend on starter instances or zero-value client construction. Made-with: Cursor * refactor: model graceful VMM shutdown as a capability Use explicit hypervisor capabilities to decide whether standby should attempt a graceful VMM shutdown, keeping unsupported shutdown semantics out of the core control flow. Made-with: Cursor * fix: discard promoted snapshot bases after snapshot errors Avoid reusing partially written retained snapshot bases after standby snapshot failures, and lock in the rollback behavior with a regression test. Made-with: Cursor * refactor: simplify standby shutdown fallback handling Skip the graceful-exit wait when a hypervisor cannot shut down its VMM cleanly, and log best-effort resume failures so standby recovery errors are visible. Made-with: Cursor * refactor: clarify standby shutdown control flow Restore the structural shutdown comments and split the graceful-exit decision into explicit branches so unsupported shutdown paths are easier to follow. Made-with: Cursor * fix: fall back to full firecracker snapshots without a base Use full Firecracker snapshots when the retained memory base is missing so first-time standby and post-failure retries do not produce incomplete diff snapshots. Made-with: Cursor * test: cover firecracker full and diff standby cycles Exercise Firecracker standby/restore with guest file persistence across the initial full snapshot cycle and a subsequent retained-base diff snapshot cycle, and log the observed timings in the integration test. Made-with: Cursor * refactor: align retained base cleanup and kill helpers Rename the standby-only SIGKILL helper to avoid colliding with the stop-path implementation, and only clean up retained snapshot bases on stop for hypervisors that actually support snapshot base reuse. Made-with: Cursor * test: log firecracker standby timings without strict thresholds Keep the full-versus-diff standby timing data in the Firecracker integration test while avoiding hard CI assertions on runner-dependent latency. Made-with: Cursor * test: relax TAP operstate assertions Keep the network integration checks focused on TAP existence, bridge attachment, and end-to-end connectivity instead of requiring a specific oper state that varies on the shared runner. Made-with: Cursor * fix: clear stale snapshot targets before retrying standby Discard leftover snapshot-latest scratch state before promoting a retained base so failed Firecracker standby attempts can retry cleanly instead of getting stuck on a stale diff snapshot target. Made-with: Cursor
1 parent c95fad2 commit 6d85d59

19 files changed

Lines changed: 405 additions & 75 deletions

lib/hypervisor/cloudhypervisor/cloudhypervisor.go

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,19 @@ var _ hypervisor.Hypervisor = (*CloudHypervisor)(nil)
3232

3333
// Capabilities returns the features supported by Cloud Hypervisor.
3434
func (c *CloudHypervisor) Capabilities() hypervisor.Capabilities {
35+
return capabilities()
36+
}
37+
38+
func capabilities() hypervisor.Capabilities {
3539
return hypervisor.Capabilities{
36-
SupportsSnapshot: true,
37-
SupportsHotplugMemory: true,
38-
SupportsPause: true,
39-
SupportsVsock: true,
40-
SupportsGPUPassthrough: true,
41-
SupportsDiskIOLimit: true,
40+
SupportsSnapshot: true,
41+
SupportsHotplugMemory: true,
42+
SupportsPause: true,
43+
SupportsVsock: true,
44+
SupportsGPUPassthrough: true,
45+
SupportsDiskIOLimit: true,
46+
SupportsGracefulVMMShutdown: true,
47+
SupportsSnapshotBaseReuse: false,
4248
}
4349
}
4450

lib/hypervisor/cloudhypervisor/process.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515

1616
func init() {
1717
hypervisor.RegisterSocketName(hypervisor.TypeCloudHypervisor, "ch.sock")
18+
hypervisor.RegisterCapabilities(hypervisor.TypeCloudHypervisor, capabilities())
1819
hypervisor.RegisterClientFactory(hypervisor.TypeCloudHypervisor, func(socketPath string) (hypervisor.Hypervisor, error) {
1920
return New(socketPath)
2021
})

lib/hypervisor/firecracker/config.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,10 +200,15 @@ func toRateLimiter(limit int64, burst int64) *rateLimiter {
200200
}
201201

202202
func toSnapshotCreateParams(snapshotDir string) snapshotCreateParams {
203+
snapshotType := "Full"
204+
if _, err := os.Stat(snapshotMemoryPath(snapshotDir)); err == nil {
205+
snapshotType = "Diff"
206+
}
207+
203208
return snapshotCreateParams{
204209
MemFilePath: snapshotMemoryPath(snapshotDir),
205210
SnapshotPath: snapshotStatePath(snapshotDir),
206-
SnapshotType: "Full",
211+
SnapshotType: snapshotType,
207212
}
208213
}
209214

lib/hypervisor/firecracker/config_test.go

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package firecracker
22

33
import (
4+
"os"
5+
"path/filepath"
46
"testing"
57

68
"github.com/kernel/hypeman/lib/hypervisor"
@@ -59,10 +61,24 @@ func TestToNetworkInterfaces(t *testing.T) {
5961
}
6062

6163
func TestSnapshotParamPaths(t *testing.T) {
62-
create := toSnapshotCreateParams("/tmp/snapshot-latest")
63-
assert.Equal(t, "/tmp/snapshot-latest/state", create.SnapshotPath)
64-
assert.Equal(t, "/tmp/snapshot-latest/memory", create.MemFilePath)
65-
assert.Equal(t, "Full", create.SnapshotType)
64+
t.Run("uses full snapshots when no retained base exists", func(t *testing.T) {
65+
snapshotDir := filepath.Join(t.TempDir(), "snapshot-latest")
66+
create := toSnapshotCreateParams(snapshotDir)
67+
assert.Equal(t, filepath.Join(snapshotDir, "state"), create.SnapshotPath)
68+
assert.Equal(t, filepath.Join(snapshotDir, "memory"), create.MemFilePath)
69+
assert.Equal(t, "Full", create.SnapshotType)
70+
})
71+
72+
t.Run("uses diff snapshots when retained base memory exists", func(t *testing.T) {
73+
snapshotDir := filepath.Join(t.TempDir(), "snapshot-latest")
74+
require.NoError(t, os.MkdirAll(snapshotDir, 0755))
75+
require.NoError(t, os.WriteFile(filepath.Join(snapshotDir, "memory"), []byte("base"), 0644))
76+
77+
create := toSnapshotCreateParams(snapshotDir)
78+
assert.Equal(t, filepath.Join(snapshotDir, "state"), create.SnapshotPath)
79+
assert.Equal(t, filepath.Join(snapshotDir, "memory"), create.MemFilePath)
80+
assert.Equal(t, "Diff", create.SnapshotType)
81+
})
6682

6783
load := toSnapshotLoadParams("/tmp/snapshot-latest", []networkOverride{
6884
{IfaceID: "eth0", HostDevName: "hype-abc123"},

lib/hypervisor/firecracker/firecracker.go

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,19 @@ func New(socketPath string) (*Firecracker, error) {
4747
var _ hypervisor.Hypervisor = (*Firecracker)(nil)
4848

4949
func (f *Firecracker) Capabilities() hypervisor.Capabilities {
50+
return capabilities()
51+
}
52+
53+
func capabilities() hypervisor.Capabilities {
5054
return hypervisor.Capabilities{
51-
SupportsSnapshot: true,
52-
SupportsHotplugMemory: false,
53-
SupportsPause: true,
54-
SupportsVsock: true,
55-
SupportsGPUPassthrough: false,
56-
SupportsDiskIOLimit: true,
55+
SupportsSnapshot: true,
56+
SupportsHotplugMemory: false,
57+
SupportsPause: true,
58+
SupportsVsock: true,
59+
SupportsGPUPassthrough: false,
60+
SupportsDiskIOLimit: true,
61+
SupportsGracefulVMMShutdown: false,
62+
SupportsSnapshotBaseReuse: true,
5763
}
5864
}
5965

lib/hypervisor/firecracker/process.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ const (
2525

2626
func init() {
2727
hypervisor.RegisterSocketName(hypervisor.TypeFirecracker, "fc.sock")
28+
hypervisor.RegisterCapabilities(hypervisor.TypeFirecracker, capabilities())
2829
hypervisor.RegisterClientFactory(hypervisor.TypeFirecracker, func(socketPath string) (hypervisor.Hypervisor, error) {
2930
return New(socketPath)
3031
})

lib/hypervisor/hypervisor.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ var socketNames = make(map[Type]string)
4545
// Registered by hypervisor packages when they use socket-based vsock routing.
4646
var vsockSocketNames = make(map[Type]string)
4747

48+
// capabilitiesByType maps hypervisor types to their static capabilities.
49+
// Registered by each hypervisor package's init() function.
50+
var capabilitiesByType = make(map[Type]Capabilities)
51+
4852
// RegisterSocketName registers the socket filename for a hypervisor type.
4953
// Called by each hypervisor implementation's init() function.
5054
func RegisterSocketName(t Type, name string) {
@@ -74,6 +78,17 @@ func VsockSocketNameForType(t Type) string {
7478
return "vsock.sock"
7579
}
7680

81+
// RegisterCapabilities registers static capabilities for a hypervisor type.
82+
func RegisterCapabilities(t Type, caps Capabilities) {
83+
capabilitiesByType[t] = caps
84+
}
85+
86+
// CapabilitiesForType returns static capabilities for a hypervisor type.
87+
func CapabilitiesForType(t Type) (Capabilities, bool) {
88+
caps, ok := capabilitiesByType[t]
89+
return caps, ok
90+
}
91+
7792
// VMStarter handles the full VM startup sequence.
7893
// Each hypervisor implements its own startup flow:
7994
// - Cloud Hypervisor: starts process, configures via HTTP API, boots via HTTP API
@@ -197,6 +212,14 @@ type Capabilities struct {
197212

198213
// SupportsDiskIOLimit indicates if disk I/O rate limiting is available
199214
SupportsDiskIOLimit bool
215+
216+
// SupportsGracefulVMMShutdown indicates the hypervisor exposes an API to
217+
// ask the VMM process itself to exit cleanly.
218+
SupportsGracefulVMMShutdown bool
219+
220+
// SupportsSnapshotBaseReuse indicates snapshots can safely reuse a retained
221+
// on-disk base across restore/standby cycles.
222+
SupportsSnapshotBaseReuse bool
200223
}
201224

202225
// VsockDialer provides vsock connectivity to a guest VM.

lib/hypervisor/qemu/process.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ const (
4343

4444
func init() {
4545
hypervisor.RegisterSocketName(hypervisor.TypeQEMU, "qemu.sock")
46+
hypervisor.RegisterCapabilities(hypervisor.TypeQEMU, capabilities())
4647
hypervisor.RegisterClientFactory(hypervisor.TypeQEMU, func(socketPath string) (hypervisor.Hypervisor, error) {
4748
return New(socketPath)
4849
})

lib/hypervisor/qemu/qemu.go

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,19 @@ var _ hypervisor.Hypervisor = (*QEMU)(nil)
3737

3838
// Capabilities returns the features supported by QEMU.
3939
func (q *QEMU) Capabilities() hypervisor.Capabilities {
40+
return capabilities()
41+
}
42+
43+
func capabilities() hypervisor.Capabilities {
4044
return hypervisor.Capabilities{
41-
SupportsSnapshot: true, // Uses QMP migrate file:// for snapshot
42-
SupportsHotplugMemory: false, // Not implemented - balloon not configured
43-
SupportsPause: true,
44-
SupportsVsock: true,
45-
SupportsGPUPassthrough: true,
46-
SupportsDiskIOLimit: true,
45+
SupportsSnapshot: true, // Uses QMP migrate file:// for snapshot
46+
SupportsHotplugMemory: false, // Not implemented - balloon not configured
47+
SupportsPause: true,
48+
SupportsVsock: true,
49+
SupportsGPUPassthrough: true,
50+
SupportsDiskIOLimit: true,
51+
SupportsGracefulVMMShutdown: true,
52+
SupportsSnapshotBaseReuse: false,
4753
}
4854
}
4955

lib/hypervisor/vz/client.go

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,19 @@ type snapshotRequest struct {
7171
}
7272

7373
func (c *Client) Capabilities() hypervisor.Capabilities {
74+
return capabilities()
75+
}
76+
77+
func capabilities() hypervisor.Capabilities {
7478
return hypervisor.Capabilities{
75-
SupportsSnapshot: runtime.GOARCH == "arm64",
76-
SupportsHotplugMemory: false,
77-
SupportsPause: true,
78-
SupportsVsock: true,
79-
SupportsGPUPassthrough: false,
80-
SupportsDiskIOLimit: false,
79+
SupportsSnapshot: runtime.GOARCH == "arm64",
80+
SupportsHotplugMemory: false,
81+
SupportsPause: true,
82+
SupportsVsock: true,
83+
SupportsGPUPassthrough: false,
84+
SupportsDiskIOLimit: false,
85+
SupportsGracefulVMMShutdown: true,
86+
SupportsSnapshotBaseReuse: false,
8187
}
8288
}
8389

0 commit comments

Comments
 (0)