From af398a1b2b5d1755ed3a423ed0aa3fd77fc8ef0c Mon Sep 17 00:00:00 2001 From: Panos Mavrikos Date: Thu, 22 Jan 2026 13:27:55 +0000 Subject: [PATCH 1/2] feat(cgroups): add cgroup v2 support with pre-move pattern Implement cgroup v2 management for urunc containers using a split-policy architecture that separates vCPU (workload) threads from I/O (overhead) threads. Key changes: - Introduce pkg/cgroup/manager.go with a Manager handling cgroup lifecycle - Initialize cgroups during container creation via setupCgroups() in create.go - Move the runtime process to the overhead cgroup before exec - Move vCPU threads to the workload cgroup after a 200ms delay in start.go - Add cgroup configuration support in urunc_config.go - Add unit tests for cgroup manager behavior Signed-off-by: Panos Mavrikos --- cmd/urunc/create.go | 56 +++ cmd/urunc/start.go | 12 + go.mod | 1 - pkg/cgroup/manager.go | 531 +++++++++++++++++++++++++ pkg/cgroup/manager_test.go | 327 +++++++++++++++ pkg/unikontainers/unikontainers.go | 24 +- pkg/unikontainers/urunc_config.go | 29 ++ pkg/unikontainers/urunc_config_test.go | 16 +- 8 files changed, 984 insertions(+), 12 deletions(-) create mode 100644 pkg/cgroup/manager.go create mode 100644 pkg/cgroup/manager_test.go diff --git a/cmd/urunc/create.go b/cmd/urunc/create.go index 0124a755..cb81e5aa 100644 --- a/cmd/urunc/create.go +++ b/cmd/urunc/create.go @@ -29,6 +29,7 @@ import ( "github.com/sirupsen/logrus" "github.com/urfave/cli/v3" m "github.com/urunc-dev/urunc/internal/metrics" + "github.com/urunc-dev/urunc/pkg/cgroup" "github.com/urunc-dev/urunc/pkg/unikontainers" "golang.org/x/sys/unix" ) @@ -260,6 +261,16 @@ func createUnikontainer(cmd *cli.Command, uruncCfg *unikontainers.UruncConfig) ( return err } + // Setup cgroups + err = setupCgroups(cmd, unikontainer, containerPid) + if err != nil { + // Clean up on cgroup creation failure + if unikontainer.CgroupMgr != nil { + _ = unikontainer.CgroupMgr.Delete() + } + return fmt.Errorf("failed to setup cgroups: %w", err) + } + // execute CreateRuntime hooks err = unikontainer.ExecuteHooks("CreateRuntime") if err != nil { @@ -279,6 +290,51 @@ func createUnikontainer(cmd *cli.Command, uruncCfg *unikontainers.UruncConfig) ( return err } +// setupCgroups creates and configures cgroups for the container +func setupCgroups(cmd *cli.Command, u *unikontainers.Unikontainer, pid int) error { + // Check if cgroups are disabled + if u.Spec.Linux == nil || u.Spec.Linux.CgroupsPath == "" { + logrus.Debug("Cgroups disabled or no cgroup path specified") + return nil + } + + // Check if systemd cgroup driver is enabled + useSystemd := cmd.Bool("systemd-cgroup") + + // Create cgroup manager config + cgroupCfg := cgroup.Config{ + CgroupPath: u.Spec.Linux.CgroupsPath, + ContainerID: u.State.ID, + Resources: u.Spec.Linux.Resources, + SandboxCgroupOnly: u.UruncCfg.Cgroup.SandboxCgroupOnly, + OverheadPath: u.UruncCfg.Cgroup.OverheadPath, + UseSystemd: useSystemd, + } + + // Create cgroup manager + cgroupMgr, err := cgroup.NewManager(cgroupCfg) + if err != nil { + return fmt.Errorf("failed to create cgroup manager: %w", err) + } + + // Create cgroups and add reexec process + if err := cgroupMgr.Create(context.Background(), u.Spec.Linux.Resources, pid, useSystemd); err != nil { + return fmt.Errorf("failed to create cgroups: %w", err) + } + + // Store manager in unikontainer + u.CgroupMgr = cgroupMgr + + logrus.WithFields(logrus.Fields{ + "cgroup_path": u.Spec.Linux.CgroupsPath, + "sandbox_cgroup_only": u.UruncCfg.Cgroup.SandboxCgroupOnly, + "use_systemd": useSystemd, + "pid": pid, + }).Info("Cgroups created successfully") + + return nil +} + func createReexecCmd(initSock *os.File, logPipe *os.File) *exec.Cmd { selfPath := "/proc/self/exe" reexecCommand := &exec.Cmd{ diff --git a/cmd/urunc/start.go b/cmd/urunc/start.go index a020b547..d82a04e1 100644 --- a/cmd/urunc/start.go +++ b/cmd/urunc/start.go @@ -19,6 +19,7 @@ import ( "errors" "fmt" "os" + "time" "github.com/sirupsen/logrus" "github.com/urfave/cli/v3" @@ -111,5 +112,16 @@ func startUnikontainer(cmd *cli.Command) error { return err } + if unikontainer.CgroupMgr != nil && unikontainer.CgroupMgr.UsingSplitPolicy() { + vmmPid := unikontainer.State.Pid + time.Sleep(200 * time.Millisecond) + + if err := unikontainer.CgroupMgr.MoveVCPUThreads(vmmPid); err != nil { + logrus.WithError(err).Warn("Failed to move vCPU threads to sandbox cgroup") + } else { + logrus.Info("Successfully moved vCPU threads to sandbox cgroup") + } + } + return unikontainer.ExecuteHooks("Poststart") } diff --git a/go.mod b/go.mod index b3923b21..44033047 100644 --- a/go.mod +++ b/go.mod @@ -33,7 +33,6 @@ require ( github.com/Microsoft/go-winio v0.6.2 // indirect github.com/Microsoft/hcsshim v0.13.0 // indirect github.com/cilium/ebpf v0.20.0 // indirect - github.com/containerd/cgroups/v3 v3.1.0 // indirect github.com/containerd/console v1.0.5 // indirect github.com/containerd/containerd/api v1.10.0 // indirect github.com/containerd/continuity v0.4.5 // indirect diff --git a/pkg/cgroup/manager.go b/pkg/cgroup/manager.go new file mode 100644 index 00000000..46152adf --- /dev/null +++ b/pkg/cgroup/manager.go @@ -0,0 +1,531 @@ +// Copyright (c) 2023-2025, Nubificus LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cgroup + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + cgroupsv2 "github.com/containerd/cgroups/v3/cgroup2" + "github.com/containerd/cgroups/v3/cgroup2/stats" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" +) + +var cgroupLog = logrus.WithField("subsystem", "cgroup") + +// Manager handles cgroup lifecycle for urunc containers +type Manager struct { + sandboxCgroup *cgroupsv2.Manager + overheadCgroup *cgroupsv2.Manager + cgroupPath string + overheadPath string + splitPolicy bool + containerID string +} + +// Config holds configuration for cgroup creation +type Config struct { + CgroupPath string + ContainerID string + Resources *specs.LinuxResources + SandboxCgroupOnly bool + OverheadPath string + UseSystemd bool // Whether to use systemd cgroup driver +} + +// NewManager creates a new cgroup manager +func NewManager(cfg Config) (*Manager, error) { + if cfg.CgroupPath == "" { + return nil, fmt.Errorf("cgroup path cannot be empty") + } + + cgroupPath := normalizeCgroupPath(cfg.CgroupPath, cfg.ContainerID) + + m := &Manager{ + cgroupPath: cgroupPath, + overheadPath: cfg.OverheadPath, + splitPolicy: !cfg.SandboxCgroupOnly, + containerID: cfg.ContainerID, + } + + cgroupLog.WithFields(logrus.Fields{ + "cgroup_path": cgroupPath, + "split_policy": m.splitPolicy, + "container_id": cfg.ContainerID, + }).Debug("Creating cgroup manager") + + return m, nil +} + +// Create creates the necessary cgroups +func (m *Manager) Create(ctx context.Context, resources *specs.LinuxResources, pid int, useSystemd bool) error { + // Convert OCI resources to cgroup v2 resources + cgroupResources, err := specToCgroupResources(resources) + if err != nil { + return fmt.Errorf("failed to convert resources: %w", err) + } + + var sandboxMgr *cgroupsv2.Manager + + // Auto-detect systemd path format or use explicit flag + useSystemdDriver := useSystemd || isSystemdPath(m.cgroupPath) + + // Create sandbox cgroup using appropriate method + if useSystemdDriver && isSystemdPath(m.cgroupPath) { + // Parse systemd path format: slice:prefix:name + slice, group, err := parseSystemdPath(m.cgroupPath) + if err != nil { + return fmt.Errorf("failed to parse systemd cgroup path %s: %w", m.cgroupPath, err) + } + + cgroupLog.WithFields(logrus.Fields{ + "slice": slice, + "group": group, + "pid": pid, + }).Debug("Creating systemd cgroup") + + sandboxMgr, err = cgroupsv2.NewSystemd(slice, group, pid, cgroupResources) + if err != nil { + return fmt.Errorf("failed to create systemd sandbox cgroup %s:%s: %w", slice, group, err) + } + } else { + // Use filesystem-based cgroup manager + sandboxMgr, err = cgroupsv2.NewManager( + "/sys/fs/cgroup", + m.cgroupPath, + cgroupResources, + ) + if err != nil { + return fmt.Errorf("failed to create sandbox cgroup at %s: %w", m.cgroupPath, err) + } + + // Add process to sandbox cgroup (NewSystemd already does this) + if err := sandboxMgr.AddProc(uint64(pid)); err != nil { + _ = sandboxMgr.Delete() + return fmt.Errorf("failed to add pid %d to cgroup: %w", pid, err) + } + } + + m.sandboxCgroup = sandboxMgr + + cgroupLog.WithFields(logrus.Fields{ + "path": m.cgroupPath, + "pid": pid, + }).Info("Created sandbox cgroup and added process") + + // If split policy, create overhead cgroup + // Note: We always use filesystem-based cgroup for overhead, even with systemd, + // because it's an internal urunc feature and doesn't need systemd integration + if m.splitPolicy { + overheadPath := filepath.Join(m.overheadPath, m.containerID) + + // Overhead gets minimal resources (no limits) + overheadMgr, err := cgroupsv2.NewManager( + "/sys/fs/cgroup", + overheadPath, + &cgroupsv2.Resources{}, + ) + if err != nil { + _ = m.sandboxCgroup.Delete() + return fmt.Errorf("failed to create overhead cgroup at %s: %w", overheadPath, err) + } + m.overheadCgroup = overheadMgr + + cgroupLog.WithField("path", overheadPath).Info("Created overhead cgroup") + } + + return nil +} + +// MoveVCPUThreads identifies vCPU threads and moves them FROM overhead TO sandbox cgroup. +// This assumes the VMM process is already in the overhead cgroup (moved before exec via MoveToOverhead). +// All threads initially inherit the overhead cgroup, and we selectively move only vCPU threads +// to the sandbox (workload) cgroup. I/O threads stay in overhead. +func (m *Manager) MoveVCPUThreads(vmmPid int) error { + if !m.splitPolicy { + // In sandbox_cgroup_only mode, all threads stay in sandbox cgroup + return nil + } + + cgroupLog.WithField("vmm_pid", vmmPid).Debug("Identifying and moving vCPU threads from overhead to sandbox cgroup") + + // Retry thread detection with exponential backoff + // VMM may not spawn all vCPU threads immediately + var vcpuThreads, ioThreads []int + maxAttempts := 5 + var lastThreadCount int + + for attempt := 0; attempt < maxAttempts; attempt++ { + // Read all threads of the VMM process + threadIDs, err := getProcessThreads(vmmPid) + if err != nil { + cgroupLog.WithError(err).WithField("attempt", attempt+1).Warn("Failed to get threads") + time.Sleep(50 * time.Millisecond * time.Duration(1< 0 || (attempt > 0 && len(threadIDs) == lastThreadCount) { + break + } + + lastThreadCount = len(threadIDs) + + // Wait before retry with exponential backoff + sleepDuration := 50 * time.Millisecond * time.Duration(1< 0 { + return fmt.Errorf("cgroup deletion errors: %v", errs) + } + + return nil +} + +// GetStats returns cgroup statistics +func (m *Manager) GetStats() (*stats.Metrics, error) { + if m.sandboxCgroup == nil { + return nil, fmt.Errorf("sandbox cgroup not initialized") + } + + return m.sandboxCgroup.Stat() +} + +// normalizeCgroupPath handles OCI cgroup path formats +func normalizeCgroupPath(cgroupPath, containerID string) string { + if cgroupPath == "" { + return containerID + } + + // If it starts with /, it's an absolute path + if strings.HasPrefix(cgroupPath, "/") { + return cgroupPath + } + + // Otherwise, it's a relative path + return cgroupPath +} + +// isSystemdPath checks if a cgroup path is in systemd format (slice:prefix:name) +func isSystemdPath(path string) bool { + return strings.Contains(path, ":") +} + +// parseSystemdPath parses a systemd cgroup path format +// Input: "slice:prefix:name" (e.g., "system.slice:docker:containerID") +// Output: slice ("system.slice"), group ("docker-containerID.scope") +func parseSystemdPath(path string) (string, string, error) { + parts := strings.Split(path, ":") + if len(parts) < 2 { + return "", "", fmt.Errorf("invalid systemd path format: %s", path) + } + + slice := parts[0] + + // Construct group name from remaining parts + // For "system.slice:docker:containerID" -> "docker-containerID.scope" + group := strings.Join(parts[1:], "-") + if !strings.HasSuffix(group, ".scope") { + group = group + ".scope" + } + + cgroupLog.WithFields(logrus.Fields{ + "input": path, + "slice": slice, + "group": group, + }).Debug("Parsed systemd cgroup path") + + return slice, group, nil +} + +// getProcessThreads returns all thread IDs for a process +func getProcessThreads(pid int) ([]int, error) { + taskDir := fmt.Sprintf("/proc/%d/task", pid) + entries, err := os.ReadDir(taskDir) + if err != nil { + return nil, err + } + + threads := make([]int, 0, len(entries)) + for _, entry := range entries { + if !entry.IsDir() { + continue + } + + tid, err := strconv.Atoi(entry.Name()) + if err != nil { + continue + } + threads = append(threads, tid) + } + + return threads, nil +} + +// getThreadName reads the thread name from /proc//task//comm +func getThreadName(pid, tid int) (string, error) { + commPath := fmt.Sprintf("/proc/%d/task/%d/comm", pid, tid) + data, err := os.ReadFile(commPath) + if err != nil { + return "", err + } + + return strings.TrimSpace(string(data)), nil +} + +// isVCPUThread determines if a thread is a vCPU thread based on its name +func isVCPUThread(name string) bool { + // QEMU vCPU threads + if strings.HasPrefix(name, "CPU ") || strings.Contains(name, "/KVM") { + return true + } + // Firecracker vCPU threads + if strings.HasPrefix(name, "fc_vcpu") { + return true + } + // Generic vcpu naming + if strings.HasPrefix(name, "vcpu") { + return true + } + + return false +} + +// specToCgroupResources converts OCI resources to cgroup v2 resources +func specToCgroupResources(spec *specs.LinuxResources) (*cgroupsv2.Resources, error) { + if spec == nil { + return &cgroupsv2.Resources{}, nil + } + + res := &cgroupsv2.Resources{} + + // CPU resources + if spec.CPU != nil { + res.CPU = &cgroupsv2.CPU{} + + if spec.CPU.Shares != nil { + weight := sharesToWeight(*spec.CPU.Shares) + res.CPU.Weight = &weight + } + + if spec.CPU.Quota != nil && spec.CPU.Period != nil { + res.CPU.Max = cgroupsv2.NewCPUMax(spec.CPU.Quota, spec.CPU.Period) + } + + if spec.CPU.Cpus != "" { + res.CPU.Cpus = spec.CPU.Cpus + } + + if spec.CPU.Mems != "" { + res.CPU.Mems = spec.CPU.Mems + } + } + + // Memory resources + if spec.Memory != nil { + res.Memory = &cgroupsv2.Memory{} + + if spec.Memory.Limit != nil { + res.Memory.Max = spec.Memory.Limit + } + + if spec.Memory.Swap != nil { + res.Memory.Swap = spec.Memory.Swap + } + + if spec.Memory.Reservation != nil { + res.Memory.Low = spec.Memory.Reservation + } + } + + // I/O resources + if spec.BlockIO != nil { + res.IO = &cgroupsv2.IO{} + + if spec.BlockIO.Weight != nil { + res.IO.BFQ.Weight = uint16(*spec.BlockIO.Weight) + } + } + + // PID resources + if spec.Pids != nil { + res.Pids = &cgroupsv2.Pids{} + + if spec.Pids.Limit > 0 { + res.Pids.Max = spec.Pids.Limit + } + } + + return res, nil +} + +// sharesToWeight converts CPU shares (OCI) to CPU weight (cgroup v2) +// OCI shares range: 2-262144, default 1024 +// cgroup v2 weight range: 1-10000, default 100 +func sharesToWeight(shares uint64) uint64 { + if shares == 0 { + return 100 // default weight + } + + // Convert shares to weight + // Formula: weight = (shares * 100) / 1024 + weight := (shares * 100) / 1024 + + if weight < 1 { + weight = 1 + } + if weight > 10000 { + weight = 10000 + } + + return weight +} diff --git a/pkg/cgroup/manager_test.go b/pkg/cgroup/manager_test.go new file mode 100644 index 00000000..d5e81e09 --- /dev/null +++ b/pkg/cgroup/manager_test.go @@ -0,0 +1,327 @@ +// Copyright (c) 2023-2025, Nubificus LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cgroup + +import ( + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func TestNewManager(t *testing.T) { + tests := []struct { + name string + cfg Config + wantErr bool + }{ + { + name: "valid config - sandbox only", + cfg: Config{ + CgroupPath: "/test/cgroup", + ContainerID: "test123", + SandboxCgroupOnly: true, + OverheadPath: "/urunc_overhead", + }, + wantErr: false, + }, + { + name: "valid config - split policy", + cfg: Config{ + CgroupPath: "/test/cgroup", + ContainerID: "test456", + SandboxCgroupOnly: false, + OverheadPath: "/urunc_overhead", + }, + wantErr: false, + }, + { + name: "empty cgroup path", + cfg: Config{ + CgroupPath: "", + ContainerID: "test789", + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mgr, err := NewManager(tt.cfg) + if (err != nil) != tt.wantErr { + t.Errorf("NewManager() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr && mgr == nil { + t.Error("NewManager() returned nil manager") + } + }) + } +} + +func TestSharesToWeight(t *testing.T) { + tests := []struct { + name string + shares uint64 + want uint64 + }{ + { + name: "default shares (1024)", + shares: 1024, + want: 100, + }, + { + name: "minimum shares (2)", + shares: 2, + want: 1, // (2 * 100) / 1024 = 0.195 -> clamped to 1 + }, + { + name: "maximum shares (262144)", + shares: 262144, + want: 10000, // (262144 * 100) / 1024 = 25600 -> clamped to 10000 + }, + { + name: "zero shares", + shares: 0, + want: 100, // default + }, + { + name: "half default (512)", + shares: 512, + want: 50, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := sharesToWeight(tt.shares) + if got != tt.want { + t.Errorf("sharesToWeight(%d) = %d, want %d", tt.shares, got, tt.want) + } + }) + } +} + +func TestNormalizeCgroupPath(t *testing.T) { + tests := []struct { + name string + cgroupPath string + containerID string + want string + }{ + { + name: "absolute path", + cgroupPath: "/kubepods/pod123/container456", + containerID: "container456", + want: "/kubepods/pod123/container456", + }, + { + name: "relative path", + cgroupPath: "kubepods/pod123/container456", + containerID: "container456", + want: "kubepods/pod123/container456", + }, + { + name: "empty path uses container ID", + cgroupPath: "", + containerID: "container789", + want: "container789", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := normalizeCgroupPath(tt.cgroupPath, tt.containerID) + if got != tt.want { + t.Errorf("normalizeCgroupPath(%q, %q) = %q, want %q", + tt.cgroupPath, tt.containerID, got, tt.want) + } + }) + } +} + +func TestIsVCPUThread(t *testing.T) { + tests := []struct { + name string + threadName string + want bool + }{ + { + name: "QEMU vCPU thread with KVM", + threadName: "CPU 0/KVM", + want: true, + }, + { + name: "QEMU vCPU thread simple", + threadName: "CPU 1/KVM", + want: true, + }, + { + name: "generic vcpu thread", + threadName: "vcpu0", + want: true, + }, + { + name: "Firecracker vCPU thread", + threadName: "fc_vcpu0", + want: true, + }, + { + name: "Firecracker vCPU thread 2", + threadName: "fc_vcpu1", + want: true, + }, + { + name: "I/O thread", + threadName: "IO 0", + want: false, + }, + { + name: "main thread", + threadName: "qemu-system-x86", + want: false, + }, + { + name: "worker thread", + threadName: "worker0", + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := isVCPUThread(tt.threadName) + if got != tt.want { + t.Errorf("isVCPUThread(%q) = %v, want %v", tt.threadName, got, tt.want) + } + }) + } +} + +func TestSpecToCgroupResources(t *testing.T) { + // Test CPU shares conversion + shares := uint64(2048) + quota := int64(50000) + period := uint64(100000) + + spec := &specs.LinuxResources{ + CPU: &specs.LinuxCPU{ + Shares: &shares, + Quota: "a, + Period: &period, + Cpus: "0-1", + Mems: "0", + }, + } + + res, err := specToCgroupResources(spec) + if err != nil { + t.Fatalf("specToCgroupResources() error = %v", err) + } + + if res.CPU == nil { + t.Fatal("CPU resources not set") + } + + if res.CPU.Weight == nil { + t.Fatal("CPU weight not set") + } + + expectedWeight := sharesToWeight(shares) + if *res.CPU.Weight != expectedWeight { + t.Errorf("CPU weight = %d, want %d", *res.CPU.Weight, expectedWeight) + } + + if res.CPU.Cpus != "0-1" { + t.Errorf("CPU cpus = %q, want %q", res.CPU.Cpus, "0-1") + } + + if res.CPU.Mems != "0" { + t.Errorf("CPU mems = %q, want %q", res.CPU.Mems, "0") + } +} + +func TestSpecToCgroupResources_Memory(t *testing.T) { + limit := int64(536870912) // 512MB + swap := int64(1073741824) // 1GB + reservation := int64(268435456) // 256MB + + spec := &specs.LinuxResources{ + Memory: &specs.LinuxMemory{ + Limit: &limit, + Swap: &swap, + Reservation: &reservation, + }, + } + + res, err := specToCgroupResources(spec) + if err != nil { + t.Fatalf("specToCgroupResources() error = %v", err) + } + + if res.Memory == nil { + t.Fatal("Memory resources not set") + } + + if res.Memory.Max == nil || *res.Memory.Max != limit { + t.Errorf("Memory max = %v, want %d", res.Memory.Max, limit) + } + + if res.Memory.Swap == nil || *res.Memory.Swap != swap { + t.Errorf("Memory swap = %v, want %d", res.Memory.Swap, swap) + } + + if res.Memory.Low == nil || *res.Memory.Low != reservation { + t.Errorf("Memory low = %v, want %d", res.Memory.Low, reservation) + } +} + +func TestSpecToCgroupResources_Pids(t *testing.T) { + pidsLimit := int64(1024) + + spec := &specs.LinuxResources{ + Pids: &specs.LinuxPids{ + Limit: pidsLimit, + }, + } + + res, err := specToCgroupResources(spec) + if err != nil { + t.Fatalf("specToCgroupResources() error = %v", err) + } + + if res.Pids == nil { + t.Fatal("Pids resources not set") + } + + if res.Pids.Max != pidsLimit { + t.Errorf("Pids max = %d, want %d", res.Pids.Max, pidsLimit) + } +} + +func TestSpecToCgroupResources_NilResources(t *testing.T) { + res, err := specToCgroupResources(nil) + if err != nil { + t.Fatalf("specToCgroupResources(nil) error = %v", err) + } + + if res == nil { + t.Fatal("Expected non-nil result for nil input") + } + + // All fields should be nil/empty + if res.CPU != nil || res.Memory != nil || res.Pids != nil || res.IO != nil { + t.Error("Expected all resource fields to be nil for nil input") + } +} diff --git a/pkg/unikontainers/unikontainers.go b/pkg/unikontainers/unikontainers.go index 72006710..5941343b 100644 --- a/pkg/unikontainers/unikontainers.go +++ b/pkg/unikontainers/unikontainers.go @@ -29,6 +29,7 @@ import ( "sync" "syscall" + "github.com/urunc-dev/urunc/pkg/cgroup" "github.com/urunc-dev/urunc/pkg/network" "github.com/urunc-dev/urunc/pkg/unikontainers/hypervisors" "github.com/urunc-dev/urunc/pkg/unikontainers/initrd" @@ -55,13 +56,14 @@ var ErrNotExistingNS = errors.New("the namespace does not exist") // Unikontainer holds the data necessary to create, manage and delete unikernel containers type Unikontainer struct { - State *specs.State - Spec *specs.Spec - BaseDir string - RootDir string - UruncCfg *UruncConfig - Listener *net.UnixListener - Conn *net.UnixConn + State *specs.State + Spec *specs.Spec + BaseDir string + RootDir string + UruncCfg *UruncConfig + Listener *net.UnixListener + Conn *net.UnixConn + CgroupMgr *cgroup.Manager } // New parses the bundle and creates a new Unikontainer object @@ -603,6 +605,14 @@ func (u *Unikontainer) Delete() error { return fmt.Errorf("cannot delete running container: %s", u.State.ID) } + // Delete cgroups + if u.CgroupMgr != nil { + if err := u.CgroupMgr.Delete(); err != nil { + uniklog.WithError(err).Error("Failed to delete cgroups") + // Don't fail delete - just log + } + } + // get a monitor instance of the running monitor vmmType := u.State.Annotations[annotHypervisor] vmm, err := hypervisors.NewVMM(hypervisors.VmmType(vmmType), u.UruncCfg.Monitors) diff --git a/pkg/unikontainers/urunc_config.go b/pkg/unikontainers/urunc_config.go index 87164504..dfd41a54 100644 --- a/pkg/unikontainers/urunc_config.go +++ b/pkg/unikontainers/urunc_config.go @@ -34,9 +34,15 @@ type UruncTimestamps struct { Destination string `toml:"destination"` // Used to specify a file for timestamps } +type UruncCgroup struct { + SandboxCgroupOnly bool `toml:"sandbox_cgroup_only"` + OverheadPath string `toml:"overhead_path"` +} + type UruncConfig struct { Log UruncLog `toml:"log"` Timestamps UruncTimestamps `toml:"timestamps"` + Cgroup UruncCgroup `toml:"cgroup"` Monitors map[string]types.MonitorConfig `toml:"monitors"` ExtraBins map[string]types.ExtraBinConfig `toml:"extra_binaries"` } @@ -78,6 +84,13 @@ func defaultTimestampsConfig() UruncTimestamps { } } +func defaultCgroupConfig() UruncCgroup { + return UruncCgroup{ + SandboxCgroupOnly: true, + OverheadPath: "/urunc_overhead", + } +} + func defaultMonitorsConfig() map[string]types.MonitorConfig { return map[string]types.MonitorConfig{ "qemu": {DefaultMemoryMB: 256, DefaultVCPUs: 1}, @@ -98,6 +111,7 @@ func defaultUruncConfig() *UruncConfig { return &UruncConfig{ Log: defaultLogConfig(), Timestamps: defaultTimestampsConfig(), + Cgroup: defaultCgroupConfig(), Monitors: defaultMonitorsConfig(), ExtraBins: defaultExtraBinConfig(), } @@ -120,6 +134,10 @@ func (p *UruncConfig) Map() map[string]string { // them to this map. this map will be used to save the rest of the urunc config to state.json cfgMap := make(map[string]string) + // Cgroup config + cfgMap["urunc_config.cgroup.sandbox_cgroup_only"] = strconv.FormatBool(p.Cgroup.SandboxCgroupOnly) + cfgMap["urunc_config.cgroup.overhead_path"] = p.Cgroup.OverheadPath + for hv, hvCfg := range p.Monitors { prefix := "urunc_config.monitors." + hv + "." cfgMap[prefix+"default_memory_mb"] = strconv.FormatUint(uint64(hvCfg.DefaultMemoryMB), 10) @@ -140,10 +158,21 @@ func UruncConfigFromMap(cfgMap map[string]string) *UruncConfig { // since log and timestamps are loaded at the start of urunc, we will not be reading // them from this map. this map will be used to parse the rest of the urunc config from state.json cfg := &UruncConfig{ + Cgroup: defaultCgroupConfig(), Monitors: defaultMonitorsConfig(), ExtraBins: defaultExtraBinConfig(), } + // Parse cgroup config + if val, ok := cfgMap["urunc_config.cgroup.sandbox_cgroup_only"]; ok { + if boolVal, err := strconv.ParseBool(val); err == nil { + cfg.Cgroup.SandboxCgroupOnly = boolVal + } + } + if val, ok := cfgMap["urunc_config.cgroup.overhead_path"]; ok { + cfg.Cgroup.OverheadPath = val + } + for key, val := range cfgMap { if !strings.HasPrefix(key, "urunc_config.monitors.") { continue diff --git a/pkg/unikontainers/urunc_config_test.go b/pkg/unikontainers/urunc_config_test.go index 89328847..86164c8b 100644 --- a/pkg/unikontainers/urunc_config_test.go +++ b/pkg/unikontainers/urunc_config_test.go @@ -423,28 +423,36 @@ func TestUruncConfigMap(t *testing.T) { assert.Equal(t, config.ExtraBins["custom"].Options, cfgMap["urunc_config.extra_binaries.custom.options"]) }) - t.Run("empty monitors map produces empty result", func(t *testing.T) { + t.Run("empty monitors map produces only cgroup config", func(t *testing.T) { t.Parallel() config := &UruncConfig{ + Cgroup: defaultCgroupConfig(), Monitors: map[string]types.MonitorConfig{}, } cfgMap := config.Map() assert.NotNil(t, cfgMap) - assert.Empty(t, cfgMap) + // Only cgroup fields should be present + assert.Len(t, cfgMap, 2) + assert.Contains(t, cfgMap, "urunc_config.cgroup.sandbox_cgroup_only") + assert.Contains(t, cfgMap, "urunc_config.cgroup.overhead_path") }) - t.Run("empty extra binaries map produces empty result", func(t *testing.T) { + t.Run("empty extra binaries map produces only cgroup config", func(t *testing.T) { t.Parallel() config := &UruncConfig{ + Cgroup: defaultCgroupConfig(), ExtraBins: map[string]types.ExtraBinConfig{}, } cfgMap := config.Map() assert.NotNil(t, cfgMap) - assert.Empty(t, cfgMap) + // Only cgroup fields should be present + assert.Len(t, cfgMap, 2) + assert.Contains(t, cfgMap, "urunc_config.cgroup.sandbox_cgroup_only") + assert.Contains(t, cfgMap, "urunc_config.cgroup.overhead_path") }) t.Run("vhost true is serialized correctly", func(t *testing.T) { From 352f367cec320bad11b6032c8eda1d76c30761d6 Mon Sep 17 00:00:00 2001 From: Panos Mavrikos Date: Thu, 26 Feb 2026 23:34:01 +0000 Subject: [PATCH 2/2] feat(cgroups): add cgroup v2 support --- cmd/urunc/create.go | 21 +- cmd/urunc/start.go | 12 - go.mod | 3 +- pkg/cgroup/manager.go | 314 +++---------------------- pkg/cgroup/manager_test.go | 76 +----- pkg/unikontainers/urunc_config.go | 27 +-- pkg/unikontainers/urunc_config_test.go | 14 +- 7 files changed, 57 insertions(+), 410 deletions(-) diff --git a/cmd/urunc/create.go b/cmd/urunc/create.go index cb81e5aa..c79d120b 100644 --- a/cmd/urunc/create.go +++ b/cmd/urunc/create.go @@ -290,7 +290,9 @@ func createUnikontainer(cmd *cli.Command, uruncCfg *unikontainers.UruncConfig) ( return err } -// setupCgroups creates and configures cgroups for the container +// setupCgroups creates and configures cgroups for the container. +// Following Kata Containers' sandbox_cgroup_only approach: +// all processes (VMM, vCPU, I/O) run under the container's cgroup. func setupCgroups(cmd *cli.Command, u *unikontainers.Unikontainer, pid int) error { // Check if cgroups are disabled if u.Spec.Linux == nil || u.Spec.Linux.CgroupsPath == "" { @@ -303,12 +305,10 @@ func setupCgroups(cmd *cli.Command, u *unikontainers.Unikontainer, pid int) erro // Create cgroup manager config cgroupCfg := cgroup.Config{ - CgroupPath: u.Spec.Linux.CgroupsPath, - ContainerID: u.State.ID, - Resources: u.Spec.Linux.Resources, - SandboxCgroupOnly: u.UruncCfg.Cgroup.SandboxCgroupOnly, - OverheadPath: u.UruncCfg.Cgroup.OverheadPath, - UseSystemd: useSystemd, + CgroupPath: u.Spec.Linux.CgroupsPath, + ContainerID: u.State.ID, + Resources: u.Spec.Linux.Resources, + UseSystemd: useSystemd, } // Create cgroup manager @@ -326,10 +326,9 @@ func setupCgroups(cmd *cli.Command, u *unikontainers.Unikontainer, pid int) erro u.CgroupMgr = cgroupMgr logrus.WithFields(logrus.Fields{ - "cgroup_path": u.Spec.Linux.CgroupsPath, - "sandbox_cgroup_only": u.UruncCfg.Cgroup.SandboxCgroupOnly, - "use_systemd": useSystemd, - "pid": pid, + "cgroup_path": u.Spec.Linux.CgroupsPath, + "use_systemd": useSystemd, + "pid": pid, }).Info("Cgroups created successfully") return nil diff --git a/cmd/urunc/start.go b/cmd/urunc/start.go index d82a04e1..a020b547 100644 --- a/cmd/urunc/start.go +++ b/cmd/urunc/start.go @@ -19,7 +19,6 @@ import ( "errors" "fmt" "os" - "time" "github.com/sirupsen/logrus" "github.com/urfave/cli/v3" @@ -112,16 +111,5 @@ func startUnikontainer(cmd *cli.Command) error { return err } - if unikontainer.CgroupMgr != nil && unikontainer.CgroupMgr.UsingSplitPolicy() { - vmmPid := unikontainer.State.Pid - time.Sleep(200 * time.Millisecond) - - if err := unikontainer.CgroupMgr.MoveVCPUThreads(vmmPid); err != nil { - logrus.WithError(err).Warn("Failed to move vCPU threads to sandbox cgroup") - } else { - logrus.Info("Successfully moved vCPU threads to sandbox cgroup") - } - } - return unikontainer.ExecuteHooks("Poststart") } diff --git a/go.mod b/go.mod index 44033047..33d283dc 100644 --- a/go.mod +++ b/go.mod @@ -6,12 +6,14 @@ require ( github.com/BurntSushi/toml v1.6.0 github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 github.com/cavaliergopher/cpio v1.0.1 + github.com/containerd/cgroups/v3 v3.1.0 github.com/containerd/containerd v1.7.30 github.com/creack/pty v1.1.24 github.com/elastic/go-seccomp-bpf v1.6.0 github.com/hashicorp/go-version v1.8.0 github.com/jackpal/gateway v1.1.1 github.com/moby/sys/mount v0.3.4 + github.com/moby/sys/userns v0.1.0 github.com/nubificus/hedge_cli v0.0.3 github.com/onsi/ginkgo/v2 v2.28.1 github.com/onsi/gomega v1.39.1 @@ -63,7 +65,6 @@ require ( github.com/moby/sys/mountinfo v0.7.2 // indirect github.com/moby/sys/sequential v0.6.0 // indirect github.com/moby/sys/user v0.4.0 // indirect - github.com/moby/sys/userns v0.1.0 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.1.1 // indirect github.com/pkg/errors v0.9.1 // indirect diff --git a/pkg/cgroup/manager.go b/pkg/cgroup/manager.go index 46152adf..4079516a 100644 --- a/pkg/cgroup/manager.go +++ b/pkg/cgroup/manager.go @@ -17,11 +17,7 @@ package cgroup import ( "context" "fmt" - "os" - "path/filepath" - "strconv" "strings" - "time" cgroupsv2 "github.com/containerd/cgroups/v3/cgroup2" "github.com/containerd/cgroups/v3/cgroup2/stats" @@ -31,24 +27,21 @@ import ( var cgroupLog = logrus.WithField("subsystem", "cgroup") -// Manager handles cgroup lifecycle for urunc containers +// Manager handles cgroup lifecycle for urunc containers. +// Following Kata Containers' sandbox_cgroup_only approach: +// all processes (VMM, vCPU, I/O) run under the container's cgroup. type Manager struct { - sandboxCgroup *cgroupsv2.Manager - overheadCgroup *cgroupsv2.Manager - cgroupPath string - overheadPath string - splitPolicy bool - containerID string + cgroupMgr *cgroupsv2.Manager + cgroupPath string + containerID string } // Config holds configuration for cgroup creation type Config struct { - CgroupPath string - ContainerID string - Resources *specs.LinuxResources - SandboxCgroupOnly bool - OverheadPath string - UseSystemd bool // Whether to use systemd cgroup driver + CgroupPath string + ContainerID string + Resources *specs.LinuxResources + UseSystemd bool } // NewManager creates a new cgroup manager @@ -60,37 +53,29 @@ func NewManager(cfg Config) (*Manager, error) { cgroupPath := normalizeCgroupPath(cfg.CgroupPath, cfg.ContainerID) m := &Manager{ - cgroupPath: cgroupPath, - overheadPath: cfg.OverheadPath, - splitPolicy: !cfg.SandboxCgroupOnly, - containerID: cfg.ContainerID, + cgroupPath: cgroupPath, + containerID: cfg.ContainerID, } cgroupLog.WithFields(logrus.Fields{ "cgroup_path": cgroupPath, - "split_policy": m.splitPolicy, "container_id": cfg.ContainerID, }).Debug("Creating cgroup manager") return m, nil } -// Create creates the necessary cgroups +// Create creates the cgroup and adds the process to it func (m *Manager) Create(ctx context.Context, resources *specs.LinuxResources, pid int, useSystemd bool) error { - // Convert OCI resources to cgroup v2 resources cgroupResources, err := specToCgroupResources(resources) if err != nil { return fmt.Errorf("failed to convert resources: %w", err) } - var sandboxMgr *cgroupsv2.Manager - // Auto-detect systemd path format or use explicit flag useSystemdDriver := useSystemd || isSystemdPath(m.cgroupPath) - // Create sandbox cgroup using appropriate method if useSystemdDriver && isSystemdPath(m.cgroupPath) { - // Parse systemd path format: slice:prefix:name slice, group, err := parseSystemdPath(m.cgroupPath) if err != nil { return fmt.Errorf("failed to parse systemd cgroup path %s: %w", m.cgroupPath, err) @@ -102,200 +87,38 @@ func (m *Manager) Create(ctx context.Context, resources *specs.LinuxResources, p "pid": pid, }).Debug("Creating systemd cgroup") - sandboxMgr, err = cgroupsv2.NewSystemd(slice, group, pid, cgroupResources) + m.cgroupMgr, err = cgroupsv2.NewSystemd(slice, group, pid, cgroupResources) if err != nil { - return fmt.Errorf("failed to create systemd sandbox cgroup %s:%s: %w", slice, group, err) + return fmt.Errorf("failed to create systemd cgroup %s:%s: %w", slice, group, err) } } else { - // Use filesystem-based cgroup manager - sandboxMgr, err = cgroupsv2.NewManager( + m.cgroupMgr, err = cgroupsv2.NewManager( "/sys/fs/cgroup", m.cgroupPath, cgroupResources, ) if err != nil { - return fmt.Errorf("failed to create sandbox cgroup at %s: %w", m.cgroupPath, err) + return fmt.Errorf("failed to create cgroup at %s: %w", m.cgroupPath, err) } - // Add process to sandbox cgroup (NewSystemd already does this) - if err := sandboxMgr.AddProc(uint64(pid)); err != nil { - _ = sandboxMgr.Delete() + if err := m.cgroupMgr.AddProc(uint64(pid)); err != nil { + _ = m.cgroupMgr.Delete() return fmt.Errorf("failed to add pid %d to cgroup: %w", pid, err) } } - m.sandboxCgroup = sandboxMgr - cgroupLog.WithFields(logrus.Fields{ "path": m.cgroupPath, "pid": pid, - }).Info("Created sandbox cgroup and added process") - - // If split policy, create overhead cgroup - // Note: We always use filesystem-based cgroup for overhead, even with systemd, - // because it's an internal urunc feature and doesn't need systemd integration - if m.splitPolicy { - overheadPath := filepath.Join(m.overheadPath, m.containerID) - - // Overhead gets minimal resources (no limits) - overheadMgr, err := cgroupsv2.NewManager( - "/sys/fs/cgroup", - overheadPath, - &cgroupsv2.Resources{}, - ) - if err != nil { - _ = m.sandboxCgroup.Delete() - return fmt.Errorf("failed to create overhead cgroup at %s: %w", overheadPath, err) - } - m.overheadCgroup = overheadMgr - - cgroupLog.WithField("path", overheadPath).Info("Created overhead cgroup") - } - - return nil -} - -// MoveVCPUThreads identifies vCPU threads and moves them FROM overhead TO sandbox cgroup. -// This assumes the VMM process is already in the overhead cgroup (moved before exec via MoveToOverhead). -// All threads initially inherit the overhead cgroup, and we selectively move only vCPU threads -// to the sandbox (workload) cgroup. I/O threads stay in overhead. -func (m *Manager) MoveVCPUThreads(vmmPid int) error { - if !m.splitPolicy { - // In sandbox_cgroup_only mode, all threads stay in sandbox cgroup - return nil - } - - cgroupLog.WithField("vmm_pid", vmmPid).Debug("Identifying and moving vCPU threads from overhead to sandbox cgroup") - - // Retry thread detection with exponential backoff - // VMM may not spawn all vCPU threads immediately - var vcpuThreads, ioThreads []int - maxAttempts := 5 - var lastThreadCount int - - for attempt := 0; attempt < maxAttempts; attempt++ { - // Read all threads of the VMM process - threadIDs, err := getProcessThreads(vmmPid) - if err != nil { - cgroupLog.WithError(err).WithField("attempt", attempt+1).Warn("Failed to get threads") - time.Sleep(50 * time.Millisecond * time.Duration(1< 0 || (attempt > 0 && len(threadIDs) == lastThreadCount) { - break - } - - lastThreadCount = len(threadIDs) - - // Wait before retry with exponential backoff - sleepDuration := 50 * time.Millisecond * time.Duration(1< 0 { - return fmt.Errorf("cgroup deletion errors: %v", errs) + if err := m.cgroupMgr.Delete(); err != nil { + cgroupLog.WithError(err).Error("Failed to delete cgroup") + return fmt.Errorf("cgroup delete: %w", err) } return nil @@ -333,11 +145,11 @@ func (m *Manager) Delete() error { // GetStats returns cgroup statistics func (m *Manager) GetStats() (*stats.Metrics, error) { - if m.sandboxCgroup == nil { - return nil, fmt.Errorf("sandbox cgroup not initialized") + if m.cgroupMgr == nil { + return nil, fmt.Errorf("cgroup not initialized") } - return m.sandboxCgroup.Stat() + return m.cgroupMgr.Stat() } // normalizeCgroupPath handles OCI cgroup path formats @@ -346,12 +158,10 @@ func normalizeCgroupPath(cgroupPath, containerID string) string { return containerID } - // If it starts with /, it's an absolute path if strings.HasPrefix(cgroupPath, "/") { return cgroupPath } - // Otherwise, it's a relative path return cgroupPath } @@ -370,9 +180,6 @@ func parseSystemdPath(path string) (string, string, error) { } slice := parts[0] - - // Construct group name from remaining parts - // For "system.slice:docker:containerID" -> "docker-containerID.scope" group := strings.Join(parts[1:], "-") if !strings.HasSuffix(group, ".scope") { group = group + ".scope" @@ -387,59 +194,6 @@ func parseSystemdPath(path string) (string, string, error) { return slice, group, nil } -// getProcessThreads returns all thread IDs for a process -func getProcessThreads(pid int) ([]int, error) { - taskDir := fmt.Sprintf("/proc/%d/task", pid) - entries, err := os.ReadDir(taskDir) - if err != nil { - return nil, err - } - - threads := make([]int, 0, len(entries)) - for _, entry := range entries { - if !entry.IsDir() { - continue - } - - tid, err := strconv.Atoi(entry.Name()) - if err != nil { - continue - } - threads = append(threads, tid) - } - - return threads, nil -} - -// getThreadName reads the thread name from /proc//task//comm -func getThreadName(pid, tid int) (string, error) { - commPath := fmt.Sprintf("/proc/%d/task/%d/comm", pid, tid) - data, err := os.ReadFile(commPath) - if err != nil { - return "", err - } - - return strings.TrimSpace(string(data)), nil -} - -// isVCPUThread determines if a thread is a vCPU thread based on its name -func isVCPUThread(name string) bool { - // QEMU vCPU threads - if strings.HasPrefix(name, "CPU ") || strings.Contains(name, "/KVM") { - return true - } - // Firecracker vCPU threads - if strings.HasPrefix(name, "fc_vcpu") { - return true - } - // Generic vcpu naming - if strings.HasPrefix(name, "vcpu") { - return true - } - - return false -} - // specToCgroupResources converts OCI resources to cgroup v2 resources func specToCgroupResources(spec *specs.LinuxResources) (*cgroupsv2.Resources, error) { if spec == nil { @@ -513,11 +267,9 @@ func specToCgroupResources(spec *specs.LinuxResources) (*cgroupsv2.Resources, er // cgroup v2 weight range: 1-10000, default 100 func sharesToWeight(shares uint64) uint64 { if shares == 0 { - return 100 // default weight + return 100 } - // Convert shares to weight - // Formula: weight = (shares * 100) / 1024 weight := (shares * 100) / 1024 if weight < 1 { diff --git a/pkg/cgroup/manager_test.go b/pkg/cgroup/manager_test.go index d5e81e09..76766e58 100644 --- a/pkg/cgroup/manager_test.go +++ b/pkg/cgroup/manager_test.go @@ -27,22 +27,10 @@ func TestNewManager(t *testing.T) { wantErr bool }{ { - name: "valid config - sandbox only", + name: "valid config", cfg: Config{ - CgroupPath: "/test/cgroup", - ContainerID: "test123", - SandboxCgroupOnly: true, - OverheadPath: "/urunc_overhead", - }, - wantErr: false, - }, - { - name: "valid config - split policy", - cfg: Config{ - CgroupPath: "/test/cgroup", - ContainerID: "test456", - SandboxCgroupOnly: false, - OverheadPath: "/urunc_overhead", + CgroupPath: "/test/cgroup", + ContainerID: "test123", }, wantErr: false, }, @@ -151,64 +139,6 @@ func TestNormalizeCgroupPath(t *testing.T) { } } -func TestIsVCPUThread(t *testing.T) { - tests := []struct { - name string - threadName string - want bool - }{ - { - name: "QEMU vCPU thread with KVM", - threadName: "CPU 0/KVM", - want: true, - }, - { - name: "QEMU vCPU thread simple", - threadName: "CPU 1/KVM", - want: true, - }, - { - name: "generic vcpu thread", - threadName: "vcpu0", - want: true, - }, - { - name: "Firecracker vCPU thread", - threadName: "fc_vcpu0", - want: true, - }, - { - name: "Firecracker vCPU thread 2", - threadName: "fc_vcpu1", - want: true, - }, - { - name: "I/O thread", - threadName: "IO 0", - want: false, - }, - { - name: "main thread", - threadName: "qemu-system-x86", - want: false, - }, - { - name: "worker thread", - threadName: "worker0", - want: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got := isVCPUThread(tt.threadName) - if got != tt.want { - t.Errorf("isVCPUThread(%q) = %v, want %v", tt.threadName, got, tt.want) - } - }) - } -} - func TestSpecToCgroupResources(t *testing.T) { // Test CPU shares conversion shares := uint64(2048) diff --git a/pkg/unikontainers/urunc_config.go b/pkg/unikontainers/urunc_config.go index dfd41a54..0bae5c57 100644 --- a/pkg/unikontainers/urunc_config.go +++ b/pkg/unikontainers/urunc_config.go @@ -34,10 +34,10 @@ type UruncTimestamps struct { Destination string `toml:"destination"` // Used to specify a file for timestamps } -type UruncCgroup struct { - SandboxCgroupOnly bool `toml:"sandbox_cgroup_only"` - OverheadPath string `toml:"overhead_path"` -} +// UruncCgroup is kept for TOML compatibility but has no configurable fields. +// urunc follows Kata Containers' sandbox_cgroup_only approach: all processes +// (VMM, vCPU, I/O) run under the container's cgroup with no thread classification. +type UruncCgroup struct{} type UruncConfig struct { Log UruncLog `toml:"log"` @@ -85,10 +85,7 @@ func defaultTimestampsConfig() UruncTimestamps { } func defaultCgroupConfig() UruncCgroup { - return UruncCgroup{ - SandboxCgroupOnly: true, - OverheadPath: "/urunc_overhead", - } + return UruncCgroup{} } func defaultMonitorsConfig() map[string]types.MonitorConfig { @@ -134,10 +131,6 @@ func (p *UruncConfig) Map() map[string]string { // them to this map. this map will be used to save the rest of the urunc config to state.json cfgMap := make(map[string]string) - // Cgroup config - cfgMap["urunc_config.cgroup.sandbox_cgroup_only"] = strconv.FormatBool(p.Cgroup.SandboxCgroupOnly) - cfgMap["urunc_config.cgroup.overhead_path"] = p.Cgroup.OverheadPath - for hv, hvCfg := range p.Monitors { prefix := "urunc_config.monitors." + hv + "." cfgMap[prefix+"default_memory_mb"] = strconv.FormatUint(uint64(hvCfg.DefaultMemoryMB), 10) @@ -163,16 +156,6 @@ func UruncConfigFromMap(cfgMap map[string]string) *UruncConfig { ExtraBins: defaultExtraBinConfig(), } - // Parse cgroup config - if val, ok := cfgMap["urunc_config.cgroup.sandbox_cgroup_only"]; ok { - if boolVal, err := strconv.ParseBool(val); err == nil { - cfg.Cgroup.SandboxCgroupOnly = boolVal - } - } - if val, ok := cfgMap["urunc_config.cgroup.overhead_path"]; ok { - cfg.Cgroup.OverheadPath = val - } - for key, val := range cfgMap { if !strings.HasPrefix(key, "urunc_config.monitors.") { continue diff --git a/pkg/unikontainers/urunc_config_test.go b/pkg/unikontainers/urunc_config_test.go index 86164c8b..7bb7719c 100644 --- a/pkg/unikontainers/urunc_config_test.go +++ b/pkg/unikontainers/urunc_config_test.go @@ -423,7 +423,7 @@ func TestUruncConfigMap(t *testing.T) { assert.Equal(t, config.ExtraBins["custom"].Options, cfgMap["urunc_config.extra_binaries.custom.options"]) }) - t.Run("empty monitors map produces only cgroup config", func(t *testing.T) { + t.Run("empty monitors map produces empty map", func(t *testing.T) { t.Parallel() config := &UruncConfig{ Cgroup: defaultCgroupConfig(), @@ -433,13 +433,10 @@ func TestUruncConfigMap(t *testing.T) { cfgMap := config.Map() assert.NotNil(t, cfgMap) - // Only cgroup fields should be present - assert.Len(t, cfgMap, 2) - assert.Contains(t, cfgMap, "urunc_config.cgroup.sandbox_cgroup_only") - assert.Contains(t, cfgMap, "urunc_config.cgroup.overhead_path") + assert.Len(t, cfgMap, 0) }) - t.Run("empty extra binaries map produces only cgroup config", func(t *testing.T) { + t.Run("empty extra binaries map produces empty map", func(t *testing.T) { t.Parallel() config := &UruncConfig{ Cgroup: defaultCgroupConfig(), @@ -449,10 +446,7 @@ func TestUruncConfigMap(t *testing.T) { cfgMap := config.Map() assert.NotNil(t, cfgMap) - // Only cgroup fields should be present - assert.Len(t, cfgMap, 2) - assert.Contains(t, cfgMap, "urunc_config.cgroup.sandbox_cgroup_only") - assert.Contains(t, cfgMap, "urunc_config.cgroup.overhead_path") + assert.Len(t, cfgMap, 0) }) t.Run("vhost true is serialized correctly", func(t *testing.T) {