diff --git a/cmd/urunc/create.go b/cmd/urunc/create.go index 0124a755..c79d120b 100644 --- a/cmd/urunc/create.go +++ b/cmd/urunc/create.go @@ -29,6 +29,7 @@ import ( "github.com/sirupsen/logrus" "github.com/urfave/cli/v3" m "github.com/urunc-dev/urunc/internal/metrics" + "github.com/urunc-dev/urunc/pkg/cgroup" "github.com/urunc-dev/urunc/pkg/unikontainers" "golang.org/x/sys/unix" ) @@ -260,6 +261,16 @@ func createUnikontainer(cmd *cli.Command, uruncCfg *unikontainers.UruncConfig) ( return err } + // Setup cgroups + err = setupCgroups(cmd, unikontainer, containerPid) + if err != nil { + // Clean up on cgroup creation failure + if unikontainer.CgroupMgr != nil { + _ = unikontainer.CgroupMgr.Delete() + } + return fmt.Errorf("failed to setup cgroups: %w", err) + } + // execute CreateRuntime hooks err = unikontainer.ExecuteHooks("CreateRuntime") if err != nil { @@ -279,6 +290,50 @@ func createUnikontainer(cmd *cli.Command, uruncCfg *unikontainers.UruncConfig) ( return err } +// setupCgroups creates and configures cgroups for the container. +// Following Kata Containers' sandbox_cgroup_only approach: +// all processes (VMM, vCPU, I/O) run under the container's cgroup. +func setupCgroups(cmd *cli.Command, u *unikontainers.Unikontainer, pid int) error { + // Check if cgroups are disabled + if u.Spec.Linux == nil || u.Spec.Linux.CgroupsPath == "" { + logrus.Debug("Cgroups disabled or no cgroup path specified") + return nil + } + + // Check if systemd cgroup driver is enabled + useSystemd := cmd.Bool("systemd-cgroup") + + // Create cgroup manager config + cgroupCfg := cgroup.Config{ + CgroupPath: u.Spec.Linux.CgroupsPath, + ContainerID: u.State.ID, + Resources: u.Spec.Linux.Resources, + UseSystemd: useSystemd, + } + + // Create cgroup manager + cgroupMgr, err := cgroup.NewManager(cgroupCfg) + if err != nil { + return fmt.Errorf("failed to create cgroup manager: %w", err) + } + + // Create cgroups and add reexec process + if err := cgroupMgr.Create(context.Background(), u.Spec.Linux.Resources, pid, useSystemd); err != nil { + return fmt.Errorf("failed to create cgroups: %w", err) + } + + // Store manager in unikontainer + u.CgroupMgr = cgroupMgr + + logrus.WithFields(logrus.Fields{ + "cgroup_path": u.Spec.Linux.CgroupsPath, + "use_systemd": useSystemd, + "pid": pid, + }).Info("Cgroups created successfully") + + return nil +} + func createReexecCmd(initSock *os.File, logPipe *os.File) *exec.Cmd { selfPath := "/proc/self/exe" reexecCommand := &exec.Cmd{ diff --git a/go.mod b/go.mod index b3923b21..33d283dc 100644 --- a/go.mod +++ b/go.mod @@ -6,12 +6,14 @@ require ( github.com/BurntSushi/toml v1.6.0 github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 github.com/cavaliergopher/cpio v1.0.1 + github.com/containerd/cgroups/v3 v3.1.0 github.com/containerd/containerd v1.7.30 github.com/creack/pty v1.1.24 github.com/elastic/go-seccomp-bpf v1.6.0 github.com/hashicorp/go-version v1.8.0 github.com/jackpal/gateway v1.1.1 github.com/moby/sys/mount v0.3.4 + github.com/moby/sys/userns v0.1.0 github.com/nubificus/hedge_cli v0.0.3 github.com/onsi/ginkgo/v2 v2.28.1 github.com/onsi/gomega v1.39.1 @@ -33,7 +35,6 @@ require ( github.com/Microsoft/go-winio v0.6.2 // indirect github.com/Microsoft/hcsshim v0.13.0 // indirect github.com/cilium/ebpf v0.20.0 // indirect - github.com/containerd/cgroups/v3 v3.1.0 // indirect github.com/containerd/console v1.0.5 // indirect github.com/containerd/containerd/api v1.10.0 // indirect github.com/containerd/continuity v0.4.5 // indirect @@ -64,7 +65,6 @@ require ( github.com/moby/sys/mountinfo v0.7.2 // indirect github.com/moby/sys/sequential v0.6.0 // indirect github.com/moby/sys/user v0.4.0 // indirect - github.com/moby/sys/userns v0.1.0 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.1.1 // indirect github.com/pkg/errors v0.9.1 // indirect diff --git a/pkg/cgroup/manager.go b/pkg/cgroup/manager.go new file mode 100644 index 00000000..4079516a --- /dev/null +++ b/pkg/cgroup/manager.go @@ -0,0 +1,283 @@ +// Copyright (c) 2023-2025, Nubificus LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cgroup + +import ( + "context" + "fmt" + "strings" + + cgroupsv2 "github.com/containerd/cgroups/v3/cgroup2" + "github.com/containerd/cgroups/v3/cgroup2/stats" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" +) + +var cgroupLog = logrus.WithField("subsystem", "cgroup") + +// Manager handles cgroup lifecycle for urunc containers. +// Following Kata Containers' sandbox_cgroup_only approach: +// all processes (VMM, vCPU, I/O) run under the container's cgroup. +type Manager struct { + cgroupMgr *cgroupsv2.Manager + cgroupPath string + containerID string +} + +// Config holds configuration for cgroup creation +type Config struct { + CgroupPath string + ContainerID string + Resources *specs.LinuxResources + UseSystemd bool +} + +// NewManager creates a new cgroup manager +func NewManager(cfg Config) (*Manager, error) { + if cfg.CgroupPath == "" { + return nil, fmt.Errorf("cgroup path cannot be empty") + } + + cgroupPath := normalizeCgroupPath(cfg.CgroupPath, cfg.ContainerID) + + m := &Manager{ + cgroupPath: cgroupPath, + containerID: cfg.ContainerID, + } + + cgroupLog.WithFields(logrus.Fields{ + "cgroup_path": cgroupPath, + "container_id": cfg.ContainerID, + }).Debug("Creating cgroup manager") + + return m, nil +} + +// Create creates the cgroup and adds the process to it +func (m *Manager) Create(ctx context.Context, resources *specs.LinuxResources, pid int, useSystemd bool) error { + cgroupResources, err := specToCgroupResources(resources) + if err != nil { + return fmt.Errorf("failed to convert resources: %w", err) + } + + // Auto-detect systemd path format or use explicit flag + useSystemdDriver := useSystemd || isSystemdPath(m.cgroupPath) + + if useSystemdDriver && isSystemdPath(m.cgroupPath) { + slice, group, err := parseSystemdPath(m.cgroupPath) + if err != nil { + return fmt.Errorf("failed to parse systemd cgroup path %s: %w", m.cgroupPath, err) + } + + cgroupLog.WithFields(logrus.Fields{ + "slice": slice, + "group": group, + "pid": pid, + }).Debug("Creating systemd cgroup") + + m.cgroupMgr, err = cgroupsv2.NewSystemd(slice, group, pid, cgroupResources) + if err != nil { + return fmt.Errorf("failed to create systemd cgroup %s:%s: %w", slice, group, err) + } + } else { + m.cgroupMgr, err = cgroupsv2.NewManager( + "/sys/fs/cgroup", + m.cgroupPath, + cgroupResources, + ) + if err != nil { + return fmt.Errorf("failed to create cgroup at %s: %w", m.cgroupPath, err) + } + + if err := m.cgroupMgr.AddProc(uint64(pid)); err != nil { + _ = m.cgroupMgr.Delete() + return fmt.Errorf("failed to add pid %d to cgroup: %w", pid, err) + } + } + + cgroupLog.WithFields(logrus.Fields{ + "path": m.cgroupPath, + "pid": pid, + }).Info("Created cgroup and added process") + + return nil +} + +// Update updates cgroup resource limits +func (m *Manager) Update(resources *specs.LinuxResources) error { + if m.cgroupMgr == nil { + return fmt.Errorf("cgroup not initialized") + } + + cgroupResources, err := specToCgroupResources(resources) + if err != nil { + return err + } + + return m.cgroupMgr.Update(cgroupResources) +} + +// Delete removes the cgroup +func (m *Manager) Delete() error { + if m.cgroupMgr == nil { + return nil + } + + if err := m.cgroupMgr.Delete(); err != nil { + cgroupLog.WithError(err).Error("Failed to delete cgroup") + return fmt.Errorf("cgroup delete: %w", err) + } + + return nil +} + +// GetStats returns cgroup statistics +func (m *Manager) GetStats() (*stats.Metrics, error) { + if m.cgroupMgr == nil { + return nil, fmt.Errorf("cgroup not initialized") + } + + return m.cgroupMgr.Stat() +} + +// normalizeCgroupPath handles OCI cgroup path formats +func normalizeCgroupPath(cgroupPath, containerID string) string { + if cgroupPath == "" { + return containerID + } + + if strings.HasPrefix(cgroupPath, "/") { + return cgroupPath + } + + return cgroupPath +} + +// isSystemdPath checks if a cgroup path is in systemd format (slice:prefix:name) +func isSystemdPath(path string) bool { + return strings.Contains(path, ":") +} + +// parseSystemdPath parses a systemd cgroup path format +// Input: "slice:prefix:name" (e.g., "system.slice:docker:containerID") +// Output: slice ("system.slice"), group ("docker-containerID.scope") +func parseSystemdPath(path string) (string, string, error) { + parts := strings.Split(path, ":") + if len(parts) < 2 { + return "", "", fmt.Errorf("invalid systemd path format: %s", path) + } + + slice := parts[0] + group := strings.Join(parts[1:], "-") + if !strings.HasSuffix(group, ".scope") { + group = group + ".scope" + } + + cgroupLog.WithFields(logrus.Fields{ + "input": path, + "slice": slice, + "group": group, + }).Debug("Parsed systemd cgroup path") + + return slice, group, nil +} + +// specToCgroupResources converts OCI resources to cgroup v2 resources +func specToCgroupResources(spec *specs.LinuxResources) (*cgroupsv2.Resources, error) { + if spec == nil { + return &cgroupsv2.Resources{}, nil + } + + res := &cgroupsv2.Resources{} + + // CPU resources + if spec.CPU != nil { + res.CPU = &cgroupsv2.CPU{} + + if spec.CPU.Shares != nil { + weight := sharesToWeight(*spec.CPU.Shares) + res.CPU.Weight = &weight + } + + if spec.CPU.Quota != nil && spec.CPU.Period != nil { + res.CPU.Max = cgroupsv2.NewCPUMax(spec.CPU.Quota, spec.CPU.Period) + } + + if spec.CPU.Cpus != "" { + res.CPU.Cpus = spec.CPU.Cpus + } + + if spec.CPU.Mems != "" { + res.CPU.Mems = spec.CPU.Mems + } + } + + // Memory resources + if spec.Memory != nil { + res.Memory = &cgroupsv2.Memory{} + + if spec.Memory.Limit != nil { + res.Memory.Max = spec.Memory.Limit + } + + if spec.Memory.Swap != nil { + res.Memory.Swap = spec.Memory.Swap + } + + if spec.Memory.Reservation != nil { + res.Memory.Low = spec.Memory.Reservation + } + } + + // I/O resources + if spec.BlockIO != nil { + res.IO = &cgroupsv2.IO{} + + if spec.BlockIO.Weight != nil { + res.IO.BFQ.Weight = uint16(*spec.BlockIO.Weight) + } + } + + // PID resources + if spec.Pids != nil { + res.Pids = &cgroupsv2.Pids{} + + if spec.Pids.Limit > 0 { + res.Pids.Max = spec.Pids.Limit + } + } + + return res, nil +} + +// sharesToWeight converts CPU shares (OCI) to CPU weight (cgroup v2) +// OCI shares range: 2-262144, default 1024 +// cgroup v2 weight range: 1-10000, default 100 +func sharesToWeight(shares uint64) uint64 { + if shares == 0 { + return 100 + } + + weight := (shares * 100) / 1024 + + if weight < 1 { + weight = 1 + } + if weight > 10000 { + weight = 10000 + } + + return weight +} diff --git a/pkg/cgroup/manager_test.go b/pkg/cgroup/manager_test.go new file mode 100644 index 00000000..76766e58 --- /dev/null +++ b/pkg/cgroup/manager_test.go @@ -0,0 +1,257 @@ +// Copyright (c) 2023-2025, Nubificus LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cgroup + +import ( + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func TestNewManager(t *testing.T) { + tests := []struct { + name string + cfg Config + wantErr bool + }{ + { + name: "valid config", + cfg: Config{ + CgroupPath: "/test/cgroup", + ContainerID: "test123", + }, + wantErr: false, + }, + { + name: "empty cgroup path", + cfg: Config{ + CgroupPath: "", + ContainerID: "test789", + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mgr, err := NewManager(tt.cfg) + if (err != nil) != tt.wantErr { + t.Errorf("NewManager() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr && mgr == nil { + t.Error("NewManager() returned nil manager") + } + }) + } +} + +func TestSharesToWeight(t *testing.T) { + tests := []struct { + name string + shares uint64 + want uint64 + }{ + { + name: "default shares (1024)", + shares: 1024, + want: 100, + }, + { + name: "minimum shares (2)", + shares: 2, + want: 1, // (2 * 100) / 1024 = 0.195 -> clamped to 1 + }, + { + name: "maximum shares (262144)", + shares: 262144, + want: 10000, // (262144 * 100) / 1024 = 25600 -> clamped to 10000 + }, + { + name: "zero shares", + shares: 0, + want: 100, // default + }, + { + name: "half default (512)", + shares: 512, + want: 50, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := sharesToWeight(tt.shares) + if got != tt.want { + t.Errorf("sharesToWeight(%d) = %d, want %d", tt.shares, got, tt.want) + } + }) + } +} + +func TestNormalizeCgroupPath(t *testing.T) { + tests := []struct { + name string + cgroupPath string + containerID string + want string + }{ + { + name: "absolute path", + cgroupPath: "/kubepods/pod123/container456", + containerID: "container456", + want: "/kubepods/pod123/container456", + }, + { + name: "relative path", + cgroupPath: "kubepods/pod123/container456", + containerID: "container456", + want: "kubepods/pod123/container456", + }, + { + name: "empty path uses container ID", + cgroupPath: "", + containerID: "container789", + want: "container789", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := normalizeCgroupPath(tt.cgroupPath, tt.containerID) + if got != tt.want { + t.Errorf("normalizeCgroupPath(%q, %q) = %q, want %q", + tt.cgroupPath, tt.containerID, got, tt.want) + } + }) + } +} + +func TestSpecToCgroupResources(t *testing.T) { + // Test CPU shares conversion + shares := uint64(2048) + quota := int64(50000) + period := uint64(100000) + + spec := &specs.LinuxResources{ + CPU: &specs.LinuxCPU{ + Shares: &shares, + Quota: "a, + Period: &period, + Cpus: "0-1", + Mems: "0", + }, + } + + res, err := specToCgroupResources(spec) + if err != nil { + t.Fatalf("specToCgroupResources() error = %v", err) + } + + if res.CPU == nil { + t.Fatal("CPU resources not set") + } + + if res.CPU.Weight == nil { + t.Fatal("CPU weight not set") + } + + expectedWeight := sharesToWeight(shares) + if *res.CPU.Weight != expectedWeight { + t.Errorf("CPU weight = %d, want %d", *res.CPU.Weight, expectedWeight) + } + + if res.CPU.Cpus != "0-1" { + t.Errorf("CPU cpus = %q, want %q", res.CPU.Cpus, "0-1") + } + + if res.CPU.Mems != "0" { + t.Errorf("CPU mems = %q, want %q", res.CPU.Mems, "0") + } +} + +func TestSpecToCgroupResources_Memory(t *testing.T) { + limit := int64(536870912) // 512MB + swap := int64(1073741824) // 1GB + reservation := int64(268435456) // 256MB + + spec := &specs.LinuxResources{ + Memory: &specs.LinuxMemory{ + Limit: &limit, + Swap: &swap, + Reservation: &reservation, + }, + } + + res, err := specToCgroupResources(spec) + if err != nil { + t.Fatalf("specToCgroupResources() error = %v", err) + } + + if res.Memory == nil { + t.Fatal("Memory resources not set") + } + + if res.Memory.Max == nil || *res.Memory.Max != limit { + t.Errorf("Memory max = %v, want %d", res.Memory.Max, limit) + } + + if res.Memory.Swap == nil || *res.Memory.Swap != swap { + t.Errorf("Memory swap = %v, want %d", res.Memory.Swap, swap) + } + + if res.Memory.Low == nil || *res.Memory.Low != reservation { + t.Errorf("Memory low = %v, want %d", res.Memory.Low, reservation) + } +} + +func TestSpecToCgroupResources_Pids(t *testing.T) { + pidsLimit := int64(1024) + + spec := &specs.LinuxResources{ + Pids: &specs.LinuxPids{ + Limit: pidsLimit, + }, + } + + res, err := specToCgroupResources(spec) + if err != nil { + t.Fatalf("specToCgroupResources() error = %v", err) + } + + if res.Pids == nil { + t.Fatal("Pids resources not set") + } + + if res.Pids.Max != pidsLimit { + t.Errorf("Pids max = %d, want %d", res.Pids.Max, pidsLimit) + } +} + +func TestSpecToCgroupResources_NilResources(t *testing.T) { + res, err := specToCgroupResources(nil) + if err != nil { + t.Fatalf("specToCgroupResources(nil) error = %v", err) + } + + if res == nil { + t.Fatal("Expected non-nil result for nil input") + } + + // All fields should be nil/empty + if res.CPU != nil || res.Memory != nil || res.Pids != nil || res.IO != nil { + t.Error("Expected all resource fields to be nil for nil input") + } +} diff --git a/pkg/unikontainers/unikontainers.go b/pkg/unikontainers/unikontainers.go index 72006710..5941343b 100644 --- a/pkg/unikontainers/unikontainers.go +++ b/pkg/unikontainers/unikontainers.go @@ -29,6 +29,7 @@ import ( "sync" "syscall" + "github.com/urunc-dev/urunc/pkg/cgroup" "github.com/urunc-dev/urunc/pkg/network" "github.com/urunc-dev/urunc/pkg/unikontainers/hypervisors" "github.com/urunc-dev/urunc/pkg/unikontainers/initrd" @@ -55,13 +56,14 @@ var ErrNotExistingNS = errors.New("the namespace does not exist") // Unikontainer holds the data necessary to create, manage and delete unikernel containers type Unikontainer struct { - State *specs.State - Spec *specs.Spec - BaseDir string - RootDir string - UruncCfg *UruncConfig - Listener *net.UnixListener - Conn *net.UnixConn + State *specs.State + Spec *specs.Spec + BaseDir string + RootDir string + UruncCfg *UruncConfig + Listener *net.UnixListener + Conn *net.UnixConn + CgroupMgr *cgroup.Manager } // New parses the bundle and creates a new Unikontainer object @@ -603,6 +605,14 @@ func (u *Unikontainer) Delete() error { return fmt.Errorf("cannot delete running container: %s", u.State.ID) } + // Delete cgroups + if u.CgroupMgr != nil { + if err := u.CgroupMgr.Delete(); err != nil { + uniklog.WithError(err).Error("Failed to delete cgroups") + // Don't fail delete - just log + } + } + // get a monitor instance of the running monitor vmmType := u.State.Annotations[annotHypervisor] vmm, err := hypervisors.NewVMM(hypervisors.VmmType(vmmType), u.UruncCfg.Monitors) diff --git a/pkg/unikontainers/urunc_config.go b/pkg/unikontainers/urunc_config.go index 87164504..0bae5c57 100644 --- a/pkg/unikontainers/urunc_config.go +++ b/pkg/unikontainers/urunc_config.go @@ -34,9 +34,15 @@ type UruncTimestamps struct { Destination string `toml:"destination"` // Used to specify a file for timestamps } +// UruncCgroup is kept for TOML compatibility but has no configurable fields. +// urunc follows Kata Containers' sandbox_cgroup_only approach: all processes +// (VMM, vCPU, I/O) run under the container's cgroup with no thread classification. +type UruncCgroup struct{} + type UruncConfig struct { Log UruncLog `toml:"log"` Timestamps UruncTimestamps `toml:"timestamps"` + Cgroup UruncCgroup `toml:"cgroup"` Monitors map[string]types.MonitorConfig `toml:"monitors"` ExtraBins map[string]types.ExtraBinConfig `toml:"extra_binaries"` } @@ -78,6 +84,10 @@ func defaultTimestampsConfig() UruncTimestamps { } } +func defaultCgroupConfig() UruncCgroup { + return UruncCgroup{} +} + func defaultMonitorsConfig() map[string]types.MonitorConfig { return map[string]types.MonitorConfig{ "qemu": {DefaultMemoryMB: 256, DefaultVCPUs: 1}, @@ -98,6 +108,7 @@ func defaultUruncConfig() *UruncConfig { return &UruncConfig{ Log: defaultLogConfig(), Timestamps: defaultTimestampsConfig(), + Cgroup: defaultCgroupConfig(), Monitors: defaultMonitorsConfig(), ExtraBins: defaultExtraBinConfig(), } @@ -140,6 +151,7 @@ func UruncConfigFromMap(cfgMap map[string]string) *UruncConfig { // since log and timestamps are loaded at the start of urunc, we will not be reading // them from this map. this map will be used to parse the rest of the urunc config from state.json cfg := &UruncConfig{ + Cgroup: defaultCgroupConfig(), Monitors: defaultMonitorsConfig(), ExtraBins: defaultExtraBinConfig(), } diff --git a/pkg/unikontainers/urunc_config_test.go b/pkg/unikontainers/urunc_config_test.go index 89328847..7bb7719c 100644 --- a/pkg/unikontainers/urunc_config_test.go +++ b/pkg/unikontainers/urunc_config_test.go @@ -423,28 +423,30 @@ func TestUruncConfigMap(t *testing.T) { assert.Equal(t, config.ExtraBins["custom"].Options, cfgMap["urunc_config.extra_binaries.custom.options"]) }) - t.Run("empty monitors map produces empty result", func(t *testing.T) { + t.Run("empty monitors map produces empty map", func(t *testing.T) { t.Parallel() config := &UruncConfig{ + Cgroup: defaultCgroupConfig(), Monitors: map[string]types.MonitorConfig{}, } cfgMap := config.Map() assert.NotNil(t, cfgMap) - assert.Empty(t, cfgMap) + assert.Len(t, cfgMap, 0) }) - t.Run("empty extra binaries map produces empty result", func(t *testing.T) { + t.Run("empty extra binaries map produces empty map", func(t *testing.T) { t.Parallel() config := &UruncConfig{ + Cgroup: defaultCgroupConfig(), ExtraBins: map[string]types.ExtraBinConfig{}, } cfgMap := config.Map() assert.NotNil(t, cfgMap) - assert.Empty(t, cfgMap) + assert.Len(t, cfgMap, 0) }) t.Run("vhost true is serialized correctly", func(t *testing.T) {