diff --git a/cgroupv2.go b/cgroupv2.go index a9ae957..40e18fd 100644 --- a/cgroupv2.go +++ b/cgroupv2.go @@ -25,6 +25,11 @@ const ( // Other memory stats - we are interested in total_inactive_file cgroupV2MemoryStat = "memory.stat" + // Default period for cpu.max as documented in the kernel docs. + // The default period is 100000 microseconds (100ms). + // Ref: https://docs.kernel.org/6.17/admin-guide/cgroup-v2.html#cpu-interface-files + cgroupV2DefaultPeriodUs = 100000 + // What is the maximum cgroup depth we support? // We only expect to see a depth of around 3-4 at max, but we // allow 10 to give us some headroom. If this limit is reached @@ -66,18 +71,17 @@ func newCgroupV2Statter(fs afero.Fs, path string, depth int) (*cgroupV2Statter, func (s cgroupV2Statter) cpuUsed() (used float64, err error) { cpuStatPath := filepath.Join(s.path, cgroupV2CPUStat) - cpuMaxPath := filepath.Join(s.path, cgroupV2CPUMax) usageUs, err := readInt64Prefix(s.fs, cpuStatPath, "usage_usec") if err != nil { return 0, xerrors.Errorf("get cgroupv2 cpu used: %w", err) } - periodUs, err := readInt64SepIdx(s.fs, cpuMaxPath, " ", 1) + periodUs, err := s.cpuPeriod() if err != nil { return 0, xerrors.Errorf("get cpu period: %w", err) } - return float64(usageUs) / float64(periodUs), nil + return float64(usageUs) / periodUs, nil } func (s cgroupV2Statter) cpuQuota() (float64, error) { @@ -106,10 +110,35 @@ func (s cgroupV2Statter) cpuQuota() (float64, error) { return float64(quotaUs), nil } -func (s cgroupV2Statter) cpuTotal() (total float64, err error) { +func (s cgroupV2Statter) cpuPeriod() (float64, error) { cpuMaxPath := filepath.Join(s.path, cgroupV2CPUMax) periodUs, err := readInt64SepIdx(s.fs, cpuMaxPath, " ", 1) + if err != nil { + if !errors.Is(err, strconv.ErrSyntax) && !errors.Is(err, fs.ErrNotExist) { + return 0, xerrors.Errorf("get cpu period: %w", err) + } + + // If the value is not a valid integer or the cpu.max file does + // not exist, we call the parent to find its period. This can happen + // in system-level cgroups like init.scope where cpu.max may not exist. + if s.parent != nil { + period, err := s.parent.cpuPeriod() + if err != nil { + return 0, xerrors.Errorf("get parent cpu period: %w", err) + } + return period, nil + } + + // No parent and no period found in the cgroup hierarchy. + return cgroupV2DefaultPeriodUs, nil + } + + return float64(periodUs), nil +} + +func (s cgroupV2Statter) cpuTotal() (total float64, err error) { + periodUs, err := s.cpuPeriod() if err != nil { return 0, xerrors.Errorf("get cpu period: %w", err) } @@ -119,7 +148,7 @@ func (s cgroupV2Statter) cpuTotal() (total float64, err error) { return 0, xerrors.Errorf("get cpu quota: %w", err) } - return float64(quotaUs) / float64(periodUs), nil + return quotaUs / periodUs, nil } func (s cgroupV2Statter) memoryMaxBytes() (*float64, error) { diff --git a/stat_internal_test.go b/stat_internal_test.go index 8bc9a82..745fa29 100644 --- a/stat_internal_test.go +++ b/stat_internal_test.go @@ -396,6 +396,50 @@ func TestStatter(t *testing.T) { assert.Equal(t, "cores", cpu.Unit) }) + t.Run("CPU/InitScopeFallback", func(t *testing.T) { + t.Parallel() + + // Test RKE2/sysbox scenario where /init.scope cgroup doesn't have + // cpu.max but the root cgroup does. The period should be read from + // the parent (root) cgroup. + fs := initFS(t, fsContainerCgroupV2InitScope) + fakeWait := func(time.Duration) { + mungeFS(t, fs, filepath.Join(cgroupRootPath, "init.scope", cgroupV2CPUStat), "usage_usec 100000") + } + s, err := New(WithFS(fs), withWait(fakeWait), withIsCgroupV2(true)) + require.NoError(t, err) + + cpu, err := s.ContainerCPU() + require.NoError(t, err) + + require.NotNil(t, cpu) + assert.Equal(t, 1.0, cpu.Used) + require.Nil(t, cpu.Total) // quota is "max" so no limit + assert.Equal(t, "cores", cpu.Unit) + }) + + t.Run("CPU/InitScopeDefaultPeriod", func(t *testing.T) { + t.Parallel() + + // Test scenario where cpu.max doesn't exist at any level in the + // hierarchy. Per kernel docs, the default period is 100000us (100ms). + fs := initFS(t, fsContainerCgroupV2InitScopeNoCPUMax) + fakeWait := func(time.Duration) { + mungeFS(t, fs, filepath.Join(cgroupRootPath, "init.scope", cgroupV2CPUStat), "usage_usec 100000") + } + s, err := New(WithFS(fs), withWait(fakeWait), withIsCgroupV2(true)) + require.NoError(t, err) + + cpu, err := s.ContainerCPU() + require.NoError(t, err) + + require.NotNil(t, cpu) + // With default period of 100000us, usage_usec 100000 = 1.0 core + assert.Equal(t, 1.0, cpu.Used) + require.Nil(t, cpu.Total) // no limit anywhere + assert.Equal(t, "cores", cpu.Unit) + }) + t.Run("Memory/Limit", func(t *testing.T) { t.Parallel() @@ -727,6 +771,40 @@ proc /proc/sys proc ro,nosuid,nodev,noexec,relatime 0 0`, filepath.Join(cgroupRootPath, fsContainerCgroupV2KubernetesPath, cgroupV2MemoryStat): "inactive_file 268435456", filepath.Join(cgroupRootPath, fsContainerCgroupV2KubernetesPath, cgroupV2MemoryUsageBytes): "536870912", } + // fsContainerCgroupV2InitScope simulates RKE2/sysbox environment where + // the cgroup path is /init.scope and cpu.max does not exist at that level + // but does exist at the root cgroup. This tests the parent fallback logic. + fsContainerCgroupV2InitScope = map[string]string{ + procOneCgroup: "0::/", + procSelfCgroup: "0::/init.scope", + procMounts: `overlay / overlay rw,relatime,lowerdir=/some/path:/some/path,upperdir=/some/path:/some/path,workdir=/some/path:/some/path 0 0 +proc /proc/sys proc ro,nosuid,nodev,noexec,relatime 0 0 +sysboxfs /proc/sys sysboxfs rw,nosuid,nodev,noexec,relatime 0 0`, + sysCgroupType: "domain", + + // cpu.max purposefully missing at /init.scope level + filepath.Join(cgroupRootPath, cgroupV2CPUMax): "max 100000", + filepath.Join(cgroupRootPath, "init.scope", cgroupV2CPUStat): "usage_usec 0", + filepath.Join(cgroupRootPath, "init.scope", cgroupV2MemoryMaxBytes): "max", + filepath.Join(cgroupRootPath, "init.scope", cgroupV2MemoryStat): "inactive_file 268435456", + filepath.Join(cgroupRootPath, "init.scope", cgroupV2MemoryUsageBytes): "536870912", + } + // fsContainerCgroupV2InitScopeNoCPUMax simulates a scenario where cpu.max + // doesn't exist at any level in the hierarchy. Tests the default period fallback. + fsContainerCgroupV2InitScopeNoCPUMax = map[string]string{ + procOneCgroup: "0::/", + procSelfCgroup: "0::/init.scope", + procMounts: `overlay / overlay rw,relatime,lowerdir=/some/path:/some/path,upperdir=/some/path:/some/path,workdir=/some/path:/some/path 0 0 +proc /proc/sys proc ro,nosuid,nodev,noexec,relatime 0 0 +sysboxfs /proc/sys sysboxfs rw,nosuid,nodev,noexec,relatime 0 0`, + sysCgroupType: "domain", + + // cpu.max purposefully missing at all levels to test default period + filepath.Join(cgroupRootPath, "init.scope", cgroupV2CPUStat): "usage_usec 0", + filepath.Join(cgroupRootPath, "init.scope", cgroupV2MemoryMaxBytes): "max", + filepath.Join(cgroupRootPath, "init.scope", cgroupV2MemoryStat): "inactive_file 268435456", + filepath.Join(cgroupRootPath, "init.scope", cgroupV2MemoryUsageBytes): "536870912", + } fsContainerCgroupV1 = map[string]string{ procOneCgroup: "0::/docker/aa86ac98959eeedeae0ecb6e0c9ddd8ae8b97a9d0fdccccf7ea7a474f4e0bb1f", procSelfCgroup: "0::/docker/aa86ac98959eeedeae0ecb6e0c9ddd8ae8b97a9d0fdccccf7ea7a474f4e0bb1f",