From 52390d68040637dfc77f9fda6bbe70952423d380 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 8 Mar 2021 11:26:27 -0800 Subject: [PATCH] Ignore kernel memory settings This is somewhat radical approach to deal with kernel memory. Per-cgroup kernel memory limiting was always problematic. A few examples: - older kernels had bugs and were even oopsing sometimes (best example is RHEL7 kernel); - kernel is unable to reclaim the kernel memory so once the limit is hit a cgroup is toasted; - some kernel memory allocations don't allow failing. In addition to that, - users don't have a clue about how to set kernel memory limits (as the concept is much more complicated than e.g. [user] memory); - different kernels might have different kernel memory usage, which is sort of unexpected; - cgroup v2 do not have a [dedicated] kmem limit knob, and thus runc silently ignores kernel memory limits for v2; - kernel v5.4 made cgroup v1 kmem.limit obsoleted (see https://github.com/torvalds/linux/commit/0158115f702b). In view of all this, and as the runtime-spec lists memory.kernel and memory.kernelTCP as OPTIONAL, let's ignore kernel memory limits (for cgroup v1, same as we're already doing for v2). This should result in less bugs and better user experience. The only bad side effect from it might be that stat can show kernel memory usage as 0 (since the accounting is not enabled). [v2: add a warning in specconv that limits are ignored] Signed-off-by: Kir Kolyshkin --- README.md | 6 +-- contrib/completions/bash/runc | 2 - libcontainer/cgroups/fs/kmem.go | 56 ------------------------ libcontainer/cgroups/fs/kmem_disabled.go | 15 ------- libcontainer/cgroups/fs/memory.go | 38 +--------------- libcontainer/cgroups/fs/memory_test.go | 56 ------------------------ libcontainer/cgroups/systemd/v1.go | 33 -------------- libcontainer/configs/cgroup_linux.go | 6 --- libcontainer/integration/exec_test.go | 35 --------------- libcontainer/specconv/spec_linux.go | 8 ++-- libcontainer/specconv/spec_linux_test.go | 6 --- man/runc-update.8.md | 2 - tests/integration/cgroups.bats | 49 --------------------- tests/integration/helpers.bash | 6 --- update.go | 16 ++++--- 15 files changed, 16 insertions(+), 318 deletions(-) delete mode 100644 libcontainer/cgroups/fs/kmem.go delete mode 100644 libcontainer/cgroups/fs/kmem_disabled.go diff --git a/README.md b/README.md index 71337036456..2940a4f0ba3 100644 --- a/README.md +++ b/README.md @@ -61,18 +61,18 @@ sudo make install with some of them enabled by default (see `BUILDTAGS` in top-level `Makefile`). To change build tags from the default, set the `BUILDTAGS` variable for make, -e.g. +e.g. to disable seccomp: ```bash -make BUILDTAGS='seccomp' +make BUILDTAGS="" ``` | Build Tag | Feature | Enabled by default | Dependency | |-----------|------------------------------------|--------------------|------------| | seccomp | Syscall filtering | yes | libseccomp | -| nokmem | disable kernel memory accounting | no | | The following build tags were used earlier, but are now obsoleted: + - **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored) - **apparmor** (since runc v1.0.0-rc93 the feature is always enabled) - **selinux** (since runc v1.0.0-rc93 the feature is always enabled) diff --git a/contrib/completions/bash/runc b/contrib/completions/bash/runc index 8b7c02a807a..165405e91d9 100644 --- a/contrib/completions/bash/runc +++ b/contrib/completions/bash/runc @@ -737,8 +737,6 @@ _runc_update() { --cpu-share --cpuset-cpus --cpuset-mems - --kernel-memory - --kernel-memory-tcp --memory --memory-reservation --memory-swap diff --git a/libcontainer/cgroups/fs/kmem.go b/libcontainer/cgroups/fs/kmem.go deleted file mode 100644 index 86858f9083e..00000000000 --- a/libcontainer/cgroups/fs/kmem.go +++ /dev/null @@ -1,56 +0,0 @@ -// +build linux,!nokmem - -package fs - -import ( - "errors" - "fmt" - "path/filepath" - "strconv" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "golang.org/x/sys/unix" -) - -const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes" - -func EnableKernelMemoryAccounting(path string) error { - // Ensure that kernel memory is available in this kernel build. If it - // isn't, we just ignore it because EnableKernelMemoryAccounting is - // automatically called for all memory limits. - if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { - return nil - } - // We have to limit the kernel memory here as it won't be accounted at all - // until a limit is set on the cgroup and limit cannot be set once the - // cgroup has children, or if there are already tasks in the cgroup. - for _, i := range []int64{1, -1} { - if err := setKernelMemory(path, i); err != nil { - return err - } - } - return nil -} - -func setKernelMemory(path string, kernelMemoryLimit int64) error { - if path == "" { - return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit) - } - if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { - // We have specifically been asked to set a kmem limit. If the kernel - // doesn't support it we *must* error out. - return errors.New("kernel memory accounting not supported by this kernel") - } - if err := fscommon.WriteFile(path, cgroupKernelMemoryLimit, strconv.FormatInt(kernelMemoryLimit, 10)); err != nil { - // Check if the error number returned by the syscall is "EBUSY" - // The EBUSY signal is returned on attempts to write to the - // memory.kmem.limit_in_bytes file if the cgroup has children or - // once tasks have been attached to the cgroup - if errors.Is(err, unix.EBUSY) { - return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit) - } - return err - } - return nil -} diff --git a/libcontainer/cgroups/fs/kmem_disabled.go b/libcontainer/cgroups/fs/kmem_disabled.go deleted file mode 100644 index ac290fd7a02..00000000000 --- a/libcontainer/cgroups/fs/kmem_disabled.go +++ /dev/null @@ -1,15 +0,0 @@ -// +build linux,nokmem - -package fs - -import ( - "errors" -) - -func EnableKernelMemoryAccounting(path string) error { - return nil -} - -func setKernelMemory(path string, kernelMemoryLimit int64) error { - return errors.New("kernel memory accounting disabled in this runc build") -} diff --git a/libcontainer/cgroups/fs/memory.go b/libcontainer/cgroups/fs/memory.go index 5d52d325793..981757b9477 100644 --- a/libcontainer/cgroups/fs/memory.go +++ b/libcontainer/cgroups/fs/memory.go @@ -33,31 +33,6 @@ func (s *MemoryGroup) Name() string { } func (s *MemoryGroup) Apply(path string, d *cgroupData) (err error) { - if path == "" { - return nil - } - if memoryAssigned(d.config) { - if _, err := os.Stat(path); os.IsNotExist(err) { - if err := os.MkdirAll(path, 0755); err != nil { - return err - } - // Only enable kernel memory accouting when this cgroup - // is created by libcontainer, otherwise we might get - // error when people use `cgroupsPath` to join an existed - // cgroup whose kernel memory is not initialized. - if err := EnableKernelMemoryAccounting(path); err != nil { - return err - } - } - } - defer func() { - if err != nil { - os.RemoveAll(path) - } - }() - - // We need to join memory cgroup after set memory limits, because - // kmem.limit_in_bytes can only be set when the cgroup is empty. return join(path, d.pid) } @@ -140,11 +115,7 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { return err } - if cgroup.Resources.KernelMemory != 0 { - if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil { - return err - } - } + // ignore KernelMemory and KernelMemoryTCP if cgroup.Resources.MemoryReservation != 0 { if err := fscommon.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { @@ -152,11 +123,6 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { } } - if cgroup.Resources.KernelMemoryTCP != 0 { - if err := fscommon.WriteFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil { - return err - } - } if cgroup.Resources.OomKillDisable { if err := fscommon.WriteFile(path, "memory.oom_control", "1"); err != nil { return err @@ -238,8 +204,6 @@ func memoryAssigned(cgroup *configs.Cgroup) bool { return cgroup.Resources.Memory != 0 || cgroup.Resources.MemoryReservation != 0 || cgroup.Resources.MemorySwap > 0 || - cgroup.Resources.KernelMemory > 0 || - cgroup.Resources.KernelMemoryTCP > 0 || cgroup.Resources.OomKillDisable || (cgroup.Resources.MemorySwappiness != nil && int64(*cgroup.Resources.MemorySwappiness) != -1) } diff --git a/libcontainer/cgroups/fs/memory_test.go b/libcontainer/cgroups/fs/memory_test.go index 44f1c9a9175..701c99da4a8 100644 --- a/libcontainer/cgroups/fs/memory_test.go +++ b/libcontainer/cgroups/fs/memory_test.go @@ -190,62 +190,6 @@ func TestMemorySetSwapSmallerThanMemory(t *testing.T) { } } -func TestMemorySetKernelMemory(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() - - const ( - kernelMemoryBefore = 314572800 // 300M - kernelMemoryAfter = 524288000 // 500M - ) - - helper.writeFileContents(map[string]string{ - "memory.kmem.limit_in_bytes": strconv.Itoa(kernelMemoryBefore), - }) - - helper.CgroupData.config.Resources.KernelMemory = kernelMemoryAfter - memory := &MemoryGroup{} - if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { - t.Fatal(err) - } - - value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.kmem.limit_in_bytes") - if err != nil { - t.Fatalf("Failed to parse memory.kmem.limit_in_bytes - %s", err) - } - if value != kernelMemoryAfter { - t.Fatal("Got the wrong value, set memory.kmem.limit_in_bytes failed.") - } -} - -func TestMemorySetKernelMemoryTCP(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() - - const ( - kernelMemoryTCPBefore = 314572800 // 300M - kernelMemoryTCPAfter = 524288000 // 500M - ) - - helper.writeFileContents(map[string]string{ - "memory.kmem.tcp.limit_in_bytes": strconv.Itoa(kernelMemoryTCPBefore), - }) - - helper.CgroupData.config.Resources.KernelMemoryTCP = kernelMemoryTCPAfter - memory := &MemoryGroup{} - if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { - t.Fatal(err) - } - - value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.kmem.tcp.limit_in_bytes") - if err != nil { - t.Fatalf("Failed to parse memory.kmem.tcp.limit_in_bytes - %s", err) - } - if value != kernelMemoryTCPAfter { - t.Fatal("Got the wrong value, set memory.kmem.tcp.limit_in_bytes failed.") - } -} - func TestMemorySetMemorySwappinessDefault(t *testing.T) { helper := NewCgroupTestUtil("memory", t) defer helper.cleanup() diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index 59a8b1e727e..bc901c9e107 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -12,7 +12,6 @@ import ( systemdDbus "github.com/coreos/go-systemd/v22/dbus" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" "github.com/sirupsen/logrus" ) @@ -170,14 +169,6 @@ func (m *legacyManager) Apply(pid int) error { } properties = append(properties, c.SystemdProps...) - // We have to set kernel memory here, as we can't change it once - // processes have been attached to the cgroup. - if c.Resources.KernelMemory != 0 { - if err := enableKmem(c); err != nil { - return err - } - } - if err := startUnit(dbusConnection, unitName, properties); err != nil { return err } @@ -404,30 +395,6 @@ func (m *legacyManager) Set(container *configs.Config) error { return nil } -func enableKmem(c *configs.Cgroup) error { - path, err := getSubsystemPath(c, "memory") - if err != nil { - if cgroups.IsNotFound(err) { - return nil - } - return err - } - - if err := os.MkdirAll(path, 0755); err != nil { - return err - } - // do not try to enable the kernel memory if we already have - // tasks in the cgroup. - content, err := fscommon.ReadFile(path, "tasks") - if err != nil { - return err - } - if len(content) > 0 { - return nil - } - return fs.EnableKernelMemoryAccounting(path) -} - func (m *legacyManager) GetPaths() map[string]string { m.mu.Lock() defer m.mu.Unlock() diff --git a/libcontainer/configs/cgroup_linux.go b/libcontainer/configs/cgroup_linux.go index aada5d62f19..87d0da84288 100644 --- a/libcontainer/configs/cgroup_linux.go +++ b/libcontainer/configs/cgroup_linux.go @@ -54,12 +54,6 @@ type Resources struct { // Total memory usage (memory + swap); set `-1` to enable unlimited swap MemorySwap int64 `json:"memory_swap"` - // Kernel memory limit (in bytes) - KernelMemory int64 `json:"kernel_memory"` - - // Kernel memory limit for TCP use (in bytes) - KernelMemoryTCP int64 `json:"kernel_memory_tcp"` - // CPU shares (relative weight vs. other containers) CpuShares uint64 `json:"cpu_shares"` diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 783af0efb34..4a9b52b6ae0 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -667,41 +667,6 @@ func testPids(t *testing.T, systemd bool) { // As such, we don't test that case. YMMV. } -func TestRunWithKernelMemory(t *testing.T) { - testRunWithKernelMemory(t, false) -} - -func TestRunWithKernelMemorySystemd(t *testing.T) { - if !systemd.IsRunningSystemd() { - t.Skip("Systemd is unsupported") - } - testRunWithKernelMemory(t, true) -} - -func testRunWithKernelMemory(t *testing.T, systemd bool) { - if testing.Short() { - return - } - if cgroups.IsCgroup2UnifiedMode() { - t.Skip("cgroup v2 does not support kernel memory limit") - } - - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(&tParam{ - rootfs: rootfs, - systemd: systemd, - }) - config.Cgroups.Resources.KernelMemory = 52428800 - - _, _, err = runContainer(config, "", "ps") - if err != nil { - t.Fatalf("runContainer failed with kernel memory limit: %v", err) - } -} - func TestCgroupResourcesUnifiedErrorOnV1(t *testing.T) { testCgroupResourcesUnifiedErrorOnV1(t, false) } diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 6d4042d04f4..cf9fa6cbc90 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -21,6 +21,7 @@ import ( "github.com/opencontainers/runc/libcontainer/seccomp" libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -510,11 +511,8 @@ func CreateCgroupConfig(opts *CreateOpts, defaultDevs []*devices.Device) (*confi if r.Memory.Swap != nil { c.Resources.MemorySwap = *r.Memory.Swap } - if r.Memory.Kernel != nil { - c.Resources.KernelMemory = *r.Memory.Kernel - } - if r.Memory.KernelTCP != nil { - c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP + if r.Memory.Kernel != nil || r.Memory.KernelTCP != nil { + logrus.Warn("Kernel memory settings are ignored and will be removed") } if r.Memory.Swappiness != nil { c.Resources.MemorySwappiness = r.Memory.Swappiness diff --git a/libcontainer/specconv/spec_linux_test.go b/libcontainer/specconv/spec_linux_test.go index dc880f32c33..f3c543a5822 100644 --- a/libcontainer/specconv/spec_linux_test.go +++ b/libcontainer/specconv/spec_linux_test.go @@ -276,12 +276,6 @@ func TestLinuxCgroupWithMemoryResource(t *testing.T) { if cgroup.Resources.MemorySwap != swap { t.Errorf("Expected to have %d as swap, got %d", swap, cgroup.Resources.MemorySwap) } - if cgroup.Resources.KernelMemory != kernel { - t.Errorf("Expected to have %d as Kernel Memory, got %d", kernel, cgroup.Resources.KernelMemory) - } - if cgroup.Resources.KernelMemoryTCP != kernelTCP { - t.Errorf("Expected to have %d as TCP Kernel Memory, got %d", kernelTCP, cgroup.Resources.KernelMemoryTCP) - } if cgroup.Resources.MemorySwappiness != swappinessPtr { t.Errorf("Expected to have %d as memory swappiness, got %d", swappinessPtr, cgroup.Resources.MemorySwappiness) } diff --git a/man/runc-update.8.md b/man/runc-update.8.md index fa269d62b20..5c02e451cb8 100644 --- a/man/runc-update.8.md +++ b/man/runc-update.8.md @@ -45,8 +45,6 @@ other options are ignored. --cpu-share value CPU shares (relative weight vs. other containers) --cpuset-cpus value CPU(s) to use --cpuset-mems value Memory node(s) to use - --kernel-memory value Kernel memory limit (in bytes) - --kernel-memory-tcp value Kernel memory limit (in bytes) for tcp buffer --memory value Memory limit (in bytes) --memory-reservation value Memory reservation or soft_limit (in bytes) --memory-swap value Total memory usage (memory + swap); set '-1' to enable unlimited swap diff --git a/tests/integration/cgroups.bats b/tests/integration/cgroups.bats index a3f19c037e5..5a7efebc60b 100644 --- a/tests/integration/cgroups.bats +++ b/tests/integration/cgroups.bats @@ -10,55 +10,6 @@ function setup() { setup_busybox } -@test "runc update --kernel-memory{,-tcp} (initialized)" { - [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup - requires cgroups_kmem - - set_cgroups_path - - # Set some initial known values - update_config '.linux.resources.memory |= {"kernel": 16777216, "kernelTCP": 11534336}' - - # run a detached busybox to work with - runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_kmem - [ "$status" -eq 0 ] - - check_cgroup_value "memory.kmem.limit_in_bytes" 16777216 - check_cgroup_value "memory.kmem.tcp.limit_in_bytes" 11534336 - - # update kernel memory limit - runc update test_cgroups_kmem --kernel-memory 50331648 - [ "$status" -eq 0 ] - check_cgroup_value "memory.kmem.limit_in_bytes" 50331648 - - # update kernel memory tcp limit - runc update test_cgroups_kmem --kernel-memory-tcp 41943040 - [ "$status" -eq 0 ] - check_cgroup_value "memory.kmem.tcp.limit_in_bytes" 41943040 -} - -@test "runc update --kernel-memory (uninitialized)" { - [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup - requires cgroups_kmem - - set_cgroups_path - - # run a detached busybox to work with - runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_kmem - [ "$status" -eq 0 ] - - # update kernel memory limit - runc update test_cgroups_kmem --kernel-memory 50331648 - # Since kernel 4.6, we can update kernel memory without initialization - # because it's accounted by default. - if [[ "$KERNEL_MAJOR" -lt 4 || ("$KERNEL_MAJOR" -eq 4 && "$KERNEL_MINOR" -le 5) ]]; then - [ ! "$status" -eq 0 ] - else - [ "$status" -eq 0 ] - check_cgroup_value "memory.kmem.limit_in_bytes" 50331648 - fi -} - @test "runc create (no limits + no cgrouppath + no permission) succeeds" { runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_permissions [ "$status" -eq 0 ] diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index a35ec62d71e..8bd4194aef4 100644 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -313,12 +313,6 @@ function requires() { skip_me=1 fi ;; - cgroups_kmem) - init_cgroup_paths - if [ ! -e "${CGROUP_MEMORY_BASE_PATH}/memory.kmem.limit_in_bytes" ]; then - skip_me=1 - fi - ;; cgroups_rt) init_cgroup_paths if [ ! -e "${CGROUP_CPU_BASE_PATH}/cpu.rt_period_us" ]; then diff --git a/update.go b/update.go index 4e9b3f4f1d4..127a6e71d45 100644 --- a/update.go +++ b/update.go @@ -10,6 +10,7 @@ import ( "strconv" "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/sirupsen/logrus" "github.com/docker/go-units" "github.com/opencontainers/runc/libcontainer/configs" @@ -38,9 +39,7 @@ The accepted format is as follow (unchanged values can be omitted): "memory": { "limit": 0, "reservation": 0, - "swap": 0, - "kernel": 0, - "kernelTCP": 0 + "swap": 0 }, "cpu": { "shares": 0, @@ -95,11 +94,11 @@ other options are ignored. }, cli.StringFlag{ Name: "kernel-memory", - Usage: "Kernel memory limit (in bytes)", + Usage: "(obsoleted; do not use)", }, cli.StringFlag{ Name: "kernel-memory-tcp", - Usage: "Kernel memory limit (in bytes) for tcp buffer", + Usage: "(obsoleted; do not use)", }, cli.StringFlag{ Name: "memory", @@ -248,9 +247,14 @@ other options are ignored. *pair.dest = v } } + r.Pids.Limit = int64(context.Int("pids-limit")) } + if *r.Memory.Kernel != 0 || *r.Memory.KernelTCP != 0 { + logrus.Warn("Kernel memory settings are ignored and will be removed") + } + // Update the values config.Cgroups.Resources.BlkioWeight = *r.BlockIO.Weight @@ -288,8 +292,6 @@ other options are ignored. config.Cgroups.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime config.Cgroups.Resources.CpusetCpus = r.CPU.Cpus config.Cgroups.Resources.CpusetMems = r.CPU.Mems - config.Cgroups.Resources.KernelMemory = *r.Memory.Kernel - config.Cgroups.Resources.KernelMemoryTCP = *r.Memory.KernelTCP config.Cgroups.Resources.Memory = *r.Memory.Limit config.Cgroups.Resources.MemoryReservation = *r.Memory.Reservation config.Cgroups.Resources.MemorySwap = *r.Memory.Swap