From ff3b4f3bb4a5d1adf0de646b73eee495e1cf9a69 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 1 Aug 2022 15:44:44 -0700 Subject: [PATCH 1/7] restore: fix ignoring --manage-cgroups-mode Merge the logic of setPageServer, setManageCgroupsMode, and setEmptyNsMask into criuOptions. This does three things: 1. Fixes ignoring --manage-cgroups-mode on restore; 2. Simplifies the code in checkpoint.go and restore.go; 3. Ensures issues like 1 won't happen again. Signed-off-by: Kir Kolyshkin --- checkpoint.go | 86 ++++++++++++++++++++++++++++----------------------- restore.go | 28 ----------------- 2 files changed, 47 insertions(+), 67 deletions(-) diff --git a/checkpoint.go b/checkpoint.go index b8bfa045d6c..ac32bbb50d6 100644 --- a/checkpoint.go +++ b/checkpoint.go @@ -67,17 +67,6 @@ checkpointed.`, return err } - // these are the mandatory criu options for a container - if err := setPageServer(context, options); err != nil { - return err - } - if err := setManageCgroupsMode(context, options); err != nil { - return err - } - if err := setEmptyNsMask(context, options); err != nil { - return err - } - err = container.Checkpoint(options) if err == nil && !(options.LeaveRunning || options.PreDump) { // Destroy the container unless we tell CRIU to keep it. @@ -119,59 +108,78 @@ func prepareImagePaths(context *cli.Context) (string, string, error) { return imagePath, parentPath, nil } -func setPageServer(context *cli.Context, options *libcontainer.CriuOpts) error { - // xxx following criu opts are optional - // The dump image can be sent to a criu page server +func criuOptions(context *cli.Context) (*libcontainer.CriuOpts, error) { + imagePath, parentPath, err := prepareImagePaths(context) + if err != nil { + return nil, err + } + + opts := &libcontainer.CriuOpts{ + ImagesDirectory: imagePath, + WorkDirectory: context.String("work-path"), + ParentImage: parentPath, + LeaveRunning: context.Bool("leave-running"), + TcpEstablished: context.Bool("tcp-established"), + ExternalUnixConnections: context.Bool("ext-unix-sk"), + ShellJob: context.Bool("shell-job"), + FileLocks: context.Bool("file-locks"), + PreDump: context.Bool("pre-dump"), + AutoDedup: context.Bool("auto-dedup"), + LazyPages: context.Bool("lazy-pages"), + StatusFd: context.Int("status-fd"), + LsmProfile: context.String("lsm-profile"), + LsmMountContext: context.String("lsm-mount-context"), + } + + // CRIU options below may or may not be set. + if psOpt := context.String("page-server"); psOpt != "" { address, port, err := net.SplitHostPort(psOpt) if err != nil || address == "" || port == "" { - return errors.New("Use --page-server ADDRESS:PORT to specify page server") + return nil, errors.New("Use --page-server ADDRESS:PORT to specify page server") } portInt, err := strconv.Atoi(port) if err != nil { - return errors.New("Invalid port number") + return nil, errors.New("Invalid port number") } - options.PageServer = libcontainer.CriuPageServerInfo{ + opts.PageServer = libcontainer.CriuPageServerInfo{ Address: address, Port: int32(portInt), } } - return nil -} -func setManageCgroupsMode(context *cli.Context, options *libcontainer.CriuOpts) error { if cgOpt := context.String("manage-cgroups-mode"); cgOpt != "" { switch cgOpt { case "soft": - options.ManageCgroupsMode = criu.CriuCgMode_SOFT + opts.ManageCgroupsMode = criu.CriuCgMode_SOFT case "full": - options.ManageCgroupsMode = criu.CriuCgMode_FULL + opts.ManageCgroupsMode = criu.CriuCgMode_FULL case "strict": - options.ManageCgroupsMode = criu.CriuCgMode_STRICT + opts.ManageCgroupsMode = criu.CriuCgMode_STRICT default: - return errors.New("Invalid manage cgroups mode") + return nil, errors.New("Invalid manage cgroups mode") } } - return nil -} -var namespaceMapping = map[specs.LinuxNamespaceType]int{ - specs.NetworkNamespace: unix.CLONE_NEWNET, -} - -func setEmptyNsMask(context *cli.Context, options *libcontainer.CriuOpts) error { - /* Runc doesn't manage network devices and their configuration */ + // runc doesn't manage network devices and their configuration. nsmask := unix.CLONE_NEWNET - for _, ns := range context.StringSlice("empty-ns") { - f, exists := namespaceMapping[specs.LinuxNamespaceType(ns)] - if !exists { - return fmt.Errorf("namespace %q is not supported", ns) + if context.IsSet("empty-ns") { + namespaceMapping := map[specs.LinuxNamespaceType]int{ + specs.NetworkNamespace: unix.CLONE_NEWNET, + } + + for _, ns := range context.StringSlice("empty-ns") { + f, exists := namespaceMapping[specs.LinuxNamespaceType(ns)] + if !exists { + return nil, fmt.Errorf("namespace %q is not supported", ns) + } + nsmask |= f } - nsmask |= f } - options.EmptyNs = uint32(nsmask) - return nil + opts.EmptyNs = uint32(nsmask) + + return opts, nil } diff --git a/restore.go b/restore.go index ccd1b232bc9..30ce89a2d9a 100644 --- a/restore.go +++ b/restore.go @@ -3,7 +3,6 @@ package main import ( "os" - "github.com/opencontainers/runc/libcontainer" "github.com/opencontainers/runc/libcontainer/userns" "github.com/sirupsen/logrus" "github.com/urfave/cli" @@ -113,9 +112,6 @@ using the runc checkpoint command.`, if err != nil { return err } - if err := setEmptyNsMask(context, options); err != nil { - return err - } status, err := startContainer(context, CT_ACT_RESTORE, options) if err != nil { return err @@ -126,27 +122,3 @@ using the runc checkpoint command.`, return nil }, } - -func criuOptions(context *cli.Context) (*libcontainer.CriuOpts, error) { - imagePath, parentPath, err := prepareImagePaths(context) - if err != nil { - return nil, err - } - - return &libcontainer.CriuOpts{ - ImagesDirectory: imagePath, - WorkDirectory: context.String("work-path"), - ParentImage: parentPath, - LeaveRunning: context.Bool("leave-running"), - TcpEstablished: context.Bool("tcp-established"), - ExternalUnixConnections: context.Bool("ext-unix-sk"), - ShellJob: context.Bool("shell-job"), - FileLocks: context.Bool("file-locks"), - PreDump: context.Bool("pre-dump"), - AutoDedup: context.Bool("auto-dedup"), - LazyPages: context.Bool("lazy-pages"), - StatusFd: context.Int("status-fd"), - LsmProfile: context.String("lsm-profile"), - LsmMountContext: context.String("lsm-mount-context"), - }, nil -} From 212d25e853adb4194e05a124ce81ee50d8dd6b16 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 1 Aug 2022 16:02:48 -0700 Subject: [PATCH 2/7] checkpoint/restore: add --manage-cgroups-mode ignore - add the new mode and document it; - slightly improve the --help output; - slightly simplify the parsing code. Signed-off-by: Kir Kolyshkin --- checkpoint.go | 26 ++++++++++++++------------ man/runc-checkpoint.8.md | 2 +- man/runc-restore.8.md | 2 +- restore.go | 2 +- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/checkpoint.go b/checkpoint.go index ac32bbb50d6..a8a27f248bc 100644 --- a/checkpoint.go +++ b/checkpoint.go @@ -38,7 +38,7 @@ checkpointed.`, cli.StringFlag{Name: "page-server", Value: "", Usage: "ADDRESS:PORT of the page server"}, cli.BoolFlag{Name: "file-locks", Usage: "handle file locks, for safety"}, cli.BoolFlag{Name: "pre-dump", Usage: "dump container's memory information only, leave the container running after this"}, - cli.StringFlag{Name: "manage-cgroups-mode", Value: "", Usage: "cgroups mode: 'soft' (default), 'full' and 'strict'"}, + cli.StringFlag{Name: "manage-cgroups-mode", Value: "", Usage: "cgroups mode: soft|full|strict|ignore (default: soft)"}, cli.StringSliceFlag{Name: "empty-ns", Usage: "create a namespace, but don't restore its properties"}, cli.BoolFlag{Name: "auto-dedup", Usage: "enable auto deduplication of memory images"}, }, @@ -149,17 +149,19 @@ func criuOptions(context *cli.Context) (*libcontainer.CriuOpts, error) { } } - if cgOpt := context.String("manage-cgroups-mode"); cgOpt != "" { - switch cgOpt { - case "soft": - opts.ManageCgroupsMode = criu.CriuCgMode_SOFT - case "full": - opts.ManageCgroupsMode = criu.CriuCgMode_FULL - case "strict": - opts.ManageCgroupsMode = criu.CriuCgMode_STRICT - default: - return nil, errors.New("Invalid manage cgroups mode") - } + switch context.String("manage-cgroups-mode") { + case "": + // do nothing + case "soft": + opts.ManageCgroupsMode = criu.CriuCgMode_SOFT + case "full": + opts.ManageCgroupsMode = criu.CriuCgMode_FULL + case "strict": + opts.ManageCgroupsMode = criu.CriuCgMode_STRICT + case "ignore": + opts.ManageCgroupsMode = criu.CriuCgMode_IGNORE + default: + return nil, errors.New("Invalid manage-cgroups-mode value") } // runc doesn't manage network devices and their configuration. diff --git a/man/runc-checkpoint.8.md b/man/runc-checkpoint.8.md index 373259d4ccf..a7dad29d3b3 100644 --- a/man/runc-checkpoint.8.md +++ b/man/runc-checkpoint.8.md @@ -57,7 +57,7 @@ together with **criu lazy-pages**. See : Do a pre-dump, i.e. dump container's memory information only, leaving the container running. See [criu iterative migration](https://criu.org/Iterative_migration). -**--manage-cgroups-mode** **soft**|**full**|**strict**. +**--manage-cgroups-mode** **soft**|**full**|**strict**|**ignore**. : Cgroups mode. Default is **soft**. See [criu --manage-cgroups option](https://criu.org/CLI/opt/--manage-cgroups). diff --git a/man/runc-restore.8.md b/man/runc-restore.8.md index a2b3da6c6fa..4f9d55f443a 100644 --- a/man/runc-restore.8.md +++ b/man/runc-restore.8.md @@ -37,7 +37,7 @@ image files directory. : Allow checkpoint/restore of file locks. See [criu --file-locks option](https://criu.org/CLI/opt/--file-locks). -**--manage-cgroups-mode** **soft**|**full**|**strict**. +**--manage-cgroups-mode** **soft**|**full**|**strict**|**ignore**. : Cgroups mode. Default is **soft**. See [criu --manage-cgroups option](https://criu.org/CLI/opt/--manage-cgroups). diff --git a/restore.go b/restore.go index 30ce89a2d9a..d65afcfc788 100644 --- a/restore.go +++ b/restore.go @@ -52,7 +52,7 @@ using the runc checkpoint command.`, cli.StringFlag{ Name: "manage-cgroups-mode", Value: "", - Usage: "cgroups mode: 'soft' (default), 'full' and 'strict'", + Usage: "cgroups mode: soft|full|strict|ignore (default: soft)", }, cli.StringFlag{ Name: "bundle, b", From 3438ef30b259fa4a7b11e2cd6eb496696318b886 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 1 Aug 2022 16:05:00 -0700 Subject: [PATCH 3/7] restore: fix --manage-cgroups-mode ignore on cgroup v2 When manage-cgroups-mode: ignore is used, criu still needs to know the cgroup path to work properly (see [1]). Revert "libct/criuApplyCgroups: don't set cgroup paths for v2" This reverts commit d5c57dcea6d8b19c0f87fc0a212d3020215af06e. [1]: https://github.com/checkpoint-restore/criu/issues/1793#issuecomment-1086675168 Signed-off-by: Kir Kolyshkin --- libcontainer/container_linux.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index c099e458aed..a3f4c44f8b4 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -1560,11 +1560,6 @@ func (c *Container) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { return err } - if cgroups.IsCgroup2UnifiedMode() { - return nil - } - // the stuff below is cgroupv1-specific - path := fmt.Sprintf("/proc/%d/cgroup", pid) cgroupsPaths, err := cgroups.ParseCgroupFile(path) if err != nil { From e8cf8783d1245a7ffd86e88d42da1532a324e5c4 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 1 Aug 2022 16:29:08 -0700 Subject: [PATCH 4/7] libct/criuApplyCgroups: add a TODO I don't want to implement it now, because this might result in some new issues, but this is definitely something that is worth implementing. Signed-off-by: Kir Kolyshkin --- libcontainer/container_linux.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index a3f4c44f8b4..b498e33bce2 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -1560,6 +1560,8 @@ func (c *Container) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { return err } + // TODO(@kolyshkin): should we use c.cgroupManager.GetPaths() + // instead of reading /proc/pid/cgroup? path := fmt.Sprintf("/proc/%d/cgroup", pid) cgroupsPaths, err := cgroups.ParseCgroupFile(path) if err != nil { From d4582ae2f1956ed0be25658ed54a89ce431a1942 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 1 Aug 2022 18:15:58 -0700 Subject: [PATCH 5/7] tests/int: add "--manage-cgroups-mode ignore" test This test checks that the container is restored into a different cgroup. To do so, a user should - use --manage-cgroups-mode ignore on both checkpoint and restore; - change the cgroupsPath value in config.json before restoring. The test does some checks to ensure that its logic is correct, and that after the restore the old (original) cgroup does not exist, the new one exists, and the container's init is in that new cgroup. Signed-off-by: Kir Kolyshkin --- tests/integration/checkpoint.bats | 41 +++++++++++++++++++++++++++++++ tests/integration/helpers.bash | 30 +++++++++++++--------- 2 files changed, 60 insertions(+), 11 deletions(-) diff --git a/tests/integration/checkpoint.bats b/tests/integration/checkpoint.bats index 615ef8cb65f..fe351e7476a 100644 --- a/tests/integration/checkpoint.bats +++ b/tests/integration/checkpoint.bats @@ -405,3 +405,44 @@ function simple_cr() { # busybox should be back up and running testcontainer test_busybox running } + +@test "checkpoint then restore into a different cgroup (via --manage-cgroups-mode ignore)" { + set_resources_limit + set_cgroups_path + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + testcontainer test_busybox running + + local orig_path + orig_path=$(get_cgroup_path "pids") + # Check that the cgroup exists. + test -d "$orig_path" + + runc checkpoint --work-path ./work-dir --manage-cgroups-mode ignore test_busybox + grep -B 5 Error ./work-dir/dump.log || true + [ "$status" -eq 0 ] + testcontainer test_busybox checkpointed + # Check that the cgroup is gone. + ! test -d "$orig_path" + + # Restore into a different cgroup. + set_cgroups_path # Changes the path. + runc restore -d --manage-cgroups-mode ignore --pid-file pid \ + --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox + grep -B 5 Error ./work-dir/restore.log || true + [ "$status" -eq 0 ] + testcontainer test_busybox running + + # Check that the old cgroup path doesn't exist. + ! test -d "$orig_path" + + # Check that the new path exists. + local new_path + new_path=$(get_cgroup_path "pids") + test -d "$new_path" + + # Check that container's init is in the new cgroup. + local pid + pid=$(cat "pid") + grep -q "${REL_CGROUPS_PATH}$" "/proc/$pid/cgroup" +} diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index a9b70ffd887..f07996450b7 100644 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -232,19 +232,27 @@ function set_cgroups_path() { update_config '.linux.cgroupsPath |= "'"${OCI_CGROUPS_PATH}"'"' } -# Get a value from a cgroup file. -function get_cgroup_value() { - local source=$1 - local cgroup var current - +# Get a path to cgroup directory, based on controller name. +# Parameters: +# $1: controller name (like "pids") or a file name (like "pids.max"). +function get_cgroup_path() { if [ -v CGROUP_V2 ]; then - cgroup=$CGROUP_PATH - else - var=${source%%.*} # controller name (e.g. memory) - var=CGROUP_${var^^}_BASE_PATH # variable name (e.g. CGROUP_MEMORY_BASE_PATH) - eval cgroup=\$"${var}${REL_CGROUPS_PATH}" + echo "$CGROUP_PATH" + return fi - cat "$cgroup/$source" + + local var cgroup + var=${1%%.*} # controller name (e.g. memory) + var=CGROUP_${var^^}_BASE_PATH # variable name (e.g. CGROUP_MEMORY_BASE_PATH) + eval cgroup=\$"${var}${REL_CGROUPS_PATH}" + echo "$cgroup" +} + +# Get a value from a cgroup file. +function get_cgroup_value() { + local cgroup + cgroup="$(get_cgroup_path "$1")" + cat "$cgroup/$1" } # Helper to check a if value in a cgroup file matches the expected one. From 683528783e38bb058c680bc8fa472b6a778f5904 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 4 Aug 2022 11:56:15 -0700 Subject: [PATCH 6/7] man/runc-restore: describe restore into different cgroup Signed-off-by: Kir Kolyshkin --- man/runc-restore.8.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/man/runc-restore.8.md b/man/runc-restore.8.md index 4f9d55f443a..eab50db9717 100644 --- a/man/runc-restore.8.md +++ b/man/runc-restore.8.md @@ -41,6 +41,11 @@ image files directory. : Cgroups mode. Default is **soft**. See [criu --manage-cgroups option](https://criu.org/CLI/opt/--manage-cgroups). +: In particular, to restore the container into a different cgroup, +**--manage-cgroups-mode ignore** must be used during both +**checkpoint** and **restore**, and the _container_id_ (or +**cgroupsPath** property in OCI config, if set) must be changed. + **--bundle**|**-b** _path_ : Path to the root of the bundle directory. Default is current directory. From c4aa452b18ae1c47596d6ce7623c90ff22fa9dcd Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 4 Aug 2022 11:57:31 -0700 Subject: [PATCH 7/7] tests/int/checkpoint: fix lazy migration flakiness When doing a lazy checkpoint/restore, we should not restore into the same cgroup, otherwise there is a race which result in occasional killing of the restored container (GH #2760, #2924). The fix is to use --manage-cgroup-mode=ignore, which allows to restore into a different cgroup. Note that since cgroupsPath is not set in config.json, the cgroup is derived from the container name, so calling set_cgroups_path is not needed. For the previous (unsuccessful) attempt to fix this, as well as detailed (and apparently correct) analysis, see commit 36fe3cc28c35d7157dc8b. Signed-off-by: Kir Kolyshkin --- tests/integration/checkpoint.bats | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/tests/integration/checkpoint.bats b/tests/integration/checkpoint.bats index fe351e7476a..0a8e58a2c9c 100644 --- a/tests/integration/checkpoint.bats +++ b/tests/integration/checkpoint.bats @@ -224,7 +224,14 @@ function simple_cr() { # TCP port for lazy migration port=27277 - __runc checkpoint --lazy-pages --page-server 0.0.0.0:${port} --status-fd ${lazy_w} --work-path ./work-dir --image-path ./image-dir test_busybox & + __runc checkpoint \ + --lazy-pages \ + --page-server 0.0.0.0:${port} \ + --status-fd ${lazy_w} \ + --manage-cgroups-mode=ignore \ + --work-path ./work-dir \ + --image-path ./image-dir \ + test_busybox & cpt_pid=$! # wait for lazy page server to be ready @@ -246,14 +253,18 @@ function simple_cr() { lp_pid=$! # Restore lazily from checkpoint. - # The restored container needs a different name (as well as systemd - # unit name, in case systemd cgroup driver is used) as the checkpointed - # container is not yet destroyed. It is only destroyed at that point - # in time when the last page is lazily transferred to the destination. + # + # The restored container needs a different name and a different cgroup + # (and a different systemd unit name, in case systemd cgroup driver is + # used) as the checkpointed container is not yet destroyed. It is only + # destroyed at that point in time when the last page is lazily + # transferred to the destination. + # # Killing the CRIU on the checkpoint side will let the container # continue to run if the migration failed at some point. - [ -v RUNC_USE_SYSTEMD ] && set_cgroups_path - runc_restore_with_pipes ./image-dir test_busybox_restore --lazy-pages + runc_restore_with_pipes ./image-dir test_busybox_restore \ + --lazy-pages \ + --manage-cgroups-mode=ignore wait $cpt_pid