From 3a4b3af661830b181136dcbd983653b7e51f4ece Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 9 Jun 2023 14:05:16 -0700 Subject: [PATCH 1/5] tests/int/cgroups: remove useless/wrong setting There is no such thing as linux.resources.memorySwap (the mem+swap is set as linux.resources.memory.swap). As it is not used in this test anyway, remove it. Fixes: 4929c05ad18ab99686040d8b40a969c3f884717c Signed-off-by: Kir Kolyshkin (cherry picked from commit dacb3aaa0dc11b3af7d6d6ef1fb93536f3530b3a) Signed-off-by: Kir Kolyshkin --- tests/integration/cgroups.bats | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/cgroups.bats b/tests/integration/cgroups.bats index 589e3d8f878..e6e136e7f16 100644 --- a/tests/integration/cgroups.bats +++ b/tests/integration/cgroups.bats @@ -233,7 +233,6 @@ function setup() { set_cgroups_path # CPU shares of 3333 corresponds to CPU weight of 128. update_config ' .linux.resources.memory |= {"limit": 33554432} - | .linux.resources.memorySwap |= {"limit": 33554432} | .linux.resources.cpu |= { "shares": 3333, "quota": 40000, From 71e760076896951f2ec26f4eec636fb7fd03ac60 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 5 Jun 2023 18:28:44 -0700 Subject: [PATCH 2/5] libct/cg/sd: remove logging from resetFailedUnit Sometimes we call resetFailedUnit as a cleanup measure, and we don't care if it fails or not. So, move error reporting to its callers, and ignore error in cases we don't really expect it to succeed. Signed-off-by: Kir Kolyshkin (cherry picked from commit 91b4cd25b7c698ff158afddc8d918546d3966fdc) Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/systemd/common.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index 5d561facebc..e3e0dabd8aa 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -370,7 +370,10 @@ retry: // In case a unit with the same name exists, this may // be a leftover failed unit. Reset it, so systemd can // remove it, and retry once. - resetFailedUnit(cm, unitName) + err = resetFailedUnit(cm, unitName) + if err != nil { + logrus.Warnf("unable to reset failed unit: %v", err) + } retry = false goto retry } @@ -385,11 +388,11 @@ retry: close(statusChan) // Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit if s != "done" { - resetFailedUnit(cm, unitName) + _ = resetFailedUnit(cm, unitName) return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s) } case <-timeout.C: - resetFailedUnit(cm, unitName) + _ = resetFailedUnit(cm, unitName) return errors.New("Timeout waiting for systemd to create " + unitName) } @@ -420,13 +423,10 @@ func stopUnit(cm *dbusConnManager, unitName string) error { return nil } -func resetFailedUnit(cm *dbusConnManager, name string) { - err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { +func resetFailedUnit(cm *dbusConnManager, name string) error { + return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { return c.ResetFailedUnitContext(context.TODO(), name) }) - if err != nil { - logrus.Warnf("unable to reset failed unit: %v", err) - } } func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) { From 1188c5a192743aac472430c4713c1a3fe0d95a8e Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 5 Jun 2023 12:09:04 -0700 Subject: [PATCH 3/5] runc delete: call systemd's reset-failed runc delete is supposed to remove all the container's artefacts. In case systemd cgroup driver is used, and the systemd unit has failed (e.g. oom-killed), systemd won't remove the unit (that is, unless the "CollectMode: inactive-or-failed" property is set). Call reset-failed from manager.Destroy so the failed unit will be removed during "runc delete". Signed-off-by: Kir Kolyshkin (cherry picked from commit 43564a7b55a9299a8ef0ddaf5d01295b7a418928) Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/systemd/common.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index e3e0dabd8aa..c5b476e2cc8 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -420,6 +420,10 @@ func stopUnit(cm *dbusConnManager, unitName string) error { return errors.New("Timed out while waiting for systemd to remove " + unitName) } } + + // In case of a failed unit, let systemd remove it. + _ = resetFailedUnit(cm, unitName) + return nil } From ebdd4fa6b17ed55541c40cde40da5d7e11438881 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 13 Jun 2023 15:04:30 -0700 Subject: [PATCH 4/5] [1.1] tests/int: add "requires systemd_vNNN" Signed-off-by: Kir Kolyshkin (cherry picked from commit 58a811f6aaa6051fdf807ac1de9f95d74589dc69) Signed-off-by: Kir Kolyshkin --- tests/integration/helpers.bash | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index b2575544810..23e32e978c2 100644 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -441,6 +441,12 @@ function requires() { skip_me=1 fi ;; + systemd_v*) + var=${var#systemd_v} + if [ "$(systemd_version)" -lt "$var" ]; then + skip "requires systemd >= v${var}" + fi + ;; no_systemd) if [ -n "${RUNC_USE_SYSTEMD}" ]; then skip_me=1 From ef6491ec9f2ee6a833e4a4837f1aca067135cc4b Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 12 Jun 2023 17:24:24 -0700 Subject: [PATCH 5/5] tests/int/delete: make sure runc delete removes failed unit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The passing run (with the fix) looks like this: ---- delete.bats ✓ runc delete removes failed systemd unit [4556] runc spec (status=0): runc run -d --console-socket /tmp/bats-run-B08vu1/runc.lbQwU5/tty/sock test-failed-unit (status=0): Warning: The unit file, source configuration file or drop-ins of runc-cgroups-integration-test-12869.scope changed on disk. Run 'systemctl daemon-reload' to reload units. × runc-cgroups-integration-test-12869.scope - libcontainer container integration-test-12869 Loaded: loaded (/run/systemd/transient/runc-cgroups-integration-test-12869.scope; transient) Transient: yes Drop-In: /run/systemd/transient/runc-cgroups-integration-test-12869.scope.d └─50-DevicePolicy.conf, 50-DeviceAllow.conf Active: failed (Result: timeout) since Tue 2023-06-13 14:41:38 PDT; 751ms ago Duration: 2.144s CPU: 8ms Jun 13 14:41:34 kir-rhat systemd[1]: Started runc-cgroups-integration-test-12869.scope - libcontainer container integration-test-12869. Jun 13 14:41:37 kir-rhat systemd[1]: runc-cgroups-integration-test-12869.scope: Scope reached runtime time limit. Stopping. Jun 13 14:41:38 kir-rhat systemd[1]: runc-cgroups-integration-test-12869.scope: Stopping timed out. Killing. Jun 13 14:41:38 kir-rhat systemd[1]: runc-cgroups-integration-test-12869.scope: Killing process 1107438 (sleep) with signal SIGKILL. Jun 13 14:41:38 kir-rhat systemd[1]: runc-cgroups-integration-test-12869.scope: Failed with result 'timeout'. runc delete test-failed-unit (status=0): Unit runc-cgroups-integration-test-12869.scope could not be found. ---- Before the fix, the test was failing like this: ---- delete.bats ✗ runc delete removes failed systemd unit (in test file tests/integration/delete.bats, line 194) `run -4 systemctl status "$SD_UNIT_NAME"' failed, expected exit code 4, got 3 .... ---- Signed-off-by: Kir Kolyshkin (cherry picked from commit ad040b1caf545928a465eef69f6a6c7436cf7e57) Signed-off-by: Kir Kolyshkin --- tests/integration/delete.bats | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/integration/delete.bats b/tests/integration/delete.bats index bb0205c4ebf..adcca2c46b3 100644 --- a/tests/integration/delete.bats +++ b/tests/integration/delete.bats @@ -168,3 +168,30 @@ EOF # check delete subcgroups success [ ! -d "$CGROUP_PATH"/foo ] } + +@test "runc delete removes failed systemd unit" { + requires systemd_v244 # Older systemd lacks RuntimeMaxSec support. + + set_cgroups_path + # shellcheck disable=SC2016 + update_config ' .annotations += { + "org.systemd.property.RuntimeMaxSec": "2", + "org.systemd.property.TimeoutStopSec": "1" + } + | .process.args |= ["/bin/sleep", "10"]' + + runc run -d --console-socket "$CONSOLE_SOCKET" test-failed-unit + [ "$status" -eq 0 ] + + wait_for_container 10 1 test-failed-unit stopped + + local user="" + [ $EUID -ne 0 ] && user="--user" + + # Expect "unit is not active" exit code. + run -3 systemctl status $user "$SD_UNIT_NAME" + + runc delete test-failed-unit + # Expect "no such unit" exit code. + run -4 systemctl status $user "$SD_UNIT_NAME" +}