From 53ea2ae660dcb94815065cbd3c8bf47a0cd54db3 Mon Sep 17 00:00:00 2001 From: Felipe Santos Date: Sun, 10 Oct 2021 13:53:51 -0300 Subject: [PATCH 01/11] Disable rootless optimizations if cgroupv2 is not present Signed-off-by: Felipe Santos --- images/base/files/usr/local/bin/entrypoint | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index bf8fb7cedd..d852311162 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -24,7 +24,7 @@ set -o pipefail userns="" if grep -Eqv "0[[:space:]]+0[[:space:]]+4294967295" /proc/self/uid_map; then userns="1" - echo 'INFO: running in a user namespace (experimental)' + echo "INFO: running in a user namespace (experimental)" >&2 fi validate_userns() { @@ -41,7 +41,9 @@ validate_userns() { if [[ ! -f "/sys/fs/cgroup/cgroup.controllers" ]]; then echo "ERROR: UserNS: cgroup v2 needs to be enabled" >&2 - exit 1 + echo "WARN: UserNS: Falling back to disabling user namespace optimizations" >&2 + userns="" + return fi for f in cpu memory pids; do if ! grep -qw $f /sys/fs/cgroup/cgroup.controllers; then From 9a95af7b590289a6f09de44e4c3a5acf3248acf0 Mon Sep 17 00:00:00 2001 From: Felipe Santos Date: Mon, 11 Oct 2021 13:18:39 -0300 Subject: [PATCH 02/11] Disable `cgroupv2` detection in `entrypoint` --- images/base/files/usr/local/bin/entrypoint | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index d852311162..13bf5878f4 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -38,19 +38,6 @@ validate_userns() { if [[ "${nofile_hard}" -lt "${nofile_hard_expected}" ]]; then echo "WARN: UserNS: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2 fi - - if [[ ! -f "/sys/fs/cgroup/cgroup.controllers" ]]; then - echo "ERROR: UserNS: cgroup v2 needs to be enabled" >&2 - echo "WARN: UserNS: Falling back to disabling user namespace optimizations" >&2 - userns="" - return - fi - for f in cpu memory pids; do - if ! grep -qw $f /sys/fs/cgroup/cgroup.controllers; then - echo "ERROR: UserNS: $f controller needs to be delegated" >&2 - exit 1 - fi - done } configure_containerd() { From 761109c702967e2e31e6e22a27b48f03be85de24 Mon Sep 17 00:00:00 2001 From: Felipe Santos Date: Mon, 11 Oct 2021 20:24:16 +0000 Subject: [PATCH 03/11] Delegate rootless detection to kind --- images/base/files/usr/local/bin/entrypoint | 35 ++++++++++++------- .../internal/providers/docker/provision.go | 9 +++++ .../internal/providers/podman/provider.go | 2 +- .../internal/providers/podman/provision.go | 9 +++++ 4 files changed, 41 insertions(+), 14 deletions(-) diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index 13bf5878f4..2bb7c43383 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -18,31 +18,40 @@ set -o errexit set -o nounset set -o pipefail -# If /proc/self/uid_map 4294967295 mappings, we are in the initial user namespace, i.e. the host. -# Otherwise we are in a non-initial user namespace. -# https://github.com/opencontainers/runc/blob/v1.0.0-rc92/libcontainer/system/linux.go#L109-L118 -userns="" -if grep -Eqv "0[[:space:]]+0[[:space:]]+4294967295" /proc/self/uid_map; then - userns="1" - echo "INFO: running in a user namespace (experimental)" >&2 +rootless="" +if [[ -n "${KIND_ROOTLESS-}" ]]; then + rootless=1 fi -validate_userns() { - if [[ -z "${userns}" ]]; then +validate_rootless() { + if [[ -z "${rootless}" ]]; then return fi + echo 'INFO: running in rootless mode (experimental)' >&2 local nofile_hard nofile_hard="$(ulimit -Hn)" local nofile_hard_expected="64000" if [[ "${nofile_hard}" -lt "${nofile_hard_expected}" ]]; then - echo "WARN: UserNS: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2 + echo "WARN: rootless: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2 fi + + if [[ ! -f "/sys/fs/cgroup/cgroup.controllers" ]]; then + echo "ERROR: rootless: cgroup v2 needs to be enabled" >&2 + exit 1 + fi + + for f in cpu memory pids; do + if ! grep -qw $f /sys/fs/cgroup/cgroup.controllers; then + echo "ERROR: rootless: $f controller needs to be delegated" >&2 + exit 1 + fi + done } configure_containerd() { local snapshotter=${KIND_EXPERIMENTAL_CONTAINERD_SNAPSHOTTER:-} - if [[ -n "$userns" ]]; then + if [[ -n "$rootless" ]]; then # userns (rootless) configs # Adjust oomScoreAdj @@ -91,7 +100,7 @@ fix_mount() { sync fi - if [[ -z "${userns}" ]]; then + if [[ -z "${rootless}" ]]; then echo 'INFO: remounting /sys read-only' # systemd-in-a-container should have read only /sys # https://systemd.io/CONTAINER_INTERFACE/ @@ -343,7 +352,7 @@ enable_network_magic(){ } # validate state -validate_userns +validate_rootless # run pre-init fixups # NOTE: it's important that we do configure* first in this order to avoid races diff --git a/pkg/cluster/internal/providers/docker/provision.go b/pkg/cluster/internal/providers/docker/provision.go index cd9b532f5c..a2d771039d 100644 --- a/pkg/cluster/internal/providers/docker/provision.go +++ b/pkg/cluster/internal/providers/docker/provision.go @@ -243,6 +243,15 @@ func runArgsForNode(node *config.Node, clusterIPFamily config.ClusterIPFamily, n args..., ) + // let the container know that it's running in rootless mode + info, err := info() + if err != nil { + return nil, err + } + if info.Rootless { + args = append(args, "-e", "KIND_ROOTLESS=1") + } + // convert mounts and port mappings to container run args args = append(args, generateMountBindings(node.ExtraMounts...)...) mappingArgs, err := generatePortMappings(clusterIPFamily, node.ExtraPortMappings...) diff --git a/pkg/cluster/internal/providers/podman/provider.go b/pkg/cluster/internal/providers/podman/provider.go index 53cc1c5a8b..4d93eee449 100644 --- a/pkg/cluster/internal/providers/podman/provider.go +++ b/pkg/cluster/internal/providers/podman/provider.go @@ -407,7 +407,7 @@ func info(logger log.Logger) (*providers.ProviderInfo, error) { SupportsPidsLimit: true, // not guaranteed to be correct SupportsCPUShares: true, // not guaranteed to be correct } - if info.Rootless { + if logger != nil && info.Rootless { logger.Warn("Cgroup controller detection is not implemented for Podman. " + "If you see cgroup-related errors, you might need to set systemd property \"Delegate=yes\", see https://kind.sigs.k8s.io/docs/user/rootless/") } diff --git a/pkg/cluster/internal/providers/podman/provision.go b/pkg/cluster/internal/providers/podman/provision.go index 8b6263ab3c..688e79292c 100644 --- a/pkg/cluster/internal/providers/podman/provision.go +++ b/pkg/cluster/internal/providers/podman/provision.go @@ -206,6 +206,15 @@ func runArgsForNode(node *config.Node, clusterIPFamily config.ClusterIPFamily, n args..., ) + // let the container know that it's running in rootless mode + info, err := info(nil) + if err != nil { + return nil, err + } + if info.Rootless { + args = append(args, "-e", "KIND_ROOTLESS=1") + } + // convert mounts and port mappings to container run args args = append(args, generateMountBindings(node.ExtraMounts...)...) mappingArgs, err := generatePortMappings(clusterIPFamily, node.ExtraPortMappings...) From 620b7d3929312913494c498f50c84280fd5ac7f4 Mon Sep 17 00:00:00 2001 From: Felipe Santos Date: Mon, 11 Oct 2021 20:54:40 +0000 Subject: [PATCH 04/11] Disable repeated checks --- images/base/files/usr/local/bin/entrypoint | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index 2bb7c43383..2f44afe0e1 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -35,18 +35,6 @@ validate_rootless() { if [[ "${nofile_hard}" -lt "${nofile_hard_expected}" ]]; then echo "WARN: rootless: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2 fi - - if [[ ! -f "/sys/fs/cgroup/cgroup.controllers" ]]; then - echo "ERROR: rootless: cgroup v2 needs to be enabled" >&2 - exit 1 - fi - - for f in cpu memory pids; do - if ! grep -qw $f /sys/fs/cgroup/cgroup.controllers; then - echo "ERROR: rootless: $f controller needs to be delegated" >&2 - exit 1 - fi - done } configure_containerd() { From 95f7540203fc56069fcea319daf51667af57a12d Mon Sep 17 00:00:00 2001 From: Felipe Santos Date: Tue, 12 Oct 2021 01:45:22 -0300 Subject: [PATCH 05/11] Rename rootless back to userns --- images/base/files/usr/local/bin/entrypoint | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index 2f44afe0e1..c16230f1af 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -18,28 +18,28 @@ set -o errexit set -o nounset set -o pipefail -rootless="" +userns="" if [[ -n "${KIND_ROOTLESS-}" ]]; then - rootless=1 + userns=1 fi -validate_rootless() { - if [[ -z "${rootless}" ]]; then +validate_userns() { + if [[ -z "${userns}" ]]; then return fi - echo 'INFO: running in rootless mode (experimental)' >&2 + echo 'INFO: running in a user namespace (experimental)' >&2 local nofile_hard nofile_hard="$(ulimit -Hn)" local nofile_hard_expected="64000" if [[ "${nofile_hard}" -lt "${nofile_hard_expected}" ]]; then - echo "WARN: rootless: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2 + echo "WARN: UserNS: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2 fi } configure_containerd() { local snapshotter=${KIND_EXPERIMENTAL_CONTAINERD_SNAPSHOTTER:-} - if [[ -n "$rootless" ]]; then + if [[ -n "$userns" ]]; then # userns (rootless) configs # Adjust oomScoreAdj @@ -88,7 +88,7 @@ fix_mount() { sync fi - if [[ -z "${rootless}" ]]; then + if [[ -z "${userns}" ]]; then echo 'INFO: remounting /sys read-only' # systemd-in-a-container should have read only /sys # https://systemd.io/CONTAINER_INTERFACE/ @@ -340,7 +340,7 @@ enable_network_magic(){ } # validate state -validate_rootless +validate_userns # run pre-init fixups # NOTE: it's important that we do configure* first in this order to avoid races From 382a570dacc7fd5a0b88cb477d8fe4bddf81fc86 Mon Sep 17 00:00:00 2001 From: Felipe Santos Date: Tue, 12 Oct 2021 01:57:50 -0300 Subject: [PATCH 06/11] Keep both userns and rootless variables --- images/base/files/usr/local/bin/entrypoint | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index c16230f1af..955bc39bf3 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -18,9 +18,17 @@ set -o errexit set -o nounset set -o pipefail +# If /proc/self/uid_map 4294967295 mappings, we are in the initial user namespace, i.e. the host. +# Otherwise we are in a non-initial user namespace. +# https://github.com/opencontainers/runc/blob/v1.0.0-rc92/libcontainer/system/linux.go#L109-L118 userns="" +if grep -Eqv "0[[:space:]]+0[[:space:]]+4294967295" /proc/self/uid_map; then + userns="1" +fi + +rootless="" if [[ -n "${KIND_ROOTLESS-}" ]]; then - userns=1 + rootless=1 fi validate_userns() { @@ -39,8 +47,8 @@ validate_userns() { configure_containerd() { local snapshotter=${KIND_EXPERIMENTAL_CONTAINERD_SNAPSHOTTER:-} - if [[ -n "$userns" ]]; then - # userns (rootless) configs + if [[ -n "$rootless" ]]; then + # rootless configs # Adjust oomScoreAdj sed -i 's/restrict_oom_score_adj = false/restrict_oom_score_adj = true/' /etc/containerd/config.toml @@ -88,14 +96,14 @@ fix_mount() { sync fi - if [[ -z "${userns}" ]]; then + if [[ -z "${rootless}" ]]; then echo 'INFO: remounting /sys read-only' # systemd-in-a-container should have read only /sys # https://systemd.io/CONTAINER_INTERFACE/ # however, we need other things from `docker run --privileged` ... # and this flag also happens to make /sys rw, amongst other things # - # This step is skipped when running inside UserNS, because it fails with EACCES. + # This step is skipped when running inside rootless mode, because it fails with EACCES. mount -o remount,ro /sys fi From e55e1dd97aee0052ba9641972e72dd50e6b96f67 Mon Sep 17 00:00:00 2001 From: Felipe Santos Date: Tue, 12 Oct 2021 02:05:47 -0300 Subject: [PATCH 07/11] Adjust oomScoreAdj when in userns --- images/base/files/usr/local/bin/entrypoint | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index 955bc39bf3..1207923d00 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -47,12 +47,11 @@ validate_userns() { configure_containerd() { local snapshotter=${KIND_EXPERIMENTAL_CONTAINERD_SNAPSHOTTER:-} - if [[ -n "$rootless" ]]; then - # rootless configs - + if [[ -n "$userns" ]]; then # Adjust oomScoreAdj sed -i 's/restrict_oom_score_adj = false/restrict_oom_score_adj = true/' /etc/containerd/config.toml - + fi + if [[ -n "$rootless" ]]; then # Use fuse-overlayfs by default: https://github.com/kubernetes-sigs/kind/issues/2275 snapshotter="fuse-overlayfs" else From cd020960c290ea2bb03fddf2266ad6b3542589f8 Mon Sep 17 00:00:00 2001 From: Felipe Santos Date: Tue, 12 Oct 2021 02:31:59 -0300 Subject: [PATCH 08/11] Skip mount /sys as ro in userns --- images/base/files/usr/local/bin/entrypoint | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index 1207923d00..cf871a9996 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -95,14 +95,14 @@ fix_mount() { sync fi - if [[ -z "${rootless}" ]]; then + if [[ -z "${userns}" ]]; then echo 'INFO: remounting /sys read-only' # systemd-in-a-container should have read only /sys # https://systemd.io/CONTAINER_INTERFACE/ # however, we need other things from `docker run --privileged` ... # and this flag also happens to make /sys rw, amongst other things # - # This step is skipped when running inside rootless mode, because it fails with EACCES. + # This step is skipped when running inside UserNS, because it fails with EACCES. mount -o remount,ro /sys fi From 8d100f66a26810a19898fac3a90ba6d4985bc85d Mon Sep 17 00:00:00 2001 From: Felipe Santos Date: Tue, 12 Oct 2021 02:35:37 -0300 Subject: [PATCH 09/11] Revert "Skip mount /sys as ro in userns" This reverts commit cd020960c290ea2bb03fddf2266ad6b3542589f8. --- images/base/files/usr/local/bin/entrypoint | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index cf871a9996..1207923d00 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -95,14 +95,14 @@ fix_mount() { sync fi - if [[ -z "${userns}" ]]; then + if [[ -z "${rootless}" ]]; then echo 'INFO: remounting /sys read-only' # systemd-in-a-container should have read only /sys # https://systemd.io/CONTAINER_INTERFACE/ # however, we need other things from `docker run --privileged` ... # and this flag also happens to make /sys rw, amongst other things # - # This step is skipped when running inside UserNS, because it fails with EACCES. + # This step is skipped when running inside rootless mode, because it fails with EACCES. mount -o remount,ro /sys fi From 54489fe2496982b5f625a4d23d6e6c3ad4c30e9d Mon Sep 17 00:00:00 2001 From: Felipe Santos Date: Tue, 12 Oct 2021 02:55:42 -0300 Subject: [PATCH 10/11] Unconditionally mount /sys and ignore error on userns --- images/base/files/usr/local/bin/entrypoint | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index 1207923d00..569b0dc940 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -95,16 +95,14 @@ fix_mount() { sync fi - if [[ -z "${rootless}" ]]; then - echo 'INFO: remounting /sys read-only' - # systemd-in-a-container should have read only /sys - # https://systemd.io/CONTAINER_INTERFACE/ - # however, we need other things from `docker run --privileged` ... - # and this flag also happens to make /sys rw, amongst other things - # - # This step is skipped when running inside rootless mode, because it fails with EACCES. - mount -o remount,ro /sys - fi + echo 'INFO: remounting /sys read-only' + # systemd-in-a-container should have read only /sys + # https://systemd.io/CONTAINER_INTERFACE/ + # however, we need other things from `docker run --privileged` ... + # and this flag also happens to make /sys rw, amongst other things + # + # This step is ignored when running inside UserNS, because it may fail with EACCES. + mount -o remount,ro /sys || [[ -n "$userns" ]] echo 'INFO: making mounts shared' >&2 # for mount propagation From d0686ec43c8c0b0ea4f51672a6acbe9913f41929 Mon Sep 17 00:00:00 2001 From: Felipe Santos Date: Tue, 12 Oct 2021 03:10:04 -0300 Subject: [PATCH 11/11] Enhance error ignoring logs --- images/base/files/usr/local/bin/entrypoint | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index 569b0dc940..1eda5814a1 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -102,7 +102,9 @@ fix_mount() { # and this flag also happens to make /sys rw, amongst other things # # This step is ignored when running inside UserNS, because it may fail with EACCES. - mount -o remount,ro /sys || [[ -n "$userns" ]] + if ! mount -o remount,ro /sys && [[ -n "$userns" ]]; then + echo 'INFO: UserNS: ignoring mount fail' >&2 + fi echo 'INFO: making mounts shared' >&2 # for mount propagation