From 28dc2e6f6d5fe891641a90354e1f5fa4321093bf Mon Sep 17 00:00:00 2001 From: Akihiro Suda Date: Tue, 14 Aug 2018 10:45:20 +0900 Subject: [PATCH] rootless: make /sys/fs/cgroup/* read-only The default rootless spec bind-mounts /sys with "rbind,ro" (because mounting sysfs requires netns to be unshared), however, it does not make subfilesystems mounted under /sys read-only. So, files under /sys/fs/cgroup were unexpectedly writable when they are chmod/chowned via privileged helpers such as pam_cgfs. This patch fixes the issue by mounting /sys/fs/cgroup/* as read-only explicitly. Signed-off-by: Akihiro Suda --- libcontainer/rootfs_linux.go | 12 ++++ libcontainer/specconv/example.go | 100 +++++++++++++++++++++++++------ tests/integration/cgroups.bats | 16 +++++ tests/integration/mounts.bats | 9 +++ 4 files changed, 120 insertions(+), 17 deletions(-) diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index cf715d66497..4075e09c0e3 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -297,6 +297,12 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { if err := mountToRootfs(tmpfs, rootfs, mountLabel); err != nil { return err } + // Add a dummy directory for cgroup2. + // Without this hack, `df` fails with `df: /sys/fs/cgroup/unified: No such file or directory` + // when /sys is bind-mounted from the host to the container (typically when netns is not unshared), + if err := os.MkdirAll(filepath.Join(rootfs, m.Destination, "unified"), 0755); err != nil { + return err + } for _, b := range binds { if err := mountToRootfs(b, rootfs, mountLabel); err != nil { return err @@ -360,6 +366,12 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { var binds []*configs.Mount for _, mm := range mounts { + // when /sys is bind-mounted from the host to the container (typically when netns is not unshared), + // mm.Mountpoint can be like "/containers/foo/rootfs/sys/fs/cgroup/systemd/user.slice/user-1001.slice/session-1.scope/foo". + // + if !strings.HasPrefix(mm.Mountpoint, "/sys/fs/cgroup") { + continue + } dir, err := mm.GetOwnCgroup(cgroupPaths) if err != nil { return nil, err diff --git a/libcontainer/specconv/example.go b/libcontainer/specconv/example.go index c113b337f34..c75704a6f1b 100644 --- a/libcontainer/specconv/example.go +++ b/libcontainer/specconv/example.go @@ -2,8 +2,11 @@ package specconv import ( "os" + "path/filepath" + "sort" "strings" + "github.com/opencontainers/runc/libcontainer/mount" "github.com/opencontainers/runtime-spec/specs-go" ) @@ -189,13 +192,10 @@ func ToRootless(spec *specs.Spec) { }} // Fix up mounts. - var mounts []specs.Mount - for _, mount := range spec.Mounts { - // Ignore all mounts that are under /sys. - if strings.HasPrefix(mount.Destination, "/sys") { - continue - } - + mountInfo, _ := mount.GetMounts() + fixupRootlessBindSys(spec, mountInfo) + for i := range spec.Mounts { + mount := &spec.Mounts[i] // Remove all gid= and uid= mappings. var options []string for _, option := range mount.Options { @@ -203,19 +203,85 @@ func ToRootless(spec *specs.Spec) { options = append(options, option) } } - mount.Options = options - mounts = append(mounts, mount) } - // Add the sysfs mount as an rbind. - mounts = append(mounts, specs.Mount{ - Source: "/sys", - Destination: "/sys", - Type: "none", - Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"}, - }) - spec.Mounts = mounts // Remove cgroup settings. spec.Linux.Resources = nil } + +// fixupRootlessBindSys bind-mounts /sys. +// This is required only when netns is not unshared. +func fixupRootlessBindSys(spec *specs.Spec, mountInfo []*mount.Info) { + // Fix up mounts. + var ( + mounts []specs.Mount + mountsUnderSys []specs.Mount + ) + for _, m := range spec.Mounts { + // ignore original /sys (cannot be mounted when netns is not unshared) + if filepath.Clean(m.Destination) == "/sys" { + continue + } else if strings.HasPrefix(m.Destination, "/sys") { + // Append all mounts that are under /sys (e.g. /sys/fs/cgroup )later + mountsUnderSys = append(mountsUnderSys, m) + } else { + mounts = append(mounts, m) + } + } + mounts = append(mounts, []specs.Mount{ + // Add the sysfs mount as an rbind. + // Note: + // * "ro" does not make submounts read-only recursively. + // * rbind work for sysfs but bind does not. + { + Source: "/sys", + Destination: "/sys", + Type: "none", + Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"}, + }, + }...) + spec.Mounts = append(mounts, mountsUnderSys...) + var ( + maskedPaths []string + maskedPathsWritable = make(map[string]struct{}) + ) + for _, masked := range spec.Linux.MaskedPaths { + // e.g. when /sys/firmware is masked and /sys/firmware/efi/efivars is in /proc/self/mountinfo, + // we need to mask /sys/firmware/efi/efivars as well. + // (otherwise `df` fails with "df: /sys/firmware/efi/efivars: No such file or directory") + // also, to mask /sys/firmware/efi/efivars, we need to mask /sys/firmware as a writable tmpfs + if strings.HasPrefix(masked, "/sys") { + for _, mi := range mountInfo { + // e.g. mi.Mountpoint = /sys/firmware/efi/efivars, masked = /sys/firmware + if strings.HasPrefix(mi.Mountpoint, masked) { + maskedPathsWritable[masked] = struct{}{} + // mi.Mountpoint is added to maskedPathsWritable for ease of supporting nested case + maskedPathsWritable[mi.Mountpoint] = struct{}{} + } + } + } + if _, ok := maskedPathsWritable[masked]; !ok { + maskedPaths = append(maskedPaths, masked) + } + } + spec.Linux.MaskedPaths = maskedPaths + for _, s := range sortMapKeyStrings(maskedPathsWritable) { + spec.Mounts = append(spec.Mounts, + specs.Mount{ + Source: "none", + Destination: s, + Type: "tmpfs", + Options: []string{"nosuid", "noexec", "nodev", "mode=0755"}, + }) + } +} + +func sortMapKeyStrings(m map[string]struct{}) []string { + var ss []string + for s := range m { + ss = append(ss, s) + } + sort.Strings(ss) + return ss +} diff --git a/tests/integration/cgroups.bats b/tests/integration/cgroups.bats index 17812abfdcc..ba15f7e5246 100644 --- a/tests/integration/cgroups.bats +++ b/tests/integration/cgroups.bats @@ -125,3 +125,19 @@ EOF [ "$status" -eq 0 ] [[ ${lines[0]} == *"cgroups_exec"* ]] } + +@test "runc exec write cgroup (limits + cgrouppath + permission on the cgroup dir) fails" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + set_cgroups_path "$BUSYBOX_BUNDLE" + set_resources_limit "$BUSYBOX_BUNDLE" + + runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions + [ "$status" -eq 0 ] + +# In previous version of rootless runc, the whole /sys was just recursively bind-mounted +# but not recursively read-only + runc exec test_cgroups_permissions mkdir -p /sys/fs/cgroup/cpuset/runc-cgroups-integration-test/dummy + [ "$status" -eq 1 ] + [[ ${lines[0]} == *"Read-only file system"* ]] +} diff --git a/tests/integration/mounts.bats b/tests/integration/mounts.bats index c35b3c5f78a..1e07ba2e47d 100755 --- a/tests/integration/mounts.bats +++ b/tests/integration/mounts.bats @@ -19,3 +19,12 @@ function teardown() { [ "$status" -eq 0 ] [[ "${lines[0]}" =~ '/tmp/bind/config.json' ]] } + +# make sure there is no "df: /foo/bar: No such file or directory" issue in the default configuration +@test "runc run [df]" { + CONFIG=$(jq '.process.args = ["df"]' config.json) + echo "${CONFIG}" >config.json + + runc run test_df + [ "$status" -eq 0 ] +}