Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rootless: make /sys/fs/cgroup/* read-only #1869

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions libcontainer/rootfs_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,12 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
if err := mountToRootfs(tmpfs, rootfs, mountLabel); err != nil {
return err
}
// Add a dummy directory for cgroup2.
// Without this hack, `df` fails with `df: /sys/fs/cgroup/unified: No such file or directory`
// when /sys is bind-mounted from the host to the container (typically when netns is not unshared),
if err := os.MkdirAll(filepath.Join(rootfs, m.Destination, "unified"), 0755); err != nil {
return err
}
for _, b := range binds {
if err := mountToRootfs(b, rootfs, mountLabel); err != nil {
return err
Expand Down Expand Up @@ -360,6 +366,12 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
var binds []*configs.Mount

for _, mm := range mounts {
// when /sys is bind-mounted from the host to the container (typically when netns is not unshared),
// mm.Mountpoint can be like "/containers/foo/rootfs/sys/fs/cgroup/systemd/user.slice/user-1001.slice/session-1.scope/foo".
//
if !strings.HasPrefix(mm.Mountpoint, "/sys/fs/cgroup") {
continue
}
dir, err := mm.GetOwnCgroup(cgroupPaths)
if err != nil {
return nil, err
Expand Down
100 changes: 83 additions & 17 deletions libcontainer/specconv/example.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@ package specconv

import (
"os"
"path/filepath"
"sort"
"strings"

"github.com/opencontainers/runc/libcontainer/mount"
"github.com/opencontainers/runtime-spec/specs-go"
)

Expand Down Expand Up @@ -189,33 +192,96 @@ func ToRootless(spec *specs.Spec) {
}}

// Fix up mounts.
var mounts []specs.Mount
for _, mount := range spec.Mounts {
// Ignore all mounts that are under /sys.
if strings.HasPrefix(mount.Destination, "/sys") {
continue
}

mountInfo, _ := mount.GetMounts()
fixupRootlessBindSys(spec, mountInfo)
for i := range spec.Mounts {
mount := &spec.Mounts[i]
// Remove all gid= and uid= mappings.
var options []string
for _, option := range mount.Options {
if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") {
options = append(options, option)
}
}

mount.Options = options
mounts = append(mounts, mount)
}
// Add the sysfs mount as an rbind.
mounts = append(mounts, specs.Mount{
Source: "/sys",
Destination: "/sys",
Type: "none",
Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
})
spec.Mounts = mounts

// Remove cgroup settings.
spec.Linux.Resources = nil
}

// fixupRootlessBindSys bind-mounts /sys.
// This is required only when netns is not unshared.
func fixupRootlessBindSys(spec *specs.Spec, mountInfo []*mount.Info) {
// Fix up mounts.
var (
mounts []specs.Mount
mountsUnderSys []specs.Mount
)
for _, m := range spec.Mounts {
// ignore original /sys (cannot be mounted when netns is not unshared)
if filepath.Clean(m.Destination) == "/sys" {
continue
} else if strings.HasPrefix(m.Destination, "/sys") {
// Append all mounts that are under /sys (e.g. /sys/fs/cgroup )later
mountsUnderSys = append(mountsUnderSys, m)
} else {
mounts = append(mounts, m)
}
}
mounts = append(mounts, []specs.Mount{
// Add the sysfs mount as an rbind.
// Note:
// * "ro" does not make submounts read-only recursively.
// * rbind work for sysfs but bind does not.
{
Source: "/sys",
Destination: "/sys",
Type: "none",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be "bind"?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the only place where I see it used. In the runtime-specs example bind is used as the type for a bind mount (https://github.com/opencontainers/runtime-spec/blob/master/config.md#example-linux), both Docker and Podman generate "bind".

Anyway since it is only in the example and not in the specs, and you are just moving this code snippet, it is probably fine to leave it as it is.

Copy link
Member

@cyphar cyphar Aug 17, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

type: "none" is entirely valid. There was a bug previously in runc when we didn't handle this properly, but the type field (when it comes to mount(2)) doesn't affect bindmounts -- what matters is that the bind option is set. Docker set bind presumably by accident, and Podman probably copied the mistake (to be clear -- it doesn't matter either way, but it does lead to confusion).

But this code is used for more than example -- it's used by at least a few projects to convert a configuration to the rootless version of said configuration.

Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
},
}...)
spec.Mounts = append(mounts, mountsUnderSys...)
var (
maskedPaths []string
maskedPathsWritable = make(map[string]struct{})
)
for _, masked := range spec.Linux.MaskedPaths {
// e.g. when /sys/firmware is masked and /sys/firmware/efi/efivars is in /proc/self/mountinfo,
// we need to mask /sys/firmware/efi/efivars as well.
// (otherwise `df` fails with "df: /sys/firmware/efi/efivars: No such file or directory")
// also, to mask /sys/firmware/efi/efivars, we need to mask /sys/firmware as a writable tmpfs
if strings.HasPrefix(masked, "/sys") {
for _, mi := range mountInfo {
// e.g. mi.Mountpoint = /sys/firmware/efi/efivars, masked = /sys/firmware
if strings.HasPrefix(mi.Mountpoint, masked) {
maskedPathsWritable[masked] = struct{}{}
// mi.Mountpoint is added to maskedPathsWritable for ease of supporting nested case
maskedPathsWritable[mi.Mountpoint] = struct{}{}
}
}
}
if _, ok := maskedPathsWritable[masked]; !ok {
maskedPaths = append(maskedPaths, masked)
}
}
spec.Linux.MaskedPaths = maskedPaths
for _, s := range sortMapKeyStrings(maskedPathsWritable) {
spec.Mounts = append(spec.Mounts,
specs.Mount{
Source: "none",
Destination: s,
Type: "tmpfs",
Options: []string{"nosuid", "noexec", "nodev", "mode=0755"},
})
}
}

func sortMapKeyStrings(m map[string]struct{}) []string {
var ss []string
for s := range m {
ss = append(ss, s)
}
sort.Strings(ss)
return ss
}
16 changes: 16 additions & 0 deletions tests/integration/cgroups.bats
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,19 @@ EOF
[ "$status" -eq 0 ]
[[ ${lines[0]} == *"cgroups_exec"* ]]
}

@test "runc exec write cgroup (limits + cgrouppath + permission on the cgroup dir) fails" {
[[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup

set_cgroups_path "$BUSYBOX_BUNDLE"
set_resources_limit "$BUSYBOX_BUNDLE"

runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions
[ "$status" -eq 0 ]

# In previous version of rootless runc, the whole /sys was just recursively bind-mounted
# but not recursively read-only
runc exec test_cgroups_permissions mkdir -p /sys/fs/cgroup/cpuset/runc-cgroups-integration-test/dummy
[ "$status" -eq 1 ]
[[ ${lines[0]} == *"Read-only file system"* ]]
}
9 changes: 9 additions & 0 deletions tests/integration/mounts.bats
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,12 @@ function teardown() {
[ "$status" -eq 0 ]
[[ "${lines[0]}" =~ '/tmp/bind/config.json' ]]
}

# make sure there is no "df: /foo/bar: No such file or directory" issue in the default configuration
@test "runc run [df]" {
CONFIG=$(jq '.process.args = ["df"]' config.json)
echo "${CONFIG}" >config.json

runc run test_df
[ "$status" -eq 0 ]
}