Skip to content

Commit

Permalink
libct/cgroups: add SkipDevices to Resources
Browse files Browse the repository at this point in the history
The kubelet uses libct/cgroups code to set up cgroups. It creates a
parent cgroup (kubepods) to put the containers into.

The problem (for cgroupv2 that uses eBPF for device configuration) is
the hard requirement to have devices cgroup configured results in
leaking an eBPF program upon every kubelet restart.  program. If kubelet
is restarted 64+ times, the cgroup can't be configured anymore.

Work around this by adding a SkipDevices flag to Resources.

A check was added so that if SkipDevices is set, such a "container"
can't be started (to make sure it is only used for non-containers).

Signed-off-by: Kir Kolyshkin <[email protected]>
  • Loading branch information
kolyshkin committed Jul 2, 2020
1 parent 1b94395 commit 108ee85
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 26 deletions.
5 changes: 4 additions & 1 deletion libcontainer/cgroups/fs/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ func (s *DevicesGroup) Name() string {
}

func (s *DevicesGroup) Apply(d *cgroupData) error {
if d.config.SkipDevices {
return nil
}
_, err := d.join("devices")
if err != nil {
// We will return error even it's `not found` error, devices
Expand Down Expand Up @@ -52,7 +55,7 @@ func buildEmulator(rules []*configs.DeviceRule) (*devices.Emulator, error) {
}

func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
if system.RunningInUserNS() {
if system.RunningInUserNS() || cgroup.SkipDevices {
return nil
}

Expand Down
2 changes: 1 addition & 1 deletion libcontainer/cgroups/fs/fs.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ func (m *manager) Apply(pid int) (err error) {
if err != nil {
// The non-presence of the devices subsystem is
// considered fatal for security reasons.
if cgroups.IsNotFound(err) && sys.Name() != "devices" {
if cgroups.IsNotFound(err) && (c.SkipDevices || sys.Name() != "devices") {
continue
}
return err
Expand Down
3 changes: 3 additions & 0 deletions libcontainer/cgroups/fs2/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ func canSkipEBPFError(cgroup *configs.Cgroup) bool {
}

func setDevices(dirPath string, cgroup *configs.Cgroup) error {
if cgroup.SkipDevices {
return nil
}
// XXX: This is currently a white-list (but all callers pass a blacklist of
// devices). This is bad for a whole variety of reasons, but will need
// to be fixed with co-ordinated effort with downstreams.
Expand Down
27 changes: 15 additions & 12 deletions libcontainer/cgroups/systemd/v1.go
Original file line number Diff line number Diff line change
Expand Up @@ -379,24 +379,27 @@ func (m *legacyManager) Set(container *configs.Config) error {
return err
}

// Figure out the current freezer state, so we can revert to it after we
// temporarily freeze the container.
targetFreezerState, err := m.GetFreezerState()
if err != nil {
return err
}
if targetFreezerState == configs.Undefined {
targetFreezerState = configs.Thawed
}

// We have to freeze the container while systemd sets the cgroup settings.
// The reason for this is that systemd's application of DeviceAllow rules
// is done disruptively, resulting in spurrious errors to common devices
// (unlike our fs driver, they will happily write deny-all rules to running
// containers). So we freeze the container to avoid them hitting the cgroup
// error. But if the freezer cgroup isn't supported, we just warn about it.
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
targetFreezerState := configs.Undefined
if !m.cgroups.SkipDevices {
// Figure out the current freezer state, so we can revert to it after we
// temporarily freeze the container.
targetFreezerState, err = m.GetFreezerState()
if err != nil {
return err
}
if targetFreezerState == configs.Undefined {
targetFreezerState = configs.Thawed
}

if err := m.Freeze(configs.Frozen); err != nil {
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
}
}

if err := dbusConnection.SetUnitProperties(getUnitName(container.Cgroups), true, properties...); err != nil {
Expand Down
27 changes: 15 additions & 12 deletions libcontainer/cgroups/systemd/v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,24 +298,27 @@ func (m *unifiedManager) Set(container *configs.Config) error {
return err
}

// Figure out the current freezer state, so we can revert to it after we
// temporarily freeze the container.
targetFreezerState, err := m.GetFreezerState()
if err != nil {
return err
}
if targetFreezerState == configs.Undefined {
targetFreezerState = configs.Thawed
}

// We have to freeze the container while systemd sets the cgroup settings.
// The reason for this is that systemd's application of DeviceAllow rules
// is done disruptively, resulting in spurrious errors to common devices
// (unlike our fs driver, they will happily write deny-all rules to running
// containers). So we freeze the container to avoid them hitting the cgroup
// error. But if the freezer cgroup isn't supported, we just warn about it.
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
targetFreezerState := configs.Undefined
if !m.cgroups.SkipDevices {
// Figure out the current freezer state, so we can revert to it after we
// temporarily freeze the container.
targetFreezerState, err = m.GetFreezerState()
if err != nil {
return err
}
if targetFreezerState == configs.Undefined {
targetFreezerState = configs.Thawed
}

if err := m.Freeze(configs.Frozen); err != nil {
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
}
}

if err := dbusConnection.SetUnitProperties(getUnitName(m.cgroups), true, properties...); err != nil {
Expand Down
7 changes: 7 additions & 0 deletions libcontainer/configs/cgroup_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,11 @@ type Resources struct {

// CpuWeight sets a proportional bandwidth limit.
CpuWeight uint64 `json:"cpu_weight"`

// SkipDevices allows to skip configuring device permissions.
// Used by e.g. kubelet while creating a parent cgroup (kubepods)
// common for many containers.
//
// NOTE it is impossible to start a container which has this flag set.
SkipDevices bool `json:"skip_devices"`
}
3 changes: 3 additions & 0 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,9 @@ func (c *linuxContainer) Set(config configs.Config) error {
func (c *linuxContainer) Start(process *Process) error {
c.m.Lock()
defer c.m.Unlock()
if c.config.Cgroups.Resources.SkipDevices {
return newGenericError(errors.New("can't start container with SkipDevices set"), ConfigInvalid)
}
if process.Init {
if err := c.createExecFifo(); err != nil {
return err
Expand Down

0 comments on commit 108ee85

Please sign in to comment.