diff --git a/daemon/mgr/container.go b/daemon/mgr/container.go index 84b12ceae..5b8a590ca 100644 --- a/daemon/mgr/container.go +++ b/daemon/mgr/container.go @@ -312,7 +312,7 @@ func (mgr *ContainerManager) StartExec(ctx context.Context, execid string, confi c.meta.Config.User = execConfig.User } - if err = setupProcessUser(ctx, c.meta, &SpecWrapper{s: &specs.Spec{Process: process}}); err != nil { + if err = setupUser(ctx, c.meta, &specs.Spec{Process: process}); err != nil { return err } @@ -607,32 +607,17 @@ func (mgr *ContainerManager) start(ctx context.Context, c *Container, detachKeys } func (mgr *ContainerManager) createContainerdContainer(ctx context.Context, c *Container) error { - // new a default spec. - s, err := ctrd.NewDefaultSpec(ctx, c.ID()) - if err != nil { - return errors.Wrapf(err, "failed to generate spec: %s", c.ID()) - } - - var cgroupsParent string - if c.meta.HostConfig.CgroupParent != "" { - cgroupsParent = c.meta.HostConfig.CgroupParent - } else if mgr.Config.CgroupParent != "" { - cgroupsParent = mgr.Config.CgroupParent - } - - // cgroupsPath must be absolute path - // call filepath.Clean is to avoid bad - // path just like../../../.../../BadPath - if cgroupsParent != "" { - if !filepath.IsAbs(cgroupsParent) { - cgroupsParent = filepath.Clean("/" + cgroupsParent) - } - - s.Linux.CgroupsPath = filepath.Join(cgroupsParent, c.ID()) + // CgroupParent from HostConfig will be first priority to use, + // then will be value from mgr.Config.CgroupParent + if c.meta.HostConfig.CgroupParent == "" { + c.meta.HostConfig.CgroupParent = mgr.Config.CgroupParent } - var prioArr []int - var argsArr [][]string + var ( + err error + prioArr []int + argsArr [][]string + ) if mgr.containerPlugin != nil { prioArr, argsArr, err = mgr.containerPlugin.PreStart(c) if err != nil { @@ -641,7 +626,6 @@ func (mgr *ContainerManager) createContainerdContainer(ctx context.Context, c *C } sw := &SpecWrapper{ - s: s, ctrMgr: mgr, volMgr: mgr.VolumeMgr, netMgr: mgr.NetworkMgr, @@ -649,10 +633,8 @@ func (mgr *ContainerManager) createContainerdContainer(ctx context.Context, c *C argsArr: argsArr, } - for _, setup := range SetupFuncs() { - if err = setup(ctx, c.meta, sw); err != nil { - return err - } + if err = createSpec(ctx, c.meta, sw); err != nil { + return err } // open container's stdio. @@ -660,15 +642,12 @@ func (mgr *ContainerManager) createContainerdContainer(ctx context.Context, c *C if err != nil { return errors.Wrap(err, "failed to open io") } - if io.Stdin != nil && io.Stdin.OpenStdin() { - s.Process.Terminal = true - } if err := mgr.Client.CreateContainer(ctx, &ctrd.Container{ ID: c.ID(), Image: c.Image(), Runtime: c.meta.HostConfig.Runtime, - Spec: s, + Spec: sw.s, IO: io, }); err != nil { logrus.Errorf("failed to create new containerd container: %s", err.Error()) diff --git a/daemon/mgr/spec.go b/daemon/mgr/spec.go index acf7cdb65..28a9c931b 100644 --- a/daemon/mgr/spec.go +++ b/daemon/mgr/spec.go @@ -3,7 +3,10 @@ package mgr import ( "context" + "github.com/alibaba/pouch/ctrd" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" ) // SpecWrapper wraps the container's specs and add manager operations. @@ -17,79 +20,46 @@ type SpecWrapper struct { argsArr [][]string } -// SetupFunc defines spec setup function type. -type SetupFunc func(ctx context.Context, m *ContainerMeta, s *SpecWrapper) error - -var setupFunc = []SetupFunc{ - // process - setupProcessArgs, - setupProcessCwd, - setupProcessEnv, - setupProcessTTY, - setupProcessUser, - setupCap, - setupNoNewPrivileges, - setupOOMScoreAdj, - - // cgroup - setupCgroupCPUShare, - setupCgroupCPUSet, - setupCgroupCPUPeriod, - setupCgroupCPUQuota, - setupCgroupMemory, - setupCgroupMemorySwap, - setupCgroupMemorySwappiness, - setupDisableOOMKill, - - // namespaces - setupUserNamespace, - setupNetworkNamespace, - setupIpcNamespace, - setupPidNamespace, - setupUtsNamespace, - - // volume spec - setupMounts, - - // network spec - setupNetwork, - - // host device spec - setupDevices, - - // linux-platform-specifc spec - setupSysctl, - setupAppArmor, - setupCapabilities, - setupSeccomp, - setupSELinux, +// createSpec create a runtime-spec. +func createSpec(ctx context.Context, c *ContainerMeta, specWrapper *SpecWrapper) error { + // new a default spec from containerd. + s, err := ctrd.NewDefaultSpec(ctx, c.ID) + if err != nil { + return errors.Wrapf(err, "failed to generate spec: %s", c.ID) + } + specWrapper.s = s - // blkio spec - setupBlkio, - setupDiskQuota, + s.Hostname = c.Config.Hostname.String() + s.Root = &specs.Root{ + Path: c.BaseFS, + Readonly: c.HostConfig.ReadonlyRootfs, + } - // IntelRdtL3Cbm - setupIntelRdt, + // create Spec.Process spec + if err := setupProcess(ctx, c, s); err != nil { + return err + } - // annotations in spec - setupAnnotations, + // create Spec.Mounts spec + if err := setupMounts(ctx, c, s); err != nil { + return err + } - // rootfs spec - setupRoot, + // create Spec.Annotations + if err := setupAnnotations(ctx, c, s); err != nil { + return err + } - //hook - setupHook, -} + // create Spec.Hooks spec + if err := setupHook(ctx, c, specWrapper); err != nil { + return err + } -// Register is used to registe spec setup function. -func Register(f SetupFunc) { - if setupFunc == nil { - setupFunc = make([]SetupFunc, 0) + // platform-specifed spec setting + // TODO: support window and Solaris platform + if err := populatePlatform(ctx, c, specWrapper); err != nil { + return err } - setupFunc = append(setupFunc, f) -} -// SetupFuncs returns all the spec setup functions. -func SetupFuncs() []SetupFunc { - return setupFunc + return nil } diff --git a/daemon/mgr/spec_annotations.go b/daemon/mgr/spec_annotations.go index f0f9cc65d..fd5596cfa 100644 --- a/daemon/mgr/spec_annotations.go +++ b/daemon/mgr/spec_annotations.go @@ -4,16 +4,16 @@ import ( "context" "strconv" + specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" ) // setupAnnotations extracts other related options from HostConfig and locate them in spec's annotations which will be dealt by vendored runc. -func setupAnnotations(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - s := spec.s - - r := meta.HostConfig.Resources - - s.Annotations = make(map[string]string) +func setupAnnotations(ctx context.Context, c *ContainerMeta, s *specs.Spec) error { + if s.Annotations == nil { + s.Annotations = make(map[string]string) + } + r := c.HostConfig.Resources if r.MemoryWmarkRatio != nil { s.Annotations["__memory_wmark_ratio"] = strconv.FormatInt(*r.MemoryWmarkRatio, 10) @@ -28,7 +28,7 @@ func setupAnnotations(ctx context.Context, meta *ContainerMeta, spec *SpecWrappe s.Annotations["__schedule_latency_switch"] = strconv.FormatInt(r.ScheLatSwitch, 10) // add additional spec annotations - annotations := meta.Config.SpecAnnotation + annotations := c.Config.SpecAnnotation for k, v := range annotations { if _, exist := s.Annotations[k]; exist { logrus.Warnf("Duplicate spec annotation: %s=%s", k, v) diff --git a/daemon/mgr/spec_blkio.go b/daemon/mgr/spec_blkio.go deleted file mode 100644 index 2735ebcc6..000000000 --- a/daemon/mgr/spec_blkio.go +++ /dev/null @@ -1,128 +0,0 @@ -package mgr - -import ( - "context" - "os" - "path/filepath" - "strconv" - "syscall" - - "github.com/alibaba/pouch/apis/types" - "github.com/alibaba/pouch/storage/quota" - specs "github.com/opencontainers/runtime-spec/specs-go" -) - -func setupBlkio(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - s := spec.s - r := meta.HostConfig.Resources - - weightDevice, err := getWeightDevice(r.BlkioWeightDevice) - if err != nil { - return err - } - readBpsDevice, err := getThrottleDevice(r.BlkioDeviceReadBps) - if err != nil { - return err - } - writeBpsDevice, err := getThrottleDevice(r.BlkioDeviceWriteBps) - if err != nil { - return err - } - readIOpsDevice, err := getThrottleDevice(r.BlkioDeviceReadIOps) - if err != nil { - return err - } - writeIOpsDevice, err := getThrottleDevice(r.BlkioDeviceWriteIOps) - if err != nil { - return err - } - - if s.Linux.Resources == nil { - s.Linux.Resources = &specs.LinuxResources{} - } - - s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{ - Weight: &r.BlkioWeight, - WeightDevice: weightDevice, - ThrottleReadBpsDevice: readBpsDevice, - ThrottleReadIOPSDevice: readIOpsDevice, - ThrottleWriteBpsDevice: writeBpsDevice, - ThrottleWriteIOPSDevice: writeIOpsDevice, - } - - return nil -} - -func getWeightDevice(devs []*types.WeightDevice) ([]specs.LinuxWeightDevice, error) { - var stat syscall.Stat_t - var weightDevice []specs.LinuxWeightDevice - - for _, dev := range devs { - if err := syscall.Stat(dev.Path, &stat); err != nil { - return nil, err - } - - d := specs.LinuxWeightDevice{ - Weight: &dev.Weight, - } - d.Major = int64(stat.Rdev / 256) - d.Minor = int64(stat.Rdev % 256) - weightDevice = append(weightDevice, d) - } - - return weightDevice, nil -} - -func getThrottleDevice(devs []*types.ThrottleDevice) ([]specs.LinuxThrottleDevice, error) { - var stat syscall.Stat_t - var ThrottleDevice []specs.LinuxThrottleDevice - - for _, dev := range devs { - if err := syscall.Stat(dev.Path, &stat); err != nil { - return nil, err - } - - d := specs.LinuxThrottleDevice{ - Rate: dev.Rate, - } - d.Major = int64(stat.Rdev / 256) - d.Minor = int64(stat.Rdev % 256) - ThrottleDevice = append(ThrottleDevice, d) - } - - return ThrottleDevice, nil -} - -func setupDiskQuota(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - s := spec.s - - rootFSQuota := quota.GetDefaultQuota(meta.Config.DiskQuota) - if rootFSQuota == "" { - return nil - } - - qid := "0" - if meta.Config.QuotaID != "" { - qid = meta.Config.QuotaID - } - - if s.Hooks == nil { - s.Hooks = &specs.Hooks{} - } - if s.Hooks.Prestart == nil { - s.Hooks.Prestart = []specs.Hook{} - } - - target, err := os.Readlink(filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")) - if err != nil { - return err - } - - quotaPrestart := specs.Hook{ - Path: target, - Args: []string{"set-diskquota", meta.BaseFS, rootFSQuota, qid}, - } - s.Hooks.Prestart = append(s.Hooks.Prestart, quotaPrestart) - - return nil -} diff --git a/daemon/mgr/spec_cgroup_cpu.go b/daemon/mgr/spec_cgroup_cpu.go deleted file mode 100644 index 6a6d4711d..000000000 --- a/daemon/mgr/spec_cgroup_cpu.go +++ /dev/null @@ -1,52 +0,0 @@ -package mgr - -import ( - "context" - - specs "github.com/opencontainers/runtime-spec/specs-go" -) - -func setupCgroupCPUShare(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - s := spec.s - if s.Linux.Resources.CPU == nil { - s.Linux.Resources.CPU = &specs.LinuxCPU{} - } - cpu := s.Linux.Resources.CPU - - v := uint64(meta.HostConfig.CPUShares) - cpu.Shares = &v - return nil -} - -func setupCgroupCPUSet(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - s := spec.s - if s.Linux.Resources.CPU == nil { - s.Linux.Resources.CPU = &specs.LinuxCPU{} - } - cpu := s.Linux.Resources.CPU - cpu.Cpus = meta.HostConfig.CpusetCpus - cpu.Mems = meta.HostConfig.CpusetMems - return nil -} - -func setupCgroupCPUPeriod(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - s := spec.s - if s.Linux.Resources.CPU == nil { - s.Linux.Resources.CPU = &specs.LinuxCPU{} - } - cpu := s.Linux.Resources.CPU - period := uint64(meta.HostConfig.CPUPeriod) - cpu.Period = &period - return nil -} - -func setupCgroupCPUQuota(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - s := spec.s - if s.Linux.Resources.CPU == nil { - s.Linux.Resources.CPU = &specs.LinuxCPU{} - } - cpu := s.Linux.Resources.CPU - quota := meta.HostConfig.CPUQuota - cpu.Quota = "a - return nil -} diff --git a/daemon/mgr/spec_cgroup_memory.go b/daemon/mgr/spec_cgroup_memory.go deleted file mode 100644 index 41a0c9885..000000000 --- a/daemon/mgr/spec_cgroup_memory.go +++ /dev/null @@ -1,56 +0,0 @@ -package mgr - -import ( - "context" - - specs "github.com/opencontainers/runtime-spec/specs-go" -) - -func getCgroupMemory(s *specs.Spec) *specs.LinuxMemory { - if s.Linux.Resources.Memory == nil { - s.Linux.Resources.Memory = &specs.LinuxMemory{} - } - return s.Linux.Resources.Memory -} - -func setupCgroupMemory(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - s := spec.s - mem := getCgroupMemory(s) - - v := meta.HostConfig.Memory - mem.Limit = &v - return nil -} - -func setupCgroupMemorySwap(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - s := spec.s - mem := getCgroupMemory(s) - - v := meta.HostConfig.MemorySwap - mem.Swap = &v - return nil -} - -func setupCgroupMemorySwappiness(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - s := spec.s - mem := getCgroupMemory(s) - - var v uint64 - if meta.HostConfig.MemorySwappiness != nil { - v = uint64(*(meta.HostConfig.MemorySwappiness)) - } - mem.Swappiness = &v - return nil -} - -func setupDisableOOMKill(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - s := spec.s - mem := getCgroupMemory(s) - - var v bool - if meta.HostConfig.OomKillDisable != nil { - v = bool(*meta.HostConfig.OomKillDisable) - } - mem.DisableOOMKiller = &v - return nil -} diff --git a/daemon/mgr/spec_devices.go b/daemon/mgr/spec_devices.go deleted file mode 100644 index 67ea75d32..000000000 --- a/daemon/mgr/spec_devices.go +++ /dev/null @@ -1,130 +0,0 @@ -package mgr - -import ( - "context" - "fmt" - "os" - "path/filepath" - "strings" - - "github.com/alibaba/pouch/apis/opts" - - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/devices" - specs "github.com/opencontainers/runtime-spec/specs-go" -) - -func u32Ptr(i int64) *uint32 { u := uint32(i); return &u } -func fmPtr(i int64) *os.FileMode { fm := os.FileMode(i); return &fm } - -// linuxDevice convert a libcontainer configs.Device to a specs.LinuxDevice object. -func linuxDevice(d *configs.Device) specs.LinuxDevice { - return specs.LinuxDevice{ - Type: string(d.Type), - Path: d.Path, - Major: d.Major, - Minor: d.Minor, - FileMode: fmPtr(int64(d.FileMode)), - UID: u32Ptr(int64(d.Uid)), - GID: u32Ptr(int64(d.Gid)), - } -} - -func deviceCgroup(d *configs.Device) specs.LinuxDeviceCgroup { - t := string(d.Type) - return specs.LinuxDeviceCgroup{ - Allow: true, - Type: t, - Major: &d.Major, - Minor: &d.Minor, - Access: d.Permissions, - } -} - -func devicesFromPath(pathOnHost, pathInContainer, cgroupPermissions string) (devs []specs.LinuxDevice, devPermissions []specs.LinuxDeviceCgroup, err error) { - resolvedPathOnHost := pathOnHost - - // check if it is a symbolic link - if src, e := os.Lstat(pathOnHost); e == nil && src.Mode()&os.ModeSymlink == os.ModeSymlink { - if linkedPathOnHost, e := filepath.EvalSymlinks(pathOnHost); e == nil { - resolvedPathOnHost = linkedPathOnHost - } - } - - device, err := devices.DeviceFromPath(resolvedPathOnHost, cgroupPermissions) - if err == nil { - device.Path = pathInContainer - return append(devs, linuxDevice(device)), append(devPermissions, deviceCgroup(device)), nil - } - - // if the device is not a device node - // try to see if it's a directory holding many devices - if err == devices.ErrNotADevice { - - // check if it is a directory - if src, e := os.Stat(resolvedPathOnHost); e == nil && src.IsDir() { - - // mount the internal devices recursively - filepath.Walk(resolvedPathOnHost, func(dpath string, f os.FileInfo, e error) error { - childDevice, e := devices.DeviceFromPath(dpath, cgroupPermissions) - if e != nil { - // ignore the device - return nil - } - - // add the device to userSpecified devices - childDevice.Path = strings.Replace(dpath, resolvedPathOnHost, pathInContainer, 1) - devs = append(devs, linuxDevice(childDevice)) - devPermissions = append(devPermissions, deviceCgroup(childDevice)) - - return nil - }) - } - } - - if len(devs) > 0 { - return devs, devPermissions, nil - } - - return devs, devPermissions, fmt.Errorf("error gathering device information while adding custom device %q: %s", pathOnHost, err) -} - -func setupDevices(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - var devs []specs.LinuxDevice - s := spec.s - devPermissions := s.Linux.Resources.Devices - if meta.HostConfig.Privileged { - hostDevices, err := devices.HostDevices() - if err != nil { - return err - } - for _, d := range hostDevices { - devs = append(devs, linuxDevice(d)) - } - devPermissions = []specs.LinuxDeviceCgroup{ - { - Allow: true, - Access: "rwm", - }, - } - } else { - for _, deviceMapping := range meta.HostConfig.Devices { - if !opts.ValidateDeviceMode(deviceMapping.CgroupPermissions) { - return fmt.Errorf("%s invalid device mode: %s", deviceMapping.PathOnHost, deviceMapping.CgroupPermissions) - } - d, dPermissions, err := devicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions) - if err != nil { - return err - } - devs = append(devs, d...) - devPermissions = append(devPermissions, dPermissions...) - } - } - - if s.Linux.Devices == nil { - s.Linux.Devices = []specs.LinuxDevice{} - } - s.Linux.Devices = append(s.Linux.Devices, devs...) - s.Linux.Resources.Devices = devPermissions - return nil -} diff --git a/daemon/mgr/spec_hook.go b/daemon/mgr/spec_hook.go index 3fc7c3b4c..6dbe43fcb 100644 --- a/daemon/mgr/spec_hook.go +++ b/daemon/mgr/spec_hook.go @@ -2,53 +2,72 @@ package mgr import ( "context" + "os" + "path/filepath" "sort" + "strconv" "strings" + "github.com/alibaba/pouch/storage/quota" specs "github.com/opencontainers/runtime-spec/specs-go" ) //setup hooks specified by user via plugins, if set rich mode and init-script exists set init-script -func setupHook(ctx context.Context, c *ContainerMeta, spec *SpecWrapper) error { - // if no init script specified and no hook plugin setup, skip it - if (!c.Config.Rich || c.Config.InitScript == "") && len(spec.argsArr) == 0 { - return nil - } - - if spec.s.Hooks == nil { - spec.s.Hooks = &specs.Hooks{} - } - - if spec.s.Hooks.Prestart == nil { - spec.s.Hooks.Prestart = []specs.Hook{} +func setupHook(ctx context.Context, c *ContainerMeta, specWrapper *SpecWrapper) error { + s := specWrapper.s + if s.Hooks == nil { + s.Hooks = &specs.Hooks{ + Prestart: []specs.Hook{}, + Poststart: []specs.Hook{}, + Poststop: []specs.Hook{}, + } } - if len(spec.argsArr) > 0 { + // setup plugin hook, if no hook plugin setup, skip this part. + argsArr := specWrapper.argsArr + prioArr := specWrapper.prioArr + if len(argsArr) > 0 { var hookArr []*wrapperEmbedPrestart - for i, hook := range spec.s.Hooks.Prestart { + for i, hook := range s.Hooks.Prestart { hookArr = append(hookArr, &wrapperEmbedPrestart{-i, append([]string{hook.Path}, hook.Args...)}) } - priorityArr := spec.prioArr - argsArr := spec.argsArr - for i, p := range priorityArr { + for i, p := range prioArr { hookArr = append(hookArr, &wrapperEmbedPrestart{p, argsArr[i]}) } sortedArr := hookArray(hookArr) sort.Sort(sortedArr) - spec.s.Hooks.Prestart = sortedArr.toOciPrestartHook() + s.Hooks.Prestart = append(s.Hooks.Prestart, sortedArr.toOciPrestartHook()...) } - args := strings.Fields(c.Config.InitScript) - if len(args) == 0 { - return nil + // setup rich mode container hoopk, if no init script specified and no hook plugin setup, skip this part. + if c.Config.Rich && c.Config.InitScript != "" { + args := strings.Fields(c.Config.InitScript) + if len(args) > 0 { + s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{ + Path: args[0], + Args: args[1:], + }) + } } - preStartHook := specs.Hook{ - Path: args[0], - Args: args[1:], - } + // setup diskquota hook, if rootFSQuota not set skip this part. + rootFSQuota := quota.GetDefaultQuota(c.Config.DiskQuota) + if rootFSQuota != "" { + qid := "0" + if c.Config.QuotaID != "" { + qid = c.Config.QuotaID + } - spec.s.Hooks.Prestart = append(spec.s.Hooks.Prestart, preStartHook) + target, err := os.Readlink(filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")) + if err != nil { + return err + } + + s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{ + Path: target, + Args: []string{"set-diskquota", c.BaseFS, rootFSQuota, qid}, + }) + } return nil } diff --git a/daemon/mgr/spec_linux.go b/daemon/mgr/spec_linux.go index 1603e7108..ddbde1916 100644 --- a/daemon/mgr/spec_linux.go +++ b/daemon/mgr/spec_linux.go @@ -6,12 +6,21 @@ import ( "fmt" "io/ioutil" "os" + "path/filepath" + "strconv" + "strings" + "syscall" + + "github.com/alibaba/pouch/apis/opts" + "github.com/alibaba/pouch/apis/types" "github.com/containerd/containerd/contrib/seccomp" - "github.com/docker/docker/daemon/caps" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" specs "github.com/opencontainers/runtime-spec/specs-go" ) +// FIXME: these variables have no relation with spec, move them. const ( // ProfileNamePrefix is the prefix for loading profiles on a localhost. Eg. localhost/profileName. ProfileNamePrefix = "localhost/" @@ -26,72 +35,83 @@ const ( ) // Setup linux-platform-sepecific specification. +func populatePlatform(ctx context.Context, c *ContainerMeta, specWrapper *SpecWrapper) error { + s := specWrapper.s + if s.Linux == nil { + s.Linux = &specs.Linux{} + } -func setupSysctl(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - spec.s.Linux.Sysctl = meta.HostConfig.Sysctls - return nil -} + // same with containerd use. or make it a variable + cgroupsParent := "default" + if c.HostConfig.CgroupParent != "" { + cgroupsParent = c.HostConfig.CgroupParent + } + + // cgroupsPath must be absolute path + // call filepath.Clean is to avoid bad + // path just like../../../.../../BadPath + if !filepath.IsAbs(cgroupsParent) { + cgroupsParent = filepath.Clean("/" + cgroupsParent) + } + s.Linux.CgroupsPath = filepath.Join(cgroupsParent, c.ID) -// isAppArmorEnabled returns true if apparmor is enabled for the host. -// This function is forked from -// https://github.com/opencontainers/runc/blob/1a81e9ab1f138c091fe5c86d0883f87716088527/libcontainer/apparmor/apparmor.go -// to avoid the libapparmor dependency. -func isAppArmorEnabled() bool { - if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil && os.Getenv("container") == "" { - if _, err = os.Stat("/sbin/apparmor_parser"); err == nil { - buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled") - return err == nil && len(buf) > 1 && buf[0] == 'Y' + s.Linux.Sysctl = c.HostConfig.Sysctls + + if c.HostConfig.IntelRdtL3Cbm != "" { + s.Linux.IntelRdt = &specs.LinuxIntelRdt{ + L3CacheSchema: c.HostConfig.IntelRdtL3Cbm, } } - return false -} -func setupAppArmor(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - if !isAppArmorEnabled() { - // Return if the apparmor is disabled. - return nil + // setup something depend on privileged authority + if !c.HostConfig.Privileged { + s.Linux.MountLabel = c.MountLabel + } else { + s.Linux.ReadonlyPaths = nil + s.Linux.MaskedPaths = nil } - appArmorProfile := meta.AppArmorProfile - switch appArmorProfile { - case ProfileNameUnconfined: - return nil - case ProfileRuntimeDefault: - // TODO: handle runtime default case. - return nil - case "": - if meta.HostConfig.Privileged { - return nil - } - // TODO: if user does not specify the AppArmor and the container is not in privilege mode, - // we need to specify it as default case, handle it later. - return nil - default: - spec.s.Process.ApparmorProfile = appArmorProfile + // start to setup linux seccomp + if err := setupSeccomp(ctx, c, s); err != nil { + return err + } + + // start to setup linux resource + if err := setupResource(ctx, c, s); err != nil { + return err + } + + // stat to setup linux namespace + if err := setupNamespaces(ctx, c, specWrapper); err != nil { + return err } return nil } -func setupSeccomp(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - if meta.HostConfig.Privileged { +// setupSeccomp creates seccomp security settings spec. +func setupSeccomp(ctx context.Context, c *ContainerMeta, s *specs.Spec) error { + if c.HostConfig.Privileged { return nil } + if s.Linux.Seccomp == nil { + s.Linux.Seccomp = &specs.LinuxSeccomp{} + } + // TODO: check whether seccomp is enable in your kernel, if not, cannot run a custom seccomp prifle. - seccompProfile := meta.SeccompProfile + seccompProfile := c.SeccompProfile switch seccompProfile { case ProfileNameUnconfined: return nil case ProfilePouchDefault, "": - spec.s.Linux.Seccomp = seccomp.DefaultProfile(spec.s) + s.Linux.Seccomp = seccomp.DefaultProfile(s) default: - spec.s.Linux.Seccomp = &specs.LinuxSeccomp{} data, err := ioutil.ReadFile(seccompProfile) if err != nil { return fmt.Errorf("failed to load seccomp profile %q: %v", seccompProfile, err) } - err = json.Unmarshal(data, spec.s.Linux.Seccomp) + err = json.Unmarshal(data, s.Linux.Seccomp) if err != nil { return fmt.Errorf("failed to decode seccomp profile %q: %v", seccompProfile, err) } @@ -100,41 +120,493 @@ func setupSeccomp(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) e return nil } -func setupSELinux(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - if !meta.HostConfig.Privileged { - spec.s.Process.SelinuxLabel = meta.ProcessLabel - spec.s.Linux.MountLabel = meta.MountLabel +// setupResource creates linux resource spec. +func setupResource(ctx context.Context, c *ContainerMeta, s *specs.Spec) error { + if s.Linux.Resources == nil { + s.Linux.Resources = &specs.LinuxResources{} + } + + // start to setup cpu and memory cgroup + setupCPU(ctx, c.HostConfig.Resources, s) + setupMemory(ctx, c.HostConfig.Resources, s) + + // start to setup blkio cgroup + if err := setupBlkio(ctx, c.HostConfig.Resources, s); err != nil { + return err + } + + // start to setup device cgroup + if err := setupDevices(ctx, c, s); err != nil { + return err + } + + //TODO: nedd support Pids, HugepageLimits, Network cgroup set + + return nil +} + +// setupResource creates linux blkio resource spec. +func setupBlkio(ctx context.Context, r types.Resources, s *specs.Spec) error { + weightDevice, err := getWeightDevice(r.BlkioWeightDevice) + if err != nil { + return err + } + readBpsDevice, err := getThrottleDevice(r.BlkioDeviceReadBps) + if err != nil { + return err + } + writeBpsDevice, err := getThrottleDevice(r.BlkioDeviceWriteBps) + if err != nil { + return err + } + readIOpsDevice, err := getThrottleDevice(r.BlkioDeviceReadIOps) + if err != nil { + return err + } + writeIOpsDevice, err := getThrottleDevice(r.BlkioDeviceWriteIOps) + if err != nil { + return err + } + + s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{ + Weight: &r.BlkioWeight, + WeightDevice: weightDevice, + ThrottleReadBpsDevice: readBpsDevice, + ThrottleReadIOPSDevice: readIOpsDevice, + ThrottleWriteBpsDevice: writeBpsDevice, + ThrottleWriteIOPSDevice: writeIOpsDevice, + } + + return nil +} + +func getWeightDevice(devs []*types.WeightDevice) ([]specs.LinuxWeightDevice, error) { + var stat syscall.Stat_t + var weightDevice []specs.LinuxWeightDevice + + for _, dev := range devs { + if err := syscall.Stat(dev.Path, &stat); err != nil { + return nil, err + } + + d := specs.LinuxWeightDevice{ + Weight: &dev.Weight, + } + d.Major = int64(stat.Rdev / 256) + d.Minor = int64(stat.Rdev % 256) + weightDevice = append(weightDevice, d) + } + + return weightDevice, nil +} + +func getThrottleDevice(devs []*types.ThrottleDevice) ([]specs.LinuxThrottleDevice, error) { + var stat syscall.Stat_t + var ThrottleDevice []specs.LinuxThrottleDevice + + for _, dev := range devs { + if err := syscall.Stat(dev.Path, &stat); err != nil { + return nil, err + } + + d := specs.LinuxThrottleDevice{ + Rate: dev.Rate, + } + d.Major = int64(stat.Rdev / 256) + d.Minor = int64(stat.Rdev % 256) + ThrottleDevice = append(ThrottleDevice, d) + } + + return ThrottleDevice, nil +} + +// setupResource creates linux cpu resource spec +func setupCPU(ctx context.Context, r types.Resources, s *specs.Spec) { + cpu := &specs.LinuxCPU{ + Cpus: r.CpusetCpus, + Mems: r.CpusetMems, + } + + if r.CPUShares != 0 { + v := uint64(r.CPUShares) + cpu.Shares = &v + } + + if r.CPUPeriod != 0 { + v := uint64(r.CPUPeriod) + cpu.Period = &v + } + + if r.CPUQuota != 0 { + v := int64(r.CPUQuota) + cpu.Quota = &v + } + + s.Linux.Resources.CPU = cpu +} + +// setupResource creates linux memory resource spec. +func setupMemory(ctx context.Context, r types.Resources, s *specs.Spec) { + memory := &specs.LinuxMemory{} + if r.Memory != 0 { + v := r.Memory + memory.Limit = &v + } + + if r.MemorySwap != 0 { + v := r.MemorySwap + memory.Swap = &v + } + + if r.MemorySwappiness != nil { + v := uint64(*r.MemorySwappiness) + memory.Swappiness = &v + } + + if r.OomKillDisable != nil { + v := bool(*r.OomKillDisable) + memory.DisableOOMKiller = &v } + s.Linux.Resources.Memory = memory +} + +// setupResource creates linux device resource spec. +func setupDevices(ctx context.Context, c *ContainerMeta, s *specs.Spec) error { + var devs []specs.LinuxDevice + devPermissions := s.Linux.Resources.Devices + if c.HostConfig.Privileged { + hostDevices, err := devices.HostDevices() + if err != nil { + return err + } + for _, d := range hostDevices { + devs = append(devs, linuxDevice(d)) + } + devPermissions = []specs.LinuxDeviceCgroup{ + { + Allow: true, + Access: "rwm", + }, + } + } else { + for _, deviceMapping := range c.HostConfig.Devices { + if !opts.ValidateDeviceMode(deviceMapping.CgroupPermissions) { + return fmt.Errorf("%s invalid device mode: %s", deviceMapping.PathOnHost, deviceMapping.CgroupPermissions) + } + d, dPermissions, err := devicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions) + if err != nil { + return err + } + devs = append(devs, d...) + devPermissions = append(devPermissions, dPermissions...) + } + } + + s.Linux.Devices = append(s.Linux.Devices, devs...) + s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...) return nil } -func setupCapabilities(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - var caplist []string - var err error +func u32Ptr(i int64) *uint32 { u := uint32(i); return &u } +func fmPtr(i int64) *os.FileMode { fm := os.FileMode(i); return &fm } + +// linuxDevice convert a libcontainer configs.Device to a specs.LinuxDevice object. +func linuxDevice(d *configs.Device) specs.LinuxDevice { + return specs.LinuxDevice{ + Type: string(d.Type), + Path: d.Path, + Major: d.Major, + Minor: d.Minor, + FileMode: fmPtr(int64(d.FileMode)), + UID: u32Ptr(int64(d.Uid)), + GID: u32Ptr(int64(d.Gid)), + } +} + +func deviceCgroup(d *configs.Device) specs.LinuxDeviceCgroup { + t := string(d.Type) + return specs.LinuxDeviceCgroup{ + Allow: true, + Type: t, + Major: &d.Major, + Minor: &d.Minor, + Access: d.Permissions, + } +} + +func devicesFromPath(pathOnHost, pathInContainer, cgroupPermissions string) (devs []specs.LinuxDevice, devPermissions []specs.LinuxDeviceCgroup, err error) { + resolvedPathOnHost := pathOnHost + + // check if it is a symbolic link + if src, e := os.Lstat(pathOnHost); e == nil && src.Mode()&os.ModeSymlink == os.ModeSymlink { + if linkedPathOnHost, e := filepath.EvalSymlinks(pathOnHost); e == nil { + resolvedPathOnHost = linkedPathOnHost + } + } + + device, err := devices.DeviceFromPath(resolvedPathOnHost, cgroupPermissions) + if err == nil { + device.Path = pathInContainer + return append(devs, linuxDevice(device)), append(devPermissions, deviceCgroup(device)), nil + } + + // if the device is not a device node + // try to see if it's a directory holding many devices + if err == devices.ErrNotADevice { + + // check if it is a directory + if src, e := os.Stat(resolvedPathOnHost); e == nil && src.IsDir() { + + // mount the internal devices recursively + filepath.Walk(resolvedPathOnHost, func(dpath string, f os.FileInfo, e error) error { + childDevice, e := devices.DeviceFromPath(dpath, cgroupPermissions) + if e != nil { + // ignore the device + return nil + } + + // add the device to userSpecified devices + childDevice.Path = strings.Replace(dpath, resolvedPathOnHost, pathInContainer, 1) + devs = append(devs, linuxDevice(childDevice)) + devPermissions = append(devPermissions, deviceCgroup(childDevice)) + + return nil + }) + } + } + + if len(devs) > 0 { + return devs, devPermissions, nil + } + + return devs, devPermissions, fmt.Errorf("error gathering device information while adding custom device %q: %s", pathOnHost, err) +} + +// setupNamespaces creates linux namespaces spec. +func setupNamespaces(ctx context.Context, c *ContainerMeta, specWrapper *SpecWrapper) error { + // create user namespace spec + if err := setupUserNamespace(ctx, c, specWrapper); err != nil { + return err + } + + // create network namespace spec + if err := setupNetworkNamespace(ctx, c, specWrapper); err != nil { + return err + } + + // create ipc namespace spec + if err := setupIpcNamespace(ctx, c, specWrapper); err != nil { + return err + } + + // create pid namespace spec + if err := setupPidNamespace(ctx, c, specWrapper); err != nil { + return err + } - capabilities := spec.s.Process.Capabilities - if meta.HostConfig.Privileged { - caplist = caps.GetAllCapabilities() - } else if caplist, err = caps.TweakCapabilities(capabilities.Effective, meta.HostConfig.CapAdd, meta.HostConfig.CapDrop); err != nil { + // create uts namespace spec + if err := setupUtsNamespace(ctx, c, specWrapper); err != nil { return err } - capabilities.Effective = caplist - capabilities.Bounding = caplist - capabilities.Permitted = caplist - capabilities.Inheritable = caplist return nil } -func setupIntelRdt(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - s := spec.s +// isEmpty indicates whether namespace mode is empty. +func isEmpty(mode string) bool { + return mode == "" +} - if meta.HostConfig.IntelRdtL3Cbm != "" { - s.Linux.IntelRdt = &specs.LinuxIntelRdt{ - L3CacheSchema: meta.HostConfig.IntelRdtL3Cbm, +// isNone indicates whether container's namespace mode is set to "none". +func isNone(mode string) bool { + return mode == "none" +} + +// isHost indicates whether the container shares the host's corresponding namespace. +func isHost(mode string) bool { + return mode == "host" +} + +// isShareable indicates whether the containers namespace can be shared with another container. +func isShareable(mode string) bool { + return mode == "shareable" +} + +// isContainer indicates whether the container uses another container's corresponding namespace. +func isContainer(mode string) bool { + parts := strings.SplitN(mode, ":", 2) + return len(parts) > 1 && parts[0] == "container" +} + +// isPrivate indicates whether the container uses its own namespace. +func isPrivate(ns specs.LinuxNamespaceType, mode string) bool { + switch ns { + case specs.IPCNamespace: + return mode == "private" + case specs.NetworkNamespace, specs.PIDNamespace: + return !(isHost(mode) || isContainer(mode)) + case specs.UserNamespace, specs.UTSNamespace: + return !(isHost(mode)) + } + return false +} + +// connectedContainer is the id or name of the container whose namespace this container share with. +func connectedContainer(mode string) string { + parts := strings.SplitN(mode, ":", 2) + if len(parts) == 2 { + return parts[1] + } + return "" +} + +func getIpcContainer(ctx context.Context, mgr ContainerMgr, id string) (*ContainerMeta, error) { + // Check whether the container exists. + c, err := mgr.Get(ctx, id) + if err != nil { + return nil, fmt.Errorf("can't join IPC namespace of container %q: %v", id, err) + } + + // TODO: check whether the container is running and not restarting. + + // TODO: check whether the container's ipc namespace is shareable. + + return c, nil +} + +func getPidContainer(ctx context.Context, mgr ContainerMgr, id string) (*ContainerMeta, error) { + // Check the container exists. + c, err := mgr.Get(ctx, id) + if err != nil { + return nil, fmt.Errorf("can't join PID namespace of %q: %v", id, err) + } + + // TODO: check whether the container is running and not restarting. + + return c, nil +} + +// TODO +func setupUserNamespace(ctx context.Context, c *ContainerMeta, specWrapper *SpecWrapper) error { + return nil +} + +func setupNetworkNamespace(ctx context.Context, c *ContainerMeta, specWrapper *SpecWrapper) error { + if c.Config.NetworkDisabled { + return nil + } + + s := specWrapper.s + ns := specs.LinuxNamespace{Type: specs.NetworkNamespace} + + networkMode := c.HostConfig.NetworkMode + if IsContainer(networkMode) { + origContainer, err := specWrapper.ctrMgr.Get(ctx, strings.SplitN(networkMode, ":", 2)[1]) + if err != nil { + return err + } + if c.ID == origContainer.ID { + return fmt.Errorf("can not join own network") + } else if origContainer.State.Status != types.StatusRunning { + return fmt.Errorf("can not join network of a non running container: %s", origContainer.ID) + } + + ns.Path = fmt.Sprintf("/proc/%d/ns/net", origContainer.State.Pid) + } else if IsHost(networkMode) { + ns.Path = c.NetworkSettings.SandboxKey + } + setNamespace(s, ns) + + for _, ns := range s.Linux.Namespaces { + if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled { + target, err := os.Readlink(filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")) + if err != nil { + return err + } + + netnsPrestart := specs.Hook{ + Path: target, + Args: []string{"libnetwork-setkey", c.ID, specWrapper.netMgr.Controller().ID()}, + } + s.Hooks.Prestart = append(s.Hooks.Prestart, netnsPrestart) + } + } + return nil +} + +func setupIpcNamespace(ctx context.Context, c *ContainerMeta, specWrapper *SpecWrapper) error { + s := specWrapper.s + ipcMode := c.HostConfig.IpcMode + switch { + case isContainer(ipcMode): + ns := specs.LinuxNamespace{Type: specs.IPCNamespace} + c, err := getIpcContainer(ctx, specWrapper.ctrMgr, connectedContainer(ipcMode)) + if err != nil { + return fmt.Errorf("setup container ipc namespace mode failed: %v", err) } + ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", c.State.Pid) + setNamespace(s, ns) + case isHost(ipcMode): + removeNamespace(s, specs.IPCNamespace) + default: + ns := specs.LinuxNamespace{Type: specs.IPCNamespace} + setNamespace(s, ns) } + return nil +} +func setupPidNamespace(ctx context.Context, c *ContainerMeta, specWrapper *SpecWrapper) error { + s := specWrapper.s + pidMode := c.HostConfig.PidMode + switch { + case isContainer(pidMode): + ns := specs.LinuxNamespace{Type: specs.PIDNamespace} + c, err := getPidContainer(ctx, specWrapper.ctrMgr, connectedContainer(pidMode)) + if err != nil { + return fmt.Errorf("setup container pid namespace mode failed: %v", err) + } + ns.Path = fmt.Sprintf("/proc/%d/ns/pid", c.State.Pid) + setNamespace(s, ns) + case isHost(pidMode): + removeNamespace(s, specs.PIDNamespace) + default: + ns := specs.LinuxNamespace{Type: specs.PIDNamespace} + setNamespace(s, ns) + } return nil } + +func setupUtsNamespace(ctx context.Context, c *ContainerMeta, specWrapper *SpecWrapper) error { + s := specWrapper.s + utsMode := c.HostConfig.UTSMode + switch { + case isHost(utsMode): + removeNamespace(s, specs.UTSNamespace) + // remove hostname + s.Hostname = "" + default: + ns := specs.LinuxNamespace{Type: specs.UTSNamespace} + setNamespace(s, ns) + } + return nil +} + +func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) { + for i, n := range s.Linux.Namespaces { + if n.Type == ns.Type { + s.Linux.Namespaces[i] = ns + return + } + } + s.Linux.Namespaces = append(s.Linux.Namespaces, ns) +} + +func removeNamespace(s *specs.Spec, nsType specs.LinuxNamespaceType) { + for i, n := range s.Linux.Namespaces { + if n.Type == nsType { + s.Linux.Namespaces = append(s.Linux.Namespaces[:i], s.Linux.Namespaces[i+1:]...) + return + } + } +} diff --git a/daemon/mgr/spec_volume.go b/daemon/mgr/spec_mount.go similarity index 90% rename from daemon/mgr/spec_volume.go rename to daemon/mgr/spec_mount.go index e3e6ab4a1..04cc6a99c 100644 --- a/daemon/mgr/spec_volume.go +++ b/daemon/mgr/spec_mount.go @@ -17,8 +17,8 @@ func clearReadonly(m *specs.Mount) { m.Options = opts } -func setupMounts(ctx context.Context, c *ContainerMeta, spec *SpecWrapper) error { - s := spec.s +// setupMounts create mount spec. +func setupMounts(ctx context.Context, c *ContainerMeta, s *specs.Spec) error { mounts := s.Mounts if c.HostConfig == nil { return nil @@ -73,8 +73,6 @@ func setupMounts(ctx context.Context, c *ContainerMeta, spec *SpecWrapper) error } } } - s.Linux.ReadonlyPaths = nil - s.Linux.MaskedPaths = nil } return nil } diff --git a/daemon/mgr/spec_namespace.go b/daemon/mgr/spec_namespace.go deleted file mode 100644 index a660fd540..000000000 --- a/daemon/mgr/spec_namespace.go +++ /dev/null @@ -1,217 +0,0 @@ -package mgr - -import ( - "context" - "fmt" - "os" - "path/filepath" - "strconv" - "strings" - - "github.com/alibaba/pouch/apis/types" - specs "github.com/opencontainers/runtime-spec/specs-go" -) - -// isEmpty indicates whether namespace mode is empty. -func isEmpty(mode string) bool { - return mode == "" -} - -// isNone indicates whether container's namespace mode is set to "none". -func isNone(mode string) bool { - return mode == "none" -} - -// isHost indicates whether the container shares the host's corresponding namespace. -func isHost(mode string) bool { - return mode == "host" -} - -// isShareable indicates whether the containers namespace can be shared with another container. -func isShareable(mode string) bool { - return mode == "shareable" -} - -// isContainer indicates whether the container uses another container's corresponding namespace. -func isContainer(mode string) bool { - parts := strings.SplitN(mode, ":", 2) - return len(parts) > 1 && parts[0] == "container" -} - -// isPrivate indicates whether the container uses its own namespace. -func isPrivate(ns specs.LinuxNamespaceType, mode string) bool { - switch ns { - case specs.IPCNamespace: - return mode == "private" - case specs.NetworkNamespace, specs.PIDNamespace: - return !(isHost(mode) || isContainer(mode)) - case specs.UserNamespace, specs.UTSNamespace: - return !(isHost(mode)) - } - return false -} - -// connectedContainer is the id or name of the container whose namespace this container share with. -func connectedContainer(mode string) string { - parts := strings.SplitN(mode, ":", 2) - if len(parts) == 2 { - return parts[1] - } - return "" -} - -func getIpcContainer(ctx context.Context, mgr ContainerMgr, id string) (*ContainerMeta, error) { - // Check whether the container exists. - c, err := mgr.Get(ctx, id) - if err != nil { - return nil, fmt.Errorf("can't join IPC namespace of container %q: %v", id, err) - } - - // TODO: check whether the container is running and not restarting. - - // TODO: check whether the container's ipc namespace is shareable. - - return c, nil -} - -func getPidContainer(ctx context.Context, mgr ContainerMgr, id string) (*ContainerMeta, error) { - // Check the container exists. - c, err := mgr.Get(ctx, id) - if err != nil { - return nil, fmt.Errorf("can't join PID namespace of %q: %v", id, err) - } - - // TODO: check whether the container is running and not restarting. - - return c, nil -} - -// TODO -func setupUserNamespace(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - return nil -} - -func setupNetworkNamespace(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - if meta.Config.NetworkDisabled { - return nil - } - - s := spec.s - ns := specs.LinuxNamespace{Type: specs.NetworkNamespace} - - networkMode := meta.HostConfig.NetworkMode - if IsContainer(networkMode) { - origContainer, err := spec.ctrMgr.Get(ctx, strings.SplitN(networkMode, ":", 2)[1]) - if err != nil { - return err - } - if meta.ID == origContainer.ID { - return fmt.Errorf("can not join own network") - } else if origContainer.State.Status != types.StatusRunning { - return fmt.Errorf("can not join network of a non running container: %s", origContainer.ID) - } - - ns.Path = fmt.Sprintf("/proc/%d/ns/net", origContainer.State.Pid) - } else if IsHost(networkMode) { - ns.Path = meta.NetworkSettings.SandboxKey - } - setNamespace(s, ns) - - if s.Hooks == nil { - s.Hooks = &specs.Hooks{} - } - if s.Hooks.Prestart == nil { - s.Hooks.Prestart = []specs.Hook{} - } - - for _, ns := range s.Linux.Namespaces { - if ns.Type == "network" && ns.Path == "" && !meta.Config.NetworkDisabled { - target, err := os.Readlink(filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")) - if err != nil { - return err - } - - netnsPrestart := specs.Hook{ - Path: target, - Args: []string{"libnetwork-setkey", meta.ID, spec.netMgr.Controller().ID()}, - } - s.Hooks.Prestart = append(s.Hooks.Prestart, netnsPrestart) - } - } - return nil -} - -func setupIpcNamespace(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - ipcMode := meta.HostConfig.IpcMode - switch { - case isContainer(ipcMode): - ns := specs.LinuxNamespace{Type: specs.IPCNamespace} - c, err := getIpcContainer(ctx, spec.ctrMgr, connectedContainer(ipcMode)) - if err != nil { - return fmt.Errorf("setup container ipc namespace mode failed: %v", err) - } - ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", c.State.Pid) - setNamespace(spec.s, ns) - case isHost(ipcMode): - removeNamespace(spec.s, specs.IPCNamespace) - default: - ns := specs.LinuxNamespace{Type: specs.IPCNamespace} - setNamespace(spec.s, ns) - } - return nil -} - -func setupPidNamespace(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - pidMode := meta.HostConfig.PidMode - switch { - case isContainer(pidMode): - ns := specs.LinuxNamespace{Type: specs.PIDNamespace} - c, err := getPidContainer(ctx, spec.ctrMgr, connectedContainer(pidMode)) - if err != nil { - return fmt.Errorf("setup container pid namespace mode failed: %v", err) - } - ns.Path = fmt.Sprintf("/proc/%d/ns/pid", c.State.Pid) - setNamespace(spec.s, ns) - case isHost(pidMode): - removeNamespace(spec.s, specs.PIDNamespace) - default: - ns := specs.LinuxNamespace{Type: specs.PIDNamespace} - setNamespace(spec.s, ns) - } - return nil -} - -func setupUtsNamespace(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - utsMode := meta.HostConfig.UTSMode - switch { - case isHost(utsMode): - removeNamespace(spec.s, specs.UTSNamespace) - default: - ns := specs.LinuxNamespace{Type: specs.UTSNamespace} - setNamespace(spec.s, ns) - // set hostname - if hostname := meta.Config.Hostname.String(); hostname != "" { - spec.s.Hostname = hostname - } - } - return nil -} - -func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) { - for i, n := range s.Linux.Namespaces { - if n.Type == ns.Type { - s.Linux.Namespaces[i] = ns - return - } - } - s.Linux.Namespaces = append(s.Linux.Namespaces, ns) -} - -func removeNamespace(s *specs.Spec, nsType specs.LinuxNamespaceType) { - for i, n := range s.Linux.Namespaces { - if n.Type == nsType { - s.Linux.Namespaces = append(s.Linux.Namespaces[:i], s.Linux.Namespaces[i+1:]...) - return - } - } -} diff --git a/daemon/mgr/spec_network.go b/daemon/mgr/spec_network.go deleted file mode 100644 index 88feb7717..000000000 --- a/daemon/mgr/spec_network.go +++ /dev/null @@ -1,14 +0,0 @@ -package mgr - -import ( - "context" -) - -func setupNetwork(ctx context.Context, c *ContainerMeta, spec *SpecWrapper) error { - s := spec.s - - s.Hostname = c.Config.Hostname.String() - //TODO setup network parameters - - return nil -} diff --git a/daemon/mgr/spec_process.go b/daemon/mgr/spec_process.go index ca250683d..f231d1b94 100644 --- a/daemon/mgr/spec_process.go +++ b/daemon/mgr/spec_process.go @@ -2,67 +2,72 @@ package mgr import ( "context" + "io/ioutil" "os" + "github.com/alibaba/pouch/apis/types" "github.com/alibaba/pouch/pkg/user" + + "github.com/docker/docker/daemon/caps" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" ) -func setupCap(ctx context.Context, c *ContainerMeta, spec *SpecWrapper) error { - //TODO setup capabilities - return nil -} +// setupProcess setups spec process. +func setupProcess(ctx context.Context, c *ContainerMeta, s *specs.Spec) error { + if s.Process == nil { + s.Process = &specs.Process{} + } + config := c.Config -func setupProcessArgs(ctx context.Context, c *ContainerMeta, spec *SpecWrapper) error { - s := spec.s - args := c.Config.Entrypoint - if args == nil { - args = []string{} + cwd := config.WorkingDir + if cwd == "" { + cwd = "/" } - if len(c.Config.Cmd) > 0 { - args = append(args, c.Config.Cmd...) + + s.Process.Args = append(config.Entrypoint, config.Cmd...) + s.Process.Env = append(s.Process.Env, createEnvironment(c)...) + s.Process.Cwd = cwd + s.Process.Terminal = config.Tty + + if s.Process.Terminal { + s.Process.Env = append(s.Process.Env, "TERM=xterm") } - s.Process.Args = args - return nil -} -func setupProcessEnv(ctx context.Context, c *ContainerMeta, spec *SpecWrapper) error { - s := spec.s - if s.Process.Env == nil { - s.Process.Env = c.Config.Env - } else { - s.Process.Env = append(s.Process.Env, c.Config.Env...) + if !c.HostConfig.Privileged { + s.Process.SelinuxLabel = c.ProcessLabel + s.Process.NoNewPrivileges = c.NoNewPrivileges + } - //set env for rich container mode - s.Process.Env = append(s.Process.Env, richContainerModeEnv(c)...) + if err := setupUser(ctx, c, s); err != nil { + return err + } - return nil -} + if c.HostConfig.OomScoreAdj != 0 { + v := int(c.HostConfig.OomScoreAdj) + s.Process.OOMScoreAdj = &v + } -func setupProcessCwd(ctx context.Context, c *ContainerMeta, spec *SpecWrapper) error { - s := spec.s - if c.Config.WorkingDir == "" { - s.Process.Cwd = "/" - } else { - s.Process.Cwd = c.Config.WorkingDir + if err := setupCapabilities(ctx, c.HostConfig, s); err != nil { + return err } - return nil -} -func setupProcessTTY(ctx context.Context, c *ContainerMeta, spec *SpecWrapper) error { - s := spec.s - s.Process.Terminal = c.Config.Tty - if s.Process.Env != nil { - s.Process.Env = append(s.Process.Env, "TERM=xterm") - } else { - s.Process.Env = []string{"TERM=xterm"} + if err := setupAppArmor(ctx, c, s); err != nil { + return err } + return nil } -func setupProcessUser(ctx context.Context, c *ContainerMeta, spec *SpecWrapper) (err error) { +func createEnvironment(c *ContainerMeta) []string { + env := c.Config.Env + env = append(env, richContainerModeEnv(c)...) + + return env +} + +func setupUser(ctx context.Context, c *ContainerMeta, s *specs.Spec) (err error) { // container rootfs is created by containerd, pouch just creates a snapshot // id and keeps it in memory. If container is in start process, we can not // find if user if exist in container image, so we do some simple check. @@ -82,7 +87,7 @@ func setupProcessUser(ctx context.Context, c *ContainerMeta, spec *SpecWrapper) additionalGids := user.GetAdditionalGids(c.HostConfig.GroupAdd) - spec.s.Process.User = specs.User{ + s.Process.User = specs.User{ UID: uid, GID: gid, AdditionalGids: additionalGids, @@ -90,17 +95,66 @@ func setupProcessUser(ctx context.Context, c *ContainerMeta, spec *SpecWrapper) return nil } -func setupNoNewPrivileges(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - if meta.HostConfig.Privileged { - return nil +func setupCapabilities(ctx context.Context, hostConfig *types.HostConfig, s *specs.Spec) error { + var caplist []string + var err error + + if s.Process.Capabilities == nil { + s.Process.Capabilities = &specs.LinuxCapabilities{} + } + capabilities := s.Process.Capabilities + + if hostConfig.Privileged { + caplist = caps.GetAllCapabilities() + } else if caplist, err = caps.TweakCapabilities(capabilities.Effective, hostConfig.CapAdd, hostConfig.CapDrop); err != nil { + return err } + capabilities.Effective = caplist + capabilities.Bounding = caplist + capabilities.Permitted = caplist + capabilities.Inheritable = caplist - spec.s.Process.NoNewPrivileges = meta.NoNewPrivileges + s.Process.Capabilities = capabilities return nil } -func setupOOMScoreAdj(ctx context.Context, c *ContainerMeta, spec *SpecWrapper) (err error) { - v := int(c.HostConfig.OomScoreAdj) - spec.s.Process.OOMScoreAdj = &v +// isAppArmorEnabled returns true if apparmor is enabled for the host. +// This function is forked from +// https://github.com/opencontainers/runc/blob/1a81e9ab1f138c091fe5c86d0883f87716088527/libcontainer/apparmor/apparmor.go +// to avoid the libapparmor dependency. +func isAppArmorEnabled() bool { + if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil && os.Getenv("container") == "" { + if _, err = os.Stat("/sbin/apparmor_parser"); err == nil { + buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled") + return err == nil && len(buf) > 1 && buf[0] == 'Y' + } + } + return false +} + +func setupAppArmor(ctx context.Context, c *ContainerMeta, s *specs.Spec) error { + if !isAppArmorEnabled() { + // Return if the apparmor is disabled. + return nil + } + + appArmorProfile := c.AppArmorProfile + switch appArmorProfile { + case ProfileNameUnconfined: + return nil + case ProfileRuntimeDefault: + // TODO: handle runtime default case. + return nil + case "": + if c.HostConfig.Privileged { + return nil + } + // TODO: if user does not specify the AppArmor and the container is not in privilege mode, + // we need to specify it as default case, handle it later. + return nil + default: + s.Process.ApparmorProfile = appArmorProfile + } + return nil } diff --git a/daemon/mgr/spec_root.go b/daemon/mgr/spec_root.go deleted file mode 100644 index 2c0f5144c..000000000 --- a/daemon/mgr/spec_root.go +++ /dev/null @@ -1,11 +0,0 @@ -package mgr - -import ( - "context" -) - -func setupRoot(ctx context.Context, meta *ContainerMeta, spec *SpecWrapper) error { - spec.s.Root.Readonly = meta.HostConfig.ReadonlyRootfs - - return nil -}