From d3a27ca34f603e90951aab50481c40310cba8f1d Mon Sep 17 00:00:00 2001 From: Malhar Vora Date: Wed, 17 Jan 2018 06:57:03 -0800 Subject: [PATCH] agent: Add missing nsenter import We need to import 'nsenter' package to make 'init' function to be able to spawn a new container. Fixes: #95 Signed-off-by: Malhar Vora --- Gopkg.lock | 3 +- agent.go | 1 + .../runc/libcontainer/nsenter/README.md | 44 + .../runc/libcontainer/nsenter/namespace.h | 32 + .../runc/libcontainer/nsenter/nsenter.go | 12 + .../libcontainer/nsenter/nsenter_gccgo.go | 25 + .../runc/libcontainer/nsenter/nsenter_test.go | 175 ++++ .../nsenter/nsenter_unsupported.go | 5 + .../runc/libcontainer/nsenter/nsexec.c | 964 ++++++++++++++++++ 9 files changed, 1260 insertions(+), 1 deletion(-) create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/README.md create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_test.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c diff --git a/Gopkg.lock b/Gopkg.lock index 93f4406d9e..fcc10c98f8 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -99,6 +99,7 @@ "libcontainer/intelrdt", "libcontainer/keys", "libcontainer/mount", + "libcontainer/nsenter", "libcontainer/seccomp", "libcontainer/specconv", "libcontainer/stacktrace", @@ -254,6 +255,6 @@ [solve-meta] analyzer-name = "dep" analyzer-version = 1 - inputs-digest = "c256d8a4f49dca7b592e1f6101059e9563cc6173cde2c9b622c31e71f14564b1" + inputs-digest = "c38d31b9fcac7acea4857468060fdf0ea2d68c14a70f59d24752bba917f271d8" solver-name = "gps-cdcl" solver-version = 1 diff --git a/agent.go b/agent.go index 278075bd63..ad559037b0 100644 --- a/agent.go +++ b/agent.go @@ -21,6 +21,7 @@ import ( pb "github.com/kata-containers/agent/protocols/grpc" "github.com/opencontainers/runc/libcontainer" "github.com/opencontainers/runc/libcontainer/configs" + _ "github.com/opencontainers/runc/libcontainer/nsenter" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" "google.golang.org/grpc" diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/README.md b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/README.md new file mode 100644 index 0000000000..5757013712 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/README.md @@ -0,0 +1,44 @@ +## nsenter + +The `nsenter` package registers a special init constructor that is called before +the Go runtime has a chance to boot. This provides us the ability to `setns` on +existing namespaces and avoid the issues that the Go runtime has with multiple +threads. This constructor will be called if this package is registered, +imported, in your go application. + +The `nsenter` package will `import "C"` and it uses [cgo](https://golang.org/cmd/cgo/) +package. In cgo, if the import of "C" is immediately preceded by a comment, that comment, +called the preamble, is used as a header when compiling the C parts of the package. +So every time we import package `nsenter`, the C code function `nsexec()` would be +called. And package `nsenter` is now only imported in `main_unix.go`, so every time +before we call `cmd.Start` on linux, that C code would run. + +Because `nsexec()` must be run before the Go runtime in order to use the +Linux kernel namespace, you must `import` this library into a package if +you plan to use `libcontainer` directly. Otherwise Go will not execute +the `nsexec()` constructor, which means that the re-exec will not cause +the namespaces to be joined. You can import it like this: + +```go +import _ "github.com/opencontainers/runc/libcontainer/nsenter" +``` + +`nsexec()` will first get the file descriptor number for the init pipe +from the environment variable `_LIBCONTAINER_INITPIPE` (which was opened +by the parent and kept open across the fork-exec of the `nsexec()` init +process). The init pipe is used to read bootstrap data (namespace paths, +clone flags, uid and gid mappings, and the console path) from the parent +process. `nsexec()` will then call `setns(2)` to join the namespaces +provided in the bootstrap data (if available), `clone(2)` a child process +with the provided clone flags, update the user and group ID mappings, do +some further miscellaneous setup steps, and then send the PID of the +child process to the parent of the `nsexec()` "caller". Finally, +the parent `nsexec()` will exit and the child `nsexec()` process will +return to allow the Go runtime take over. + +NOTE: We do both `setns(2)` and `clone(2)` even if we don't have any +CLONE_NEW* clone flags because we must fork a new process in order to +enter the PID namespace. + + + diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h new file mode 100644 index 0000000000..9e9bdca05e --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h @@ -0,0 +1,32 @@ +#ifndef NSENTER_NAMESPACE_H +#define NSENTER_NAMESPACE_H + +#ifndef _GNU_SOURCE +# define _GNU_SOURCE +#endif +#include + +/* All of these are taken from include/uapi/linux/sched.h */ +#ifndef CLONE_NEWNS +# define CLONE_NEWNS 0x00020000 /* New mount namespace group */ +#endif +#ifndef CLONE_NEWCGROUP +# define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */ +#endif +#ifndef CLONE_NEWUTS +# define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ +#endif +#ifndef CLONE_NEWIPC +# define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ +#endif +#ifndef CLONE_NEWUSER +# define CLONE_NEWUSER 0x10000000 /* New user namespace */ +#endif +#ifndef CLONE_NEWPID +# define CLONE_NEWPID 0x20000000 /* New pid namespace */ +#endif +#ifndef CLONE_NEWNET +# define CLONE_NEWNET 0x40000000 /* New network namespace */ +#endif + +#endif /* NSENTER_NAMESPACE_H */ diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go new file mode 100644 index 0000000000..07f4d63e43 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go @@ -0,0 +1,12 @@ +// +build linux,!gccgo + +package nsenter + +/* +#cgo CFLAGS: -Wall +extern void nsexec(); +void __attribute__((constructor)) init(void) { + nsexec(); +} +*/ +import "C" diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go new file mode 100644 index 0000000000..63c7a3ec22 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go @@ -0,0 +1,25 @@ +// +build linux,gccgo + +package nsenter + +/* +#cgo CFLAGS: -Wall +extern void nsexec(); +void __attribute__((constructor)) init(void) { + nsexec(); +} +*/ +import "C" + +// AlwaysFalse is here to stay false +// (and be exported so the compiler doesn't optimize out its reference) +var AlwaysFalse bool + +func init() { + if AlwaysFalse { + // by referencing this C init() in a noop test, it will ensure the compiler + // links in the C function. + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134 + C.init() + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_test.go b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_test.go new file mode 100644 index 0000000000..532155535d --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_test.go @@ -0,0 +1,175 @@ +package nsenter + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "os" + "os/exec" + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer" + "github.com/vishvananda/netlink/nl" + + "golang.org/x/sys/unix" +) + +type pid struct { + Pid int `json:"Pid"` +} + +func TestNsenterValidPaths(t *testing.T) { + args := []string{"nsenter-exec"} + parent, child, err := newPipe() + if err != nil { + t.Fatalf("failed to create pipe %v", err) + } + + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("pid:/proc/%d/ns/pid", os.Getpid()), + } + cmd := &exec.Cmd{ + Path: os.Args[0], + Args: args, + ExtraFiles: []*os.File{child}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + Stdout: os.Stdout, + Stderr: os.Stderr, + } + + if err := cmd.Start(); err != nil { + t.Fatalf("nsenter failed to start %v", err) + } + // write cloneFlags + r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) + r.AddData(&libcontainer.Int32msg{ + Type: libcontainer.CloneFlagsAttr, + Value: uint32(unix.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), + }) + if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { + t.Fatal(err) + } + + decoder := json.NewDecoder(parent) + var pid *pid + + if err := cmd.Wait(); err != nil { + t.Fatalf("nsenter exits with a non-zero exit status") + } + if err := decoder.Decode(&pid); err != nil { + dir, _ := ioutil.ReadDir(fmt.Sprintf("/proc/%d/ns", os.Getpid())) + for _, d := range dir { + t.Log(d.Name()) + } + t.Fatalf("%v", err) + } + + p, err := os.FindProcess(pid.Pid) + if err != nil { + t.Fatalf("%v", err) + } + p.Wait() +} + +func TestNsenterInvalidPaths(t *testing.T) { + args := []string{"nsenter-exec"} + parent, child, err := newPipe() + if err != nil { + t.Fatalf("failed to create pipe %v", err) + } + + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("pid:/proc/%d/ns/pid", -1), + } + cmd := &exec.Cmd{ + Path: os.Args[0], + Args: args, + ExtraFiles: []*os.File{child}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + } + + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + // write cloneFlags + r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) + r.AddData(&libcontainer.Int32msg{ + Type: libcontainer.CloneFlagsAttr, + Value: uint32(unix.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), + }) + if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { + t.Fatal(err) + } + + if err := cmd.Wait(); err == nil { + t.Fatalf("nsenter exits with a zero exit status") + } +} + +func TestNsenterIncorrectPathType(t *testing.T) { + args := []string{"nsenter-exec"} + parent, child, err := newPipe() + if err != nil { + t.Fatalf("failed to create pipe %v", err) + } + + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("net:/proc/%d/ns/pid", os.Getpid()), + } + cmd := &exec.Cmd{ + Path: os.Args[0], + Args: args, + ExtraFiles: []*os.File{child}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + } + + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + // write cloneFlags + r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) + r.AddData(&libcontainer.Int32msg{ + Type: libcontainer.CloneFlagsAttr, + Value: uint32(unix.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), + }) + if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { + t.Fatal(err) + } + + if err := cmd.Wait(); err == nil { + t.Fatalf("nsenter exits with a zero exit status") + } +} + +func init() { + if strings.HasPrefix(os.Args[0], "nsenter-") { + os.Exit(0) + } + return +} + +func newPipe() (parent *os.File, child *os.File, err error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go new file mode 100644 index 0000000000..ac701ca393 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go @@ -0,0 +1,5 @@ +// +build !linux !cgo + +package nsenter + +import "C" diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c new file mode 100644 index 0000000000..a6a107e6e6 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c @@ -0,0 +1,964 @@ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +#include +#include +#include + +/* Get all of the CLONE_NEW* flags. */ +#include "namespace.h" + +/* Synchronisation values. */ +enum sync_t { + SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ + SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ + SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ + SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ + SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ + SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */ + + /* XXX: This doesn't help with segfaults and other such issues. */ + SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ +}; + +/* longjmp() arguments. */ +#define JUMP_PARENT 0x00 +#define JUMP_CHILD 0xA0 +#define JUMP_INIT 0xA1 + +/* JSON buffer. */ +#define JSON_MAX 4096 + +/* Assume the stack grows down, so arguments should be above it. */ +struct clone_t { + /* + * Reserve some space for clone() to locate arguments + * and retcode in this place + */ + char stack[4096] __attribute__ ((aligned(16))); + char stack_ptr[0]; + + /* There's two children. This is used to execute the different code. */ + jmp_buf *env; + int jmpval; +}; + +struct nlconfig_t { + char *data; + + /* Process settings. */ + uint32_t cloneflags; + char *oom_score_adj; + size_t oom_score_adj_len; + + /* User namespace settings.*/ + char *uidmap; + size_t uidmap_len; + char *gidmap; + size_t gidmap_len; + char *namespaces; + size_t namespaces_len; + uint8_t is_setgroup; + + /* Rootless container settings.*/ + uint8_t is_rootless; + char *uidmappath; + size_t uidmappath_len; + char *gidmappath; + size_t gidmappath_len; +}; + +/* + * List of netlink message types sent to us as part of bootstrapping the init. + * These constants are defined in libcontainer/message_linux.go. + */ +#define INIT_MSG 62000 +#define CLONE_FLAGS_ATTR 27281 +#define NS_PATHS_ATTR 27282 +#define UIDMAP_ATTR 27283 +#define GIDMAP_ATTR 27284 +#define SETGROUP_ATTR 27285 +#define OOM_SCORE_ADJ_ATTR 27286 +#define ROOTLESS_ATTR 27287 +#define UIDMAPPATH_ATTR 27288 +#define GIDMAPPATH_ATTR 27289 + +/* + * Use the raw syscall for versions of glibc which don't include a function for + * it, namely (glibc 2.12). + */ +#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 +# define _GNU_SOURCE +# include "syscall.h" +# if !defined(SYS_setns) && defined(__NR_setns) +# define SYS_setns __NR_setns +# endif + +#ifndef SYS_setns +# error "setns(2) syscall not supported by glibc version" +#endif + +int setns(int fd, int nstype) +{ + return syscall(SYS_setns, fd, nstype); +} +#endif + +/* XXX: This is ugly. */ +static int syncfd = -1; + +/* TODO(cyphar): Fix this so it correctly deals with syncT. */ +#define bail(fmt, ...) \ + do { \ + int ret = __COUNTER__ + 1; \ + fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \ + if (syncfd >= 0) { \ + enum sync_t s = SYNC_ERR; \ + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) \ + fprintf(stderr, "nsenter: failed: write(s)"); \ + if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret)) \ + fprintf(stderr, "nsenter: failed: write(ret)"); \ + } \ + exit(ret); \ + } while(0) + +static int write_file(char *data, size_t data_len, char *pathfmt, ...) +{ + int fd, len, ret = 0; + char path[PATH_MAX]; + + va_list ap; + va_start(ap, pathfmt); + len = vsnprintf(path, PATH_MAX, pathfmt, ap); + va_end(ap); + if (len < 0) + return -1; + + fd = open(path, O_RDWR); + if (fd < 0) { + return -1; + } + + len = write(fd, data, data_len); + if (len != data_len) { + ret = -1; + goto out; + } + +out: + close(fd); + return ret; +} + +enum policy_t { + SETGROUPS_DEFAULT = 0, + SETGROUPS_ALLOW, + SETGROUPS_DENY, +}; + +/* This *must* be called before we touch gid_map. */ +static void update_setgroups(int pid, enum policy_t setgroup) +{ + char *policy; + + switch (setgroup) { + case SETGROUPS_ALLOW: + policy = "allow"; + break; + case SETGROUPS_DENY: + policy = "deny"; + break; + case SETGROUPS_DEFAULT: + default: + /* Nothing to do. */ + return; + } + + if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) { + /* + * If the kernel is too old to support /proc/pid/setgroups, + * open(2) or write(2) will return ENOENT. This is fine. + */ + if (errno != ENOENT) + bail("failed to write '%s' to /proc/%d/setgroups", policy, pid); + } +} + +static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len) +{ + int child; + + /* + * If @app is NULL, execve will segfault. Just check it here and bail (if + * we're in this path, the caller is already getting desparate and there + * isn't a backup to this failing). This usually would be a configuration + * or programming issue. + */ + if (!app) + bail("mapping tool not present"); + + child = fork(); + if (child < 0) + bail("failed to fork"); + + if (!child) { +#define MAX_ARGV 20 + char *argv[MAX_ARGV]; + char *envp[] = {NULL}; + char pid_fmt[16]; + int argc = 0; + char *next; + + snprintf(pid_fmt, 16, "%d", pid); + + argv[argc++] = (char *) app; + argv[argc++] = pid_fmt; + /* + * Convert the map string into a list of argument that + * newuidmap/newgidmap can understand. + */ + + while (argc < MAX_ARGV) { + if (*map == '\0') { + argv[argc++] = NULL; + break; + } + argv[argc++] = map; + next = strpbrk(map, "\n "); + if (next == NULL) + break; + *next++ = '\0'; + map = next + strspn(next, "\n "); + } + + execve(app, argv, envp); + bail("failed to execv"); + } else { + int status; + + while (true) { + if (waitpid(child, &status, 0) < 0) { + if (errno == EINTR) + continue; + bail("failed to waitpid"); + } + if (WIFEXITED(status) || WIFSIGNALED(status)) + return WEXITSTATUS(status); + } + } + + return -1; +} + +static void update_uidmap(const char *path, int pid, char *map, size_t map_len) +{ + if (map == NULL || map_len <= 0) + return; + + if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) { + if (errno != EPERM) + bail("failed to update /proc/%d/uid_map", pid); + if (try_mapping_tool(path, pid, map, map_len)) + bail("failed to use newuid map on %d", pid); + } +} + +static void update_gidmap(const char *path, int pid, char *map, size_t map_len) +{ + if (map == NULL || map_len <= 0) + return; + + if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) { + if (errno != EPERM) + bail("failed to update /proc/%d/gid_map", pid); + if (try_mapping_tool(path, pid, map, map_len)) + bail("failed to use newgid map on %d", pid); + } +} + +static void update_oom_score_adj(char *data, size_t len) +{ + if (data == NULL || len <= 0) + return; + + if (write_file(data, len, "/proc/self/oom_score_adj") < 0) + bail("failed to update /proc/self/oom_score_adj"); +} + +/* A dummy function that just jumps to the given jumpval. */ +static int child_func(void *arg) __attribute__ ((noinline)); +static int child_func(void *arg) +{ + struct clone_t *ca = (struct clone_t *)arg; + longjmp(*ca->env, ca->jmpval); +} + +static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline)); +static int clone_parent(jmp_buf *env, int jmpval) +{ + struct clone_t ca = { + .env = env, + .jmpval = jmpval, + }; + + return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); +} + +/* + * Gets the init pipe fd from the environment, which is used to read the + * bootstrap data and tell the parent what the new pid is after we finish + * setting up the environment. + */ +static int initpipe(void) +{ + int pipenum; + char *initpipe, *endptr; + + initpipe = getenv("_LIBCONTAINER_INITPIPE"); + if (initpipe == NULL || *initpipe == '\0') + return -1; + + pipenum = strtol(initpipe, &endptr, 10); + if (*endptr != '\0') + bail("unable to parse _LIBCONTAINER_INITPIPE"); + + return pipenum; +} + +/* Returns the clone(2) flag for a namespace, given the name of a namespace. */ +static int nsflag(char *name) +{ + if (!strcmp(name, "cgroup")) + return CLONE_NEWCGROUP; + else if (!strcmp(name, "ipc")) + return CLONE_NEWIPC; + else if (!strcmp(name, "mnt")) + return CLONE_NEWNS; + else if (!strcmp(name, "net")) + return CLONE_NEWNET; + else if (!strcmp(name, "pid")) + return CLONE_NEWPID; + else if (!strcmp(name, "user")) + return CLONE_NEWUSER; + else if (!strcmp(name, "uts")) + return CLONE_NEWUTS; + + /* If we don't recognise a name, fallback to 0. */ + return 0; +} + +static uint32_t readint32(char *buf) +{ + return *(uint32_t *) buf; +} + +static uint8_t readint8(char *buf) +{ + return *(uint8_t *) buf; +} + +static void nl_parse(int fd, struct nlconfig_t *config) +{ + size_t len, size; + struct nlmsghdr hdr; + char *data, *current; + + /* Retrieve the netlink header. */ + len = read(fd, &hdr, NLMSG_HDRLEN); + if (len != NLMSG_HDRLEN) + bail("invalid netlink header length %zu", len); + + if (hdr.nlmsg_type == NLMSG_ERROR) + bail("failed to read netlink message"); + + if (hdr.nlmsg_type != INIT_MSG) + bail("unexpected msg type %d", hdr.nlmsg_type); + + /* Retrieve data. */ + size = NLMSG_PAYLOAD(&hdr, 0); + current = data = malloc(size); + if (!data) + bail("failed to allocate %zu bytes of memory for nl_payload", size); + + len = read(fd, data, size); + if (len != size) + bail("failed to read netlink payload, %zu != %zu", len, size); + + /* Parse the netlink payload. */ + config->data = data; + while (current < data + size) { + struct nlattr *nlattr = (struct nlattr *)current; + size_t payload_len = nlattr->nla_len - NLA_HDRLEN; + + /* Advance to payload. */ + current += NLA_HDRLEN; + + /* Handle payload. */ + switch (nlattr->nla_type) { + case CLONE_FLAGS_ATTR: + config->cloneflags = readint32(current); + break; + case ROOTLESS_ATTR: + config->is_rootless = readint8(current); + break; + case OOM_SCORE_ADJ_ATTR: + config->oom_score_adj = current; + config->oom_score_adj_len = payload_len; + break; + case NS_PATHS_ATTR: + config->namespaces = current; + config->namespaces_len = payload_len; + break; + case UIDMAP_ATTR: + config->uidmap = current; + config->uidmap_len = payload_len; + break; + case GIDMAP_ATTR: + config->gidmap = current; + config->gidmap_len = payload_len; + break; + case UIDMAPPATH_ATTR: + config->uidmappath = current; + config->uidmappath_len = payload_len; + break; + case GIDMAPPATH_ATTR: + config->gidmappath = current; + config->gidmappath_len = payload_len; + break; + case SETGROUP_ATTR: + config->is_setgroup = readint8(current); + break; + default: + bail("unknown netlink message type %d", nlattr->nla_type); + } + + current += NLA_ALIGN(payload_len); + } +} + +void nl_free(struct nlconfig_t *config) +{ + free(config->data); +} + +void join_namespaces(char *nslist) +{ + int num = 0, i; + char *saveptr = NULL; + char *namespace = strtok_r(nslist, ",", &saveptr); + struct namespace_t { + int fd; + int ns; + char type[PATH_MAX]; + char path[PATH_MAX]; + } *namespaces = NULL; + + if (!namespace || !strlen(namespace) || !strlen(nslist)) + bail("ns paths are empty"); + + /* + * We have to open the file descriptors first, since after + * we join the mnt namespace we might no longer be able to + * access the paths. + */ + do { + int fd; + char *path; + struct namespace_t *ns; + + /* Resize the namespace array. */ + namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t)); + if (!namespaces) + bail("failed to reallocate namespace array"); + ns = &namespaces[num - 1]; + + /* Split 'ns:path'. */ + path = strstr(namespace, ":"); + if (!path) + bail("failed to parse %s", namespace); + *path++ = '\0'; + + fd = open(path, O_RDONLY); + if (fd < 0) + bail("failed to open %s", path); + + ns->fd = fd; + ns->ns = nsflag(namespace); + strncpy(ns->path, path, PATH_MAX); + } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); + + /* + * The ordering in which we join namespaces is important. We should + * always join the user namespace *first*. This is all guaranteed + * from the container_linux.go side of this, so we're just going to + * follow the order given to us. + */ + + for (i = 0; i < num; i++) { + struct namespace_t ns = namespaces[i]; + + if (setns(ns.fd, ns.ns) < 0) + bail("failed to setns to %s", ns.path); + + close(ns.fd); + } + + free(namespaces); +} + +void nsexec(void) +{ + int pipenum; + jmp_buf env; + int sync_child_pipe[2], sync_grandchild_pipe[2]; + struct nlconfig_t config = {0}; + + /* + * If we don't have an init pipe, just return to the go routine. + * We'll only get an init pipe for start or exec. + */ + pipenum = initpipe(); + if (pipenum == -1) + return; + + /* Parse all of the netlink configuration. */ + nl_parse(pipenum, &config); + + /* Set oom_score_adj. This has to be done before !dumpable because + * /proc/self/oom_score_adj is not writeable unless you're an privileged + * user (if !dumpable is set). All children inherit their parent's + * oom_score_adj value on fork(2) so this will always be propagated + * properly. + */ + update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len); + + /* + * Make the process non-dumpable, to avoid various race conditions that + * could cause processes in namespaces we're joining to access host + * resources (or potentially execute code). + * + * However, if the number of namespaces we are joining is 0, we are not + * going to be switching to a different security context. Thus setting + * ourselves to be non-dumpable only breaks things (like rootless + * containers), which is the recommendation from the kernel folks. + */ + if (config.namespaces) { + if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) + bail("failed to set process as non-dumpable"); + } + + /* Pipe so we can tell the child when we've finished setting up. */ + if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0) + bail("failed to setup sync pipe between parent and child"); + + /* + * We need a new socketpair to sync with grandchild so we don't have + * race condition with child. + */ + if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0) + bail("failed to setup sync pipe between parent and grandchild"); + + /* TODO: Currently we aren't dealing with child deaths properly. */ + + /* + * Okay, so this is quite annoying. + * + * In order for this unsharing code to be more extensible we need to split + * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case + * would be if we did clone(CLONE_NEWUSER) and the other namespaces + * separately, but because of SELinux issues we cannot really do that. But + * we cannot just dump the namespace flags into clone(...) because several + * usecases (such as rootless containers) require more granularity around + * the namespace setup. In addition, some older kernels had issues where + * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot + * handle this while also dealing with SELinux so we choose SELinux support + * over broken kernel support). + * + * However, if we unshare(2) the user namespace *before* we clone(2), then + * all hell breaks loose. + * + * The parent no longer has permissions to do many things (unshare(2) drops + * all capabilities in your old namespace), and the container cannot be set + * up to have more than one {uid,gid} mapping. This is obviously less than + * ideal. In order to fix this, we have to first clone(2) and then unshare. + * + * Unfortunately, it's not as simple as that. We have to fork to enter the + * PID namespace (the PID namespace only applies to children). Since we'll + * have to double-fork, this clone_parent() call won't be able to get the + * PID of the _actual_ init process (without doing more synchronisation than + * I can deal with at the moment). So we'll just get the parent to send it + * for us, the only job of this process is to update + * /proc/pid/{setgroups,uid_map,gid_map}. + * + * And as a result of the above, we also need to setns(2) in the first child + * because if we join a PID namespace in the topmost parent then our child + * will be in that namespace (and it will not be able to give us a PID value + * that makes sense without resorting to sending things with cmsg). + * + * This also deals with an older issue caused by dumping cloneflags into + * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so + * we have to unshare(2) before clone(2) in order to do this. This was fixed + * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was + * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're + * aware, the last mainline kernel which had this bug was Linux 3.12. + * However, we cannot comment on which kernels the broken patch was + * backported to. + * + * -- Aleksa "what has my life come to?" Sarai + */ + + switch (setjmp(env)) { + /* + * Stage 0: We're in the parent. Our job is just to create a new child + * (stage 1: JUMP_CHILD) process and write its uid_map and + * gid_map. That process will go on to create a new process, then + * it will send us its PID which we will send to the bootstrap + * process. + */ + case JUMP_PARENT: { + int len; + pid_t child, first_child = -1; + char buf[JSON_MAX]; + bool ready = false; + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0); + + /* Start the process of getting a container. */ + child = clone_parent(&env, JUMP_CHILD); + if (child < 0) + bail("unable to fork: child_func"); + + /* + * State machine for synchronisation with the children. + * + * Father only return when both child and grandchild are + * ready, so we can receive all possible error codes + * generated by children. + */ + while (!ready) { + enum sync_t s; + int ret; + + syncfd = sync_child_pipe[1]; + close(sync_child_pipe[0]); + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with child: next state"); + + switch (s) { + case SYNC_ERR: + /* We have to mirror the error code of the child. */ + if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret)) + bail("failed to sync with child: read(error code)"); + + exit(ret); + case SYNC_USERMAP_PLS: + /* + * Enable setgroups(2) if we've been asked to. But we also + * have to explicitly disable setgroups(2) if we're + * creating a rootless container (this is required since + * Linux 3.19). + */ + if (config.is_rootless && config.is_setgroup) { + kill(child, SIGKILL); + bail("cannot allow setgroup in an unprivileged user namespace setup"); + } + + if (config.is_setgroup) + update_setgroups(child, SETGROUPS_ALLOW); + if (config.is_rootless) + update_setgroups(child, SETGROUPS_DENY); + + /* Set up mappings. */ + update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len); + update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len); + + s = SYNC_USERMAP_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); + } + break; + case SYNC_RECVPID_PLS: { + first_child = child; + + /* Get the init_func pid. */ + if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { + kill(first_child, SIGKILL); + bail("failed to sync with child: read(childpid)"); + } + + /* Send ACK. */ + s = SYNC_RECVPID_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(first_child, SIGKILL); + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); + } + } + break; + case SYNC_CHILD_READY: + ready = true; + break; + default: + bail("unexpected sync value: %u", s); + } + } + + /* Now sync with grandchild. */ + + ready = false; + while (!ready) { + enum sync_t s; + int ret; + + syncfd = sync_grandchild_pipe[1]; + close(sync_grandchild_pipe[0]); + + s = SYNC_GRANDCHILD; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_GRANDCHILD)"); + } + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with child: next state"); + + switch (s) { + case SYNC_ERR: + /* We have to mirror the error code of the child. */ + if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret)) + bail("failed to sync with child: read(error code)"); + + exit(ret); + case SYNC_CHILD_READY: + ready = true; + break; + default: + bail("unexpected sync value: %u", s); + } + } + + /* + * Send the init_func pid and the pid of the first child back to our parent. + * + * We need to send both back because we can't reap the first child we created (CLONE_PARENT). + * It becomes the responsibility of our parent to reap the first child. + */ + len = snprintf(buf, JSON_MAX, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child); + if (len < 0) { + kill(child, SIGKILL); + bail("unable to generate JSON for child pid"); + } + if (write(pipenum, buf, len) != len) { + kill(child, SIGKILL); + bail("unable to send child pid to bootstrapper"); + } + + exit(0); + } + + /* + * Stage 1: We're in the first child process. Our job is to join any + * provided namespaces in the netlink payload and unshare all + * of the requested namespaces. If we've been asked to + * CLONE_NEWUSER, we will ask our parent (stage 0) to set up + * our user mappings for us. Then, we create a new child + * (stage 2: JUMP_INIT) for PID namespace. We then send the + * child's PID to our parent (stage 0). + */ + case JUMP_CHILD: { + pid_t child; + enum sync_t s; + + /* We're in a child and thus need to tell the parent if we die. */ + syncfd = sync_child_pipe[0]; + close(sync_child_pipe[1]); + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0); + + /* + * We need to setns first. We cannot do this earlier (in stage 0) + * because of the fact that we forked to get here (the PID of + * [stage 2: JUMP_INIT]) would be meaningless). We could send it + * using cmsg(3) but that's just annoying. + */ + if (config.namespaces) + join_namespaces(config.namespaces); + + /* + * Unshare all of the namespaces. Now, it should be noted that this + * ordering might break in the future (especially with rootless + * containers). But for now, it's not possible to split this into + * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. + * + * Note that we don't merge this with clone() because there were + * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) + * was broken, so we'll just do it the long way anyway. + */ + if (unshare(config.cloneflags) < 0) + bail("failed to unshare namespaces"); + + /* + * Deal with user namespaces first. They are quite special, as they + * affect our ability to unshare other namespaces and are used as + * context for privilege checks. + */ + if (config.cloneflags & CLONE_NEWUSER) { + /* + * We don't have the privileges to do any mapping here (see the + * clone_parent rant). So signal our parent to hook us up. + */ + + /* Switching is only necessary if we joined namespaces. */ + if (config.namespaces) { + if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0) + bail("failed to set process as dumpable"); + } + s = SYNC_USERMAP_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); + + /* ... wait for mapping ... */ + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); + if (s != SYNC_USERMAP_ACK) + bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); + /* Switching is only necessary if we joined namespaces. */ + if (config.namespaces) { + if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) + bail("failed to set process as dumpable"); + } + } + + /* + * TODO: What about non-namespace clone flags that we're dropping here? + * + * We fork again because of PID namespace, setns(2) or unshare(2) don't + * change the PID namespace of the calling process, because doing so + * would change the caller's idea of its own PID (as reported by getpid()), + * which would break many applications and libraries, so we must fork + * to actually enter the new PID namespace. + */ + child = clone_parent(&env, JUMP_INIT); + if (child < 0) + bail("unable to fork: init_func"); + + /* Send the child to our parent, which knows what it's doing. */ + s = SYNC_RECVPID_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); + } + if (write(syncfd, &child, sizeof(child)) != sizeof(child)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(childpid)"); + } + + /* ... wait for parent to get the pid ... */ + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); + } + if (s != SYNC_RECVPID_ACK) { + kill(child, SIGKILL); + bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); + } + + s = SYNC_CHILD_READY; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(SYNC_CHILD_READY)"); + } + + /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ + exit(0); + } + + /* + * Stage 2: We're the final child process, and the only process that will + * actually return to the Go runtime. Our job is to just do the + * final cleanup steps and then return to the Go runtime to allow + * init_linux.go to run. + */ + case JUMP_INIT: { + /* + * We're inside the child now, having jumped from the + * start_child() code after forking in the parent. + */ + enum sync_t s; + + /* We're in a child and thus need to tell the parent if we die. */ + syncfd = sync_grandchild_pipe[0]; + close(sync_grandchild_pipe[1]); + close(sync_child_pipe[0]); + close(sync_child_pipe[1]); + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0); + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); + if (s != SYNC_GRANDCHILD) + bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s); + + if (setsid() < 0) + bail("setsid failed"); + + if (setuid(0) < 0) + bail("setuid failed"); + + if (setgid(0) < 0) + bail("setgid failed"); + + if (!config.is_rootless && config.is_setgroup) { + if (setgroups(0, NULL) < 0) + bail("setgroups failed"); + } + + s = SYNC_CHILD_READY; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with patent: write(SYNC_CHILD_READY)"); + + /* Close sync pipes. */ + close(sync_grandchild_pipe[0]); + + /* Free netlink data. */ + nl_free(&config); + + /* Finish executing, let the Go runtime take over. */ + return; + } + default: + bail("unexpected jump value"); + } + + /* Should never be reached. */ + bail("should never be reached"); +}