From 2cd9c31b995cff313c828dafbfeea082c7318c3f Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 21 Jul 2016 10:02:08 +1000 Subject: [PATCH] nsenter: guarantee correct user namespace ordering Depending on your SELinux setup, the order in which you join namespaces can be important. In general, user namespaces should *always* be joined and unshared first because then the other namespaces are correctly pinned and you have the right priviliges within them. This also is very useful for rootless containers, as well as older kernels that had essentially broken unshare(2) and clone(2) implementations. This also includes huge refactorings in how we spawn processes for complicated reasons that I don't want to get into because it will make me spiral into a cloud of rage. The reasoning is in the giant comment in clone_parent. Have fun. In addition, because we now create multiple children with CLONE_PARENT, we cannot wait for them to SIGCHLD us in the case of a death. Thus, we have to resort to having a child kindly send us their exit code before they die. Hopefully this all works okay, but at this point there's not much more than we can do. Signed-off-by: Aleksa Sarai --- libcontainer/container_linux.go | 3 +- libcontainer/nsenter/namespace.h | 32 ++ libcontainer/nsenter/nsexec.c | 707 ++++++++++++++++++++----------- 3 files changed, 501 insertions(+), 241 deletions(-) create mode 100644 libcontainer/nsenter/namespace.h diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 34cac634783..4ba2735d010 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -1224,12 +1224,13 @@ func (c *linuxContainer) currentState() (*State, error) { func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { paths := []string{} order := []configs.NamespaceType{ + // The user namespace *must* be done first. + configs.NEWUSER, configs.NEWIPC, configs.NEWUTS, configs.NEWNET, configs.NEWPID, configs.NEWNS, - configs.NEWUSER, } // Remove namespaces that we don't need to join. diff --git a/libcontainer/nsenter/namespace.h b/libcontainer/nsenter/namespace.h new file mode 100644 index 00000000000..9e9bdca05e1 --- /dev/null +++ b/libcontainer/nsenter/namespace.h @@ -0,0 +1,32 @@ +#ifndef NSENTER_NAMESPACE_H +#define NSENTER_NAMESPACE_H + +#ifndef _GNU_SOURCE +# define _GNU_SOURCE +#endif +#include + +/* All of these are taken from include/uapi/linux/sched.h */ +#ifndef CLONE_NEWNS +# define CLONE_NEWNS 0x00020000 /* New mount namespace group */ +#endif +#ifndef CLONE_NEWCGROUP +# define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */ +#endif +#ifndef CLONE_NEWUTS +# define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ +#endif +#ifndef CLONE_NEWIPC +# define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ +#endif +#ifndef CLONE_NEWUSER +# define CLONE_NEWUSER 0x10000000 /* New user namespace */ +#endif +#ifndef CLONE_NEWPID +# define CLONE_NEWPID 0x20000000 /* New pid namespace */ +#endif +#ifndef CLONE_NEWNET +# define CLONE_NEWNET 0x40000000 /* New network namespace */ +#endif + +#endif /* NSENTER_NAMESPACE_H */ diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index ce8fab380d1..d3a50b04ce0 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -24,27 +24,51 @@ #include #include -#define SYNC_VAL 0x42 -#define JUMP_VAL 0x43 +/* Get all of the CLONE_NEW* flags. */ +#include "namespace.h" + +/* Synchronisation values. */ +enum sync_t { + SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ + SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ + SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ + SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ + + /* XXX: This doesn't help with segfaults and other such issues. */ + SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ +}; + +/* longjmp() arguments. */ +#define JUMP_PARENT 0x00 +#define JUMP_CHILD 0xA0 +#define JUMP_INIT 0xA1 + +/* JSON buffer. */ +#define JSON_MAX 4096 /* Assume the stack grows down, so arguments should be above it. */ -struct clone_arg { +struct clone_t { /* * Reserve some space for clone() to locate arguments * and retcode in this place */ char stack[4096] __attribute__ ((aligned(16))); char stack_ptr[0]; + + /* There's two children. This is used to execute the different code. */ jmp_buf *env; + int jmpval; }; struct nlconfig_t { char *data; uint32_t cloneflags; char *uidmap; - int uidmap_len; + size_t uidmap_len; char *gidmap; - int gidmap_len; + size_t gidmap_len; + char *namespaces; + size_t namespaces_len; uint8_t is_setgroup; int consolefd; }; @@ -82,80 +106,24 @@ int setns(int fd, int nstype) } #endif +/* XXX: This is ugly. */ +static int syncfd = -1; + /* TODO(cyphar): Fix this so it correctly deals with syncT. */ -#define bail(fmt, ...) \ - do { \ - fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \ - exit(__COUNTER__ + 1); \ +#define bail(fmt, ...) \ + do { \ + int ret = __COUNTER__ + 1; \ + fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \ + if (syncfd >= 0) { \ + enum sync_t s = SYNC_ERR; \ + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) \ + fprintf(stderr, "nsenter: failed: write(s)"); \ + if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret)) \ + fprintf(stderr, "nsenter: failed: write(ret)"); \ + } \ + exit(ret); \ } while(0) -static int child_func(void *arg) -{ - struct clone_arg *ca = (struct clone_arg *)arg; - longjmp(*ca->env, JUMP_VAL); -} - -static int clone_parent(jmp_buf *env, int flags) __attribute__ ((noinline)); -static int clone_parent(jmp_buf *env, int flags) -{ - int child; - struct clone_arg ca = { - .env = env, - }; - - child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags, &ca); - - /* - * On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so we have - * to unshare(2) before clone(2) in order to do this. This was fixed in - * upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was - * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. - * - * As far as we're aware, the last mainline kernel which had this bug was - * Linux 3.12. However, we cannot comment on which kernels the broken patch - * was backported to. - */ - if (errno == EINVAL) { - if (unshare(flags) < 0) - bail("unable to unshare namespaces"); - child = clone(child_func, ca.stack_ptr, SIGCHLD | CLONE_PARENT, &ca); - } - - return child; -} - -/* - * Gets the init pipe fd from the environment, which is used to read the - * bootstrap data and tell the parent what the new pid is after we finish - * setting up the environment. - */ -static int initpipe(void) -{ - int pipenum; - char *initpipe, *endptr; - - initpipe = getenv("_LIBCONTAINER_INITPIPE"); - if (initpipe == NULL || *initpipe == '\0') - return -1; - - errno = 0; - pipenum = strtol(initpipe, &endptr, 10); - if (errno != 0 || *endptr != '\0') - bail("unable to parse _LIBCONTAINER_INITPIPE"); - - return pipenum; -} - -static uint32_t readint32(char *buf) -{ - return *(uint32_t *) buf; -} - -static uint8_t readint8(char *buf) -{ - return *(uint8_t *) buf; -} - static int write_file(char *data, size_t data_len, char *pathfmt, ...) { int fd, len, ret = 0; @@ -185,18 +153,28 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...) return ret; } -#define SETGROUPS_ALLOW "allow" -#define SETGROUPS_DENY "deny" +enum policy_t { + SETGROUPS_DEFAULT = 0, + SETGROUPS_ALLOW, + SETGROUPS_DENY, +}; /* This *must* be called before we touch gid_map. */ -static void update_setgroups(int pid, bool setgroup) +static void update_setgroups(int pid, enum policy_t setgroup) { char *policy; - if (setgroup) - policy = SETGROUPS_ALLOW; - else - policy = SETGROUPS_DENY; + switch (setgroup) { + case SETGROUPS_ALLOW: + policy = "allow"; + break; + case SETGROUPS_DENY: + policy = "deny"; + break; + case SETGROUPS_DEFAULT: + /* Nothing to do. */ + return; + } if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) { /* @@ -226,84 +204,78 @@ static void update_gidmap(int pid, char *map, int map_len) bail("failed to update /proc/%d/gid_map", pid); } -#define JSON_MAX 4096 +/* A dummy function that just jumps to the given jumpval. */ +static int child_func(void *arg) __attribute__ ((noinline)); +static int child_func(void *arg) +{ + struct clone_t *ca = (struct clone_t *)arg; + longjmp(*ca->env, ca->jmpval); +} -static void start_child(int pipenum, jmp_buf *env, int syncpipe[2], struct nlconfig_t *config) +static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline)); +static int clone_parent(jmp_buf *env, int jmpval) { - int len, childpid; - char buf[JSON_MAX]; - uint8_t syncval; + struct clone_t ca = { + .env = env, + .jmpval = jmpval, + }; - /* - * We must fork to actually enter the PID namespace, and use - * CLONE_PARENT so that the child init can have the right parent - * (the bootstrap process). Also so we don't need to forward the - * child's exit code or resend its death signal. - */ - childpid = clone_parent(env, config->cloneflags); - if (childpid < 0) - bail("unable to fork"); - - /* Update setgroups, uid_map and gid_map for the process if provided. */ - if (config->is_setgroup) - update_setgroups(childpid, true); - update_uidmap(childpid, config->uidmap, config->uidmap_len); - update_gidmap(childpid, config->gidmap, config->gidmap_len); - - /* Send the sync signal to the child. */ - close(syncpipe[0]); - syncval = SYNC_VAL; - if (write(syncpipe[1], &syncval, sizeof(syncval)) != sizeof(syncval)) - bail("failed to write sync byte to child"); - - /* Send the child pid back to our parent */ - len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", childpid); - if (len < 0 || write(pipenum, buf, len) != len) { - kill(childpid, SIGKILL); - bail("unable to send a child pid"); - } + return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); +} + +/* + * Gets the init pipe fd from the environment, which is used to read the + * bootstrap data and tell the parent what the new pid is after we finish + * setting up the environment. + */ +static int initpipe(void) +{ + int pipenum; + char *initpipe, *endptr; + + initpipe = getenv("_LIBCONTAINER_INITPIPE"); + if (initpipe == NULL || *initpipe == '\0') + return -1; + + pipenum = strtol(initpipe, &endptr, 10); + if (*endptr != '\0') + bail("unable to parse _LIBCONTAINER_INITPIPE"); - exit(0); + return pipenum; } /* Returns the clone(2) flag for a namespace, given the name of a namespace. */ static int nsflag(char *name) { - if (false) - /* dummy */ ; -#ifdef CLONE_NEWCGROUP - else if (!strcmp(name, "cgroup")) + if (!strcmp(name, "cgroup")) return CLONE_NEWCGROUP; -#endif -#ifdef CLONE_NEWIPC else if (!strcmp(name, "ipc")) return CLONE_NEWIPC; -#endif -#ifdef CLONE_NEWNS else if (!strcmp(name, "mnt")) return CLONE_NEWNS; -#endif -#ifdef CLONE_NEWNET else if (!strcmp(name, "net")) return CLONE_NEWNET; -#endif -#ifdef CLONE_NEWPID else if (!strcmp(name, "pid")) return CLONE_NEWPID; -#endif -#ifdef CLONE_NEWUSER else if (!strcmp(name, "user")) return CLONE_NEWUSER; -#endif -#ifdef CLONE_NEWUTS else if (!strcmp(name, "uts")) return CLONE_NEWUTS; -#endif /* If we don't recognise a name, fallback to 0. */ return 0; } +static uint32_t readint32(char *buf) +{ + return *(uint32_t *) buf; +} + +static uint8_t readint8(char *buf) +{ + return *(uint8_t *) buf; +} + static void nl_parse(int fd, struct nlconfig_t *config) { size_t len, size; @@ -348,78 +320,17 @@ static void nl_parse(int fd, struct nlconfig_t *config) break; case CONSOLE_PATH_ATTR: /* - * The context in which this is done (before or after we - * join the other namespaces) will affect how the path - * resolution of the console works. This order is not - * decided here, but rather in container_linux.go. We just - * follow the order given by the netlink message. + * We open the console here because we currently evaluate console + * paths from the *host* namespaces. */ config->consolefd = open(current, O_RDWR); if (config->consolefd < 0) bail("failed to open console %s", current); break; - case NS_PATHS_ATTR:{ - /* - * Open each namespace path and setns it in the - * order provided to us. We currently don't have - * any context for what kind of namespace we're - * joining, so just blindly do it. - */ - char *saveptr = NULL; - char *ns = strtok_r(current, ",", &saveptr); - int num = 0, i; - - struct namespace_t { - int fd; - int ns; - char *path; - } *nses = NULL; - - if (!ns || !strlen(current)) - bail("ns paths are empty"); - - /* - * We have to open the file descriptors first, since after - * we join the mnt namespace we might no longer be able to - * access the paths. - */ - do { - int fd; - char *path; - - /* Resize the namespace array. */ - nses = realloc(nses, ++num * sizeof(struct namespace_t)); - - /* Split 'ns:path'. */ - path = strstr(ns, ":"); - if (!path) - bail("failed to parse %s", ns); - *path++ = '\0'; - - fd = open(path, O_RDONLY); - if (fd < 0) - bail("failed to open %s", ns); - - nses[num - 1] = (struct namespace_t) { - .fd = fd, - .ns = nsflag(ns), - .path = path, - }; - } while ((ns = strtok_r(NULL, ",", &saveptr)) != NULL); - - for (i = 0; i < num; i++) { - struct namespace_t ns = nses[i]; - - /* Actually join the namespaces. */ - if (setns(ns.fd, ns.ns) < 0) - bail("failed to setns to %s", ns.path); - - close(ns.fd); - } - - free(nses); - break; - } + case NS_PATHS_ATTR: + config->namespaces = current; + config->namespaces_len = payload_len; + break; case UIDMAP_ATTR: config->uidmap = current; config->uidmap_len = payload_len; @@ -444,6 +355,71 @@ void nl_free(struct nlconfig_t *config) free(config->data); } +void join_namespaces(char *nslist) +{ + int num = 0, i; + char *saveptr = NULL; + char *namespace = strtok_r(nslist, ",", &saveptr); + struct namespace_t { + int fd; + int ns; + char type[PATH_MAX]; + char path[PATH_MAX]; + } *namespaces = NULL; + + if (!namespace || !strlen(namespace) || !strlen(nslist)) + bail("ns paths are empty"); + + /* + * We have to open the file descriptors first, since after + * we join the mnt namespace we might no longer be able to + * access the paths. + */ + do { + int fd; + char *path; + struct namespace_t *ns; + + /* Resize the namespace array. */ + namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t)); + if (!namespaces) + bail("failed to reallocate namespace array"); + ns = &namespaces[num - 1]; + + /* Split 'ns:path'. */ + path = strstr(namespace, ":"); + if (!path) + bail("failed to parse %s", namespace); + *path++ = '\0'; + + fd = open(path, O_RDONLY); + if (fd < 0) + bail("failed to open %s", namespace); + + ns->fd = fd; + ns->ns = nsflag(namespace); + strncpy(ns->path, path, PATH_MAX); + } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); + + /* + * The ordering in which we join namespaces is important. We should + * always join the user namespace *first*. This is all guaranteed + * from the container_linux.go side of this, so we're just going to + * follow the order given to us. + */ + + for (i = 0; i < num; i++) { + struct namespace_t ns = namespaces[i]; + + if (setns(ns.fd, ns.ns) < 0) + bail("failed to setns to %s", ns.path); + + close(ns.fd); + } + + free(namespaces); +} + void nsexec(void) { int pipenum; @@ -464,60 +440,311 @@ void nsexec(void) /* clone(2) flags are mandatory. */ if (config.cloneflags == -1) - bail("missing clone_flags"); + bail("missing cloneflags"); /* Pipe so we can tell the child when we've finished setting up. */ - if (pipe(syncpipe) < 0) + if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0) bail("failed to setup sync pipe between parent and child"); - /* Set up the jump point. */ - if (setjmp(env) == JUMP_VAL) { - /* - * We're inside the child now, having jumped from the - * start_child() code after forking in the parent. - */ - uint8_t s = 0; - int consolefd = config.consolefd; + /* TODO: Currently we aren't dealing with child deaths properly. */ + + /* + * Okay, so this is quite annoying. + * + * In order to make sure that deal with older kernels (when CLONE_NEWUSER + * wasn't guaranteed to be done first if you specify multiple namespaces in + * a clone(2) invocation) as well as with certain usecases like rootless + * containers, we cannot just dump all of the cloneflags into clone(2). + * However, if we unshare(2) the user namespace *before* we clone(2), then + * all hell breaks loose. + * + * The parent no longer has permissions to do many things (unshare(2) drops + * all capabilities in your old namespace), and the container cannot be set + * up to have more than one {uid,gid} mapping. This is obviously less than + * ideal. In order to fix this, we have to first clone(2) and then unshare. + * + * Unfortunately, it's not as simple as that. We have to fork to enter the + * PID namespace (the PID namespace only applies to children). Since we'll + * have to double-fork, this clone_parent() call won't be able to get the + * PID of the _actual_ init process (without doing more synchronisation than + * I can deal with at the moment). So we'll just get the parent to send it + * for us, the only job of this process is to update + * /proc/pid/{setgroups,uid_map,gid_map}. + * + * And as a result of the above, we also need to setns(2) in the first child + * because if we join a PID namespace in the topmost parent then our child + * will be in that namespace (and it will not be able to give us a PID value + * that makes sense without resorting to sending things with cmsg). + * + * This also deals with an older issue caused by dumping cloneflags into + * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so + * we have to unshare(2) before clone(2) in order to do this. This was fixed + * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was + * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're + * aware, the last mainline kernel which had this bug was Linux 3.12. + * However, we cannot comment on which kernels the broken patch was + * backported to. + * + * -- Aleksa "what has my life come to?" Sarai + */ + + switch (setjmp(env)) { + /* + * Stage 0: We're in the parent. Our job is just to create a new child + * (stage 1: JUMP_CHILD) process and write its uid_map and + * gid_map. That process will go on to create a new process, then + * it will send us its PID which we will send to the bootstrap + * process. + */ + case JUMP_PARENT: { + int len; + pid_t child; + char buf[JSON_MAX]; + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0); + + /* Start the process of getting a container. */ + child = clone_parent(&env, JUMP_CHILD); + if (child < 0) + bail("unable to fork: child_func"); + + /* State machine for synchronisation with the children. */ + while (true) { + enum sync_t s; + + /* This doesn't need to be global, we're in the parent. */ + int syncfd = syncpipe[1]; + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with child: next state"); + + switch (s) { + case SYNC_ERR: { + /* We have to mirror the error code of the child. */ + int ret; + + if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret)) + bail("failed to sync with child: read(error code)"); + + exit(ret); + } + break; + case SYNC_USERMAP_PLS: + /* Enable setgroups(2) if we've been asked to. */ + if (config.is_setgroup) + update_setgroups(child, SETGROUPS_ALLOW); + + /* Set up mappings. */ + update_uidmap(child, config.uidmap, config.uidmap_len); + update_gidmap(child, config.gidmap, config.gidmap_len); + + s = SYNC_USERMAP_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); + } + break; + case SYNC_USERMAP_ACK: + /* We should _never_ receive acks. */ + kill(child, SIGKILL); + bail("failed to sync with child: unexpected SYNC_USERMAP_ACK"); + break; + case SYNC_RECVPID_PLS: { + pid_t old = child; + + /* Get the init_func pid. */ + if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { + kill(old, SIGKILL); + bail("failed to sync with child: read(childpid)"); + } + + /* Send ACK. */ + s = SYNC_RECVPID_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(old, SIGKILL); + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); + } + } + + /* Leave the loop. */ + goto out; + case SYNC_RECVPID_ACK: + /* We should _never_ receive acks. */ + kill(child, SIGKILL); + bail("failed to sync with child: unexpected SYNC_RECVPID_ACK"); + break; + } + } + + out: + /* Send the init_func pid back to our parent. */ + len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child); + if (len < 0) { + kill(child, SIGKILL); + bail("unable to generate JSON for child pid"); + } + if (write(pipenum, buf, len) != len) { + kill(child, SIGKILL); + bail("unable to send child pid to bootstrapper"); + } + + exit(0); + } + + /* + * Stage 1: We're in the first child process. Our job is to join any + * provided user namespaces in the netlink payload. If we've been + * asked to CLONE_NEWUSER, we will unshare the user namespace and + * ask our parent (stage 0) to set up our user mappings for us. + * Then, we unshare the rest of the requested namespaces and + * create a new child (stage 2: JUMP_INIT). We then send the + * child's PID to our parent (stage 0). + */ + case JUMP_CHILD: { + pid_t child; + enum sync_t s; - /* Close the writing side of pipe. */ - close(syncpipe[1]); + /* We're in a child and thus need to tell the parent if we die. */ + syncfd = syncpipe[0]; - /* Sync with parent. */ - if (read(syncpipe[0], &s, sizeof(s)) != sizeof(s) || s != SYNC_VAL) - bail("failed to read sync byte from parent"); + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0); - if (setsid() < 0) - bail("setsid failed"); + /* + * We need to setns first. We cannot do this earlier (in stage 0) + * because of the fact that we forked to get here (the PID of + * [stage 2: JUMP_INIT]) would be meaningless). We could send it + * using cmsg(3) but that's just annoying. + */ + if (config.namespaces) + join_namespaces(config.namespaces); + + /* + * Deal with user namespaces first. They are quite special, as they + * affect our ability to unshare other namespaces and are used as + * context for privilege checks. + */ + if (config.cloneflags & CLONE_NEWUSER) { + /* Create a new user namespace. */ + if (unshare(CLONE_NEWUSER) < 0) + bail("failed to unshare user namespace"); + + /* + * We don't have the privileges to do any mapping here (see the + * clone_parent rant). So signal our parent to hook us up. + */ + + s = SYNC_USERMAP_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); + + /* ... wait for mapping ... */ + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); + if (s != SYNC_USERMAP_ACK) + bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); + + config.cloneflags &= ~CLONE_NEWUSER; + } - if (setuid(0) < 0) - bail("setuid failed"); + /* + * Now we can unshare the rest of the namespaces. We can't be sure if the + * current kernel supports clone(CLONE_PARENT | CLONE_NEWPID), so we'll + * just do it the long way anyway. + */ + if (unshare(config.cloneflags) < 0) + bail("failed to unshare namespaces"); + + /* TODO: What about non-namespace clone flags that we're dropping here? */ + child = clone_parent(&env, JUMP_INIT); + if (child < 0) + bail("unable to fork: init_func"); + + /* Send the child to our parent, which knows what it's doing. */ + s = SYNC_RECVPID_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); + } + if (write(syncfd, &child, sizeof(child)) != sizeof(child)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(childpid)"); + } - if (setgid(0) < 0) - bail("setgid failed"); + /* ... wait for parent to get the pid ... */ - if (setgroups(0, NULL) < 0) - bail("setgroups failed"); + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); + } + if (s != SYNC_RECVPID_ACK) { + kill(child, SIGKILL); + bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); + } - if (consolefd != -1) { - if (ioctl(consolefd, TIOCSCTTY, 0) < 0) - bail("ioctl TIOCSCTTY failed"); - if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO) - bail("failed to dup stdin"); - if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO) - bail("failed to dup stdout"); - if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO) - bail("failed to dup stderr"); + /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ + exit(0); } - /* Free netlink data. */ - nl_free(&config); + /* + * Stage 2: We're the final child process, and the only process that will + * actually return to the Go runtime. Our job is to just do the + * final cleanup steps and then return to the Go runtime to allow + * init_linux.go to run. + */ + case JUMP_INIT: { + /* + * We're inside the child now, having jumped from the + * start_child() code after forking in the parent. + */ + int consolefd = config.consolefd; - /* Finish executing, let the Go runtime take over. */ - return; - } + /* We're in a child and thus need to tell the parent if we die. */ + syncfd = syncpipe[0]; - /* Run the parent code. */ - start_child(pipenum, &env, syncpipe, &config); + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long) "runc:[1:INIT]", 0, 0, 0); + + if (setsid() < 0) + bail("setsid failed"); + + if (setuid(0) < 0) + bail("setuid failed"); + + if (setgid(0) < 0) + bail("setgid failed"); + + if (setgroups(0, NULL) < 0) + bail("setgroups failed"); + + if (consolefd != -1) { + if (ioctl(consolefd, TIOCSCTTY, 0) < 0) + bail("ioctl TIOCSCTTY failed"); + if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO) + bail("failed to dup stdin"); + if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO) + bail("failed to dup stdout"); + if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO) + bail("failed to dup stderr"); + } + + /* Close sync pipes. */ + close(syncpipe[0]); + close(syncpipe[1]); + + /* Free netlink data. */ + nl_free(&config); + + /* Finish executing, let the Go runtime take over. */ + return; + } + default: + bail("unexpected jump value"); + break; + } /* Should never be reached. */ bail("should never be reached");