diff --git a/init.go b/init.go
index 08351fdb488..59ce1e8c739 100644
--- a/init.go
+++ b/init.go
@@ -4,6 +4,7 @@ import (
 	"fmt"
 	"os"
 	"runtime"
+	"strconv"
 
 	"github.com/opencontainers/runc/libcontainer"
 	"github.com/opencontainers/runc/libcontainer/logs"
@@ -23,8 +24,13 @@ func init() {
 			panic(fmt.Sprintf("libcontainer: failed to parse log level: %q: %v", level, err))
 		}
 
+		logPipeFdStr := os.Getenv("_LIBCONTAINER_LOGPIPE")
+		logPipeFd, err := strconv.Atoi(logPipeFdStr)
+		if err != nil {
+			panic(fmt.Sprintf("libcontainer: failed to convert environment variable _LIBCONTAINER_LOGPIPE=%s to int: %s", logPipeFdStr, err))
+		}
 		err = logs.ConfigureLogging(logs.Config{
-			LogPipeFd: os.Getenv("_LIBCONTAINER_LOGPIPE"),
+			LogPipeFd: logPipeFd,
 			LogFormat: "json",
 			LogLevel:  logLevel,
 		})
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 65e2eace8c8..ac4beb63c5a 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -357,12 +357,26 @@ type openResult struct {
 	err  error
 }
 
-func (c *linuxContainer) start(process *Process) error {
+func (c *linuxContainer) start(process *Process) (retErr error) {
 	parent, err := c.newParentProcess(process)
 	if err != nil {
 		return newSystemErrorWithCause(err, "creating new parent process")
 	}
-	parent.forwardChildLogs()
+
+	logsDone := parent.forwardChildLogs()
+	if logsDone != nil {
+		defer func() {
+			// Wait for log forwarder to finish. This depends
+			// on init closing the _LIBCONTAINER_LOGPIPE log fd:
+			// For standard init, this is done by closeLogFD.
+			// For setns init, the fd is closed upon execve.
+			err := <-logsDone
+			if err != nil && retErr == nil {
+				retErr = newSystemErrorWithCause(err, "forwarding init logs")
+			}
+		}()
+	}
+
 	if err := parent.start(); err != nil {
 		return newSystemErrorWithCause(err, "starting container process")
 	}
diff --git a/libcontainer/container_linux_test.go b/libcontainer/container_linux_test.go
index 2a9af6ad28e..78071a92651 100644
--- a/libcontainer/container_linux_test.go
+++ b/libcontainer/container_linux_test.go
@@ -139,7 +139,8 @@ func (m *mockProcess) externalDescriptors() []string {
 func (m *mockProcess) setExternalDescriptors(newFds []string) {
 }
 
-func (m *mockProcess) forwardChildLogs() {
+func (m *mockProcess) forwardChildLogs() chan error {
+	return nil
 }
 
 func TestGetContainerPids(t *testing.T) {
diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go
index 0b4cf59eb19..972b0f2b5d9 100644
--- a/libcontainer/factory_linux.go
+++ b/libcontainer/factory_linux.go
@@ -365,6 +365,12 @@ func (l *LinuxFactory) StartInitialization() (err error) {
 		defer consoleSocket.Close()
 	}
 
+	logPipeFdStr := os.Getenv("_LIBCONTAINER_LOGPIPE")
+	logPipeFd, err := strconv.Atoi(logPipeFdStr)
+	if err != nil {
+		return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE=%s to int: %s", logPipeFdStr, err)
+	}
+
 	// clear the current process's environment to clean any libcontainer
 	// specific env vars.
 	os.Clearenv()
@@ -387,7 +393,7 @@ func (l *LinuxFactory) StartInitialization() (err error) {
 		}
 	}()
 
-	i, err := newContainerInit(it, pipe, consoleSocket, fifofd)
+	i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd)
 	if err != nil {
 		return err
 	}
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index c57af0eebb8..dcebeda11a9 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -35,8 +35,8 @@ const (
 )
 
 type pid struct {
-	Pid           int `json:"pid"`
-	PidFirstChild int `json:"pid_first"`
+	Pid           int `json:"stage2_pid"`
+	PidFirstChild int `json:"stage1_pid"`
 }
 
 // network is an internal struct used to setup container networks.
@@ -76,7 +76,7 @@ type initer interface {
 	Init() error
 }
 
-func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) {
+func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int) (initer, error) {
 	var config *initConfig
 	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
 		return nil, err
@@ -90,6 +90,7 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd
 			pipe:          pipe,
 			consoleSocket: consoleSocket,
 			config:        config,
+			logFd:         logFd,
 		}, nil
 	case initStandard:
 		return &linuxStandardInit{
@@ -98,6 +99,7 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd
 			parentPid:     unix.Getppid(),
 			config:        config,
 			fifoFd:        fifoFd,
+			logFd:         logFd,
 		}, nil
 	}
 	return nil, fmt.Errorf("unknown init type %q", t)
diff --git a/libcontainer/logs/logs.go b/libcontainer/logs/logs.go
index 1077e7b0145..dd4c4a6a101 100644
--- a/libcontainer/logs/logs.go
+++ b/libcontainer/logs/logs.go
@@ -6,14 +6,14 @@ import (
 	"fmt"
 	"io"
 	"os"
-	"strconv"
 	"sync"
 
+	"github.com/pkg/errors"
 	"github.com/sirupsen/logrus"
 )
 
 var (
-	configureMutex = sync.Mutex{}
+	configureMutex sync.Mutex
 	// loggingConfigured will be set once logging has been configured via invoking `ConfigureLogging`.
 	// Subsequent invocations of `ConfigureLogging` would be no-op
 	loggingConfigured = false
@@ -23,41 +23,44 @@ type Config struct {
 	LogLevel    logrus.Level
 	LogFormat   string
 	LogFilePath string
-	LogPipeFd   string
+	LogPipeFd   int
 }
 
-func ForwardLogs(logPipe io.Reader) {
-	lineReader := bufio.NewReader(logPipe)
-	for {
-		line, err := lineReader.ReadBytes('\n')
-		if len(line) > 0 {
-			processEntry(line)
-		}
-		if err == io.EOF {
-			logrus.Debugf("log pipe has been closed: %+v", err)
-			return
+func ForwardLogs(logPipe io.ReadCloser) chan error {
+	done := make(chan error, 1)
+	s := bufio.NewScanner(logPipe)
+
+	go func() {
+		for s.Scan() {
+			processEntry(s.Bytes())
 		}
-		if err != nil {
-			logrus.Errorf("log pipe read error: %+v", err)
+		if err := logPipe.Close(); err != nil {
+			logrus.Errorf("error closing log source: %v", err)
 		}
-	}
+		done <- s.Err()
+		close(done)
+	}()
+
+	return done
 }
 
 func processEntry(text []byte) {
-	type jsonLog struct {
+	if len(text) == 0 {
+		return
+	}
+
+	var jl struct {
 		Level string `json:"level"`
 		Msg   string `json:"msg"`
 	}
-
-	var jl jsonLog
 	if err := json.Unmarshal(text, &jl); err != nil {
-		logrus.Errorf("failed to decode %q to json: %+v", text, err)
+		logrus.Errorf("failed to decode %q to json: %v", text, err)
 		return
 	}
 
 	lvl, err := logrus.ParseLevel(jl.Level)
 	if err != nil {
-		logrus.Errorf("failed to parse log level %q: %v\n", jl.Level, err)
+		logrus.Errorf("failed to parse log level %q: %v", jl.Level, err)
 		return
 	}
 	logrus.StandardLogger().Logf(lvl, jl.Msg)
@@ -68,18 +71,15 @@ func ConfigureLogging(config Config) error {
 	defer configureMutex.Unlock()
 
 	if loggingConfigured {
-		logrus.Debug("logging has already been configured")
-		return nil
+		return errors.New("logging has already been configured")
 	}
 
 	logrus.SetLevel(config.LogLevel)
 
-	if config.LogPipeFd != "" {
-		logPipeFdInt, err := strconv.Atoi(config.LogPipeFd)
-		if err != nil {
-			return fmt.Errorf("failed to convert _LIBCONTAINER_LOGPIPE environment variable value %q to int: %v", config.LogPipeFd, err)
-		}
-		logrus.SetOutput(os.NewFile(uintptr(logPipeFdInt), "logpipe"))
+	// XXX: while 0 is a valid fd (usually stdin), here we assume
+	// that we never deliberately set LogPipeFd to 0.
+	if config.LogPipeFd > 0 {
+		logrus.SetOutput(os.NewFile(uintptr(config.LogPipeFd), "logpipe"))
 	} else if config.LogFilePath != "" {
 		f, err := os.OpenFile(config.LogFilePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0644)
 		if err != nil {
diff --git a/libcontainer/logs/logs_linux_test.go b/libcontainer/logs/logs_linux_test.go
index ec9ae4fc8b2..0baf7c4cbc0 100644
--- a/libcontainer/logs/logs_linux_test.go
+++ b/libcontainer/logs/logs_linux_test.go
@@ -1,10 +1,10 @@
 package logs
 
 import (
-	"errors"
+	"bytes"
+	"io"
 	"io/ioutil"
 	"os"
-	"strings"
 	"testing"
 	"time"
 
@@ -12,141 +12,109 @@ import (
 )
 
 func TestLoggingToFile(t *testing.T) {
-	logW, logFile, _ := runLogForwarding(t)
-	defer os.Remove(logFile)
-	defer logW.Close()
+	l := runLogForwarding(t)
 
-	logToLogWriter(t, logW, `{"level": "info","msg":"kitten"}`)
-
-	logFileContent := waitForLogContent(t, logFile)
-	if !strings.Contains(logFileContent, "kitten") {
-		t.Fatalf("%s does not contain kitten", logFileContent)
-	}
+	logToLogWriter(t, l, `{"level": "info","msg":"kitten"}`)
+	finish(t, l)
+	check(t, l, "kitten")
 }
 
 func TestLogForwardingDoesNotStopOnJsonDecodeErr(t *testing.T) {
-	logW, logFile, _ := runLogForwarding(t)
-	defer os.Remove(logFile)
-	defer logW.Close()
-
-	logToLogWriter(t, logW, "invalid-json-with-kitten")
+	l := runLogForwarding(t)
 
-	logFileContent := waitForLogContent(t, logFile)
-	if !strings.Contains(logFileContent, "failed to decode") {
-		t.Fatalf("%q does not contain decoding error", logFileContent)
-	}
-
-	truncateLogFile(t, logFile)
+	logToLogWriter(t, l, "invalid-json-with-kitten")
+	checkWait(t, l, "failed to decode")
 
-	logToLogWriter(t, logW, `{"level": "info","msg":"puppy"}`)
+	truncateLogFile(t, l.file)
 
-	logFileContent = waitForLogContent(t, logFile)
-	if !strings.Contains(logFileContent, "puppy") {
-		t.Fatalf("%s does not contain puppy", logFileContent)
-	}
+	logToLogWriter(t, l, `{"level": "info","msg":"puppy"}`)
+	finish(t, l)
+	check(t, l, "puppy")
 }
 
 func TestLogForwardingDoesNotStopOnLogLevelParsingErr(t *testing.T) {
-	logW, logFile, _ := runLogForwarding(t)
-	defer os.Remove(logFile)
-	defer logW.Close()
+	l := runLogForwarding(t)
 
-	logToLogWriter(t, logW, `{"level": "alert","msg":"puppy"}`)
+	logToLogWriter(t, l, `{"level": "alert","msg":"puppy"}`)
+	checkWait(t, l, "failed to parse log level")
 
-	logFileContent := waitForLogContent(t, logFile)
-	if !strings.Contains(logFileContent, "failed to parse log level") {
-		t.Fatalf("%q does not contain log level parsing error", logFileContent)
-	}
-
-	truncateLogFile(t, logFile)
-
-	logToLogWriter(t, logW, `{"level": "info","msg":"puppy"}`)
+	truncateLogFile(t, l.file)
 
-	logFileContent = waitForLogContent(t, logFile)
-	if !strings.Contains(logFileContent, "puppy") {
-		t.Fatalf("%s does not contain puppy", logFileContent)
-	}
+	logToLogWriter(t, l, `{"level": "info","msg":"puppy"}`)
+	finish(t, l)
+	check(t, l, "puppy")
 }
 
 func TestLogForwardingStopsAfterClosingTheWriter(t *testing.T) {
-	logW, logFile, doneForwarding := runLogForwarding(t)
-	defer os.Remove(logFile)
+	l := runLogForwarding(t)
 
-	logToLogWriter(t, logW, `{"level": "info","msg":"sync"}`)
+	logToLogWriter(t, l, `{"level": "info","msg":"sync"}`)
 
-	logFileContent := waitForLogContent(t, logFile)
-	if !strings.Contains(logFileContent, "sync") {
-		t.Fatalf("%q does not contain sync message", logFileContent)
-	}
-
-	logW.Close()
+	// Do not use finish() here as we check done pipe ourselves.
+	l.w.Close()
 	select {
-	case <-doneForwarding:
+	case <-l.done:
 	case <-time.After(10 * time.Second):
 		t.Fatal("log forwarding did not stop after closing the pipe")
 	}
+
+	check(t, l, "sync")
 }
 
-func logToLogWriter(t *testing.T, logW *os.File, message string) {
-	_, err := logW.Write([]byte(message + "\n"))
+func logToLogWriter(t *testing.T, l *log, message string) {
+	t.Helper()
+	_, err := l.w.Write([]byte(message + "\n"))
 	if err != nil {
 		t.Fatalf("failed to write %q to log writer: %v", message, err)
 	}
 }
 
-func runLogForwarding(t *testing.T) (*os.File, string, chan struct{}) {
+type log struct {
+	w    io.WriteCloser
+	file string
+	done chan error
+}
+
+func runLogForwarding(t *testing.T) *log {
+	t.Helper()
 	logR, logW, err := os.Pipe()
 	if err != nil {
 		t.Fatal(err)
 	}
+	t.Cleanup(func() {
+		logR.Close()
+		logW.Close()
+	})
 
 	tempFile, err := ioutil.TempFile("", "")
 	if err != nil {
 		t.Fatal(err)
 	}
+	defer tempFile.Close()
 	logFile := tempFile.Name()
+	t.Cleanup(func() { os.Remove(logFile) })
 
 	logConfig := Config{LogLevel: logrus.InfoLevel, LogFormat: "json", LogFilePath: logFile}
-	return logW, logFile, startLogForwarding(t, logConfig, logR)
-}
-
-func startLogForwarding(t *testing.T, logConfig Config, logR *os.File) chan struct{} {
 	loggingConfigured = false
 	if err := ConfigureLogging(logConfig); err != nil {
 		t.Fatal(err)
 	}
-	doneForwarding := make(chan struct{})
-	go func() {
-		ForwardLogs(logR)
-		close(doneForwarding)
-	}()
-	return doneForwarding
-}
 
-func waitForLogContent(t *testing.T, logFile string) string {
-	startTime := time.Now()
+	done := ForwardLogs(logR)
 
-	for {
-		if time.Now().After(startTime.Add(10 * time.Second)) {
-			t.Fatal(errors.New("No content in log file after 10 seconds"))
-			break
-		}
+	return &log{w: logW, done: done, file: logFile}
+}
 
-		fileContent, err := ioutil.ReadFile(logFile)
-		if err != nil {
-			t.Fatal(err)
-		}
-		if len(fileContent) == 0 {
-			continue
-		}
-		return string(fileContent)
+func finish(t *testing.T, l *log) {
+	t.Helper()
+	l.w.Close()
+	if err := <-l.done; err != nil {
+		t.Fatalf("ForwardLogs: %v", err)
 	}
-
-	return ""
 }
 
 func truncateLogFile(t *testing.T, logFile string) {
-	file, err := os.OpenFile(logFile, os.O_RDWR, 0666)
+	file, err := os.OpenFile(logFile, os.O_RDWR, 0o600)
 	if err != nil {
 		t.Fatalf("failed to open log file: %v", err)
 		return
@@ -158,3 +126,40 @@ func truncateLogFile(t *testing.T, logFile string) {
 		t.Fatalf("failed to truncate log file: %v", err)
 	}
 }
+
+// check checks that file contains txt
+func check(t *testing.T, l *log, txt string) {
+	t.Helper()
+	contents, err := ioutil.ReadFile(l.file)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !bytes.Contains(contents, []byte(txt)) {
+		t.Fatalf("%q does not contain %q", string(contents), txt)
+	}
+}
+
+// checkWait checks that file contains txt. If the file is empty,
+// it waits until it's not.
+func checkWait(t *testing.T, l *log, txt string) {
+	t.Helper()
+	const (
+		delay = 100 * time.Millisecond
+		iter  = 3
+	)
+	for i := 0; ; i++ {
+		st, err := os.Stat(l.file)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if st.Size() > 0 {
+			break
+		}
+		if i == iter {
+			t.Fatalf("waited %s for file %s to be non-empty but it still is", iter*delay, l.file)
+		}
+		time.Sleep(delay)
+	}
+
+	check(t, l, txt)
+}
diff --git a/libcontainer/nsenter/escape.c b/libcontainer/nsenter/escape.c
new file mode 100644
index 00000000000..56f99f12ba0
--- /dev/null
+++ b/libcontainer/nsenter/escape.c
@@ -0,0 +1,93 @@
+#include <stdlib.h>
+
+static char hex(char i)
+{
+	if (i >= 0 && i < 10) {
+		return '0' + i;
+	}
+	if (i >= 10 && i < 16) {
+		return 'a' + i - 10;
+	}
+	return '?';
+}
+
+/*
+ * Escape the string to be usable as JSON string.
+ */
+char *escape_json_string(char *str)
+{
+	int i, j = 0;
+	char *out;
+
+	// Avoid malloc by checking first if escaping is required.
+	// While at it, count how much additional space we need.
+	// XXX: the counting code need to be in sync with the rest!
+	for (i = 0; str[i] != '\0'; i++) {
+		switch (str[i]) {
+		case '\\':
+		case '"':
+		case '\b':
+		case '\n':
+		case '\r':
+		case '\t':
+		case '\f':
+			j += 2;
+			break;
+		default:
+			if (str[i] < ' ') {
+				// \u00xx
+				j += 6;
+			}
+		}
+	}
+	if (j == 0) {
+		// nothing to escape
+		return str;
+	}
+
+	out = malloc(i + j);
+	if (!out) {
+		exit(1);
+	}
+	for (i = j = 0; str[i] != '\0'; i++, j++) {
+		switch (str[i]) {
+		case '"':
+		case '\\':
+			out[j++] = '\\';
+			out[j] = str[i];
+			continue;
+		}
+		if (str[i] >= ' ') {
+			out[j] = str[i];
+			continue;
+		}
+		out[j++] = '\\';
+		switch (str[i]) {
+		case '\b':
+			out[j] = 'b';
+			break;
+		case '\n':
+			out[j] = 'n';
+			break;
+		case '\r':
+			out[j] = 'r';
+			break;
+		case '\t':
+			out[j] = 't';
+			break;
+		case '\f':
+			out[j] = 'f';
+			break;
+		default:
+			out[j++] = 'u';
+			out[j++] = '0';
+			out[j++] = '0';
+			out[j++] = hex(str[i] >> 4);
+			out[j] = hex(str[i] & 0x0f);
+		}
+	}
+	out[j] = '\0';
+
+	free(str);
+	return out;
+}
diff --git a/libcontainer/nsenter/escape.go b/libcontainer/nsenter/escape.go
new file mode 100644
index 00000000000..c8a9860c7cd
--- /dev/null
+++ b/libcontainer/nsenter/escape.go
@@ -0,0 +1,24 @@
+package nsenter
+
+// This file is part of escape_json_string unit test. It would be a part
+// of escape_test.go if Go would allow cgo to be used in _test.go files.
+
+// #include <stdlib.h>
+// #include "escape.h"
+import "C"
+
+import (
+	"testing"
+	"unsafe"
+)
+
+func testEscapeJsonString(t *testing.T, input, want string) {
+	in := C.CString(input)
+	out := C.escape_json_string(in)
+	got := C.GoString(out)
+	C.free(unsafe.Pointer(out))
+	t.Logf("input: %q, output: %q", input, got)
+	if got != want {
+		t.Errorf("Failed on input: %q, want %q, got %q", input, want, got)
+	}
+}
diff --git a/libcontainer/nsenter/escape.h b/libcontainer/nsenter/escape.h
new file mode 100644
index 00000000000..80d660a4338
--- /dev/null
+++ b/libcontainer/nsenter/escape.h
@@ -0,0 +1 @@
+extern char *escape_json_string(char *str);
diff --git a/libcontainer/nsenter/escape_test.go b/libcontainer/nsenter/escape_test.go
new file mode 100644
index 00000000000..bd28a29146b
--- /dev/null
+++ b/libcontainer/nsenter/escape_test.go
@@ -0,0 +1,23 @@
+package nsenter
+
+import "testing"
+
+func TestEscapeJsonString(t *testing.T) {
+	testCases := []struct {
+		input, output string
+	}{
+		{"", ""},
+		{"abcdef", "abcdef"},
+		{`\\\\\\`, `\\\\\\\\\\\\`},
+		{`with"quote`, `with\"quote`},
+		{"\n\r\b\t\f\\", `\n\r\b\t\f\\`},
+		{"\007", "\\u0007"},
+		{"\017 \020 \037", "\\u000f \\u0010 \\u001f"},
+		{"\033", "\\u001b"},
+		{"<->", "<->"},
+	}
+
+	for _, tc := range testCases {
+		testEscapeJsonString(t, tc.input, tc.output)
+	}
+}
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index a33f2fcc371..a67b4f6bbbd 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -28,6 +28,7 @@
 
 /* Get all of the CLONE_NEW* flags. */
 #include "namespace.h"
+#include "escape.h"
 
 /* Synchronisation values. */
 enum sync_t {
@@ -36,7 +37,7 @@ enum sync_t {
 	SYNC_RECVPID_PLS = 0x42,	/* Tell parent we're sending the PID. */
 	SYNC_RECVPID_ACK = 0x43,	/* PID was correctly received by parent. */
 	SYNC_GRANDCHILD = 0x44,	/* The grandchild is ready to run. */
-	SYNC_CHILD_READY = 0x45,	/* The child or grandchild is ready to return. */
+	SYNC_CHILD_FINISH = 0x45,	/* The child or grandchild has finished. */
 };
 
 /*
@@ -45,10 +46,14 @@ enum sync_t {
  */
 #define CREATECGROUPNS 0x80
 
+#define STAGE_SETUP  -1
 /* longjmp() arguments. */
-#define JUMP_PARENT 0x00
-#define JUMP_CHILD  0xA0
-#define JUMP_INIT   0xA1
+#define STAGE_PARENT  0
+#define STAGE_CHILD   1
+#define STAGE_INIT    2
+
+/* Stores the current stage of nsexec. */
+int current_stage = STAGE_SETUP;
 
 /* Assume the stack grows down, so arguments should be above it. */
 struct clone_t {
@@ -56,7 +61,7 @@ struct clone_t {
 	 * Reserve some space for clone() to locate arguments
 	 * and retcode in this place
 	 */
-	char stack[4096] __attribute__ ((aligned(16)));
+	char stack[4096] __attribute__((aligned(16)));
 	char stack_ptr[0];
 
 	/* There's two children. This is used to execute the different code. */
@@ -134,33 +139,43 @@ int setns(int fd, int nstype)
 }
 #endif
 
-static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...)
+static void write_log(const char *level, const char *format, ...)
 {
-	char message[1024] = {};
-
+	char *message = NULL, *stage = NULL;
 	va_list args;
+	int ret;
 
 	if (logfd < 0 || level == NULL)
-		return;
+		goto out;
 
 	va_start(args, format);
-	if (vsnprintf(message, sizeof(message), format, args) < 0)
-		goto done;
-
-	dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message);
-done:
+	ret = vasprintf(&message, format, args);
 	va_end(args);
-}
+	if (ret < 0)
+		goto out;
 
-#define write_log(level, fmt, ...) \
-	write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__)
+	message = escape_json_string(message);
+
+	if (current_stage == STAGE_SETUP)
+		stage = strdup("nsexec");
+	else
+		ret = asprintf(&stage, "nsexec-%d", current_stage);
+	if (ret < 0)
+		goto out;
+
+	dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n", level, stage, getpid(), message);
+
+ out:
+	free(message);
+	free(stage);
+}
 
 /* XXX: This is ugly. */
 static int syncfd = -1;
 
 #define bail(fmt, ...)                                       \
 	do {                                                       \
-		write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \
+		write_log(FATAL, fmt ": %m", ##__VA_ARGS__); \
 		exit(1);                                                 \
 	} while(0)
 
@@ -297,9 +312,11 @@ static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
 	if (map == NULL || map_len <= 0)
 		return;
 
+	write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map);
 	if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
 		if (errno != EPERM)
 			bail("failed to update /proc/%d/uid_map", pid);
+		write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path);
 		if (try_mapping_tool(path, pid, map, map_len))
 			bail("failed to use newuid map on %d", pid);
 	}
@@ -310,9 +327,11 @@ static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
 	if (map == NULL || map_len <= 0)
 		return;
 
+	write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map);
 	if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
 		if (errno != EPERM)
 			bail("failed to update /proc/%d/gid_map", pid);
+		write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path);
 		if (try_mapping_tool(path, pid, map, map_len))
 			bail("failed to use newgid map on %d", pid);
 	}
@@ -323,19 +342,20 @@ static void update_oom_score_adj(char *data, size_t len)
 	if (data == NULL || len <= 0)
 		return;
 
+	write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data);
 	if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
 		bail("failed to update /proc/self/oom_score_adj");
 }
 
 /* A dummy function that just jumps to the given jumpval. */
-static int child_func(void *arg) __attribute__ ((noinline));
+static int child_func(void *arg) __attribute__((noinline));
 static int child_func(void *arg)
 {
 	struct clone_t *ca = (struct clone_t *)arg;
 	longjmp(*ca->env, ca->jmpval);
 }
 
-static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
+static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline));
 static int clone_parent(jmp_buf *env, int jmpval)
 {
 	struct clone_t ca = {
@@ -507,7 +527,6 @@ void join_namespaces(char *nslist)
 	char *namespace = strtok_r(nslist, ",", &saveptr);
 	struct namespace_t {
 		int fd;
-		int ns;
 		char type[PATH_MAX];
 		char path[PATH_MAX];
 	} *namespaces = NULL;
@@ -542,7 +561,7 @@ void join_namespaces(char *nslist)
 			bail("failed to open %s", path);
 
 		ns->fd = fd;
-		ns->ns = nsflag(namespace);
+		strncpy(ns->type, namespace, PATH_MAX - 1);
 		strncpy(ns->path, path, PATH_MAX - 1);
 		ns->path[PATH_MAX - 1] = '\0';
 	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
@@ -555,12 +574,14 @@ void join_namespaces(char *nslist)
 	 */
 
 	for (i = 0; i < num; i++) {
-		struct namespace_t ns = namespaces[i];
+		struct namespace_t *ns = &namespaces[i];
+		int flag = nsflag(ns->type);
 
-		if (setns(ns.fd, ns.ns) < 0)
-			bail("failed to setns to %s", ns.path);
+		write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
+		if (setns(ns->fd, flag) < 0)
+			bail("failed to setns into %s namespace", ns->type);
 
-		close(ns.fd);
+		close(ns->fd);
 	}
 
 	free(namespaces);
@@ -569,6 +590,14 @@ void join_namespaces(char *nslist)
 /* Defined in cloned_binary.c. */
 extern int ensure_cloned_binary(void);
 
+static inline int sane_kill(pid_t pid, int signum)
+{
+	if (pid > 0)
+		return kill(pid, signum);
+	else
+		return 0;
+}
+
 void nsexec(void)
 {
 	int pipenum;
@@ -598,7 +627,7 @@ void nsexec(void)
 	if (ensure_cloned_binary() < 0)
 		bail("could not ensure we are a cloned binary");
 
-	write_log(DEBUG, "nsexec started");
+	write_log(DEBUG, "=> nsexec container setup");
 
 	/* Parse all of the netlink configuration. */
 	nl_parse(pipenum, &config);
@@ -622,6 +651,7 @@ void nsexec(void)
 	 * containers), which is the recommendation from the kernel folks.
 	 */
 	if (config.namespaces) {
+		write_log(DEBUG, "set process as non-dumpable");
 		if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
 			bail("failed to set process as non-dumpable");
 	}
@@ -686,45 +716,49 @@ void nsexec(void)
 	 * -- Aleksa "what has my life come to?" Sarai
 	 */
 
-	switch (setjmp(env)) {
+	current_stage = setjmp(env);
+	switch (current_stage) {
 		/*
 		 * Stage 0: We're in the parent. Our job is just to create a new child
-		 *          (stage 1: JUMP_CHILD) process and write its uid_map and
+		 *          (stage 1: STAGE_CHILD) process and write its uid_map and
 		 *          gid_map. That process will go on to create a new process, then
 		 *          it will send us its PID which we will send to the bootstrap
 		 *          process.
 		 */
-	case JUMP_PARENT:{
+	case STAGE_PARENT:{
 			int len;
-			pid_t child, first_child = -1;
-			bool ready = false;
+			pid_t stage1_pid = -1, stage2_pid = -1;
+			bool stage1_complete, stage2_complete;
 
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
+			write_log(DEBUG, "~> nsexec stage-0");
 
 			/* Start the process of getting a container. */
-			child = clone_parent(&env, JUMP_CHILD);
-			if (child < 0)
-				bail("unable to fork: child_func");
+			write_log(DEBUG, "spawn stage-1");
+			stage1_pid = clone_parent(&env, STAGE_CHILD);
+			if (stage1_pid < 0)
+				bail("unable to spawn stage-1");
 
-			/*
-			 * State machine for synchronisation with the children.
-			 *
-			 * Father only return when both child and grandchild are
-			 * ready, so we can receive all possible error codes
-			 * generated by children.
-			 */
 			syncfd = sync_child_pipe[1];
 			close(sync_child_pipe[0]);
 
-			while (!ready) {
+			/*
+			 * State machine for synchronisation with the children. We only
+			 * return once both the child and grandchild are ready.
+			 */
+			write_log(DEBUG, "-> stage-1 synchronisation loop");
+			stage1_complete = false;
+			while (!stage1_complete) {
 				enum sync_t s;
 
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with child: next state");
+					bail("failed to sync with stage-1: next state");
 
 				switch (s) {
 				case SYNC_USERMAP_PLS:
+					write_log(DEBUG, "stage-1 requested userns mappings");
+
 					/*
 					 * Enable setgroups(2) if we've been asked to. But we also
 					 * have to explicitly disable setgroups(2) if we're
@@ -735,70 +769,78 @@ void nsexec(void)
 					 * For rootless multi-entry mapping, config.is_setgroup shall be true and
 					 * newuidmap/newgidmap shall be used.
 					 */
-
 					if (config.is_rootless_euid && !config.is_setgroup)
-						update_setgroups(child, SETGROUPS_DENY);
+						update_setgroups(stage1_pid, SETGROUPS_DENY);
 
 					/* Set up mappings. */
-					update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
-					update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
+					update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len);
+					update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len);
 
 					s = SYNC_USERMAP_ACK;
 					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-						kill(child, SIGKILL);
-						bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)");
 					}
 					break;
-				case SYNC_RECVPID_PLS:{
-						first_child = child;
-
-						/* Get the init_func pid. */
-						if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
-							kill(first_child, SIGKILL);
-							bail("failed to sync with child: read(childpid)");
-						}
-
-						/* Send ACK. */
-						s = SYNC_RECVPID_ACK;
-						if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-							kill(first_child, SIGKILL);
-							kill(child, SIGKILL);
-							bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
-						}
-
-						/* Send the init_func pid back to our parent.
-						 *
-						 * Send the init_func pid and the pid of the first child back to our parent.
-						 * We need to send both back because we can't reap the first child we created (CLONE_PARENT).
-						 * It becomes the responsibility of our parent to reap the first child.
-						 */
-						len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
-						if (len < 0) {
-							kill(child, SIGKILL);
-							bail("unable to generate JSON for child pid");
-						}
+				case SYNC_RECVPID_PLS:
+					write_log(DEBUG, "stage-1 requested pid to be forwarded");
+
+					/* Get the stage-2 pid. */
+					if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with stage-1: read(stage2_pid)");
+					}
+
+					/* Send ACK. */
+					s = SYNC_RECVPID_ACK;
+					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)");
+					}
+
+					/*
+					 * Send both the stage-1 and stage-2 pids back to runc.
+					 * runc needs the stage-2 to continue process management,
+					 * but because stage-1 was spawned with CLONE_PARENT we
+					 * cannot reap it within stage-0 and thus we need to ask
+					 * runc to reap the zombie for us.
+					 */
+					write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
+						  stage1_pid, stage2_pid);
+					len =
+					    dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid,
+						    stage2_pid);
+					if (len < 0) {
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with runc: write(pid-JSON)");
 					}
 					break;
-				case SYNC_CHILD_READY:
-					ready = true;
+				case SYNC_CHILD_FINISH:
+					write_log(DEBUG, "stage-1 complete");
+					stage1_complete = true;
 					break;
 				default:
 					bail("unexpected sync value: %u", s);
 				}
 			}
+			write_log(DEBUG, "<- stage-1 synchronisation loop");
 
 			/* Now sync with grandchild. */
-
 			syncfd = sync_grandchild_pipe[1];
 			close(sync_grandchild_pipe[0]);
-
-			ready = false;
-			while (!ready) {
+			write_log(DEBUG, "-> stage-2 synchronisation loop");
+			stage2_complete = false;
+			while (!stage2_complete) {
 				enum sync_t s;
 
+				write_log(DEBUG, "signalling stage-2 to run");
 				s = SYNC_GRANDCHILD;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-					kill(child, SIGKILL);
+					sane_kill(stage2_pid, SIGKILL);
 					bail("failed to sync with child: write(SYNC_GRANDCHILD)");
 				}
 
@@ -806,27 +848,31 @@ void nsexec(void)
 					bail("failed to sync with child: next state");
 
 				switch (s) {
-				case SYNC_CHILD_READY:
-					ready = true;
+				case SYNC_CHILD_FINISH:
+					write_log(DEBUG, "stage-2 complete");
+					stage2_complete = true;
 					break;
 				default:
 					bail("unexpected sync value: %u", s);
 				}
 			}
+			write_log(DEBUG, "<- stage-2 synchronisation loop");
+			write_log(DEBUG, "<~ nsexec stage-0");
 			exit(0);
 		}
+		break;
 
 		/*
 		 * Stage 1: We're in the first child process. Our job is to join any
-		 *          provided namespaces in the netlink payload and unshare all
-		 *          of the requested namespaces. If we've been asked to
-		 *          CLONE_NEWUSER, we will ask our parent (stage 0) to set up
-		 *          our user mappings for us. Then, we create a new child
-		 *          (stage 2: JUMP_INIT) for PID namespace. We then send the
-		 *          child's PID to our parent (stage 0).
+		 *          provided namespaces in the netlink payload and unshare all of
+		 *          the requested namespaces. If we've been asked to CLONE_NEWUSER,
+		 *          we will ask our parent (stage 0) to set up our user mappings
+		 *          for us. Then, we create a new child (stage 2: STAGE_INIT) for
+		 *          PID namespace. We then send the child's PID to our parent
+		 *          (stage 0).
 		 */
-	case JUMP_CHILD:{
-			pid_t child;
+	case STAGE_CHILD:{
+			pid_t stage2_pid = -1;
 			enum sync_t s;
 
 			/* We're in a child and thus need to tell the parent if we die. */
@@ -835,11 +881,12 @@ void nsexec(void)
 
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
+			write_log(DEBUG, "~> nsexec stage-1");
 
 			/*
 			 * We need to setns first. We cannot do this earlier (in stage 0)
 			 * because of the fact that we forked to get here (the PID of
-			 * [stage 2: JUMP_INIT]) would be meaningless). We could send it
+			 * [stage 2: STAGE_INIT]) would be meaningless). We could send it
 			 * using cmsg(3) but that's just annoying.
 			 */
 			if (config.namespaces)
@@ -865,40 +912,50 @@ void nsexec(void)
 			 * problem.
 			 */
 			if (config.cloneflags & CLONE_NEWUSER) {
+				write_log(DEBUG, "unshare user namespace");
 				if (unshare(CLONE_NEWUSER) < 0)
 					bail("failed to unshare user namespace");
 				config.cloneflags &= ~CLONE_NEWUSER;
 
 				/*
-				 * We don't have the privileges to do any mapping here (see the
-				 * clone_parent rant). So signal our parent to hook us up.
+				 * We need to set ourselves as dumpable temporarily so that the
+				 * parent process can write to our procfs files.
 				 */
-
-				/* Switching is only necessary if we joined namespaces. */
 				if (config.namespaces) {
+					write_log(DEBUG, "temporarily set process as dumpable");
 					if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
-						bail("failed to set process as dumpable");
+						bail("failed to temporarily set process as dumpable");
 				}
+
+				/*
+				 * We don't have the privileges to do any mapping here (see the
+				 * clone_parent rant). So signal stage-0 to do the mapping for
+				 * us.
+				 */
+				write_log(DEBUG, "request stage-0 to map user namespace");
 				s = SYNC_USERMAP_PLS;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
 
 				/* ... wait for mapping ... */
-
+				write_log(DEBUG, "request stage-0 to map user namespace");
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
 				if (s != SYNC_USERMAP_ACK)
 					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
-				/* Switching is only necessary if we joined namespaces. */
+
+				/* Revert temporary re-dumpable setting. */
 				if (config.namespaces) {
+					write_log(DEBUG, "re-set process as non-dumpable");
 					if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
-						bail("failed to set process as dumpable");
+						bail("failed to re-set process as non-dumpable");
 				}
 
 				/* Become root in the namespace proper. */
 				if (setresuid(0, 0, 0) < 0)
 					bail("failed to become root in user namespace");
 			}
+
 			/*
 			 * Unshare all of the namespaces. Now, it should be noted that this
 			 * ordering might break in the future (especially with rootless
@@ -909,8 +966,9 @@ void nsexec(void)
 			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
 			 * was broken, so we'll just do it the long way anyway.
 			 */
+			write_log(DEBUG, "unshare remaining namespace (except cgroupns)");
 			if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
-				bail("failed to unshare namespaces");
+				bail("failed to unshare remaining namespaces (except cgroupns)");
 
 			/*
 			 * TODO: What about non-namespace clone flags that we're dropping here?
@@ -921,41 +979,45 @@ void nsexec(void)
 			 * which would break many applications and libraries, so we must fork
 			 * to actually enter the new PID namespace.
 			 */
-			child = clone_parent(&env, JUMP_INIT);
-			if (child < 0)
-				bail("unable to fork: init_func");
+			write_log(DEBUG, "spawn stage-2");
+			stage2_pid = clone_parent(&env, STAGE_INIT);
+			if (stage2_pid < 0)
+				bail("unable to spawn stage-2");
 
 			/* Send the child to our parent, which knows what it's doing. */
+			write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid);
 			s = SYNC_RECVPID_PLS;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-				kill(child, SIGKILL);
+				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
 			}
-			if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
-				kill(child, SIGKILL);
-				bail("failed to sync with parent: write(childpid)");
+			if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
+				sane_kill(stage2_pid, SIGKILL);
+				bail("failed to sync with parent: write(stage2_pid)");
 			}
 
 			/* ... wait for parent to get the pid ... */
-
 			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
-				kill(child, SIGKILL);
+				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
 			}
 			if (s != SYNC_RECVPID_ACK) {
-				kill(child, SIGKILL);
+				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
 			}
 
-			s = SYNC_CHILD_READY;
+			write_log(DEBUG, "signal completion to stage-0");
+			s = SYNC_CHILD_FINISH;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-				kill(child, SIGKILL);
-				bail("failed to sync with parent: write(SYNC_CHILD_READY)");
+				sane_kill(stage2_pid, SIGKILL);
+				bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
 			}
 
-			/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
+			/* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */
+			write_log(DEBUG, "<~ nsexec stage-1");
 			exit(0);
 		}
+		break;
 
 		/*
 		 * Stage 2: We're the final child process, and the only process that will
@@ -963,7 +1025,7 @@ void nsexec(void)
 		 *          final cleanup steps and then return to the Go runtime to allow
 		 *          init_linux.go to run.
 		 */
-	case JUMP_INIT:{
+	case STAGE_INIT:{
 			/*
 			 * We're inside the child now, having jumped from the
 			 * start_child() code after forking in the parent.
@@ -978,6 +1040,7 @@ void nsexec(void)
 
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
+			write_log(DEBUG, "~> nsexec stage-2");
 
 			if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 				bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
@@ -998,21 +1061,30 @@ void nsexec(void)
 					bail("setgroups failed");
 			}
 
-			/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
+			/*
+			 * Wait until our topmost parent has finished cgroup setup in
+			 * p.manager.Apply().
+			 *
+			 * TODO(cyphar): Check if this code is actually needed because we
+			 *               should be in the cgroup even from stage-0, so
+			 *               waiting until now might not make sense.
+			 */
 			if (config.cloneflags & CLONE_NEWCGROUP) {
 				uint8_t value;
 				if (read(pipenum, &value, sizeof(value)) != sizeof(value))
 					bail("read synchronisation value failed");
 				if (value == CREATECGROUPNS) {
+					write_log(DEBUG, "unshare cgroup namespace");
 					if (unshare(CLONE_NEWCGROUP) < 0)
 						bail("failed to unshare cgroup namespace");
 				} else
 					bail("received unknown synchronisation value");
 			}
 
-			s = SYNC_CHILD_READY;
+			write_log(DEBUG, "signal completion to stage-0");
+			s = SYNC_CHILD_FINISH;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
-				bail("failed to sync with patent: write(SYNC_CHILD_READY)");
+				bail("failed to sync with patent: write(SYNC_CHILD_FINISH)");
 
 			/* Close sync pipes. */
 			close(sync_grandchild_pipe[0]);
@@ -1021,10 +1093,13 @@ void nsexec(void)
 			nl_free(&config);
 
 			/* Finish executing, let the Go runtime take over. */
+			write_log(DEBUG, "<= nsexec container setup");
+			write_log(DEBUG, "booting up go runtime ...");
 			return;
 		}
+		break;
 	default:
-		bail("unexpected jump value");
+		bail("unknown stage '%d' for jump value", current_stage);
 	}
 
 	/* Should never be reached. */
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
index 6f2828770ee..8315d709aaa 100644
--- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go
@@ -51,7 +51,7 @@ type parentProcess interface {
 
 	setExternalDescriptors(fds []string)
 
-	forwardChildLogs()
+	forwardChildLogs() chan error
 }
 
 type filePair struct {
@@ -251,8 +251,8 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) {
 	p.fds = newFds
 }
 
-func (p *setnsProcess) forwardChildLogs() {
-	go logs.ForwardLogs(p.logFilePair.parent)
+func (p *setnsProcess) forwardChildLogs() chan error {
+	return logs.ForwardLogs(p.logFilePair.parent)
 }
 
 type initProcess struct {
@@ -605,8 +605,8 @@ func (p *initProcess) setExternalDescriptors(newFds []string) {
 	p.fds = newFds
 }
 
-func (p *initProcess) forwardChildLogs() {
-	go logs.ForwardLogs(p.logFilePair.parent)
+func (p *initProcess) forwardChildLogs() chan error {
+	return logs.ForwardLogs(p.logFilePair.parent)
 }
 
 func getPipeFds(pid int) ([]string, error) {
diff --git a/libcontainer/restored_process.go b/libcontainer/restored_process.go
index f861e82d1b2..97565d7fa4b 100644
--- a/libcontainer/restored_process.go
+++ b/libcontainer/restored_process.go
@@ -77,7 +77,8 @@ func (p *restoredProcess) setExternalDescriptors(newFds []string) {
 	p.fds = newFds
 }
 
-func (p *restoredProcess) forwardChildLogs() {
+func (p *restoredProcess) forwardChildLogs() chan error {
+	return nil
 }
 
 // nonChildProcess represents a process where the calling process is not
@@ -125,5 +126,6 @@ func (p *nonChildProcess) setExternalDescriptors(newFds []string) {
 	p.fds = newFds
 }
 
-func (p *nonChildProcess) forwardChildLogs() {
+func (p *nonChildProcess) forwardChildLogs() chan error {
+	return nil
 }
diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go
index 6b1e9a6e97c..9e7fc37dc5a 100644
--- a/libcontainer/setns_init_linux.go
+++ b/libcontainer/setns_init_linux.go
@@ -12,6 +12,7 @@ import (
 	"github.com/opencontainers/runc/libcontainer/system"
 	"github.com/opencontainers/selinux/go-selinux"
 	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 
@@ -21,6 +22,7 @@ type linuxSetnsInit struct {
 	pipe          *os.File
 	consoleSocket *os.File
 	config        *initConfig
+	logFd         int
 }
 
 func (l *linuxSetnsInit) getSessionRingName() string {
@@ -86,5 +88,13 @@ func (l *linuxSetnsInit) Init() error {
 			return newSystemErrorWithCause(err, "init seccomp")
 		}
 	}
+	logrus.Debugf("init: about to exec")
+	// Close the log pipe fd so the parent's ForwardLogs can exit.
+	// Usually this would be done upon system.Execv below, as the fd
+	// has CLOEXEC flag set, but it may fail.
+	if err := unix.Close(l.logFd); err != nil {
+		return newSystemErrorWithCause(err, "closing log pipe fd")
+	}
+
 	return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
 }
diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go
index 7ec506c462e..0dee51b624c 100644
--- a/libcontainer/standard_init_linux.go
+++ b/libcontainer/standard_init_linux.go
@@ -16,6 +16,7 @@ import (
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/opencontainers/selinux/go-selinux"
 	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 
@@ -24,6 +25,7 @@ type linuxStandardInit struct {
 	consoleSocket *os.File
 	parentPid     int
 	fifoFd        int
+	logFd         int
 	config        *initConfig
 }
 
@@ -180,7 +182,19 @@ func (l *linuxStandardInit) Init() error {
 		return err
 	}
 	// Close the pipe to signal that we have completed our init.
+	logrus.Debugf("init: closing the pipe to signal completion")
 	l.pipe.Close()
+
+	// Close the log pipe fd so the parent's ForwardLogs can exit.
+	// Usually this would be done upon execve as the file descriptor
+	// has CLOEXEC flag set, but (1) execve might not happen due to
+	// any kind of error below; (2) in case of runc start, it spawns
+	// runc init and exits, while runc init (this code) waits on fifo
+	// for an indefinitely long time.
+	if err := unix.Close(l.logFd); err != nil {
+		return newSystemErrorWithCause(err, "closing log pipe fd")
+	}
+
 	// Wait for the FIFO to be opened on the other side before exec-ing the
 	// user process. We open it through /proc/self/fd/$fd, because the fd that
 	// was given to us was an O_PATH fd to the fifo itself. Linux allows us to
diff --git a/main.go b/main.go
index 57d7f4282e6..97bce5bf5a3 100644
--- a/main.go
+++ b/main.go
@@ -142,7 +142,7 @@ func main() {
 				fmt.Fprintln(os.Stderr, "the path in $XDG_RUNTIME_DIR must be writable by the user")
 				fatal(err)
 			}
-			if err := os.Chmod(root, 0700|os.ModeSticky); err != nil {
+			if err := os.Chmod(root, os.FileMode(0o700)|os.ModeSticky); err != nil {
 				fmt.Fprintln(os.Stderr, "you should check permission of the path in $XDG_RUNTIME_DIR")
 				fatal(err)
 			}
@@ -150,6 +150,10 @@ func main() {
 		if err := reviseRootDir(context); err != nil {
 			return err
 		}
+		// let init configure logging on its own
+		if args := context.Args(); args != nil && args.First() == "init" {
+			return nil
+		}
 		return logs.ConfigureLogging(createLogConfig(context))
 	}
 
@@ -176,9 +180,9 @@ func (f *FatalWriter) Write(p []byte) (n int, err error) {
 
 func createLogConfig(context *cli.Context) logs.Config {
 	logFilePath := context.GlobalString("log")
-	logPipeFd := ""
+	logPipeFd := 0
 	if logFilePath == "" {
-		logPipeFd = "2"
+		logPipeFd = 2
 	}
 	config := logs.Config{
 		LogPipeFd:   logPipeFd,
diff --git a/tests/integration/debug.bats b/tests/integration/debug.bats
index 16f1b2fb0cc..333745e3969 100644
--- a/tests/integration/debug.bats
+++ b/tests/integration/debug.bats
@@ -10,16 +10,20 @@ function teardown() {
 	teardown_bundle
 }
 
+function check_debug() {
+	[[ "$*" == *"nsexec container setup"* ]]
+	[[ "$*" == *"child process in init()"* ]]
+	[[ "$*" == *"init: closing the pipe to signal completion"* ]]
+}
+
 @test "global --debug" {
 	# run hello-world
 	runc --debug run test_hello
-	echo "${output}"
 	[ "$status" -eq 0 ]
 
 	# check expected debug output was sent to stderr
 	[[ "${output}" == *"level=debug"* ]]
-	[[ "${output}" == *"nsexec started"* ]]
-	[[ "${output}" == *"child process in init()"* ]]
+	check_debug "$output"
 }
 
 @test "global --debug to --log" {
@@ -30,14 +34,11 @@ function teardown() {
 	# check output does not include debug info
 	[[ "${output}" != *"level=debug"* ]]
 
-	# check log.out was generated
-	[ -e log.out ]
-
+	cat log.out >&2
 	# check expected debug output was sent to log.out
 	output=$(cat log.out)
 	[[ "${output}" == *"level=debug"* ]]
-	[[ "${output}" == *"nsexec started"* ]]
-	[[ "${output}" == *"child process in init()"* ]]
+	check_debug "$output"
 }
 
 @test "global --debug to --log --log-format 'text'" {
@@ -48,14 +49,11 @@ function teardown() {
 	# check output does not include debug info
 	[[ "${output}" != *"level=debug"* ]]
 
-	# check log.out was generated
-	[ -e log.out ]
-
+	cat log.out >&2
 	# check expected debug output was sent to log.out
 	output=$(cat log.out)
 	[[ "${output}" == *"level=debug"* ]]
-	[[ "${output}" == *"nsexec started"* ]]
-	[[ "${output}" == *"child process in init()"* ]]
+	check_debug "$output"
 }
 
 @test "global --debug to --log --log-format 'json'" {
@@ -66,12 +64,9 @@ function teardown() {
 	# check output does not include debug info
 	[[ "${output}" != *"level=debug"* ]]
 
-	# check log.out was generated
-	[ -e log.out ]
-
+	cat log.out >&2
 	# check expected debug output was sent to log.out
 	output=$(cat log.out)
 	[[ "${output}" == *'"level":"debug"'* ]]
-	[[ "${output}" == *"nsexec started"* ]]
-	[[ "${output}" == *"child process in init()"* ]]
+	check_debug "$output"
 }
diff --git a/tests/integration/exec.bats b/tests/integration/exec.bats
index b7c9becbf47..3b825e4b768 100644
--- a/tests/integration/exec.bats
+++ b/tests/integration/exec.bats
@@ -132,6 +132,38 @@ function teardown() {
 	echo hello >preserve-fds.test
 	# fd 3 is used by bats, so we use 4
 	exec 4<preserve-fds.test
-	output=$(__runc exec --preserve-fds=2 test_busybox cat /proc/self/fd/4)
+	runc exec --preserve-fds=2 test_busybox cat /proc/self/fd/4
+	[ "$status" -eq 0 ]
 	[[ "${output}" == "hello" ]]
 }
+
+function check_exec_debug() {
+	[[ "$*" == *"nsexec container setup"* ]]
+	[[ "$*" == *"child process in init()"* ]]
+	[[ "$*" == *"init: about to exec"* ]]
+}
+
+@test "runc --debug exec" {
+	runc run -d --console-socket "$CONSOLE_SOCKET" test
+	[ "$status" -eq 0 ]
+
+	runc --debug exec test true
+	[ "$status" -eq 0 ]
+	[[ "${output}" == *"level=debug"* ]]
+	check_exec_debug "$output"
+}
+
+@test "runc --debug --log exec" {
+	runc run -d --console-socket "$CONSOLE_SOCKET" test
+	[ "$status" -eq 0 ]
+
+	runc --debug --log log.out exec test true
+	# check output does not include debug info
+	[[ "${output}" != *"level=debug"* ]]
+
+	cat log.out >&2
+	# check expected debug output was sent to log.out
+	output=$(cat log.out)
+	[[ "${output}" == *"level=debug"* ]]
+	check_exec_debug "$output"
+}