Skip to content

Commit

Permalink
Merge pull request #110 from Roblox/stdout_stderr_issue
Browse files Browse the repository at this point in the history
Fix: Stderr and stdout of existing processes are lost after a restart of nomad
  • Loading branch information
shishir-a412ed authored Sep 13, 2021
2 parents 42df527 + 7f50f0f commit 5168f15
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 9 deletions.
16 changes: 8 additions & 8 deletions containerd/containerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,12 +341,7 @@ func (d *Driver) loadContainer(id string) (containerd.Container, error) {
}

func (d *Driver) createTask(container containerd.Container, stdoutPath, stderrPath string) (containerd.Task, error) {
stdout, err := openFIFO(stdoutPath)
if err != nil {
return nil, err
}

stderr, err := openFIFO(stderrPath)
stdout, stderr, err := getStdoutStderrFifos(stdoutPath, stderrPath)
if err != nil {
return nil, err
}
Expand All @@ -357,9 +352,14 @@ func (d *Driver) createTask(container containerd.Container, stdoutPath, stderrPa
return container.NewTask(ctxWithTimeout, cio.NewCreator(cio.WithStreams(nil, stdout, stderr)))
}

func (d *Driver) getTask(container containerd.Container) (containerd.Task, error) {
func (d *Driver) getTask(container containerd.Container, stdoutPath, stderrPath string) (containerd.Task, error) {
stdout, stderr, err := getStdoutStderrFifos(stdoutPath, stderrPath)
if err != nil {
return nil, err
}

ctxWithTimeout, cancel := context.WithTimeout(d.ctxContainerd, 30*time.Second)
defer cancel()

return container.Task(ctxWithTimeout, cio.Load)
return container.Task(ctxWithTimeout, cio.NewAttach(cio.WithStreams(nil, stdout, stderr)))
}
6 changes: 5 additions & 1 deletion containerd/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,8 @@ type TaskConfig struct {
type TaskState struct {
StartedAt time.Time
ContainerName string
StdoutPath string
StderrPath string
}

type Driver struct {
Expand Down Expand Up @@ -496,6 +498,8 @@ func (d *Driver) StartTask(cfg *drivers.TaskConfig) (*drivers.TaskHandle, *drive
driverState := TaskState{
StartedAt: h.startedAt,
ContainerName: containerName,
StdoutPath: cfg.StdoutPath,
StderrPath: cfg.StderrPath,
}

if err := handle.SetDriverState(&driverState); err != nil {
Expand Down Expand Up @@ -539,7 +543,7 @@ func (d *Driver) RecoverTask(handle *drivers.TaskHandle) error {
return fmt.Errorf("Error in recovering container: %v", err)
}

task, err := d.getTask(container)
task, err := d.getTask(container, taskState.StdoutPath, taskState.StderrPath)
if err != nil {
return fmt.Errorf("Error in recovering task: %v", err)
}
Expand Down
14 changes: 14 additions & 0 deletions containerd/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,20 @@ func buildMountpoint(mountType, mountTarget, mountSource string, mountOptions []
return m
}

// getStdoutStderrFifos return the container's stdout and stderr FIFO's.
func getStdoutStderrFifos(stdoutPath, stderrPath string) (*os.File, *os.File, error) {
stdout, err := openFIFO(stdoutPath)
if err != nil {
return nil, nil, err
}

stderr, err := openFIFO(stderrPath)
if err != nil {
return nil, nil, err
}
return stdout, stderr, nil
}

// FIFO's are named pipes in linux.
// openFIFO() opens the nomad task stdout/stderr pipes and returns the fd.
func openFIFO(path string) (*os.File, error) {
Expand Down
24 changes: 24 additions & 0 deletions tests/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ main() {
setup
echo "INFO: Checking if nomad-driver-containerd is up and running, and nomad is ready to accept jobs."
is_containerd_driver_active
is_nomad_ready

run_tests $@
exit $PASS_STATUS
Expand Down Expand Up @@ -202,4 +203,27 @@ is_containerd_driver_active() {
fi
}

is_nomad_ready() {
i="0"
while test $i -lt 5
do
set +e
status=$(curl -s http://127.0.0.1:4646/v1/nodes|jq '.[0] ."Status"')
rc=$?
set -e
if [[ $rc -eq 0 && $status = \"ready\" ]]; then
echo "INFO: nomad is ready to accept jobs."
break
fi
echo "INFO: nomad is initializing, sleep for 4 seconds."
sleep 4s
i=$[$i+1]
done

if [ $i -ge 5 ]; then
echo "ERROR: nomad didn't come up. exit 1."
exit 1
fi
}

main "$@"

0 comments on commit 5168f15

Please sign in to comment.