From 1c49e81cec66051a81df11f9c5eb496bf7764c54 Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Tue, 30 Apr 2024 18:21:06 +0200 Subject: [PATCH 01/24] func: set nomad as subreaper, clean processes in cgroup and wait for signals --- drivers/exec/driver.go | 11 +++++ drivers/shared/executor/executor_linux.go | 49 ++++++++++++++++++++++- go.mod | 1 + go.sum | 2 + 4 files changed, 62 insertions(+), 1 deletion(-) diff --git a/drivers/exec/driver.go b/drivers/exec/driver.go index 072b430a715..a43a2cb1f4f 100644 --- a/drivers/exec/driver.go +++ b/drivers/exec/driver.go @@ -28,6 +28,7 @@ import ( "github.com/hashicorp/nomad/plugins/drivers/utils" "github.com/hashicorp/nomad/plugins/shared/hclspec" pstructs "github.com/hashicorp/nomad/plugins/shared/structs" + "golang.org/x/sys/unix" ) const ( @@ -423,6 +424,13 @@ func (d *Driver) RecoverTask(handle *drivers.TaskHandle) error { return nil } +func (d *Driver) SetAsSubreaper() { + d.logger.Info(" setting nomad as subreaper") + if err := unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(1), 0, 0, 0); err != nil { + d.logger.Error("unable to set as subreaper", err) + } +} + func (d *Driver) StartTask(cfg *drivers.TaskConfig) (*drivers.TaskHandle, *drivers.DriverNetwork, error) { if _, ok := d.tasks.Get(cfg.ID); ok { return nil, nil, fmt.Errorf("task with ID %q already started", cfg.ID) @@ -496,6 +504,8 @@ func (d *Driver) StartTask(cfg *drivers.TaskConfig) (*drivers.TaskHandle, *drive Capabilities: caps, } + d.SetAsSubreaper() + ps, err := exec.Launch(execCmd) if err != nil { pluginClient.Kill() @@ -585,6 +595,7 @@ func (d *Driver) StopTask(taskID string, timeout time.Duration, signal string) e } func (d *Driver) DestroyTask(taskID string, force bool) error { + handle, ok := d.tasks.Get(taskID) if !ok { return drivers.ErrTaskNotFound diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index ff5d03cfbe3..8f0243f741b 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -12,6 +12,7 @@ import ( "io" "os" "os/exec" + "os/signal" "path" "path/filepath" "strings" @@ -97,6 +98,17 @@ func (l *LibcontainerExecutor) ListProcesses() *set.Set[int] { return procstats.List(l.command) } +func (l *LibcontainerExecutor) killOrphans(path string) { + c, e := cgroups.GetAllPids("/sys/fs/cgroup" + path) + for i, p := range c { + l.logger.Info(" killing process", i, p) + syscall.Kill(p, syscall.SIGKILL) + } + + l.logger.Info(fmt.Sprintf(" ********* pid: %+v error:%+v", c, e)) + //syscall.Kill(-gid, syscall.SIGKILL) +} + // Launch creates a new container in libcontainer and starts a new process with it func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, error) { l.logger.Trace("preparing to launch command", "command", command.Cmd, "args", strings.Join(command.Args, " ")) @@ -127,6 +139,8 @@ func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, erro return nil, fmt.Errorf("failed to configure container(%s): %v", l.id, err) } + l.logger.Debug(" c goup path ", containerCfg.Cgroups.Path) + l.killOrphans(containerCfg.Cgroups.Path) container, err := factory.Create(l.id, containerCfg) if err != nil { return nil, fmt.Errorf("failed to create container(%s): %v", l.id, err) @@ -163,9 +177,11 @@ func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, erro Init: true, } + l.logger.Debug("launching", "command", command.Args) if command.User != "" { process.User = command.User } + l.userProc = process l.totalCpuStats = cpustats.New(l.compute) @@ -188,7 +204,19 @@ func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, erro // be multiplexed l.userProcExited = make(chan interface{}) - go l.wait() + ctx, cancel := context.WithCancel(context.Background()) + + go func() { + defer cancel() + + go l.wait() + + signal := l.catchSignals(ctx) + err = container.Signal(signal, true) + if err != nil { + l.logger.Error("failed send signal to process(%s): %v", l.id, err) + } + }() return &ProcessState{ Pid: pid, @@ -197,6 +225,24 @@ func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, erro }, nil } +func (l *LibcontainerExecutor) catchSignals(ctx context.Context) os.Signal { + + sigch := make(chan os.Signal, 4) + defer close(sigch) + + l.logger.Debug("waiting for signls") + signal.Notify(sigch, syscall.SIGHUP, syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT, syscall.SIGSEGV) + defer signal.Stop(sigch) + + select { + case <-ctx.Done(): + return nil + + case s := <-sigch: + return s + } +} + // Wait waits until a process has exited and returns it's exitcode and errors func (l *LibcontainerExecutor) Wait(ctx context.Context) (*ProcessState, error) { select { @@ -779,6 +825,7 @@ func (l *LibcontainerExecutor) configureCG2(cfg *runc.Config, command *ExecComma func (l *LibcontainerExecutor) newLibcontainerConfig(command *ExecCommand) (*runc.Config, error) { cfg := &runc.Config{ + ParentDeathSignal: 9, Cgroups: &runc.Cgroup{ Resources: &runc.Resources{ MemorySwappiness: nil, diff --git a/go.mod b/go.mod index 4eeedab9ea5..a1147e5ce5b 100644 --- a/go.mod +++ b/go.mod @@ -175,6 +175,7 @@ require ( github.com/cilium/ebpf v0.9.1 // indirect github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible // indirect github.com/circonus-labs/circonusllhist v0.1.3 // indirect + github.com/containerd/cgroups v1.1.0 // indirect github.com/containerd/console v1.0.3 // indirect github.com/containerd/containerd v1.7.13 // indirect github.com/containerd/log v0.1.0 // indirect diff --git a/go.sum b/go.sum index 2c5d1ec5cac..5e7b9a86ce7 100644 --- a/go.sum +++ b/go.sum @@ -335,6 +335,8 @@ github.com/cncf/xds/go v0.0.0-20211001041855-01bcc9b48dfe/go.mod h1:eXthEFrGJvWH github.com/cncf/xds/go v0.0.0-20211011173535-cb28da3451f1/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/container-storage-interface/spec v1.7.0 h1:gW8eyFQUZWWrMWa8p1seJ28gwDoN5CVJ4uAbQ+Hdycw= github.com/container-storage-interface/spec v1.7.0/go.mod h1:JYuzLqr9VVNoDJl44xp/8fmCOvWPDKzuGTwCoklhuqk= +github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaDFBM= +github.com/containerd/cgroups v1.1.0/go.mod h1:6ppBcbh/NOOUU+dMKrykgaBnK9lCIBxHqJDGwsa1mIw= github.com/containerd/console v1.0.1/go.mod h1:XUsP6YE/mKtz6bxc+I8UiKKTP04qjQL4qcS3XoQ5xkw= github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw= github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U= From 08ad0fa9252ccd89f0cf11461b688b560f62457e Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Tue, 30 Apr 2024 18:53:39 +0200 Subject: [PATCH 02/24] style: clean up the orphans function --- client/lib/cgroupslib/editor.go | 4 ++++ drivers/shared/executor/executor_linux.go | 22 +++++++++++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/client/lib/cgroupslib/editor.go b/client/lib/cgroupslib/editor.go index e3f3db231ab..682c8be10c5 100644 --- a/client/lib/cgroupslib/editor.go +++ b/client/lib/cgroupslib/editor.go @@ -21,6 +21,10 @@ const ( root = "/sys/fs/cgroup" ) +func GetNomadCGRoot() string { + return root +} + // OpenPath creates a handle for modifying cgroup interface files under // the given directory. // diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 8f0243f741b..922aa74b5fa 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -98,15 +98,23 @@ func (l *LibcontainerExecutor) ListProcesses() *set.Set[int] { return procstats.List(l.command) } -func (l *LibcontainerExecutor) killOrphans(path string) { - c, e := cgroups.GetAllPids("/sys/fs/cgroup" + path) - for i, p := range c { - l.logger.Info(" killing process", i, p) - syscall.Kill(p, syscall.SIGKILL) +// killOrphans kills processes that ended up reparented to Nomad when the +// executor was unexpectedly killed. +func (l *LibcontainerExecutor) killOrphans(nomadRelativePath string) { + root := cgroupslib.GetNomadCGRoot() + orphansPIDs, err := cgroups.GetAllPids(filepath.Join(root, nomadRelativePath)) + if err != nil { + l.logger.Error("unable to get orphan allocs PIDs", err) + return } - l.logger.Info(fmt.Sprintf(" ********* pid: %+v error:%+v", c, e)) - //syscall.Kill(-gid, syscall.SIGKILL) + for _, pid := range orphansPIDs { + l.logger.Info("killing process:", orphansPIDs) + err := syscall.Kill(pid, syscall.SIGKILL) + if err != nil { + l.logger.Error("unable to send signal to process", pid, err) + } + } } // Launch creates a new container in libcontainer and starts a new process with it From 52a96b2d013e4da61643152746839f0c43ec98d5 Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Tue, 30 Apr 2024 18:54:37 +0200 Subject: [PATCH 03/24] func: comment out subreaper asignation --- client/lib/cgroupslib/editor.go | 2 +- drivers/exec/driver.go | 11 ----------- drivers/shared/executor/executor_linux.go | 2 +- drivers/shared/executor/utils.go | 4 ++++ drivers/shared/executor/utils_unix.go | 6 ++++++ 5 files changed, 12 insertions(+), 13 deletions(-) diff --git a/client/lib/cgroupslib/editor.go b/client/lib/cgroupslib/editor.go index 682c8be10c5..e0abfc70634 100644 --- a/client/lib/cgroupslib/editor.go +++ b/client/lib/cgroupslib/editor.go @@ -21,7 +21,7 @@ const ( root = "/sys/fs/cgroup" ) -func GetNomadCGRoot() string { +func GetDefautlRoot() string { return root } diff --git a/drivers/exec/driver.go b/drivers/exec/driver.go index a43a2cb1f4f..072b430a715 100644 --- a/drivers/exec/driver.go +++ b/drivers/exec/driver.go @@ -28,7 +28,6 @@ import ( "github.com/hashicorp/nomad/plugins/drivers/utils" "github.com/hashicorp/nomad/plugins/shared/hclspec" pstructs "github.com/hashicorp/nomad/plugins/shared/structs" - "golang.org/x/sys/unix" ) const ( @@ -424,13 +423,6 @@ func (d *Driver) RecoverTask(handle *drivers.TaskHandle) error { return nil } -func (d *Driver) SetAsSubreaper() { - d.logger.Info(" setting nomad as subreaper") - if err := unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(1), 0, 0, 0); err != nil { - d.logger.Error("unable to set as subreaper", err) - } -} - func (d *Driver) StartTask(cfg *drivers.TaskConfig) (*drivers.TaskHandle, *drivers.DriverNetwork, error) { if _, ok := d.tasks.Get(cfg.ID); ok { return nil, nil, fmt.Errorf("task with ID %q already started", cfg.ID) @@ -504,8 +496,6 @@ func (d *Driver) StartTask(cfg *drivers.TaskConfig) (*drivers.TaskHandle, *drive Capabilities: caps, } - d.SetAsSubreaper() - ps, err := exec.Launch(execCmd) if err != nil { pluginClient.Kill() @@ -595,7 +585,6 @@ func (d *Driver) StopTask(taskID string, timeout time.Duration, signal string) e } func (d *Driver) DestroyTask(taskID string, force bool) error { - handle, ok := d.tasks.Get(taskID) if !ok { return drivers.ErrTaskNotFound diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 922aa74b5fa..1d1fb4a9598 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -101,7 +101,7 @@ func (l *LibcontainerExecutor) ListProcesses() *set.Set[int] { // killOrphans kills processes that ended up reparented to Nomad when the // executor was unexpectedly killed. func (l *LibcontainerExecutor) killOrphans(nomadRelativePath string) { - root := cgroupslib.GetNomadCGRoot() + root := cgroupslib.GetDefautlRoot() orphansPIDs, err := cgroups.GetAllPids(filepath.Join(root, nomadRelativePath)) if err != nil { l.logger.Error("unable to get orphan allocs PIDs", err) diff --git a/drivers/shared/executor/utils.go b/drivers/shared/executor/utils.go index b74c5372823..260d6e00a5e 100644 --- a/drivers/shared/executor/utils.go +++ b/drivers/shared/executor/utils.go @@ -72,6 +72,10 @@ func CreateExecutor( isolateCommand(config.Cmd) } + /* if err := setAsSubreaper(); err != nil { + return nil, nil, fmt.Errorf("unable to set nomad as subreaper: %v", err) + } */ + return newExecutorClient(config, logger) } diff --git a/drivers/shared/executor/utils_unix.go b/drivers/shared/executor/utils_unix.go index b5ced65043e..6d73e10ce50 100644 --- a/drivers/shared/executor/utils_unix.go +++ b/drivers/shared/executor/utils_unix.go @@ -9,6 +9,8 @@ package executor import ( "os/exec" "syscall" + + "golang.org/x/sys/unix" ) // isolateCommand sets the setsid flag in exec.Cmd to true so that the process @@ -20,3 +22,7 @@ func isolateCommand(cmd *exec.Cmd) { } cmd.SysProcAttr.Setsid = true } + +func setAsSubreaper() error { + return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(1), 0, 0, 0) +} From 34c633e59d8c6acf8bde7d8a0f5358e9cf4a0c6e Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Wed, 1 May 2024 17:54:28 +0200 Subject: [PATCH 04/24] func: remove the subreaper --- drivers/shared/executor/utils.go | 4 ---- drivers/shared/executor/utils_unix.go | 6 ------ go.mod | 1 - go.sum | 2 -- 4 files changed, 13 deletions(-) diff --git a/drivers/shared/executor/utils.go b/drivers/shared/executor/utils.go index 260d6e00a5e..b74c5372823 100644 --- a/drivers/shared/executor/utils.go +++ b/drivers/shared/executor/utils.go @@ -72,10 +72,6 @@ func CreateExecutor( isolateCommand(config.Cmd) } - /* if err := setAsSubreaper(); err != nil { - return nil, nil, fmt.Errorf("unable to set nomad as subreaper: %v", err) - } */ - return newExecutorClient(config, logger) } diff --git a/drivers/shared/executor/utils_unix.go b/drivers/shared/executor/utils_unix.go index 6d73e10ce50..b5ced65043e 100644 --- a/drivers/shared/executor/utils_unix.go +++ b/drivers/shared/executor/utils_unix.go @@ -9,8 +9,6 @@ package executor import ( "os/exec" "syscall" - - "golang.org/x/sys/unix" ) // isolateCommand sets the setsid flag in exec.Cmd to true so that the process @@ -22,7 +20,3 @@ func isolateCommand(cmd *exec.Cmd) { } cmd.SysProcAttr.Setsid = true } - -func setAsSubreaper() error { - return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(1), 0, 0, 0) -} diff --git a/go.mod b/go.mod index a1147e5ce5b..4eeedab9ea5 100644 --- a/go.mod +++ b/go.mod @@ -175,7 +175,6 @@ require ( github.com/cilium/ebpf v0.9.1 // indirect github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible // indirect github.com/circonus-labs/circonusllhist v0.1.3 // indirect - github.com/containerd/cgroups v1.1.0 // indirect github.com/containerd/console v1.0.3 // indirect github.com/containerd/containerd v1.7.13 // indirect github.com/containerd/log v0.1.0 // indirect diff --git a/go.sum b/go.sum index 5e7b9a86ce7..2c5d1ec5cac 100644 --- a/go.sum +++ b/go.sum @@ -335,8 +335,6 @@ github.com/cncf/xds/go v0.0.0-20211001041855-01bcc9b48dfe/go.mod h1:eXthEFrGJvWH github.com/cncf/xds/go v0.0.0-20211011173535-cb28da3451f1/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/container-storage-interface/spec v1.7.0 h1:gW8eyFQUZWWrMWa8p1seJ28gwDoN5CVJ4uAbQ+Hdycw= github.com/container-storage-interface/spec v1.7.0/go.mod h1:JYuzLqr9VVNoDJl44xp/8fmCOvWPDKzuGTwCoklhuqk= -github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaDFBM= -github.com/containerd/cgroups v1.1.0/go.mod h1:6ppBcbh/NOOUU+dMKrykgaBnK9lCIBxHqJDGwsa1mIw= github.com/containerd/console v1.0.1/go.mod h1:XUsP6YE/mKtz6bxc+I8UiKKTP04qjQL4qcS3XoQ5xkw= github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw= github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U= From bba68b0075ae1ff215616180b7b07ac4b31bf5cf Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Fri, 3 May 2024 16:34:59 +0200 Subject: [PATCH 05/24] style: clean up logs --- drivers/shared/executor/executor_linux.go | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 1d1fb4a9598..70158cf5671 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -98,9 +98,11 @@ func (l *LibcontainerExecutor) ListProcesses() *set.Set[int] { return procstats.List(l.command) } -// killOrphans kills processes that ended up reparented to Nomad when the -// executor was unexpectedly killed. -func (l *LibcontainerExecutor) killOrphans(nomadRelativePath string) { +// cleanOldProcessesInCGroup kills processes that ended up reparented to Nomad when the +// executor was unexpectedly killed and nomad can reconnect to. +func (l *LibcontainerExecutor) cleanOldProcessesInCGroup(nomadRelativePath string) { + l.logger.Debug("removing old processes", "path", nomadRelativePath) + root := cgroupslib.GetDefautlRoot() orphansPIDs, err := cgroups.GetAllPids(filepath.Join(root, nomadRelativePath)) if err != nil { @@ -147,8 +149,7 @@ func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, erro return nil, fmt.Errorf("failed to configure container(%s): %v", l.id, err) } - l.logger.Debug(" c goup path ", containerCfg.Cgroups.Path) - l.killOrphans(containerCfg.Cgroups.Path) + l.cleanOldProcessesInCGroup(containerCfg.Cgroups.Path) container, err := factory.Create(l.id, containerCfg) if err != nil { return nil, fmt.Errorf("failed to create container(%s): %v", l.id, err) @@ -220,7 +221,8 @@ func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, erro go l.wait() signal := l.catchSignals(ctx) - err = container.Signal(signal, true) + + err = l.Signal(signal) if err != nil { l.logger.Error("failed send signal to process(%s): %v", l.id, err) } @@ -373,8 +375,9 @@ func (l *LibcontainerExecutor) Shutdown(signal string, grace time.Duration) erro case <-l.userProcExited: return nil case <-time.After(time.Second * 15): - return fmt.Errorf("process failed to exit after 15 seconds") } + + return fmt.Errorf("process failed to exit after 15 seconds") } // UpdateResources updates the resource isolation with new values to be enforced From bdfef994c38f69dd86be5e4aaceaf129e066c156 Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Wed, 8 May 2024 18:21:14 +0200 Subject: [PATCH 06/24] func: add testing for the cleaning of olf processes --- drivers/shared/executor/executor_linux.go | 6 +- .../shared/executor/executor_linux_test.go | 75 +++++++++++++++++++ go.mod | 2 +- go.sum | 8 +- 4 files changed, 83 insertions(+), 8 deletions(-) diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 70158cf5671..5288a78d9f5 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -98,10 +98,10 @@ func (l *LibcontainerExecutor) ListProcesses() *set.Set[int] { return procstats.List(l.command) } -// cleanOldProcessesInCGroup kills processes that ended up reparented to Nomad when the -// executor was unexpectedly killed and nomad can reconnect to. +// cleanOldProcessesInCGroup kills processes that might ended up orphans when the +// executor was unexpectedly killed and nomad can't reconnect to them. func (l *LibcontainerExecutor) cleanOldProcessesInCGroup(nomadRelativePath string) { - l.logger.Debug("removing old processes", "path", nomadRelativePath) + l.logger.Info("removing old processes", "path", nomadRelativePath) root := cgroupslib.GetDefautlRoot() orphansPIDs, err := cgroups.GetAllPids(filepath.Join(root, nomadRelativePath)) diff --git a/drivers/shared/executor/executor_linux_test.go b/drivers/shared/executor/executor_linux_test.go index 31a4d7f8575..20e33203fca 100644 --- a/drivers/shared/executor/executor_linux_test.go +++ b/drivers/shared/executor/executor_linux_test.go @@ -7,6 +7,7 @@ import ( "context" "fmt" "os" + "os/exec" "path/filepath" "regexp" "strconv" @@ -856,3 +857,77 @@ func TestExecCommand_getCgroupOr_v1_relative(t *testing.T) { result2 := ec.getCgroupOr("cpuset", "/sys/fs/cgroup/cpuset/nomad/abc123") must.Eq(t, result2, "/sys/fs/cgroup/cpuset/custom/path") } + +func createCGroup(fullpath string) error { + return os.MkdirAll(fullpath, 0755) +} + +func TestExecutor2_OOMKilled(t *testing.T) { + ci.Parallel(t) + + testutil.ExecCompatible(t) + testutil.CgroupsCompatible(t) + + testExecCmd := testExecutorCommandWithChroot(t) + + allocDir := testExecCmd.allocDir + defer allocDir.Destroy() + + fullCGroupPath := testExecCmd.command.Resources.LinuxResources.CpusetCgroupPath + + execCmd := testExecCmd.command + execCmd.Cmd = "/bin/sleep" + execCmd.Args = []string{"20"} + + execCmd.ResourceLimits = true + execCmd.ModePID = "private" + execCmd.ModeIPC = "private" + + // Create the CGroup the executor's command will run in and populate it with one process + err := createCGroup(fullCGroupPath) + must.NoError(t, err) + + cmd := exec.Command("/bin/sleep", "3000") + err = cmd.Start() + must.NoError(t, err) + + go func() { + err := cmd.Wait() + //This process will be killed by the executor as a prerequisite to run + // the executors command. + must.Error(t, err) + }() + + pid := cmd.Process.Pid + must.Positive(t, pid) + + cgInterface := cgroupslib.OpenPath(fullCGroupPath) + err = cgInterface.Write("cgroup.procs", strconv.Itoa(pid)) + must.NoError(t, err) + + pids, err := cgInterface.PIDs() + must.NoError(t, err) + must.One(t, pids.Size()) + + // Run the executor normally and make sure the process that was originally running + // as part of the CGroup was killed, and only the executor's process is running. + execInterface := NewExecutorWithIsolation(testlog.HCLogger(t), compute) + executor := execInterface.(*LibcontainerExecutor) + defer executor.Shutdown("SIGKILL", 0) + + ps, err := executor.Launch(execCmd) + must.NoError(t, err) + must.Positive(t, ps.Pid) + + pids, err = cgInterface.PIDs() + must.NoError(t, err) + must.One(t, pids.Size()) + must.True(t, pids.Contains(ps.Pid)) + + estate, err := executor.Wait(context.Background()) + must.NoError(t, err) + must.Zero(t, estate.ExitCode) + + must.NoError(t, executor.Shutdown("", 0)) + executor.Wait(context.Background()) +} diff --git a/go.mod b/go.mod index 4eeedab9ea5..2f35a66a4a7 100644 --- a/go.mod +++ b/go.mod @@ -172,7 +172,7 @@ require ( github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/checkpoint-restore/go-criu/v5 v5.3.0 // indirect github.com/cheggaaa/pb/v3 v3.0.5 // indirect - github.com/cilium/ebpf v0.9.1 // indirect + github.com/cilium/ebpf v0.11.0 // indirect github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible // indirect github.com/circonus-labs/circonusllhist v0.1.3 // indirect github.com/containerd/console v1.0.3 // indirect diff --git a/go.sum b/go.sum index 2c5d1ec5cac..c7b01295c66 100644 --- a/go.sum +++ b/go.sum @@ -317,8 +317,8 @@ github.com/cheggaaa/pb/v3 v3.0.5/go.mod h1:X1L61/+36nz9bjIsrDU52qHKOQukUQe2Ge+Yv github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= -github.com/cilium/ebpf v0.9.1 h1:64sn2K3UKw8NbP/blsixRpF3nXuyhz/VjRlRzvlBRu4= -github.com/cilium/ebpf v0.9.1/go.mod h1:+OhNOIXx/Fnu1IE8bJz2dzOA+VSfyTfdNUVdlQnxUFY= +github.com/cilium/ebpf v0.11.0 h1:V8gS/bTCCjX9uUnkUFUpPsksM8n1lXBAvHcpiFk1X2Y= +github.com/cilium/ebpf v0.11.0/go.mod h1:WE7CZAnqOL2RouJ4f1uyNhqr2P4CCvXFIqdRDUgWsVs= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible h1:C29Ae4G5GtYyYMm1aztcyj/J5ckgJm2zwdDajFbx1NY= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag= github.com/circonus-labs/circonusllhist v0.1.3 h1:TJH+oke8D16535+jHExHj4nQvzlZrj7ug5D7I/orNUA= @@ -421,8 +421,8 @@ github.com/felixge/httpsnoop v1.0.1/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSw github.com/felixge/httpsnoop v1.0.3 h1:s/nj+GCswXYzN5v2DpNMuMQYe+0DDwt5WVCU6CWBdXk= github.com/felixge/httpsnoop v1.0.3/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= -github.com/frankban/quicktest v1.14.3 h1:FJKSZTDHjyhriyC81FLQ0LY93eSai0ZyR/ZIkd3ZUKE= -github.com/frankban/quicktest v1.14.3/go.mod h1:mgiwOwqx65TmIk1wJ6Q7wvnVMocbUorkibMOrVTHZps= +github.com/frankban/quicktest v1.14.5 h1:dfYrrRyLtiqT9GyKXgdh+k4inNeTvmGbuSgZ3lx3GhA= +github.com/frankban/quicktest v1.14.5/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= From 4d58b78836b6adef0c069d5b8c083df9c48a8cdf Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Wed, 8 May 2024 18:26:16 +0200 Subject: [PATCH 07/24] func: add extra test to make sure the old process is dead --- drivers/shared/executor/executor_linux_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/shared/executor/executor_linux_test.go b/drivers/shared/executor/executor_linux_test.go index 20e33203fca..90ee8d9bebd 100644 --- a/drivers/shared/executor/executor_linux_test.go +++ b/drivers/shared/executor/executor_linux_test.go @@ -862,7 +862,7 @@ func createCGroup(fullpath string) error { return os.MkdirAll(fullpath, 0755) } -func TestExecutor2_OOMKilled(t *testing.T) { +func TestExecutor_CleanOldProcessesInCGroup(t *testing.T) { ci.Parallel(t) testutil.ExecCompatible(t) @@ -923,6 +923,7 @@ func TestExecutor2_OOMKilled(t *testing.T) { must.NoError(t, err) must.One(t, pids.Size()) must.True(t, pids.Contains(ps.Pid)) + must.False(t, pids.Contains(pid)) estate, err := executor.Wait(context.Background()) must.NoError(t, err) From 25fd974fc49a8055bd4d057fe3a07b96f0cc950a Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Wed, 8 May 2024 18:36:06 +0200 Subject: [PATCH 08/24] func: Add changelog --- .changelog/20500.txt | 3 +++ drivers/shared/executor/executor_linux_test.go | 14 ++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) create mode 100644 .changelog/20500.txt diff --git a/.changelog/20500.txt b/.changelog/20500.txt new file mode 100644 index 00000000000..f6b51313636 --- /dev/null +++ b/.changelog/20500.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: get rid of old exec allocs before starting new ones, to avoid accidentally leaving running allocs in case of an error +``` diff --git a/drivers/shared/executor/executor_linux_test.go b/drivers/shared/executor/executor_linux_test.go index 90ee8d9bebd..7a79158d955 100644 --- a/drivers/shared/executor/executor_linux_test.go +++ b/drivers/shared/executor/executor_linux_test.go @@ -858,8 +858,12 @@ func TestExecCommand_getCgroupOr_v1_relative(t *testing.T) { must.Eq(t, result2, "/sys/fs/cgroup/cpuset/custom/path") } -func createCGroup(fullpath string) error { - return os.MkdirAll(fullpath, 0755) +func createCGroup(fullpath string) (cgroupslib.Interface, error) { + if err := os.MkdirAll(fullpath, 0755); err != nil { + return nil, err + } + + return cgroupslib.OpenPath(fullpath), nil } func TestExecutor_CleanOldProcessesInCGroup(t *testing.T) { @@ -877,14 +881,13 @@ func TestExecutor_CleanOldProcessesInCGroup(t *testing.T) { execCmd := testExecCmd.command execCmd.Cmd = "/bin/sleep" - execCmd.Args = []string{"20"} - + execCmd.Args = []string{"1"} execCmd.ResourceLimits = true execCmd.ModePID = "private" execCmd.ModeIPC = "private" // Create the CGroup the executor's command will run in and populate it with one process - err := createCGroup(fullCGroupPath) + cgInterface, err := createCGroup(fullCGroupPath) must.NoError(t, err) cmd := exec.Command("/bin/sleep", "3000") @@ -901,7 +904,6 @@ func TestExecutor_CleanOldProcessesInCGroup(t *testing.T) { pid := cmd.Process.Pid must.Positive(t, pid) - cgInterface := cgroupslib.OpenPath(fullCGroupPath) err = cgInterface.Write("cgroup.procs", strconv.Itoa(pid)) must.NoError(t, err) From a10f608c15e43a5bba0770c37e019f49b264807b Mon Sep 17 00:00:00 2001 From: Juana De La Cuesta Date: Fri, 10 May 2024 10:36:53 +0200 Subject: [PATCH 09/24] Update drivers/shared/executor/executor_linux.go Co-authored-by: Tim Gross --- drivers/shared/executor/executor_linux.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 5288a78d9f5..2029fd9a747 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -103,7 +103,7 @@ func (l *LibcontainerExecutor) ListProcesses() *set.Set[int] { func (l *LibcontainerExecutor) cleanOldProcessesInCGroup(nomadRelativePath string) { l.logger.Info("removing old processes", "path", nomadRelativePath) - root := cgroupslib.GetDefautlRoot() + root := cgroupslib.GetDefaultRoot() orphansPIDs, err := cgroups.GetAllPids(filepath.Join(root, nomadRelativePath)) if err != nil { l.logger.Error("unable to get orphan allocs PIDs", err) From ae456962f7be2553899691b3dd544c8681bf3e2b Mon Sep 17 00:00:00 2001 From: Juana De La Cuesta Date: Fri, 10 May 2024 10:37:20 +0200 Subject: [PATCH 10/24] Update drivers/shared/executor/executor_linux.go Co-authored-by: Tim Gross --- drivers/shared/executor/executor_linux.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 2029fd9a747..0e3b0e828c3 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -114,7 +114,7 @@ func (l *LibcontainerExecutor) cleanOldProcessesInCGroup(nomadRelativePath strin l.logger.Info("killing process:", orphansPIDs) err := syscall.Kill(pid, syscall.SIGKILL) if err != nil { - l.logger.Error("unable to send signal to process", pid, err) + l.logger.Error("unable to send signal to process", "pid", pid, "error", err) } } } From b1c347cf2b42f07f217a3d4df8c3565773208046 Mon Sep 17 00:00:00 2001 From: Juana De La Cuesta Date: Fri, 10 May 2024 10:37:26 +0200 Subject: [PATCH 11/24] Update drivers/shared/executor/executor_linux.go Co-authored-by: Tim Gross --- drivers/shared/executor/executor_linux.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 0e3b0e828c3..96a09bcd840 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -240,7 +240,7 @@ func (l *LibcontainerExecutor) catchSignals(ctx context.Context) os.Signal { sigch := make(chan os.Signal, 4) defer close(sigch) - l.logger.Debug("waiting for signls") + l.logger.Trace("waiting for signals") signal.Notify(sigch, syscall.SIGHUP, syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT, syscall.SIGSEGV) defer signal.Stop(sigch) From 48406ce65fab69fba98330ff69863adbb418c8d8 Mon Sep 17 00:00:00 2001 From: Juana De La Cuesta Date: Fri, 10 May 2024 10:37:37 +0200 Subject: [PATCH 12/24] Update drivers/shared/executor/executor_linux.go Co-authored-by: Tim Gross --- drivers/shared/executor/executor_linux.go | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 96a09bcd840..77cdbb36a98 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -186,7 +186,6 @@ func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, erro Init: true, } - l.logger.Debug("launching", "command", command.Args) if command.User != "" { process.User = command.User } From 3ba9e8519297b6c8da4576419f1a3be1c7f98c8f Mon Sep 17 00:00:00 2001 From: Juana De La Cuesta Date: Fri, 10 May 2024 10:37:48 +0200 Subject: [PATCH 13/24] Update client/lib/cgroupslib/editor.go Co-authored-by: Tim Gross --- client/lib/cgroupslib/editor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/lib/cgroupslib/editor.go b/client/lib/cgroupslib/editor.go index e0abfc70634..96ef95c4c95 100644 --- a/client/lib/cgroupslib/editor.go +++ b/client/lib/cgroupslib/editor.go @@ -21,7 +21,7 @@ const ( root = "/sys/fs/cgroup" ) -func GetDefautlRoot() string { +func GetDefaultRoot() string { return root } From bdd09481d7415c305c9bc5cc16e555b9ec70b87b Mon Sep 17 00:00:00 2001 From: Juana De La Cuesta Date: Fri, 10 May 2024 10:37:55 +0200 Subject: [PATCH 14/24] Update .changelog/20500.txt Co-authored-by: Tim Gross --- .changelog/20500.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.changelog/20500.txt b/.changelog/20500.txt index f6b51313636..a272b050817 100644 --- a/.changelog/20500.txt +++ b/.changelog/20500.txt @@ -1,3 +1,3 @@ ```release-note:bug -client: get rid of old exec allocs before starting new ones, to avoid accidentally leaving running allocs in case of an error +client: terminate old exec task processes before starting new ones, to avoid accidentally leaving running processes in case of an error ``` From 2c901fe89966c45207c8e2eb89c2cb7041e41932 Mon Sep 17 00:00:00 2001 From: Juana De La Cuesta Date: Fri, 10 May 2024 13:29:38 +0200 Subject: [PATCH 15/24] Update drivers/shared/executor/executor_linux.go Co-authored-by: Tim Gross --- drivers/shared/executor/executor_linux.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 77cdbb36a98..ff9305eb068 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -111,7 +111,7 @@ func (l *LibcontainerExecutor) cleanOldProcessesInCGroup(nomadRelativePath strin } for _, pid := range orphansPIDs { - l.logger.Info("killing process:", orphansPIDs) + l.logger.Info("killing orphaned process", "pid", pid) err := syscall.Kill(pid, syscall.SIGKILL) if err != nil { l.logger.Error("unable to send signal to process", "pid", pid, "error", err) From 2f61a97f44574d66000ba61bfe49b1f09aff9c6e Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Fri, 10 May 2024 14:00:39 +0200 Subject: [PATCH 16/24] style: change message to debug for killing orphans --- drivers/shared/executor/executor_linux.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index ff9305eb068..ae8bc055dd3 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -101,7 +101,7 @@ func (l *LibcontainerExecutor) ListProcesses() *set.Set[int] { // cleanOldProcessesInCGroup kills processes that might ended up orphans when the // executor was unexpectedly killed and nomad can't reconnect to them. func (l *LibcontainerExecutor) cleanOldProcessesInCGroup(nomadRelativePath string) { - l.logger.Info("removing old processes", "path", nomadRelativePath) + l.logger.Debug("looking old processes", "path", nomadRelativePath) root := cgroupslib.GetDefaultRoot() orphansPIDs, err := cgroups.GetAllPids(filepath.Join(root, nomadRelativePath)) From a1c635ed7d5528880265e7d4d70566d5128d2a6b Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Fri, 10 May 2024 14:57:21 +0200 Subject: [PATCH 17/24] func: Move the catch signals to the executor creation --- drivers/shared/executor/executor_linux.go | 61 +++++++++++------------ 1 file changed, 28 insertions(+), 33 deletions(-) diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index ae8bc055dd3..d7d46574581 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -79,9 +79,32 @@ type LibcontainerExecutor struct { userProc *libcontainer.Process userProcExited chan interface{} exitState *ProcessState + sigChan chan os.Signal +} + +func (l *LibcontainerExecutor) catchSignals() { + l.logger.Trace("waiting for signals") + defer signal.Stop(l.sigChan) + defer close(l.sigChan) + + signal.Notify(l.sigChan, syscall.SIGHUP, syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT, syscall.SIGSEGV) + for { + signal := <-l.sigChan + l.logger.Info(" ****** catching it", signal) + if signal == syscall.SIGTERM || signal == syscall.SIGKILL || signal == syscall.SIGINT { + l.Shutdown("SIGINT", 0) + break + } + + if l.container != nil { + l.container.Signal(signal, false) + } + } } func NewExecutorWithIsolation(logger hclog.Logger, compute cpustats.Compute) Executor { + sigch := make(chan os.Signal, 4) + le := &LibcontainerExecutor{ id: strings.ReplaceAll(uuid.Generate(), "-", "_"), logger: logger.Named("isolated_executor"), @@ -89,7 +112,11 @@ func NewExecutorWithIsolation(logger hclog.Logger, compute cpustats.Compute) Exe totalCpuStats: cpustats.New(compute), userCpuStats: cpustats.New(compute), systemCpuStats: cpustats.New(compute), + sigChan: sigch, } + + go le.catchSignals() + le.processStats = procstats.New(compute, le) return le } @@ -211,21 +238,7 @@ func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, erro // start a goroutine to wait on the process to complete, so Wait calls can // be multiplexed l.userProcExited = make(chan interface{}) - - ctx, cancel := context.WithCancel(context.Background()) - - go func() { - defer cancel() - - go l.wait() - - signal := l.catchSignals(ctx) - - err = l.Signal(signal) - if err != nil { - l.logger.Error("failed send signal to process(%s): %v", l.id, err) - } - }() + go l.wait() return &ProcessState{ Pid: pid, @@ -234,24 +247,6 @@ func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, erro }, nil } -func (l *LibcontainerExecutor) catchSignals(ctx context.Context) os.Signal { - - sigch := make(chan os.Signal, 4) - defer close(sigch) - - l.logger.Trace("waiting for signals") - signal.Notify(sigch, syscall.SIGHUP, syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT, syscall.SIGSEGV) - defer signal.Stop(sigch) - - select { - case <-ctx.Done(): - return nil - - case s := <-sigch: - return s - } -} - // Wait waits until a process has exited and returns it's exitcode and errors func (l *LibcontainerExecutor) Wait(ctx context.Context) (*ProcessState, error) { select { From 312dcfcc52b45d53bc791d6c269dfc1e0ac91150 Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Fri, 10 May 2024 15:32:26 +0200 Subject: [PATCH 18/24] func: add test for catching signals --- drivers/shared/executor/executor_linux.go | 1 - .../shared/executor/executor_linux_test.go | 39 +++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index d7d46574581..a538c55e2df 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -90,7 +90,6 @@ func (l *LibcontainerExecutor) catchSignals() { signal.Notify(l.sigChan, syscall.SIGHUP, syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT, syscall.SIGSEGV) for { signal := <-l.sigChan - l.logger.Info(" ****** catching it", signal) if signal == syscall.SIGTERM || signal == syscall.SIGKILL || signal == syscall.SIGINT { l.Shutdown("SIGINT", 0) break diff --git a/drivers/shared/executor/executor_linux_test.go b/drivers/shared/executor/executor_linux_test.go index 7a79158d955..d72a55e9f94 100644 --- a/drivers/shared/executor/executor_linux_test.go +++ b/drivers/shared/executor/executor_linux_test.go @@ -12,6 +12,7 @@ import ( "regexp" "strconv" "strings" + "syscall" "testing" "time" @@ -28,6 +29,7 @@ import ( tu "github.com/hashicorp/nomad/testutil" lconfigs "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runtime-spec/specs-go" "github.com/shoenig/test" "github.com/shoenig/test/must" "github.com/stretchr/testify/require" @@ -934,3 +936,40 @@ func TestExecutor_CleanOldProcessesInCGroup(t *testing.T) { must.NoError(t, executor.Shutdown("", 0)) executor.Wait(context.Background()) } + +func TestExecutor_SignalCatching(t *testing.T) { + ci.Parallel(t) + + testutil.ExecCompatible(t) + testutil.CgroupsCompatible(t) + + testExecCmd := testExecutorCommandWithChroot(t) + + allocDir := testExecCmd.allocDir + defer allocDir.Destroy() + + execCmd := testExecCmd.command + execCmd.Cmd = "/bin/sleep" + execCmd.Args = []string{"100"} + execCmd.ResourceLimits = true + execCmd.ModePID = "private" + execCmd.ModeIPC = "private" + + execInterface := NewExecutorWithIsolation(testlog.HCLogger(t), compute) + + ps, err := execInterface.Launch(execCmd) + must.NoError(t, err) + must.Positive(t, ps.Pid) + + executor := execInterface.(*LibcontainerExecutor) + status, err := executor.container.OCIState() + must.NoError(t, err) + must.Eq(t, specs.StateRunning, status.Status) + + executor.sigChan <- syscall.SIGTERM + time.Sleep(1 * time.Second) + + status, err = executor.container.OCIState() + must.NoError(t, err) + must.Eq(t, specs.StateStopped, status.Status) +} From 3f1a238e2a08de52f72ef556a6d70be20ebadb7b Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Fri, 10 May 2024 15:38:32 +0200 Subject: [PATCH 19/24] typo --- drivers/shared/executor/executor_linux.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index a538c55e2df..63781a1b7dc 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -127,7 +127,7 @@ func (l *LibcontainerExecutor) ListProcesses() *set.Set[int] { // cleanOldProcessesInCGroup kills processes that might ended up orphans when the // executor was unexpectedly killed and nomad can't reconnect to them. func (l *LibcontainerExecutor) cleanOldProcessesInCGroup(nomadRelativePath string) { - l.logger.Debug("looking old processes", "path", nomadRelativePath) + l.logger.Debug("looking for old processes", "path", nomadRelativePath) root := cgroupslib.GetDefaultRoot() orphansPIDs, err := cgroups.GetAllPids(filepath.Join(root, nomadRelativePath)) From ec9dade779fa9f14bf17e415c73248c28c3c45b7 Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Fri, 10 May 2024 15:41:22 +0200 Subject: [PATCH 20/24] func: Check the process number before kill it, to avoid shuting done the node by mistake --- drivers/shared/executor/executor_linux.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 63781a1b7dc..a6367196f12 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -138,6 +138,13 @@ func (l *LibcontainerExecutor) cleanOldProcessesInCGroup(nomadRelativePath strin for _, pid := range orphansPIDs { l.logger.Info("killing orphaned process", "pid", pid) + + // Avoid bringind down the whole node by mistake, very unlikly case, + // but it's better to be sure. + if pid == 1 { + continue + } + err := syscall.Kill(pid, syscall.SIGKILL) if err != nil { l.logger.Error("unable to send signal to process", "pid", pid, "error", err) @@ -368,9 +375,8 @@ func (l *LibcontainerExecutor) Shutdown(signal string, grace time.Duration) erro case <-l.userProcExited: return nil case <-time.After(time.Second * 15): + return fmt.Errorf("process failed to exit after 15 seconds") } - - return fmt.Errorf("process failed to exit after 15 seconds") } // UpdateResources updates the resource isolation with new values to be enforced From 2f50dbbd244c7bde3615f03c08324270b25f016a Mon Sep 17 00:00:00 2001 From: Juana De La Cuesta Date: Fri, 10 May 2024 15:59:37 +0200 Subject: [PATCH 21/24] Update drivers/shared/executor/executor_linux.go Co-authored-by: Tim Gross --- drivers/shared/executor/executor_linux.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index a6367196f12..266ef6df770 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -132,7 +132,7 @@ func (l *LibcontainerExecutor) cleanOldProcessesInCGroup(nomadRelativePath strin root := cgroupslib.GetDefaultRoot() orphansPIDs, err := cgroups.GetAllPids(filepath.Join(root, nomadRelativePath)) if err != nil { - l.logger.Error("unable to get orphan allocs PIDs", err) + l.logger.Error("unable to get orphaned task PIDs", "error" err) return } From d6017b4f6577f0eec2cf289b7a7e6c732535e3ac Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Fri, 10 May 2024 17:06:43 +0200 Subject: [PATCH 22/24] style: add tag to logger --- drivers/shared/executor/executor_linux.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 266ef6df770..09b40889c8a 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -132,7 +132,7 @@ func (l *LibcontainerExecutor) cleanOldProcessesInCGroup(nomadRelativePath strin root := cgroupslib.GetDefaultRoot() orphansPIDs, err := cgroups.GetAllPids(filepath.Join(root, nomadRelativePath)) if err != nil { - l.logger.Error("unable to get orphaned task PIDs", "error" err) + l.logger.Error("unable to get orphaned task PIDs", "error", err) return } From 9d6b62ed6952dd2c574a35603f9c92dc4676d4b5 Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Mon, 13 May 2024 10:27:14 +0200 Subject: [PATCH 23/24] fix: move back ciliums version --- drivers/shared/executor/executor_linux.go | 2 +- go.mod | 2 +- go.sum | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 09b40889c8a..9cade945526 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -90,7 +90,7 @@ func (l *LibcontainerExecutor) catchSignals() { signal.Notify(l.sigChan, syscall.SIGHUP, syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT, syscall.SIGSEGV) for { signal := <-l.sigChan - if signal == syscall.SIGTERM || signal == syscall.SIGKILL || signal == syscall.SIGINT { + if signal == syscall.SIGTERM || signal == syscall.SIGINT { l.Shutdown("SIGINT", 0) break } diff --git a/go.mod b/go.mod index 2f35a66a4a7..4eeedab9ea5 100644 --- a/go.mod +++ b/go.mod @@ -172,7 +172,7 @@ require ( github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/checkpoint-restore/go-criu/v5 v5.3.0 // indirect github.com/cheggaaa/pb/v3 v3.0.5 // indirect - github.com/cilium/ebpf v0.11.0 // indirect + github.com/cilium/ebpf v0.9.1 // indirect github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible // indirect github.com/circonus-labs/circonusllhist v0.1.3 // indirect github.com/containerd/console v1.0.3 // indirect diff --git a/go.sum b/go.sum index c7b01295c66..2c5d1ec5cac 100644 --- a/go.sum +++ b/go.sum @@ -317,8 +317,8 @@ github.com/cheggaaa/pb/v3 v3.0.5/go.mod h1:X1L61/+36nz9bjIsrDU52qHKOQukUQe2Ge+Yv github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= -github.com/cilium/ebpf v0.11.0 h1:V8gS/bTCCjX9uUnkUFUpPsksM8n1lXBAvHcpiFk1X2Y= -github.com/cilium/ebpf v0.11.0/go.mod h1:WE7CZAnqOL2RouJ4f1uyNhqr2P4CCvXFIqdRDUgWsVs= +github.com/cilium/ebpf v0.9.1 h1:64sn2K3UKw8NbP/blsixRpF3nXuyhz/VjRlRzvlBRu4= +github.com/cilium/ebpf v0.9.1/go.mod h1:+OhNOIXx/Fnu1IE8bJz2dzOA+VSfyTfdNUVdlQnxUFY= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible h1:C29Ae4G5GtYyYMm1aztcyj/J5ckgJm2zwdDajFbx1NY= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag= github.com/circonus-labs/circonusllhist v0.1.3 h1:TJH+oke8D16535+jHExHj4nQvzlZrj7ug5D7I/orNUA= @@ -421,8 +421,8 @@ github.com/felixge/httpsnoop v1.0.1/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSw github.com/felixge/httpsnoop v1.0.3 h1:s/nj+GCswXYzN5v2DpNMuMQYe+0DDwt5WVCU6CWBdXk= github.com/felixge/httpsnoop v1.0.3/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= -github.com/frankban/quicktest v1.14.5 h1:dfYrrRyLtiqT9GyKXgdh+k4inNeTvmGbuSgZ3lx3GhA= -github.com/frankban/quicktest v1.14.5/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= +github.com/frankban/quicktest v1.14.3 h1:FJKSZTDHjyhriyC81FLQ0LY93eSai0ZyR/ZIkd3ZUKE= +github.com/frankban/quicktest v1.14.3/go.mod h1:mgiwOwqx65TmIk1wJ6Q7wvnVMocbUorkibMOrVTHZps= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= From bf0a7f5dea17727e7d2084e707d1fb03f81a3e36 Mon Sep 17 00:00:00 2001 From: Juana De La Cuesta Date: Mon, 13 May 2024 16:43:35 +0200 Subject: [PATCH 24/24] Update drivers/shared/executor/executor_linux.go Co-authored-by: Tim Gross --- drivers/shared/executor/executor_linux.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 9cade945526..ceb80f2c474 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -139,7 +139,7 @@ func (l *LibcontainerExecutor) cleanOldProcessesInCGroup(nomadRelativePath strin for _, pid := range orphansPIDs { l.logger.Info("killing orphaned process", "pid", pid) - // Avoid bringind down the whole node by mistake, very unlikly case, + // Avoid bringing down the whole node by mistake, very unlikely case, // but it's better to be sure. if pid == 1 { continue