Skip to content
4 changes: 4 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
Version 11.16.4
---------------
* Kill parent and all child processes via process group on worker timeout. Log hanging processes

Version 11.16.3
---------------
* Fix logrus verbosity handling for debug level log
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
11.16.3
11.16.4
27 changes: 24 additions & 3 deletions mettle/worker/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -928,22 +928,43 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur
cmd.Stderr = &w.stderr
gracePeriod := 5 * time.Second
cmd.WaitDelay = gracePeriod
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}

cmd.Cancel = func() error {
if cmd.Process == nil {
if cmd.Process == nil || cmd.Process.Pid <= 0 {
return nil
}
return cmd.Process.Signal(syscall.SIGTERM)

// send SIGTERM to the entire group (-PID) created by Setpgid
// where parent AND all children holding the pipes
err := syscall.Kill(-cmd.Process.Pid, syscall.SIGTERM)

// If the group doesn't exist yet or already gone, ignore the error
if errors.Is(err, syscall.ESRCH) {
return nil
}
return err
}

err := cmd.Run()

if errors.Is(ctx.Err(), context.DeadlineExceeded) {
actionTimeout.Inc()

// fetch processes that didn't exit on SIGTERM
var processTree string
if cmd.Process != nil {
args := []string{"-g", fmt.Sprintf("%d", cmd.Process.Pid), "-o", "pid,ppid,state,%cpu,%mem,start,time,command"}
if psOut, psErr := exec.Command("ps", args...).Output(); psErr == nil {
processTree = string(psOut)
}
}
logr.WithField("hangingProcessTree", processTree).Debug("Timeout reached: Analyzing hanging group")

forceKilled := false
if ps := cmd.ProcessState; ps != nil {
if status, ok := ps.Sys().(syscall.WaitStatus); ok {
if status.Signaled() && status.Signal() == syscall.SIGKILL {
if status.Signaled() && (status.Signal() == syscall.SIGTERM || status.Signal() == syscall.SIGKILL) {
forceKilled = true
}
}
Expand Down
Loading