diff --git a/ChangeLog b/ChangeLog index f68c4220..960be660 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +Version 11.16.4 +--------------- + * Kill parent and all child processes via process group on worker timeout. Log hanging processes + Version 11.16.3 --------------- * Fix logrus verbosity handling for debug level log diff --git a/VERSION b/VERSION index 249d9bf1..1905ca4c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -11.16.3 \ No newline at end of file +11.16.4 \ No newline at end of file diff --git a/mettle/worker/worker.go b/mettle/worker/worker.go index ff9b0209..5dc6184a 100644 --- a/mettle/worker/worker.go +++ b/mettle/worker/worker.go @@ -928,22 +928,43 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur cmd.Stderr = &w.stderr gracePeriod := 5 * time.Second cmd.WaitDelay = gracePeriod + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} cmd.Cancel = func() error { - if cmd.Process == nil { + if cmd.Process == nil || cmd.Process.Pid <= 0 { return nil } - return cmd.Process.Signal(syscall.SIGTERM) + + // send SIGTERM to the entire group (-PID) created by Setpgid + // where parent AND all children holding the pipes + err := syscall.Kill(-cmd.Process.Pid, syscall.SIGTERM) + + // If the group doesn't exist yet or already gone, ignore the error + if errors.Is(err, syscall.ESRCH) { + return nil + } + return err } err := cmd.Run() if errors.Is(ctx.Err(), context.DeadlineExceeded) { actionTimeout.Inc() + + // fetch processes that didn't exit on SIGTERM + var processTree string + if cmd.Process != nil { + args := []string{"-g", fmt.Sprintf("%d", cmd.Process.Pid), "-o", "pid,ppid,state,%cpu,%mem,start,time,command"} + if psOut, psErr := exec.Command("ps", args...).Output(); psErr == nil { + processTree = string(psOut) + } + } + logr.WithField("hangingProcessTree", processTree).Debug("Timeout reached: Analyzing hanging group") + forceKilled := false if ps := cmd.ProcessState; ps != nil { if status, ok := ps.Sys().(syscall.WaitStatus); ok { - if status.Signaled() && status.Signal() == syscall.SIGKILL { + if status.Signaled() && (status.Signal() == syscall.SIGTERM || status.Signal() == syscall.SIGKILL) { forceKilled = true } }