From 9f4fa699a675b6929094a0f561d839b88395ca90 Mon Sep 17 00:00:00 2001 From: olha Date: Tue, 17 Feb 2026 10:41:43 +0000 Subject: [PATCH 1/7] Kill parent and all child processes via process group on timeout. Log last stdout --- mettle/worker/worker.go | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/mettle/worker/worker.go b/mettle/worker/worker.go index ff9b0209..0e6d9850 100644 --- a/mettle/worker/worker.go +++ b/mettle/worker/worker.go @@ -928,12 +928,21 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur cmd.Stderr = &w.stderr gracePeriod := 5 * time.Second cmd.WaitDelay = gracePeriod + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} cmd.Cancel = func() error { - if cmd.Process == nil { + if cmd.Process == nil || cmd.Process.Pid <= 0 { return nil } - return cmd.Process.Signal(syscall.SIGTERM) + // send SIGTERM to the entire group (-PID) created by Setpgid + // where parent AND all children holding the pipes + err := syscall.Kill(-cmd.Process.Pid, syscall.SIGTERM) + + // If the group doesn't exist yet or already gone, ignore the error + if errors.Is(err, syscall.ESRCH) { + return nil + } + return err } err := cmd.Run() @@ -941,6 +950,14 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur if errors.Is(ctx.Err(), context.DeadlineExceeded) { actionTimeout.Inc() forceKilled := false + + out := w.stdout.Bytes() + start := 0 + if len(out) > 500 { + start = len(out) - 500 + } + lastStdout := string(out[start:]) + if ps := cmd.ProcessState; ps != nil { if status, ok := ps.Sys().(syscall.WaitStatus); ok { if status.Signaled() && status.Signal() == syscall.SIGKILL { @@ -949,7 +966,7 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur } } - msg := "Terminating process due to timeout" + msg := "Terminating process due to timeout; check 'last_stdout' for hanging point" if forceKilled { msg += "; grace period expired; process killed (SIGKILL)" } @@ -957,6 +974,7 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur "hash": w.actionDigest.Hash, "timeout": timeout.String(), "gracePeriod": gracePeriod.String(), + "lastStdout": lastStdout, "forceKilled": forceKilled, }).Warn(msg) From 41e6687b7a1161d4cd0493b8d57c75e29d2a739e Mon Sep 17 00:00:00 2001 From: olha Date: Tue, 17 Feb 2026 11:25:10 +0000 Subject: [PATCH 2/7] Remove stdout log --- mettle/worker/worker.go | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/mettle/worker/worker.go b/mettle/worker/worker.go index 0e6d9850..a297e061 100644 --- a/mettle/worker/worker.go +++ b/mettle/worker/worker.go @@ -950,14 +950,6 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur if errors.Is(ctx.Err(), context.DeadlineExceeded) { actionTimeout.Inc() forceKilled := false - - out := w.stdout.Bytes() - start := 0 - if len(out) > 500 { - start = len(out) - 500 - } - lastStdout := string(out[start:]) - if ps := cmd.ProcessState; ps != nil { if status, ok := ps.Sys().(syscall.WaitStatus); ok { if status.Signaled() && status.Signal() == syscall.SIGKILL { @@ -966,7 +958,7 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur } } - msg := "Terminating process due to timeout; check 'last_stdout' for hanging point" + msg := "Terminating process due to timeout" if forceKilled { msg += "; grace period expired; process killed (SIGKILL)" } @@ -974,7 +966,6 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur "hash": w.actionDigest.Hash, "timeout": timeout.String(), "gracePeriod": gracePeriod.String(), - "lastStdout": lastStdout, "forceKilled": forceKilled, }).Warn(msg) From 66ba086e2561e13593dfe7b73f0aaf7949383688 Mon Sep 17 00:00:00 2001 From: olha Date: Tue, 17 Feb 2026 11:36:19 +0000 Subject: [PATCH 3/7] apture process state, resource usage, and full command hierarchy for the group on timeout --- mettle/worker/worker.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mettle/worker/worker.go b/mettle/worker/worker.go index a297e061..971a8188 100644 --- a/mettle/worker/worker.go +++ b/mettle/worker/worker.go @@ -934,6 +934,12 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur if cmd.Process == nil || cmd.Process.Pid <= 0 { return nil } + + // Capture process state, resource usage, and full command hierarchy for the group + args := []string{"-g", fmt.Sprintf("%d", cmd.Process.Pid), "-o", "pid,ppid,state,%cpu,%mem,start,time,command"} + if psOut, err := exec.Command("ps", args...).Output(); err == nil { + logr.WithField("process_tree", string(psOut)).Debug("Timeout reached: Analyzing hanging group") + } // send SIGTERM to the entire group (-PID) created by Setpgid // where parent AND all children holding the pipes err := syscall.Kill(-cmd.Process.Pid, syscall.SIGTERM) @@ -952,7 +958,7 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur forceKilled := false if ps := cmd.ProcessState; ps != nil { if status, ok := ps.Sys().(syscall.WaitStatus); ok { - if status.Signaled() && status.Signal() == syscall.SIGKILL { + if status.Signaled() && (status.Signal() == syscall.SIGTERM || status.Signal() == syscall.SIGKILL) { forceKilled = true } } From b3c8121f9be81ecd2b8e0ed3edf465513f5c585c Mon Sep 17 00:00:00 2001 From: olha Date: Tue, 17 Feb 2026 12:12:53 +0000 Subject: [PATCH 4/7] Move hangingProcessTree to execute inside context.DeadlineExceeded block --- ChangeLog | 4 ++++ VERSION | 2 +- mettle/worker/worker.go | 24 +++++++++++++++--------- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/ChangeLog b/ChangeLog index f68c4220..f5e59d4f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +Version 11.16.3 +--------------- + * Kill parent and all child processes via process group on worker timeout. Log hanging processes + Version 11.16.3 --------------- * Fix logrus verbosity handling for debug level log diff --git a/VERSION b/VERSION index 249d9bf1..1905ca4c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -11.16.3 \ No newline at end of file +11.16.4 \ No newline at end of file diff --git a/mettle/worker/worker.go b/mettle/worker/worker.go index 971a8188..402cea4e 100644 --- a/mettle/worker/worker.go +++ b/mettle/worker/worker.go @@ -935,11 +935,6 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur return nil } - // Capture process state, resource usage, and full command hierarchy for the group - args := []string{"-g", fmt.Sprintf("%d", cmd.Process.Pid), "-o", "pid,ppid,state,%cpu,%mem,start,time,command"} - if psOut, err := exec.Command("ps", args...).Output(); err == nil { - logr.WithField("process_tree", string(psOut)).Debug("Timeout reached: Analyzing hanging group") - } // send SIGTERM to the entire group (-PID) created by Setpgid // where parent AND all children holding the pipes err := syscall.Kill(-cmd.Process.Pid, syscall.SIGTERM) @@ -955,6 +950,16 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur if errors.Is(ctx.Err(), context.DeadlineExceeded) { actionTimeout.Inc() + + // fetch processes that didn't exit on SIGTERM + var processTree string + if cmd.Process != nil { + args := []string{"-g", fmt.Sprintf("%d", cmd.Process.Pid), "-o", "pid,ppid,state,%cpu,%mem,start,time,command"} + if psOut, psErr := exec.Command("ps", args...).Output(); psErr == nil { + processTree = string(psOut) + } + } + forceKilled := false if ps := cmd.ProcessState; ps != nil { if status, ok := ps.Sys().(syscall.WaitStatus); ok { @@ -969,10 +974,11 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur msg += "; grace period expired; process killed (SIGKILL)" } logr.WithFields(logrus.Fields{ - "hash": w.actionDigest.Hash, - "timeout": timeout.String(), - "gracePeriod": gracePeriod.String(), - "forceKilled": forceKilled, + "hash": w.actionDigest.Hash, + "timeout": timeout.String(), + "gracePeriod": gracePeriod.String(), + "forceKilled": forceKilled, + "hangingProcessTree": processTree, }).Warn(msg) return ErrTimeout From 08b1659d8aec9a19928b8622c13060ae611522cb Mon Sep 17 00:00:00 2001 From: olha Date: Tue, 17 Feb 2026 12:12:53 +0000 Subject: [PATCH 5/7] Move hangingProcessTree to execute inside context.DeadlineExceeded block --- ChangeLog | 4 ++++ VERSION | 2 +- mettle/worker/worker.go | 24 +++++++++++++++--------- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/ChangeLog b/ChangeLog index f68c4220..960be660 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +Version 11.16.4 +--------------- + * Kill parent and all child processes via process group on worker timeout. Log hanging processes + Version 11.16.3 --------------- * Fix logrus verbosity handling for debug level log diff --git a/VERSION b/VERSION index 249d9bf1..1905ca4c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -11.16.3 \ No newline at end of file +11.16.4 \ No newline at end of file diff --git a/mettle/worker/worker.go b/mettle/worker/worker.go index 971a8188..402cea4e 100644 --- a/mettle/worker/worker.go +++ b/mettle/worker/worker.go @@ -935,11 +935,6 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur return nil } - // Capture process state, resource usage, and full command hierarchy for the group - args := []string{"-g", fmt.Sprintf("%d", cmd.Process.Pid), "-o", "pid,ppid,state,%cpu,%mem,start,time,command"} - if psOut, err := exec.Command("ps", args...).Output(); err == nil { - logr.WithField("process_tree", string(psOut)).Debug("Timeout reached: Analyzing hanging group") - } // send SIGTERM to the entire group (-PID) created by Setpgid // where parent AND all children holding the pipes err := syscall.Kill(-cmd.Process.Pid, syscall.SIGTERM) @@ -955,6 +950,16 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur if errors.Is(ctx.Err(), context.DeadlineExceeded) { actionTimeout.Inc() + + // fetch processes that didn't exit on SIGTERM + var processTree string + if cmd.Process != nil { + args := []string{"-g", fmt.Sprintf("%d", cmd.Process.Pid), "-o", "pid,ppid,state,%cpu,%mem,start,time,command"} + if psOut, psErr := exec.Command("ps", args...).Output(); psErr == nil { + processTree = string(psOut) + } + } + forceKilled := false if ps := cmd.ProcessState; ps != nil { if status, ok := ps.Sys().(syscall.WaitStatus); ok { @@ -969,10 +974,11 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur msg += "; grace period expired; process killed (SIGKILL)" } logr.WithFields(logrus.Fields{ - "hash": w.actionDigest.Hash, - "timeout": timeout.String(), - "gracePeriod": gracePeriod.String(), - "forceKilled": forceKilled, + "hash": w.actionDigest.Hash, + "timeout": timeout.String(), + "gracePeriod": gracePeriod.String(), + "forceKilled": forceKilled, + "hangingProcessTree": processTree, }).Warn(msg) return ErrTimeout From 65f2e9c396446a08ec37a63e3b2c7ef313b5686e Mon Sep 17 00:00:00 2001 From: olha Date: Tue, 17 Feb 2026 12:18:58 +0000 Subject: [PATCH 6/7] Add a separate Debug log --- mettle/worker/worker.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mettle/worker/worker.go b/mettle/worker/worker.go index 402cea4e..5dc6184a 100644 --- a/mettle/worker/worker.go +++ b/mettle/worker/worker.go @@ -959,6 +959,7 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur processTree = string(psOut) } } + logr.WithField("hangingProcessTree", processTree).Debug("Timeout reached: Analyzing hanging group") forceKilled := false if ps := cmd.ProcessState; ps != nil { @@ -974,11 +975,10 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur msg += "; grace period expired; process killed (SIGKILL)" } logr.WithFields(logrus.Fields{ - "hash": w.actionDigest.Hash, - "timeout": timeout.String(), - "gracePeriod": gracePeriod.String(), - "forceKilled": forceKilled, - "hangingProcessTree": processTree, + "hash": w.actionDigest.Hash, + "timeout": timeout.String(), + "gracePeriod": gracePeriod.String(), + "forceKilled": forceKilled, }).Warn(msg) return ErrTimeout From 974b1fc8bec7888aa7f1a4c342601b8c1b335bd0 Mon Sep 17 00:00:00 2001 From: olha Date: Tue, 17 Feb 2026 13:01:39 +0000 Subject: [PATCH 7/7] Remove dublicate from the changelog --- ChangeLog | 4 ---- 1 file changed, 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 6aa02636..960be660 100644 --- a/ChangeLog +++ b/ChangeLog @@ -2,10 +2,6 @@ Version 11.16.4 --------------- * Kill parent and all child processes via process group on worker timeout. Log hanging processes -Version 11.16.3 ---------------- - * Kill parent and all child processes via process group on worker timeout. Log hanging processes - Version 11.16.3 --------------- * Fix logrus verbosity handling for debug level log