From d40311bf171ed6af847f0c04a9d84a1a23eb654a Mon Sep 17 00:00:00 2001 From: Olha Yeremenko Date: Mon, 2 Mar 2026 12:26:56 +0000 Subject: [PATCH 01/11] Add Debug logs with process tree (#339) * Add logProcOnTimeout() * Bump version --- ChangeLog | 4 +++ VERSION | 2 +- mettle/worker/worker.go | 76 +++++++++++++++++++++++++++++++++-------- 3 files changed, 66 insertions(+), 16 deletions(-) diff --git a/ChangeLog b/ChangeLog index c9821fca..730d40f2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +Version 11.16.6 +--------------- + * Improve runCommand timeout diagnostics with logProcOnTimeout() + Version 11.16.5 --------------- * Improve runCommand timeout diagnostics (process snapshot + output summary) diff --git a/VERSION b/VERSION index 0a926732..9817f8d3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -11.16.5 \ No newline at end of file +11.16.6 \ No newline at end of file diff --git a/mettle/worker/worker.go b/mettle/worker/worker.go index 63f1dd81..a4a1741d 100644 --- a/mettle/worker/worker.go +++ b/mettle/worker/worker.go @@ -14,6 +14,7 @@ import ( "path" "path/filepath" "runtime" + "strconv" "strings" "syscall" "time" @@ -732,12 +733,23 @@ func (w *worker) execute(req *pb.ExecuteRequest, action *pb.Action, command *pb. duration, _ := ptypes.Duration(action.Timeout) ctx, cancel := context.WithTimeout(context.Background(), duration) defer cancel() + logr.WithFields(logrus.Fields{ "hash": w.actionDigest.Hash, "stage": "exec_action", "timeout": duration.String(), + "argv0": command.Arguments[0], + "argc": len(command.Arguments), }).Debug("Executing action - started") + logr.WithFields(logrus.Fields{ + "hash": w.actionDigest.Hash, + "sandbox": w.sandbox, + "altSandbox": w.altSandbox, + "shouldSandbox": w.shouldSandbox(command), + "argv0": command.Arguments[0], + }).Debug("Sandbox selection") + cmd := exec.CommandContext(ctx, commandPath(command), command.Arguments[1:]...) // Setting Pdeathsig should ideally make subprocesses get kill signals if we die. @@ -962,21 +974,8 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur if !errors.Is(ctx.Err(), context.DeadlineExceeded) { return } - - pgid, _ := syscall.Getpgid(pid) - var processTree string - if pgid > 0 { - args := []string{"-g", fmt.Sprintf("%d", pgid), "-o", "pid,ppid,pgid,state,etime,wchan,%cpu,%mem,command"} - if psOut, psErr := exec.Command("ps", args...).Output(); psErr == nil { - processTree = string(psOut) - } - } - logr.WithFields(logrus.Fields{ - "hash": w.actionDigest.Hash, - "pid": pid, - "pgid": pgid, - "hangingProcessTree": processTree, - }).Debug("Deadline exceeded: Analyzing hanging group") + // # TODO(INFRA-130229): Revert this temporary change mettle timeouts investigations are completed + w.logProcOnTimeout(pid) }(pid) err := cmd.Wait() @@ -1030,6 +1029,53 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur return err } +// logProcOnTimeout logs /proc diagnostics for the timed-out pid and its process group. +// It does not rely on `ps` existing in the container. +func (w *worker) logProcOnTimeout(pid int) { + const maxFDs = 200 + const maxTargetLen = 256 + p := strconv.Itoa(pid) + base := filepath.Join("/proc", p) + + cmdlineBytes, err := os.ReadFile(filepath.Join(base, "cmdline")) + cmdline := "" + if err == nil && len(cmdlineBytes) > 0 { + cmdline = strings.TrimSpace(strings.ReplaceAll(string(cmdlineBytes), "\x00", " ")) + } + + wchanBytes, err := os.ReadFile(filepath.Join(base, "wchan")) + wchan := "" + if err == nil && len(wchanBytes) > 0 { + wchan = strings.TrimSpace(string(wchanBytes)) + } + + var fds []string + if entries, err := os.ReadDir(filepath.Join(base, "fd")); err == nil { + for i, e := range entries { + if i >= maxFDs { + fds = append(fds, "...truncated") + break + } + target, err := os.Readlink(filepath.Join(base, "fd", e.Name())) + if err != nil { + continue + } + if len(target) > maxTargetLen { + target = target[:maxTargetLen] + "…" + } + fds = append(fds, e.Name()+" -> "+target) + } + } + + logr.WithFields(logrus.Fields{ + "hash": w.actionDigest.Hash, + "pid": pid, + "cmdline": cmdline, + "wchan": wchan, + "fds": fds, + }).Debug("Deadline exceeded: /proc snapshot") +} + // writeUncachedResult attempts to write an uncached action result proto. // This is an extension for buildbarn-browser that lets it display results of a failed build. // It returns a string that, on success, contains an appropriate message about it (they are From be74d4bc0f05ad477960aa5c8b857ceb95b35ba0 Mon Sep 17 00:00:00 2001 From: olha Date: Fri, 6 Mar 2026 09:24:31 +0000 Subject: [PATCH 02/11] Log chils processes that keep worker from exiting --- mettle/worker/worker.go | 97 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/mettle/worker/worker.go b/mettle/worker/worker.go index a4a1741d..a4076d5f 100644 --- a/mettle/worker/worker.go +++ b/mettle/worker/worker.go @@ -976,6 +976,7 @@ func (w *worker) runCommand(ctx context.Context, cmd *exec.Cmd, timeout time.Dur } // # TODO(INFRA-130229): Revert this temporary change mettle timeouts investigations are completed w.logProcOnTimeout(pid) + w.logProcChain(pid) }(pid) err := cmd.Wait() @@ -1076,6 +1077,102 @@ func (w *worker) logProcOnTimeout(pid int) { }).Debug("Deadline exceeded: /proc snapshot") } +// logProcChain logs a compact parent→child execution chain for the given PID +// This helps quickly identify which subprocess is still running when the worker +// times out +func (w *worker) logProcChain(pid int) { + const maxDepth = 10 + + readTrim := func(path string) string { + b, err := os.ReadFile(path) + if err != nil || len(b) == 0 { + return "" + } + return strings.TrimSpace(string(b)) + } + + name := func(p int) string { + n := readTrim(filepath.Join("/proc", strconv.Itoa(p), "comm")) + if n == "" { + return "?" + } + return n + } + + firstChild := func(p int) int { + b, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(p), "task", strconv.Itoa(p), "children")) + if err != nil || len(b) == 0 { + return 0 + } + f := strings.Fields(string(b)) + if len(f) == 0 { + return 0 + } + n, _ := strconv.Atoi(f[0]) + return n + } + + state := func(p int) string { + b, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(p), "status")) + if err != nil { + return "" + } + for _, ln := range strings.Split(string(b), "\n") { + k, v, ok := strings.Cut(ln, ":") + if ok && strings.TrimSpace(k) == "State" { + if f := strings.Fields(strings.TrimSpace(v)); len(f) > 0 { + return f[0] + } + } + } + return "" + } + + cur := pid + parts := []string{fmt.Sprintf("%d:%s", cur, name(cur))} + + for i := 0; i < maxDepth; i++ { + c := firstChild(cur) + if c == 0 { + break + } + parts = append(parts, fmt.Sprintf("%d:%s", c, name(c))) + cur = c + } + + tailPID := cur + tailState := state(tailPID) + + logr.WithFields(logrus.Fields{ + "hash": w.actionDigest.Hash, + "pid": pid, + "chain": strings.Join(parts, " -> "), + "tail_pid": tailPID, + "tail_name": name(tailPID), + "tail_state": tailState, + "tail_state_comment": stateComment(tailState), + "tail_wchan": readTrim(filepath.Join("/proc", strconv.Itoa(tailPID), "wchan")), + }).Debug("Timeout: process chain") +} + +// stateComment is a helper to translate tail_state code into text +func stateComment(s string) string { + switch s { + case "R": + return "running (executing on CPU)" + case "S": + return "sleeping (waiting on event / lock / pipe)" + case "D": + return "uninterruptible sleep (kernel I/O or filesystem wait)" + case "T": + return "stopped (signal or debugger)" + case "Z": + return "zombie (process exited but parent has not reaped it)" + default: + return "unknown process state" + } +} + // writeUncachedResult attempts to write an uncached action result proto. // This is an extension for buildbarn-browser that lets it display results of a failed build. // It returns a string that, on success, contains an appropriate message about it (they are From cd22353c5acf604a8d22b7e5b6b74186e6acc3b4 Mon Sep 17 00:00:00 2001 From: olha Date: Fri, 6 Mar 2026 09:29:06 +0000 Subject: [PATCH 03/11] Rebase master --- mettle/worker/worker.go | 8 -------- 1 file changed, 8 deletions(-) diff --git a/mettle/worker/worker.go b/mettle/worker/worker.go index a4076d5f..a320b31f 100644 --- a/mettle/worker/worker.go +++ b/mettle/worker/worker.go @@ -742,14 +742,6 @@ func (w *worker) execute(req *pb.ExecuteRequest, action *pb.Action, command *pb. "argc": len(command.Arguments), }).Debug("Executing action - started") - logr.WithFields(logrus.Fields{ - "hash": w.actionDigest.Hash, - "sandbox": w.sandbox, - "altSandbox": w.altSandbox, - "shouldSandbox": w.shouldSandbox(command), - "argv0": command.Arguments[0], - }).Debug("Sandbox selection") - cmd := exec.CommandContext(ctx, commandPath(command), command.Arguments[1:]...) // Setting Pdeathsig should ideally make subprocesses get kill signals if we die. From a927e8fd26e6c51a127df874cb51cb693d3931fa Mon Sep 17 00:00:00 2001 From: olha Date: Fri, 6 Mar 2026 09:25:42 +0000 Subject: [PATCH 04/11] Bump version --- ChangeLog | 4 ++++ VERSION | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 730d40f2..73039967 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +Version 11.16.7 +--------------- + * Improve runCommand timeout diagnostics with logProcChain() with child processes + Version 11.16.6 --------------- * Improve runCommand timeout diagnostics with logProcOnTimeout() diff --git a/VERSION b/VERSION index 9817f8d3..080a2cc9 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -11.16.6 \ No newline at end of file +11.16.7 \ No newline at end of file From 92073234558095622da01952c0eabd9f51de29be Mon Sep 17 00:00:00 2001 From: olha Date: Fri, 6 Mar 2026 09:39:37 +0000 Subject: [PATCH 05/11] pull chamges --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 080a2cc9..ee38035b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -11.16.7 \ No newline at end of file +11.16.7 From 56557613e58dff49e0a6e78c2f63b47974734d9e Mon Sep 17 00:00:00 2001 From: olha Date: Fri, 6 Mar 2026 10:30:58 +0000 Subject: [PATCH 06/11] Fixes after the review --- mettle/worker/worker.go | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/mettle/worker/worker.go b/mettle/worker/worker.go index a4076d5f..d14d20b6 100644 --- a/mettle/worker/worker.go +++ b/mettle/worker/worker.go @@ -1092,7 +1092,8 @@ func (w *worker) logProcChain(pid int) { } name := func(p int) string { - n := readTrim(filepath.Join("/proc", strconv.Itoa(p), "comm")) + pidStr := strconv.Itoa(p) + n := readTrim(filepath.Join("/proc", pidStr, "comm")) if n == "" { return "?" } @@ -1100,7 +1101,8 @@ func (w *worker) logProcChain(pid int) { } firstChild := func(p int) int { - b, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(p), "task", strconv.Itoa(p), "children")) + pidStr := strconv.Itoa(p) + b, err := os.ReadFile(filepath.Join("/proc", pidStr, "task", pidStr, "children")) if err != nil || len(b) == 0 { return 0 } @@ -1108,12 +1110,16 @@ func (w *worker) logProcChain(pid int) { if len(f) == 0 { return 0 } - n, _ := strconv.Atoi(f[0]) + n, err := strconv.Atoi(f[0]) + if err != nil { + return 0 + } return n } state := func(p int) string { - b, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(p), "status")) + pidStr := strconv.Itoa(p) + b, err := os.ReadFile(filepath.Join("/proc", pidStr, "status")) if err != nil { return "" } @@ -1129,7 +1135,8 @@ func (w *worker) logProcChain(pid int) { } cur := pid - parts := []string{fmt.Sprintf("%d:%s", cur, name(cur))} + parts := make([]string, 0, maxDepth+1) + parts = append(parts, fmt.Sprintf("%d:%s", cur, name(cur))) for i := 0; i < maxDepth; i++ { c := firstChild(cur) From 602eec6a4b946f840ccb562c5a28ee2729b0754e Mon Sep 17 00:00:00 2001 From: olha Date: Fri, 6 Mar 2026 12:59:15 +0000 Subject: [PATCH 07/11] Iterate over all child processes --- mettle/worker/worker.go | 162 +++++++++++++++++++++++++--------------- 1 file changed, 100 insertions(+), 62 deletions(-) diff --git a/mettle/worker/worker.go b/mettle/worker/worker.go index d14d20b6..1b9fda93 100644 --- a/mettle/worker/worker.go +++ b/mettle/worker/worker.go @@ -1080,86 +1080,122 @@ func (w *worker) logProcOnTimeout(pid int) { // logProcChain logs a compact parent→child execution chain for the given PID // This helps quickly identify which subprocess is still running when the worker // times out +// If multiple children exist, they are listed instead. func (w *worker) logProcChain(pid int) { const maxDepth = 10 - readTrim := func(path string) string { - b, err := os.ReadFile(path) - if err != nil || len(b) == 0 { - return "" - } - return strings.TrimSpace(string(b)) - } + cur := pid + parts := make([]string, 0, maxDepth+1) + parts = append(parts, fmt.Sprintf("%d:%s", cur, procName(cur))) - name := func(p int) string { - pidStr := strconv.Itoa(p) - n := readTrim(filepath.Join("/proc", pidStr, "comm")) - if n == "" { - return "?" - } - return n - } + var childrenSummary string - firstChild := func(p int) int { - pidStr := strconv.Itoa(p) - b, err := os.ReadFile(filepath.Join("/proc", pidStr, "task", pidStr, "children")) - if err != nil || len(b) == 0 { - return 0 - } - f := strings.Fields(string(b)) - if len(f) == 0 { - return 0 - } - n, err := strconv.Atoi(f[0]) - if err != nil { - return 0 - } - return n - } + for i := 0; i < maxDepth; i++ { + children := childPIDs(cur) - state := func(p int) string { - pidStr := strconv.Itoa(p) - b, err := os.ReadFile(filepath.Join("/proc", pidStr, "status")) - if err != nil { - return "" - } - for _, ln := range strings.Split(string(b), "\n") { - k, v, ok := strings.Cut(ln, ":") - if ok && strings.TrimSpace(k) == "State" { - if f := strings.Fields(strings.TrimSpace(v)); len(f) > 0 { - return f[0] - } - } + if len(children) == 0 { + break } - return "" - } - - cur := pid - parts := make([]string, 0, maxDepth+1) - parts = append(parts, fmt.Sprintf("%d:%s", cur, name(cur))) - for i := 0; i < maxDepth; i++ { - c := firstChild(cur) - if c == 0 { + if len(children) > 1 { + items := make([]string, 0, len(children)) + for _, c := range children { + state := procState(c) + items = append(items, + fmt.Sprintf("%d:%s:%s(%s)", + c, + procName(c), + state, + stateComment(state), + ), + ) + } + childrenSummary = strings.Join(items, " | ") break } - parts = append(parts, fmt.Sprintf("%d:%s", c, name(c))) - cur = c + + cur = children[0] + parts = append(parts, fmt.Sprintf("%d:%s", cur, procName(cur))) } - tailPID := cur - tailState := state(tailPID) + tailState := procState(cur) - logr.WithFields(logrus.Fields{ + fields := logrus.Fields{ "hash": w.actionDigest.Hash, "pid": pid, "chain": strings.Join(parts, " -> "), - "tail_pid": tailPID, - "tail_name": name(tailPID), + "tail_pid": cur, + "tail_name": procName(cur), "tail_state": tailState, "tail_state_comment": stateComment(tailState), - "tail_wchan": readTrim(filepath.Join("/proc", strconv.Itoa(tailPID), "wchan")), - }).Debug("Timeout: process chain") + "tail_wchan": readTrim(filepath.Join(procBase(cur), "wchan")), + } + + if childrenSummary != "" { + fields["children"] = childrenSummary + } else { + fields["children"] = "none" + } + + logr.WithFields(fields).Debug("Deadline exceeded: process chain") +} + +// procBase returns the /proc path for a pid +func procBase(pid int) string { + return filepath.Join("/proc", strconv.Itoa(pid)) +} + +// readTrim reads a file and returns trimmed content, or "" on error/empty content +func readTrim(path string) string { + b, err := os.ReadFile(path) + if err != nil || len(b) == 0 { + return "" + } + return strings.TrimSpace(string(b)) +} + +// procName returns the process name from /proc//comm, or "?" if unavailable +func procName(pid int) string { + name := readTrim(filepath.Join(procBase(pid), "comm")) + if name == "" { + return "?" + } + return name +} + +// procState returns the Linux process state code from /proc//status +func procState(pid int) string { + b, err := os.ReadFile(filepath.Join(procBase(pid), "status")) + if err != nil { + return "" + } + for _, ln := range strings.Split(string(b), "\n") { + k, v, ok := strings.Cut(ln, ":") + if !ok || strings.TrimSpace(k) != "State" { + continue + } + if f := strings.Fields(strings.TrimSpace(v)); len(f) > 0 { + return f[0] + } + return "" + } + return "" +} + +// childPIDs returns direct children of pid from /proc//task//children +func childPIDs(pid int) []int { + b, err := os.ReadFile(filepath.Join(procBase(pid), "task", strconv.Itoa(pid), "children")) + if err != nil || len(b) == 0 { + return nil + } + out := make([]int, 0, 4) + for _, s := range strings.Fields(string(b)) { + n, err := strconv.Atoi(s) + if err == nil && n > 0 { + out = append(out, n) + } + } + return out } // stateComment is a helper to translate tail_state code into text @@ -1171,6 +1207,8 @@ func stateComment(s string) string { return "sleeping (waiting on event / lock / pipe)" case "D": return "uninterruptible sleep (kernel I/O or filesystem wait)" + case "I": + return "idle kernel thread (not doing work)" case "T": return "stopped (signal or debugger)" case "Z": From 597e49e35acd80ba9b47ac2ac1237b64d643e5e1 Mon Sep 17 00:00:00 2001 From: olha Date: Tue, 10 Mar 2026 15:59:48 +0000 Subject: [PATCH 08/11] Log all child processes --- mettle/worker/worker.go | 114 ++++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 58 deletions(-) diff --git a/mettle/worker/worker.go b/mettle/worker/worker.go index 1b9fda93..cfe3f5d8 100644 --- a/mettle/worker/worker.go +++ b/mettle/worker/worker.go @@ -1077,67 +1077,66 @@ func (w *worker) logProcOnTimeout(pid int) { }).Debug("Deadline exceeded: /proc snapshot") } -// logProcChain logs a compact parent→child execution chain for the given PID -// This helps quickly identify which subprocess is still running when the worker -// times out -// If multiple children exist, they are listed instead. +// logProcTree logs a bounded process subtree starting from pid. +// Example: +// +// 24877:bash[S - sleeping/waiting]{do_wait} -> 24890:test_binary[D - io_wait)]{tcp_recvmsg} +// 24877:bash[S - sleeping/waiting]{do_wait} -> 24891:tee[S - sleeping/waitin]{pipe_read} func (w *worker) logProcChain(pid int) { - const maxDepth = 10 + const ( + maxDepth = 6 + maxNodes = 40 + ) - cur := pid - parts := make([]string, 0, maxDepth+1) - parts = append(parts, fmt.Sprintf("%d:%s", cur, procName(cur))) + lines := make([]string, 0, maxNodes) + nodes := 0 - var childrenSummary string + var walk func(pid, depth int, path []string) + walk = func(pid, depth int, path []string) { + if depth > maxDepth || nodes >= maxNodes { + return + } - for i := 0; i < maxDepth; i++ { - children := childPIDs(cur) + current := formatProc(pid) + path = append(path, current) + children := childPIDs(pid) if len(children) == 0 { - break + lines = append(lines, strings.Join(path, " -> ")) + nodes++ + return } - if len(children) > 1 { - items := make([]string, 0, len(children)) - for _, c := range children { - state := procState(c) - items = append(items, - fmt.Sprintf("%d:%s:%s(%s)", - c, - procName(c), - state, - stateComment(state), - ), - ) + for _, c := range children { + if nodes >= maxNodes { + return } - childrenSummary = strings.Join(items, " | ") - break + walk(c, depth+1, path) } - - cur = children[0] - parts = append(parts, fmt.Sprintf("%d:%s", cur, procName(cur))) - } - - tailState := procState(cur) - - fields := logrus.Fields{ - "hash": w.actionDigest.Hash, - "pid": pid, - "chain": strings.Join(parts, " -> "), - "tail_pid": cur, - "tail_name": procName(cur), - "tail_state": tailState, - "tail_state_comment": stateComment(tailState), - "tail_wchan": readTrim(filepath.Join(procBase(cur), "wchan")), } + startDepth := 0 + walk(pid, startDepth, nil) - if childrenSummary != "" { - fields["children"] = childrenSummary - } else { - fields["children"] = "none" - } + logr.WithFields(logrus.Fields{ + "hash": w.actionDigest.Hash, + "pid": pid, + "tree": strings.Join(lines, "\n"), + }).Debug("Deadline exceeded: process tree") +} - logr.WithFields(fields).Debug("Deadline exceeded: process chain") +// formatProc creates formatted log line +func formatProc(pid int) string { + state := procState(pid) + wchan := readTrim(filepath.Join(procBase(pid), "wchan")) + + return fmt.Sprintf( + "%d:%s[%s - %s]{%s}", + pid, + procName(pid), + state, + stateComment(state), + wchan, + ) } // procBase returns the /proc path for a pid @@ -1188,10 +1187,9 @@ func childPIDs(pid int) []int { if err != nil || len(b) == 0 { return nil } - out := make([]int, 0, 4) + var out []int for _, s := range strings.Fields(string(b)) { - n, err := strconv.Atoi(s) - if err == nil && n > 0 { + if n, err := strconv.Atoi(s); err == nil && n > 0 { out = append(out, n) } } @@ -1202,19 +1200,19 @@ func childPIDs(pid int) []int { func stateComment(s string) string { switch s { case "R": - return "running (executing on CPU)" + return "running" case "S": - return "sleeping (waiting on event / lock / pipe)" + return "sleeping/waiting" case "D": - return "uninterruptible sleep (kernel I/O or filesystem wait)" + return "io_wait" case "I": - return "idle kernel thread (not doing work)" + return "idle" case "T": - return "stopped (signal or debugger)" + return "stopped" case "Z": - return "zombie (process exited but parent has not reaped it)" + return "zombie" default: - return "unknown process state" + return "unknown" } } From 6363d4678eaad06566bf5b6f23cda3d30811559d Mon Sep 17 00:00:00 2001 From: olha Date: Tue, 10 Mar 2026 16:00:53 +0000 Subject: [PATCH 09/11] improve formatting --- mettle/worker/worker.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mettle/worker/worker.go b/mettle/worker/worker.go index cfe3f5d8..fbe8a3c9 100644 --- a/mettle/worker/worker.go +++ b/mettle/worker/worker.go @@ -1087,10 +1087,8 @@ func (w *worker) logProcChain(pid int) { maxDepth = 6 maxNodes = 40 ) - lines := make([]string, 0, maxNodes) nodes := 0 - var walk func(pid, depth int, path []string) walk = func(pid, depth int, path []string) { if depth > maxDepth || nodes >= maxNodes { @@ -1106,7 +1104,6 @@ func (w *worker) logProcChain(pid int) { nodes++ return } - for _, c := range children { if nodes >= maxNodes { return @@ -1114,6 +1111,7 @@ func (w *worker) logProcChain(pid int) { walk(c, depth+1, path) } } + startDepth := 0 walk(pid, startDepth, nil) From 3941ab2a7736dc133b50a099d1d600e59984f747 Mon Sep 17 00:00:00 2001 From: olha Date: Tue, 10 Mar 2026 16:08:10 +0000 Subject: [PATCH 10/11] Return errorSummary if failed to get a snapshot --- mettle/worker/worker.go | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/mettle/worker/worker.go b/mettle/worker/worker.go index fbe8a3c9..78d0b2b3 100644 --- a/mettle/worker/worker.go +++ b/mettle/worker/worker.go @@ -1037,21 +1037,29 @@ func (w *worker) logProcOnTimeout(pid int) { const maxTargetLen = 256 p := strconv.Itoa(pid) base := filepath.Join("/proc", p) + var errs []string cmdlineBytes, err := os.ReadFile(filepath.Join(base, "cmdline")) cmdline := "" - if err == nil && len(cmdlineBytes) > 0 { + if err != nil { + errs = append(errs, "cmdline") + } else if len(cmdlineBytes) > 0 { cmdline = strings.TrimSpace(strings.ReplaceAll(string(cmdlineBytes), "\x00", " ")) } wchanBytes, err := os.ReadFile(filepath.Join(base, "wchan")) wchan := "" - if err == nil && len(wchanBytes) > 0 { + if err != nil { + errs = append(errs, "wchan") + } else if len(wchanBytes) > 0 { wchan = strings.TrimSpace(string(wchanBytes)) } var fds []string - if entries, err := os.ReadDir(filepath.Join(base, "fd")); err == nil { + entries, err := os.ReadDir(filepath.Join(base, "fd")) + if err != nil { + errs = append(errs, "fd") + } else { for i, e := range entries { if i >= maxFDs { fds = append(fds, "...truncated") @@ -1068,12 +1076,18 @@ func (w *worker) logProcOnTimeout(pid int) { } } + errSummary := "" + if len(errs) > 0 { + errSummary = strings.Join(errs, ",") + } + logr.WithFields(logrus.Fields{ - "hash": w.actionDigest.Hash, - "pid": pid, - "cmdline": cmdline, - "wchan": wchan, - "fds": fds, + "hash": w.actionDigest.Hash, + "pid": pid, + "cmdline": cmdline, + "wchan": wchan, + "fds": fds, + "proc_err": errSummary, }).Debug("Deadline exceeded: /proc snapshot") } @@ -1119,7 +1133,7 @@ func (w *worker) logProcChain(pid int) { "hash": w.actionDigest.Hash, "pid": pid, "tree": strings.Join(lines, "\n"), - }).Debug("Deadline exceeded: process tree") + }).Debug("Deadline exceeded: /proc process tree") } // formatProc creates formatted log line From 74c7457463429908af2498a0f0eb1866d07d1977 Mon Sep 17 00:00:00 2001 From: olha Date: Thu, 12 Mar 2026 14:07:50 +0000 Subject: [PATCH 11/11] Return actual error when /proc fails --- mettle/worker/worker.go | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/mettle/worker/worker.go b/mettle/worker/worker.go index 78d0b2b3..1150b8cd 100644 --- a/mettle/worker/worker.go +++ b/mettle/worker/worker.go @@ -1037,12 +1037,12 @@ func (w *worker) logProcOnTimeout(pid int) { const maxTargetLen = 256 p := strconv.Itoa(pid) base := filepath.Join("/proc", p) - var errs []string + var errs []error cmdlineBytes, err := os.ReadFile(filepath.Join(base, "cmdline")) cmdline := "" if err != nil { - errs = append(errs, "cmdline") + errs = append(errs, fmt.Errorf("cmdline: %v", err)) } else if len(cmdlineBytes) > 0 { cmdline = strings.TrimSpace(strings.ReplaceAll(string(cmdlineBytes), "\x00", " ")) } @@ -1050,7 +1050,7 @@ func (w *worker) logProcOnTimeout(pid int) { wchanBytes, err := os.ReadFile(filepath.Join(base, "wchan")) wchan := "" if err != nil { - errs = append(errs, "wchan") + errs = append(errs, fmt.Errorf("wchan: %v", err)) } else if len(wchanBytes) > 0 { wchan = strings.TrimSpace(string(wchanBytes)) } @@ -1058,7 +1058,7 @@ func (w *worker) logProcOnTimeout(pid int) { var fds []string entries, err := os.ReadDir(filepath.Join(base, "fd")) if err != nil { - errs = append(errs, "fd") + errs = append(errs, fmt.Errorf("fd: %v", err)) } else { for i, e := range entries { if i >= maxFDs { @@ -1076,9 +1076,13 @@ func (w *worker) logProcOnTimeout(pid int) { } } - errSummary := "" + var errSummary string if len(errs) > 0 { - errSummary = strings.Join(errs, ",") + s := make([]string, len(errs)) + for i, e := range errs { + s[i] = e.Error() + } + errSummary = strings.Join(s, ", ") } logr.WithFields(logrus.Fields{ @@ -1126,8 +1130,7 @@ func (w *worker) logProcChain(pid int) { } } - startDepth := 0 - walk(pid, startDepth, nil) + walk(pid, 0, nil) logr.WithFields(logrus.Fields{ "hash": w.actionDigest.Hash,