Skip to content

Commit f284a8d

Browse files
authored
refactor signal handler for improved efficiency when multiple remote targets (#608)
* refactor signal handler for improved efficiency when multiple remote targets Signed-off-by: Harper, Jason M <jason.m.harper@intel.com> * improve error handling Signed-off-by: Harper, Jason M <jason.m.harper@intel.com> --------- Signed-off-by: Harper, Jason M <jason.m.harper@intel.com>
1 parent ec352b7 commit f284a8d

1 file changed

Lines changed: 70 additions & 54 deletions

File tree

internal/workflow/signals.go

Lines changed: 70 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@ package workflow
55

66
import (
77
"context"
8+
"errors"
89
"fmt"
910
"log/slog"
1011
"os"
1112
"os/exec"
1213
"os/signal"
1314
"path/filepath"
15+
"strconv"
1416
"strings"
17+
"sync"
1518
"syscall"
1619
"time"
1720

@@ -57,74 +60,87 @@ func configureSignalHandler(myTargets []target.Target, statusFunc progress.Multi
5760
sigChannel := make(chan os.Signal, 1)
5861
signal.Notify(sigChannel, syscall.SIGINT, syscall.SIGTERM)
5962
go func() {
63+
// wait for a signal
6064
sig := <-sigChannel
6165
slog.Debug("received signal", slog.String("signal", sig.String()))
6266
// The controller script is run in its own process group, so we need to send the signal
6367
// directly to the PID of the controller. For every target, look for the controller
64-
// PID file and send SIGINT to it.
68+
// PID file and send SIGINT to it, then wait for it to exit concurrently.
69+
var wg sync.WaitGroup
6570
for _, t := range myTargets {
6671
if statusFunc != nil {
6772
_ = statusFunc(t.GetName(), "Signal received, cleaning up...")
6873
}
6974
pidFilePath := filepath.Join(t.GetTempDirectory(), script.ControllerPIDFileName)
70-
stdout, _, exitcode, err := t.RunCommandEx(exec.Command("cat", pidFilePath), 5, false, true) // #nosec G204
75+
stdout, _, _, err := t.RunCommandEx(exec.Command("cat", pidFilePath), 5, false, true) // #nosec G204
76+
// if there's an error and the error type is exec.ExitError, the file likely doesn't exist
77+
// so we can skip sending the signal to this target
7178
if err != nil {
72-
slog.Error("error retrieving target controller PID", slog.String("target", t.GetName()), slog.String("error", err.Error()))
79+
var exitErr *exec.ExitError
80+
if errors.As(err, &exitErr) {
81+
slog.Debug("target controller PID file not found, assuming script has already exited", slog.String("target", t.GetName()))
82+
continue
83+
}
84+
slog.Error("failed to retrieve target controller PID", slog.String("target", t.GetName()), slog.String("error", err.Error()))
7385
continue
7486
}
75-
if exitcode == 0 {
76-
pidStr := strings.TrimSpace(stdout)
77-
err = signalProcessOnTarget(t, pidStr, "SIGINT")
78-
if err != nil {
79-
slog.Error("error sending SIGINT signal to target controller", slog.String("target", t.GetName()), slog.String("error", err.Error()))
80-
}
87+
pid := strings.TrimSpace(stdout)
88+
// confirm pid is a valid integer
89+
if _, err := strconv.Atoi(pid); err != nil {
90+
slog.Error("invalid PID retrieved from target controller PID file", slog.String("target", t.GetName()), slog.String("pid", pid), slog.String("error", err.Error()))
91+
continue
8192
}
82-
}
83-
// now wait until all controller scripts have exited
84-
slog.Debug("waiting for controller scripts to exit")
85-
for _, t := range myTargets {
86-
// create a per-target timeout context
87-
targetTimeout := 10 * time.Second
88-
ctx, cancel := context.WithTimeout(context.Background(), targetTimeout)
89-
timedOut := false
90-
pidFilePath := filepath.Join(t.GetTempDirectory(), script.ControllerPIDFileName)
91-
// read the pid file
92-
stdout, _, exitcode, err := t.RunCommandEx(exec.Command("cat", pidFilePath), 5, false, true) // #nosec G204
93-
if err != nil || exitcode != 0 {
94-
slog.Debug("target controller PID file no longer exists, assuming script has exited or is in the process of exiting", slog.String("target", t.GetName()))
95-
cancel()
93+
// send SIGINT to the controller process on the target
94+
slog.Debug("signaling target controller process with SIGINT", slog.String("target", t.GetName()), slog.String("pid", pid))
95+
err = signalProcessOnTarget(t, pid, "SIGINT")
96+
if err != nil {
97+
slog.Error("failed to send SIGINT signal to target controller", slog.String("target", t.GetName()), slog.String("error", err.Error()))
9698
continue
9799
}
98-
pidStr := strings.TrimSpace(stdout)
99-
for {
100-
// determine if the process still exists
101-
_, _, exitcode, err = t.RunCommandEx(exec.Command("ps", "-p", pidStr), 5, false, true) // #nosec G204
102-
if err != nil || exitcode != 0 {
103-
slog.Debug("target controller process no longer exists", slog.String("target", t.GetName()))
104-
break
105-
}
106-
// check for timeout
107-
select {
108-
case <-ctx.Done():
109-
timedOut = true
110-
default:
111-
}
112-
if timedOut {
113-
if statusFunc != nil {
114-
_ = statusFunc(t.GetName(), "cleanup timeout exceeded, sending kill signal")
115-
}
116-
slog.Warn("signal handler cleanup timeout exceeded for target, sending SIGKILL", slog.String("target", t.GetName()))
117-
err = signalProcessOnTarget(t, pidStr, "SIGKILL")
100+
// spawn a goroutine to wait for this target's controller to exit
101+
wg.Add(1)
102+
go func(tgt target.Target, pid string) {
103+
defer wg.Done()
104+
// create a per-target timeout context
105+
targetTimeout := 10 * time.Second
106+
ctx, cancel := context.WithTimeout(context.Background(), targetTimeout)
107+
defer cancel()
108+
timedOut := false
109+
for {
110+
// determine if the process still exists
111+
_, _, _, err := tgt.RunCommandEx(exec.Command("ps", "-p", pid), 5, false, true) // #nosec G204
118112
if err != nil {
119-
slog.Error("error sending SIGKILL signal to target controller", slog.String("target", t.GetName()), slog.String("error", err.Error()))
113+
var exitErr *exec.ExitError
114+
if errors.As(err, &exitErr) {
115+
slog.Debug("target controller process no longer exists", slog.String("target", tgt.GetName()))
116+
break
117+
}
118+
slog.Error("failed to check target controller process", slog.String("target", tgt.GetName()), slog.String("error", err.Error()))
119+
break
120120
}
121-
break
121+
// check for timeout
122+
select {
123+
case <-ctx.Done():
124+
timedOut = true
125+
default:
126+
}
127+
if timedOut {
128+
if statusFunc != nil {
129+
_ = statusFunc(tgt.GetName(), "cleanup timeout exceeded, sending kill signal")
130+
}
131+
slog.Warn("signal handler cleanup timeout exceeded for target, sending SIGKILL", slog.String("target", tgt.GetName()))
132+
err := signalProcessOnTarget(tgt, pid, "SIGKILL")
133+
if err != nil {
134+
slog.Error("failed to send SIGKILL signal to target controller", slog.String("target", tgt.GetName()), slog.String("error", err.Error()))
135+
}
136+
break
137+
}
138+
// sleep for a short time before checking again
139+
time.Sleep(500 * time.Millisecond)
122140
}
123-
// sleep for a short time before checking again
124-
time.Sleep(500 * time.Millisecond)
125-
}
126-
cancel()
141+
}(t, pid)
127142
}
143+
wg.Wait()
128144

129145
// Race condition between the controller script deleting its PID file and it truly exiting.
130146
// Future work: reconsider decision to have the controller script delete its own PID file.
@@ -134,20 +150,20 @@ func configureSignalHandler(myTargets []target.Target, statusFunc progress.Multi
134150
time.Sleep(500 * time.Millisecond)
135151

136152
// send SIGINT to perfspect's remaining children, if any
137-
myPid := os.Getpid()
138-
children, err := util.GetChildren(myPid)
153+
perfspectPid := os.Getpid()
154+
children, err := util.GetChildren(perfspectPid)
139155
if err != nil {
140-
slog.Error("error retrieving child processes", slog.String("error", err.Error()))
156+
slog.Error("failed to retrieve perfspect's child processes", slog.String("error", err.Error()))
141157
return
142158
}
143159
if len(children) == 0 {
144-
slog.Debug("no child processes to signal")
160+
slog.Debug("perfspect has no child processes to signal")
145161
return
146162
}
147163
slog.Debug("signaling child processes", slog.String("child PIDs", fmt.Sprintf("%v", children)))
148164
err = util.SignalChildren(syscall.SIGINT)
149165
if err != nil {
150-
slog.Error("error sending signal to children", slog.String("error", err.Error()))
166+
slog.Error("failed to send SIGINT signal to perfspect's child processes", slog.String("error", err.Error()))
151167
}
152168
}()
153169
}

0 commit comments

Comments
 (0)