@@ -5,13 +5,16 @@ package workflow
55
66import (
77 "context"
8+ "errors"
89 "fmt"
910 "log/slog"
1011 "os"
1112 "os/exec"
1213 "os/signal"
1314 "path/filepath"
15+ "strconv"
1416 "strings"
17+ "sync"
1518 "syscall"
1619 "time"
1720
@@ -57,74 +60,87 @@ func configureSignalHandler(myTargets []target.Target, statusFunc progress.Multi
5760 sigChannel := make (chan os.Signal , 1 )
5861 signal .Notify (sigChannel , syscall .SIGINT , syscall .SIGTERM )
5962 go func () {
63+ // wait for a signal
6064 sig := <- sigChannel
6165 slog .Debug ("received signal" , slog .String ("signal" , sig .String ()))
6266 // The controller script is run in its own process group, so we need to send the signal
6367 // directly to the PID of the controller. For every target, look for the controller
64- // PID file and send SIGINT to it.
68+ // PID file and send SIGINT to it, then wait for it to exit concurrently.
69+ var wg sync.WaitGroup
6570 for _ , t := range myTargets {
6671 if statusFunc != nil {
6772 _ = statusFunc (t .GetName (), "Signal received, cleaning up..." )
6873 }
6974 pidFilePath := filepath .Join (t .GetTempDirectory (), script .ControllerPIDFileName )
70- stdout , _ , exitcode , err := t .RunCommandEx (exec .Command ("cat" , pidFilePath ), 5 , false , true ) // #nosec G204
75+ stdout , _ , _ , err := t .RunCommandEx (exec .Command ("cat" , pidFilePath ), 5 , false , true ) // #nosec G204
76+ // if there's an error and the error type is exec.ExitError, the file likely doesn't exist
77+ // so we can skip sending the signal to this target
7178 if err != nil {
72- slog .Error ("error retrieving target controller PID" , slog .String ("target" , t .GetName ()), slog .String ("error" , err .Error ()))
79+ var exitErr * exec.ExitError
80+ if errors .As (err , & exitErr ) {
81+ slog .Debug ("target controller PID file not found, assuming script has already exited" , slog .String ("target" , t .GetName ()))
82+ continue
83+ }
84+ slog .Error ("failed to retrieve target controller PID" , slog .String ("target" , t .GetName ()), slog .String ("error" , err .Error ()))
7385 continue
7486 }
75- if exitcode == 0 {
76- pidStr := strings .TrimSpace (stdout )
77- err = signalProcessOnTarget (t , pidStr , "SIGINT" )
78- if err != nil {
79- slog .Error ("error sending SIGINT signal to target controller" , slog .String ("target" , t .GetName ()), slog .String ("error" , err .Error ()))
80- }
87+ pid := strings .TrimSpace (stdout )
88+ // confirm pid is a valid integer
89+ if _ , err := strconv .Atoi (pid ); err != nil {
90+ slog .Error ("invalid PID retrieved from target controller PID file" , slog .String ("target" , t .GetName ()), slog .String ("pid" , pid ), slog .String ("error" , err .Error ()))
91+ continue
8192 }
82- }
83- // now wait until all controller scripts have exited
84- slog .Debug ("waiting for controller scripts to exit" )
85- for _ , t := range myTargets {
86- // create a per-target timeout context
87- targetTimeout := 10 * time .Second
88- ctx , cancel := context .WithTimeout (context .Background (), targetTimeout )
89- timedOut := false
90- pidFilePath := filepath .Join (t .GetTempDirectory (), script .ControllerPIDFileName )
91- // read the pid file
92- stdout , _ , exitcode , err := t .RunCommandEx (exec .Command ("cat" , pidFilePath ), 5 , false , true ) // #nosec G204
93- if err != nil || exitcode != 0 {
94- slog .Debug ("target controller PID file no longer exists, assuming script has exited or is in the process of exiting" , slog .String ("target" , t .GetName ()))
95- cancel ()
93+ // send SIGINT to the controller process on the target
94+ slog .Debug ("signaling target controller process with SIGINT" , slog .String ("target" , t .GetName ()), slog .String ("pid" , pid ))
95+ err = signalProcessOnTarget (t , pid , "SIGINT" )
96+ if err != nil {
97+ slog .Error ("failed to send SIGINT signal to target controller" , slog .String ("target" , t .GetName ()), slog .String ("error" , err .Error ()))
9698 continue
9799 }
98- pidStr := strings .TrimSpace (stdout )
99- for {
100- // determine if the process still exists
101- _ , _ , exitcode , err = t .RunCommandEx (exec .Command ("ps" , "-p" , pidStr ), 5 , false , true ) // #nosec G204
102- if err != nil || exitcode != 0 {
103- slog .Debug ("target controller process no longer exists" , slog .String ("target" , t .GetName ()))
104- break
105- }
106- // check for timeout
107- select {
108- case <- ctx .Done ():
109- timedOut = true
110- default :
111- }
112- if timedOut {
113- if statusFunc != nil {
114- _ = statusFunc (t .GetName (), "cleanup timeout exceeded, sending kill signal" )
115- }
116- slog .Warn ("signal handler cleanup timeout exceeded for target, sending SIGKILL" , slog .String ("target" , t .GetName ()))
117- err = signalProcessOnTarget (t , pidStr , "SIGKILL" )
100+ // spawn a goroutine to wait for this target's controller to exit
101+ wg .Add (1 )
102+ go func (tgt target.Target , pid string ) {
103+ defer wg .Done ()
104+ // create a per-target timeout context
105+ targetTimeout := 10 * time .Second
106+ ctx , cancel := context .WithTimeout (context .Background (), targetTimeout )
107+ defer cancel ()
108+ timedOut := false
109+ for {
110+ // determine if the process still exists
111+ _ , _ , _ , err := tgt .RunCommandEx (exec .Command ("ps" , "-p" , pid ), 5 , false , true ) // #nosec G204
118112 if err != nil {
119- slog .Error ("error sending SIGKILL signal to target controller" , slog .String ("target" , t .GetName ()), slog .String ("error" , err .Error ()))
113+ var exitErr * exec.ExitError
114+ if errors .As (err , & exitErr ) {
115+ slog .Debug ("target controller process no longer exists" , slog .String ("target" , tgt .GetName ()))
116+ break
117+ }
118+ slog .Error ("failed to check target controller process" , slog .String ("target" , tgt .GetName ()), slog .String ("error" , err .Error ()))
119+ break
120120 }
121- break
121+ // check for timeout
122+ select {
123+ case <- ctx .Done ():
124+ timedOut = true
125+ default :
126+ }
127+ if timedOut {
128+ if statusFunc != nil {
129+ _ = statusFunc (tgt .GetName (), "cleanup timeout exceeded, sending kill signal" )
130+ }
131+ slog .Warn ("signal handler cleanup timeout exceeded for target, sending SIGKILL" , slog .String ("target" , tgt .GetName ()))
132+ err := signalProcessOnTarget (tgt , pid , "SIGKILL" )
133+ if err != nil {
134+ slog .Error ("failed to send SIGKILL signal to target controller" , slog .String ("target" , tgt .GetName ()), slog .String ("error" , err .Error ()))
135+ }
136+ break
137+ }
138+ // sleep for a short time before checking again
139+ time .Sleep (500 * time .Millisecond )
122140 }
123- // sleep for a short time before checking again
124- time .Sleep (500 * time .Millisecond )
125- }
126- cancel ()
141+ }(t , pid )
127142 }
143+ wg .Wait ()
128144
129145 // Race condition between the controller script deleting its PID file and it truly exiting.
130146 // Future work: reconsider decision to have the controller script delete its own PID file.
@@ -134,20 +150,20 @@ func configureSignalHandler(myTargets []target.Target, statusFunc progress.Multi
134150 time .Sleep (500 * time .Millisecond )
135151
136152 // send SIGINT to perfspect's remaining children, if any
137- myPid := os .Getpid ()
138- children , err := util .GetChildren (myPid )
153+ perfspectPid := os .Getpid ()
154+ children , err := util .GetChildren (perfspectPid )
139155 if err != nil {
140- slog .Error ("error retrieving child processes" , slog .String ("error" , err .Error ()))
156+ slog .Error ("failed to retrieve perfspect's child processes" , slog .String ("error" , err .Error ()))
141157 return
142158 }
143159 if len (children ) == 0 {
144- slog .Debug ("no child processes to signal" )
160+ slog .Debug ("perfspect has no child processes to signal" )
145161 return
146162 }
147163 slog .Debug ("signaling child processes" , slog .String ("child PIDs" , fmt .Sprintf ("%v" , children )))
148164 err = util .SignalChildren (syscall .SIGINT )
149165 if err != nil {
150- slog .Error ("error sending signal to children " , slog .String ("error" , err .Error ()))
166+ slog .Error ("failed to send SIGINT signal to perfspect's child processes " , slog .String ("error" , err .Error ()))
151167 }
152168 }()
153169}
0 commit comments