@@ -5,9 +5,11 @@ import (
55 "fmt"
66 "os"
77 "os/exec"
8+ "os/signal"
89 "path/filepath"
910 "strconv"
1011 "strings"
12+ "sync"
1113 "syscall"
1214 "time"
1315
@@ -103,7 +105,43 @@ func runSupervisor() {
103105 supervisorLogFile .Sync ()
104106 }
105107
106- logSupervisor ("已启动" )
108+ // 将 supervisor 自身的 PID 写入 PID 文件
109+ pidFile := GetPIDFile ()
110+ if err := os .WriteFile (pidFile , []byte (strconv .Itoa (os .Getpid ())), 0600 ); err != nil {
111+ fmt .Printf ("写入PID文件失败: %v\n " , err )
112+ return
113+ }
114+
115+ logSupervisor ("已启动 (PID: %d)" , os .Getpid ())
116+
117+ // 信号处理:收到 SIGTERM/SIGINT 时杀掉子进程并退出
118+ var currentChild * os.Process
119+ var childMu sync.Mutex
120+
121+ sigCh := make (chan os.Signal , 1 )
122+ signal .Notify (sigCh , syscall .SIGTERM , syscall .SIGINT )
123+ go func () {
124+ sig := <- sigCh
125+ logSupervisor ("收到信号 %v,正在停止..." , sig )
126+ childMu .Lock ()
127+ child := currentChild
128+ childMu .Unlock ()
129+ if child != nil {
130+ child .Signal (syscall .SIGTERM )
131+ // 等待子进程退出,最多 5 秒后强杀
132+ for i := 0 ; i < 50 ; i ++ {
133+ time .Sleep (100 * time .Millisecond )
134+ if err := child .Signal (syscall .Signal (0 )); err != nil {
135+ break // 子进程已退出
136+ }
137+ }
138+ // 如果还活着,强杀
139+ child .Kill ()
140+ }
141+ os .Remove (pidFile )
142+ os .Remove (filepath .Join (getHomeDir (), ".anssl-stop" ))
143+ os .Exit (0 )
144+ }()
107145
108146 restartDelay := 1 * time .Second
109147 maxRestartDelay := 30 * time .Second
@@ -112,6 +150,7 @@ func runSupervisor() {
112150 for {
113151 if shouldStopSupervisor () {
114152 logSupervisor ("停止" )
153+ os .Remove (pidFile )
115154 return
116155 }
117156
@@ -134,16 +173,17 @@ func runSupervisor() {
134173 continue
135174 }
136175
137- pidFile := GetPIDFile ()
138- if err := os .WriteFile (pidFile , []byte (strconv .Itoa (cmd .Process .Pid )), 0600 ); err != nil {
139- cmd .Process .Kill ()
140- logFile .Close ()
141- time .Sleep (restartDelay )
142- continue
143- }
176+ childMu .Lock ()
177+ currentChild = cmd .Process
178+ childMu .Unlock ()
144179
145180 err = cmd .Wait ()
146181 logFile .Close ()
182+
183+ childMu .Lock ()
184+ currentChild = nil
185+ childMu .Unlock ()
186+
147187 uptime := time .Since (startTime )
148188
149189 if err != nil {
@@ -180,30 +220,28 @@ func runSupervisor() {
180220
181221// shouldStopSupervisor 检查是否应该停止监控器
182222func shouldStopSupervisor () bool {
183- homeDir , err := os .UserHomeDir ()
184- if err != nil {
185- // 如果无法获取用户主目录,使用当前目录
186- homeDir = "."
187- }
188- stopMarker := filepath .Join (homeDir , ".anssl-stop" )
223+ stopMarker := filepath .Join (getHomeDir (), ".anssl-stop" )
189224 if _ , err := os .Stat (stopMarker ); err == nil {
190225 os .Remove (stopMarker )
191226 return true
192227 }
193228 return false
194229}
195230
196- // StopDaemon 停止守护进程
197- func StopDaemon () error {
231+ // getHomeDir 获取用户主目录,失败时返回当前目录
232+ func getHomeDir () string {
198233 homeDir , err := os .UserHomeDir ()
199234 if err != nil {
200- // 如果无法获取用户主目录,使用当前目录
201- homeDir = "."
202- }
203- stopMarker := filepath .Join (homeDir , ".anssl-stop" )
204- if err := os .WriteFile (stopMarker , []byte ("stop" ), 0600 ); err != nil {
205- return fmt .Errorf ("创建停止标记失败: %w" , err )
235+ return "."
206236 }
237+ return homeDir
238+ }
239+
240+ // StopDaemon 停止守护进程
241+ func StopDaemon () error {
242+ // 保留 stop marker 作为备用停止机制
243+ stopMarker := filepath .Join (getHomeDir (), ".anssl-stop" )
244+ os .WriteFile (stopMarker , []byte ("stop" ), 0600 )
207245
208246 pidFile := GetPIDFile ()
209247 data , err := os .ReadFile (pidFile )
@@ -222,23 +260,28 @@ func StopDaemon() error {
222260 return fmt .Errorf ("查找进程失败: %w" , err )
223261 }
224262
263+ // 发送 SIGTERM,supervisor 的信号处理会级联杀掉 worker
225264 if err := process .Signal (syscall .SIGTERM ); err != nil {
226- return fmt .Errorf ("发送停止信号失败: %w" , err )
265+ // 进程可能已经退出
266+ os .Remove (pidFile )
267+ os .Remove (stopMarker )
268+ return nil
227269 }
228270
271+ // 等待 supervisor 退出
229272 for i := 0 ; i < 10 ; i ++ {
230- if ! IsRunning () {
231- break
232- }
233273 time .Sleep (1 * time .Second )
274+ if err := process .Signal (syscall .Signal (0 )); err != nil {
275+ // 进程已退出
276+ os .Remove (pidFile )
277+ os .Remove (stopMarker )
278+ return nil
279+ }
234280 }
235281
236- // 如果进程还在运行,强制杀死
237- if IsRunning () {
238- // 忽略错误,因为进程可能在检查和发送信号之间已经退出
239- process .Signal (syscall .SIGKILL )
240- time .Sleep (500 * time .Millisecond )
241- }
282+ // 超时,强制杀死
283+ process .Signal (syscall .SIGKILL )
284+ time .Sleep (500 * time .Millisecond )
242285
243286 os .Remove (pidFile )
244287 os .Remove (stopMarker )
0 commit comments