Skip to content

Commit 04d6e13

Browse files
author
ly
committed
Fix orphan process cleanup in manage workflows
1 parent a2f5c16 commit 04d6e13

2 files changed

Lines changed: 287 additions & 7 deletions

File tree

manage.sh

Lines changed: 96 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -167,9 +167,20 @@ function cleanup_on_exit() {
167167
local running_jobs=$(jobs -p)
168168
if [ -n "$running_jobs" ]; then
169169
kill $running_jobs 2>/dev/null || true
170+
wait $running_jobs 2>/dev/null || true
170171
fi
171172
}
173+
174+
function handle_signal_exit() {
175+
local exit_code="$1"
176+
trap - EXIT
177+
cleanup_on_exit
178+
exit "$exit_code"
179+
}
180+
172181
trap cleanup_on_exit EXIT
182+
trap 'handle_signal_exit 143' TERM
183+
trap 'handle_signal_exit 130' INT
173184

174185
# 确保执行权限
175186
chmod +x "$ROOT_DIR/agent-rule/scripts/"*.sh > /dev/null 2>&1 || true
@@ -228,6 +239,84 @@ function timeout_run() {
228239
fi
229240
}
230241

242+
function run_supervised_task() {
243+
local timeout_seconds="$1"
244+
local workdir="$2"
245+
local command_str="$3"
246+
247+
exec "$PYTHON_CMD" - "$timeout_seconds" "$workdir" "$command_str" <<'PY'
248+
import os
249+
import signal
250+
import subprocess
251+
import sys
252+
import time
253+
254+
timeout_seconds = int(sys.argv[1])
255+
workdir = sys.argv[2]
256+
command_str = sys.argv[3]
257+
child = None
258+
259+
260+
def terminate_group(sig, grace_seconds=1.0):
261+
if child is None:
262+
return
263+
try:
264+
os.killpg(child.pid, sig)
265+
except ProcessLookupError:
266+
return
267+
268+
deadline = time.time() + grace_seconds
269+
while time.time() < deadline:
270+
try:
271+
os.killpg(child.pid, 0)
272+
except ProcessLookupError:
273+
return
274+
time.sleep(0.05)
275+
276+
if sig != signal.SIGKILL:
277+
try:
278+
os.killpg(child.pid, signal.SIGKILL)
279+
except ProcessLookupError:
280+
pass
281+
282+
283+
def handle_signal(signum, _frame):
284+
terminate_group(signal.SIGTERM)
285+
sys.exit(128 + signum)
286+
287+
288+
signal.signal(signal.SIGTERM, handle_signal)
289+
signal.signal(signal.SIGINT, handle_signal)
290+
291+
child = subprocess.Popen(
292+
["bash", "-lc", command_str],
293+
cwd=workdir,
294+
preexec_fn=os.setsid,
295+
)
296+
297+
try:
298+
return_code = child.wait(timeout=timeout_seconds)
299+
except subprocess.TimeoutExpired:
300+
terminate_group(signal.SIGTERM)
301+
sys.exit(124)
302+
303+
terminate_group(signal.SIGTERM, grace_seconds=0.5)
304+
sys.exit(return_code)
305+
PY
306+
}
307+
308+
function run_supervised_command_in_dir() {
309+
local timeout_seconds="$1"
310+
local workdir="$2"
311+
shift 2
312+
313+
local command_str
314+
command_str="$(shell_join "$@")"
315+
run_supervised_task "$timeout_seconds" "$workdir" "$command_str" &
316+
local pid=$!
317+
wait "$pid"
318+
}
319+
231320
function update_latest_log_pointer() {
232321
mkdir -p "$(dirname "$TEMP_LOG")"
233322
: > "$TEMP_LOG"
@@ -495,7 +584,7 @@ function run_task_in_dir() {
495584

496585
local command_str
497586
command_str="$(shell_join "$@")"
498-
run_task_with_progress "$title" "$success_msg" "$err_msg" timeout_run "$timeout_seconds" bash -c "cd \"$workdir\" && $command_str"
587+
run_task_with_progress "$title" "$success_msg" "$err_msg" run_supervised_task "$timeout_seconds" "$workdir" "$command_str"
499588
}
500589

501590
function begin_pipeline_execution() {
@@ -1010,9 +1099,9 @@ case "$cmd" in
10101099
break
10111100
done < "$selection_input" ;;
10121101
mcp:sync)
1013-
(cd "$ROOT_DIR/agent-mcp" && $PYTHON_CMD sync_mcp.py --config "$CONFIG_FILE" --def-file "$ROOT_DIR/config/mcp.json" $DRY_RUN_FLAG $OFFLINE_FLAG) ;;
1102+
run_supervised_command_in_dir 600 "$ROOT_DIR/agent-mcp" "$PYTHON_CMD" sync_mcp.py --config "$CONFIG_FILE" --def-file "$ROOT_DIR/config/mcp.json" $DRY_RUN_FLAG $OFFLINE_FLAG ;;
10141103
mcp:prepare)
1015-
(cd "$ROOT_DIR/agent-mcp" && $PYTHON_CMD sync_mcp.py --config "$CONFIG_FILE" --def-file "$ROOT_DIR/config/mcp.json" --prepare-runtime $DRY_RUN_FLAG $OFFLINE_FLAG) ;;
1104+
run_supervised_command_in_dir 600 "$ROOT_DIR/agent-mcp" "$PYTHON_CMD" sync_mcp.py --config "$CONFIG_FILE" --def-file "$ROOT_DIR/config/mcp.json" --prepare-runtime $DRY_RUN_FLAG $OFFLINE_FLAG ;;
10161105
rules:sync)
10171106
if ! target_names="$(list_active_rule_targets)"; then
10181107
exit 1
@@ -1025,9 +1114,9 @@ case "$cmd" in
10251114
while IFS= read -r target_name; do
10261115
[ -n "$target_name" ] && rule_args+=(--target "$target_name")
10271116
done <<< "$target_names"
1028-
(cd "$ROOT_DIR/agent-rule" && ./scripts/sync-agent-rules.sh "${rule_args[@]}" $DRY_RUN_FLAG) ;;
1117+
run_supervised_command_in_dir 600 "$ROOT_DIR/agent-rule" ./scripts/sync-agent-rules.sh "${rule_args[@]}" $DRY_RUN_FLAG ;;
10291118
skills:link)
1030-
(cd "$ROOT_DIR/agent-skill" && $PYTHON_CMD link_skills.py --config "$CONFIG_FILE" --def-file "$ROOT_DIR/config/skills.json" --yes $DRY_RUN_FLAG $OFFLINE_FLAG) ;;
1119+
run_supervised_command_in_dir 600 "$ROOT_DIR/agent-skill" "$PYTHON_CMD" link_skills.py --config "$CONFIG_FILE" --def-file "$ROOT_DIR/config/skills.json" --yes $DRY_RUN_FLAG $OFFLINE_FLAG ;;
10311120
skills:upgrade)
10321121
upgrade_args=(upgrade_skills.py --config "$CONFIG_FILE" --def-file "$ROOT_DIR/config/skills.json")
10331122
if [ "${#CMD_ARGS[@]}" -gt 0 ]; then
@@ -1037,9 +1126,9 @@ case "$cmd" in
10371126
fi
10381127
if [ -n "$DRY_RUN_FLAG" ]; then upgrade_args+=("$DRY_RUN_FLAG"); fi
10391128
if [ -n "$OFFLINE_FLAG" ]; then upgrade_args+=("$OFFLINE_FLAG"); fi
1040-
(cd "$ROOT_DIR/agent-skill" && $PYTHON_CMD "${upgrade_args[@]}") ;;
1129+
run_supervised_command_in_dir 600 "$ROOT_DIR/agent-skill" "$PYTHON_CMD" "${upgrade_args[@]}" ;;
10411130
tools:sync)
1042-
(cd "$ROOT_DIR/agent-tool" && $PYTHON_CMD sync_tools.py --config "$CONFIG_FILE" --def-file "$ROOT_DIR/config/tools.json" --yes $DRY_RUN_FLAG $OFFLINE_FLAG) ;;
1131+
run_supervised_command_in_dir 600 "$ROOT_DIR/agent-tool" "$PYTHON_CMD" sync_tools.py --config "$CONFIG_FILE" --def-file "$ROOT_DIR/config/tools.json" --yes $DRY_RUN_FLAG $OFFLINE_FLAG ;;
10431132
terminal:sync)
10441133
run_terminal_sync_task true ;;
10451134
fetch-models)

tests/test_manage.py

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import os
22
import json
33
import shutil
4+
import signal
45
import subprocess
56
import tempfile
7+
import time
68
import unittest
79
from getpass import getuser
810
from pathlib import Path
@@ -1895,6 +1897,109 @@ def test_atomic_lock_directory_blocks_new_run(self):
18951897
restore_path(self.lock_path, lock_backup)
18961898
restore_path(self.lock_dir, lock_dir_backup)
18971899

1900+
def test_termination_cleans_up_descendant_processes(self):
1901+
with tempfile.TemporaryDirectory() as temp_dir:
1902+
temp_root = Path(temp_dir) / "repo"
1903+
shutil.copytree(ROOT, temp_root, ignore=shutil.ignore_patterns('repositories'))
1904+
1905+
child_pid_file = Path(temp_dir) / "child.pid"
1906+
ready_file = Path(temp_dir) / "ready"
1907+
1908+
self._write_stub(
1909+
temp_root / "agent-lifecycle" / "sync_agents.py",
1910+
(
1911+
"#!/usr/bin/env python3\n"
1912+
"import os\n"
1913+
"import signal\n"
1914+
"import subprocess\n"
1915+
"import sys\n"
1916+
"import time\n"
1917+
"child = subprocess.Popen([sys.executable, '-c', 'import time; time.sleep(60)'])\n"
1918+
"with open(os.environ['CHILD_PID_FILE'], 'w', encoding='utf-8') as fh:\n"
1919+
" fh.write(str(child.pid))\n"
1920+
"Path = __import__('pathlib').Path\n"
1921+
"Path(os.environ['READY_FILE']).write_text('ready\\n', encoding='utf-8')\n"
1922+
"signal.signal(signal.SIGTERM, lambda *_args: sys.exit(143))\n"
1923+
"while True:\n"
1924+
" time.sleep(1)\n"
1925+
),
1926+
)
1927+
self._write_stub(
1928+
temp_root / "agent-tool" / "sync_tools.py",
1929+
"#!/usr/bin/env python3\nimport sys\nsys.exit(0)\n",
1930+
)
1931+
self._write_stub(
1932+
temp_root / "agent-skill" / "link_skills.py",
1933+
"#!/usr/bin/env python3\nimport sys\nsys.exit(0)\n",
1934+
)
1935+
self._write_stub(
1936+
temp_root / "agent-mcp" / "sync_mcp.py",
1937+
"#!/usr/bin/env python3\nimport sys\nsys.exit(0)\n",
1938+
)
1939+
self._write_stub(
1940+
temp_root / "agent-rule" / "scripts" / "sync-agent-rules.sh",
1941+
"#!/usr/bin/env bash\nexit 0\n",
1942+
)
1943+
self._write_stub(
1944+
temp_root / "agent-terminal" / "sync_terminal.py",
1945+
"#!/usr/bin/env python3\nimport sys\nsys.exit(0)\n",
1946+
)
1947+
1948+
env = self._managed_env()
1949+
env["CHILD_PID_FILE"] = str(child_pid_file)
1950+
env["READY_FILE"] = str(ready_file)
1951+
1952+
proc = subprocess.Popen(
1953+
[str(temp_root / "manage.sh"), "--offline", "update"],
1954+
cwd=temp_root,
1955+
env=env,
1956+
stdout=subprocess.PIPE,
1957+
stderr=subprocess.PIPE,
1958+
text=True,
1959+
)
1960+
1961+
child_pid = None
1962+
try:
1963+
deadline = time.time() + 10
1964+
while time.time() < deadline:
1965+
if ready_file.exists() and child_pid_file.exists():
1966+
child_pid = int(child_pid_file.read_text(encoding="utf-8").strip())
1967+
break
1968+
if proc.poll() is not None:
1969+
break
1970+
time.sleep(0.1)
1971+
1972+
self.assertIsNotNone(child_pid, "生命周期任务未按预期启动子进程")
1973+
os.kill(child_pid, 0)
1974+
1975+
proc.terminate()
1976+
proc.wait(timeout=10)
1977+
proc.communicate(timeout=1)
1978+
1979+
deadline = time.time() + 3
1980+
child_alive = True
1981+
while time.time() < deadline:
1982+
try:
1983+
os.kill(child_pid, 0)
1984+
except OSError:
1985+
child_alive = False
1986+
break
1987+
time.sleep(0.1)
1988+
1989+
self.assertFalse(child_alive, f"manage.sh 退出后子进程仍存活: {child_pid}")
1990+
finally:
1991+
if proc.poll() is None:
1992+
proc.kill()
1993+
proc.communicate(timeout=5)
1994+
else:
1995+
proc.stdout.close()
1996+
proc.stderr.close()
1997+
if child_pid is not None:
1998+
try:
1999+
os.kill(child_pid, signal.SIGKILL)
2000+
except OSError:
2001+
pass
2002+
18982003
def test_unlock_removes_atomic_lock_artifacts(self):
18992004
lock_backup = move_aside(self.lock_path)
19002005
lock_dir_backup = move_aside(self.lock_dir)
@@ -2594,3 +2699,89 @@ def test_mcp_sync_defaults_to_config_only_mode(self):
25942699
combined = (result.stdout + result.stderr).decode("utf-8", errors="replace")
25952700
self.assertEqual(result.returncode, 0, combined)
25962701
self.assertTrue(agent_config_path.exists(), combined)
2702+
2703+
def test_atomic_mcp_sync_termination_cleans_up_descendant_processes(self):
2704+
with tempfile.TemporaryDirectory() as temp_dir:
2705+
temp_root = Path(temp_dir) / "repo"
2706+
shutil.copytree(ROOT, temp_root, ignore=shutil.ignore_patterns('repositories'))
2707+
2708+
child_pid_file = Path(temp_dir) / "mcp-child.pid"
2709+
ready_file = Path(temp_dir) / "mcp-ready"
2710+
2711+
self._write_stub(
2712+
temp_root / "agent-mcp" / "sync_mcp.py",
2713+
(
2714+
"#!/usr/bin/env python3\n"
2715+
"import os\n"
2716+
"import signal\n"
2717+
"import subprocess\n"
2718+
"import sys\n"
2719+
"import time\n"
2720+
"child = subprocess.Popen([sys.executable, '-c', 'import time; time.sleep(60)'])\n"
2721+
"with open(os.environ['CHILD_PID_FILE'], 'w', encoding='utf-8') as fh:\n"
2722+
" fh.write(str(child.pid))\n"
2723+
"Path = __import__('pathlib').Path\n"
2724+
"Path(os.environ['READY_FILE']).write_text('ready\\n', encoding='utf-8')\n"
2725+
"signal.signal(signal.SIGTERM, lambda *_args: sys.exit(143))\n"
2726+
"while True:\n"
2727+
" time.sleep(1)\n"
2728+
),
2729+
)
2730+
2731+
env = self._managed_env()
2732+
env["CHILD_PID_FILE"] = str(child_pid_file)
2733+
env["READY_FILE"] = str(ready_file)
2734+
2735+
proc = subprocess.Popen(
2736+
[str(temp_root / "manage.sh"), "--offline", "mcp:sync"],
2737+
cwd=temp_root,
2738+
env=env,
2739+
stdout=subprocess.PIPE,
2740+
stderr=subprocess.PIPE,
2741+
text=True,
2742+
)
2743+
2744+
child_pid = None
2745+
try:
2746+
deadline = time.time() + 10
2747+
while time.time() < deadline:
2748+
if ready_file.exists() and child_pid_file.exists():
2749+
child_pid = int(child_pid_file.read_text(encoding="utf-8").strip())
2750+
break
2751+
if proc.poll() is not None:
2752+
break
2753+
time.sleep(0.1)
2754+
2755+
self.assertIsNotNone(child_pid, "mcp:sync 未按预期启动子进程")
2756+
os.kill(child_pid, 0)
2757+
2758+
proc.terminate()
2759+
proc.wait(timeout=10)
2760+
proc.stdout.close()
2761+
proc.stderr.close()
2762+
2763+
deadline = time.time() + 3
2764+
child_alive = True
2765+
while time.time() < deadline:
2766+
try:
2767+
os.kill(child_pid, 0)
2768+
except OSError:
2769+
child_alive = False
2770+
break
2771+
time.sleep(0.1)
2772+
2773+
self.assertFalse(child_alive, f"mcp:sync 退出后子进程仍存活: {child_pid}")
2774+
finally:
2775+
if proc.poll() is None:
2776+
proc.kill()
2777+
proc.wait(timeout=5)
2778+
proc.stdout.close()
2779+
proc.stderr.close()
2780+
else:
2781+
proc.stdout.close()
2782+
proc.stderr.close()
2783+
if child_pid is not None:
2784+
try:
2785+
os.kill(child_pid, signal.SIGKILL)
2786+
except OSError:
2787+
pass

0 commit comments

Comments
 (0)