Skip to content

Commit 9680dcf

Browse files
hhe48203-ctrlclaudeSAKURA-CAT
authored
fix: add SIGINT signal handler for reliable Ctrl+C experiment abort (#1504)
* fix: add SIGINT signal handler for reliable Ctrl+C experiment abort sys.excepthook alone cannot reliably catch KeyboardInterrupt when the main thread is blocked in C extensions (NumPy/PyTorch). Register a signal.SIGINT handler as an additional safety net to ensure experiments are always marked as "aborted" on Ctrl+C. Fixes #1329 * fix: correct SIGINT handler chaining for SIG_IGN case When the original handler was SIG_IGN, the handler incorrectly raised KeyboardInterrupt instead of silently returning. Simplify the logic to explicitly handle SIG_IGN, callable handlers, and SIG_DFL separately. Add test for SIG_IGN scenario. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Call saved original excepthook and update tests Invoke the saved original excepthook (self._sys_origin_excepthook) instead of directly calling sys.__excepthook__, and make the KeyboardInterrupt log message clearer ("aborting run..."). Update unit tests to set and assert against the saved hook, rename tests for clarity, and add coverage to ensure an outer framework's hook is used (and the builtin is not called) as well as that internal errors still result in calling the saved hook. --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Kang Li <79990647+SAKURA-CAT@users.noreply.github.com>
1 parent 8ef2a43 commit 9680dcf

2 files changed

Lines changed: 99 additions & 22 deletions

File tree

swanlab/sdk/internal/run/__init__.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
"""
1010

1111
import atexit
12+
import signal
1213
import sys
1314
import threading
1415
import traceback
@@ -145,6 +146,10 @@ def __init__(self, ctx: RunContext):
145146
self._sys_origin_excepthook = sys.excepthook
146147
atexit.register(self._atexit_cleanup)
147148
sys.excepthook = self._excepthook
149+
# 注册 SIGINT handler,确保 Ctrl+C 能可靠地将实验标记为 aborted
150+
# sys.excepthook 在主线程阻塞于 C 扩展时可能无法触发
151+
self._original_sigint_handler = signal.getsignal(signal.SIGINT)
152+
signal.signal(signal.SIGINT, self._sigint_handler)
148153
# 绑定日志文件,运行正式开始
149154
if self._ctx.config.settings.mode != "disabled":
150155
log.bindfile(self._ctx.debug_dir)
@@ -153,6 +158,26 @@ def __init__(self, ctx: RunContext):
153158
# 私有钩子
154159
# ----------------------------------
155160

161+
def _sigint_handler(self, signum: int, frame: Any) -> None:
162+
"""SIGINT handler:确保 Ctrl+C 能可靠地将实验标记为 aborted。
163+
164+
sys.excepthook 依赖 Python 层面抛出 KeyboardInterrupt,但当主线程
165+
阻塞在 C 扩展(NumPy/PyTorch 等)时,KeyboardInterrupt 可能无法
166+
正常传播到 excepthook。此 handler 作为额外防线,在信号层直接处理。
167+
"""
168+
if self._state == "running":
169+
console.info("KeyboardInterrupt by user")
170+
self.finish(state="aborted", error="KeyboardInterrupt")
171+
# 恢复原始 handler 并重新发送信号,让进程正常终止
172+
signal.signal(signal.SIGINT, self._original_sigint_handler)
173+
if self._original_sigint_handler is signal.SIG_IGN:
174+
return # Signal was ignored, do nothing.
175+
if callable(self._original_sigint_handler):
176+
self._original_sigint_handler(signum, frame)
177+
else:
178+
# The default handler (SIG_DFL) raises KeyboardInterrupt.
179+
raise KeyboardInterrupt
180+
156181
def _atexit_cleanup(self) -> None:
157182
"""程序正常退出时自动结束当前运行"""
158183
if self._state != "running":
@@ -172,7 +197,7 @@ def _excepthook(
172197
return
173198
state: FinishType = "crashed"
174199
if tp is KeyboardInterrupt:
175-
console.info("KeyboardInterrupt by user")
200+
console.info("KeyboardInterrupt by user, aborting run...")
176201
state = "aborted"
177202
else:
178203
console.info("Error happened while training")
@@ -181,7 +206,7 @@ def _excepthook(
181206
except Exception as e:
182207
console.error(f"SwanLab failed to handle excepthook: {e}")
183208
finally:
184-
sys.__excepthook__(tp, val, tb)
209+
self._sys_origin_excepthook(tp, val, tb)
185210

186211
def _cleanup(self):
187212
"""
@@ -191,6 +216,7 @@ def _cleanup(self):
191216
console.debug("Cleanup system hook...")
192217
atexit.unregister(self._atexit_cleanup)
193218
sys.excepthook = self._sys_origin_excepthook
219+
signal.signal(signal.SIGINT, self._original_sigint_handler)
194220
# 清理全局运行实例
195221
console.debug("Cleanup global instance...")
196222
clear_run()

tests/unit/sdk/internal/run/test_finish_hook.py

Lines changed: 71 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@
22
@author: cunyue
33
@file: test_finish_hook.py
44
@time: 2026/3/14
5-
@description: 测试 SwanLabRun._atexit_cleanup / _excepthook 的单元行为(均 mock 依赖,不启动真实 Run)
5+
@description: 测试 SwanLabRun._atexit_cleanup / _excepthook / _sigint_handler 的单元行为(均 mock 依赖,不启动真实 Run)
66
"""
77

8+
import signal
89
import sys
910
import threading
1011
from unittest.mock import ANY, MagicMock, patch
1112

13+
import pytest
14+
1215
from swanlab.sdk.internal.run import SwanLabRun
1316

1417

@@ -49,42 +52,90 @@ class TestExcepthook:
4952
def test_keyboard_interrupt_calls_aborted(self):
5053
"""KeyboardInterrupt → finish(state='aborted', ...)"""
5154
run = _make_mock_run()
52-
with patch("sys.__excepthook__"):
53-
tp, val, tb = _make_exc_info(KeyboardInterrupt())
54-
SwanLabRun._excepthook(run, tp, val, tb)
55+
run._sys_origin_excepthook = MagicMock()
56+
tp, val, tb = _make_exc_info(KeyboardInterrupt())
57+
SwanLabRun._excepthook(run, tp, val, tb)
5558
run.finish.assert_called_once_with(state="aborted", error=ANY)
5659

5760
def test_generic_exception_calls_crashed(self):
5861
"""普通异常 → finish(state='crashed'),error 包含完整 traceback"""
5962
run = _make_mock_run()
60-
with patch("sys.__excepthook__"):
61-
tp, val, tb = _make_exc_info(RuntimeError("boom"))
62-
SwanLabRun._excepthook(run, tp, val, tb)
63+
run._sys_origin_excepthook = MagicMock()
64+
tp, val, tb = _make_exc_info(RuntimeError("boom"))
65+
SwanLabRun._excepthook(run, tp, val, tb)
6366
call_kwargs = run.finish.call_args.kwargs
6467
assert call_kwargs["state"] == "crashed"
6568
assert "boom" in call_kwargs["error"]
6669

6770
def test_no_op_when_not_running(self):
6871
"""_state != 'running' 时不调用 finish"""
6972
run = _make_mock_run(state="success")
70-
with patch("sys.__excepthook__"):
71-
tp, val, tb = _make_exc_info(RuntimeError("no run"))
72-
SwanLabRun._excepthook(run, tp, val, tb)
73+
run._sys_origin_excepthook = MagicMock()
74+
tp, val, tb = _make_exc_info(RuntimeError("no run"))
75+
SwanLabRun._excepthook(run, tp, val, tb)
7376
run.finish.assert_not_called()
7477

75-
def test_always_calls_original_excepthook(self):
76-
"""无论是否有活跃 Run,sys.__excepthook__ 必须被调用一次"""
78+
def test_always_calls_saved_origin_excepthook(self):
79+
"""无论是否有活跃 Run,始终调用注册时保存的原始 hook,而非 sys.__excepthook__"""
7780
run = _make_mock_run(state="success")
78-
with patch("sys.__excepthook__") as mock_original:
79-
tp, val, tb = _make_exc_info(RuntimeError("test"))
81+
saved_hook = MagicMock()
82+
run._sys_origin_excepthook = saved_hook
83+
tp, val, tb = _make_exc_info(RuntimeError("test"))
84+
SwanLabRun._excepthook(run, tp, val, tb)
85+
saved_hook.assert_called_once_with(tp, val, tb)
86+
87+
def test_calls_saved_hook_not_builtin(self):
88+
"""当外层框架替换了 sys.excepthook 时,调用保存的外层 hook,而非内置默认 hook"""
89+
run = _make_mock_run()
90+
outer_framework_hook = MagicMock()
91+
run._sys_origin_excepthook = outer_framework_hook
92+
tp, val, tb = _make_exc_info(RuntimeError("outer"))
93+
with patch("sys.__excepthook__") as mock_builtin:
8094
SwanLabRun._excepthook(run, tp, val, tb)
81-
mock_original.assert_called_once_with(tp, val, tb)
95+
outer_framework_hook.assert_called_once_with(tp, val, tb)
96+
mock_builtin.assert_not_called()
8297

8398
def test_internal_error_doesnt_crash(self):
84-
"""excepthook 内部出错时不向上抛出,仍调用 sys.__excepthook__"""
99+
"""excepthook 内部出错时不向上抛出,仍调用保存的原始 hook"""
85100
run = _make_mock_run()
86101
run.finish.side_effect = Exception("internal boom")
87-
with patch("sys.__excepthook__") as mock_original:
88-
tp, val, tb = _make_exc_info(RuntimeError("outer"))
89-
SwanLabRun._excepthook(run, tp, val, tb)
90-
mock_original.assert_called_once_with(tp, val, tb)
102+
saved_hook = MagicMock()
103+
run._sys_origin_excepthook = saved_hook
104+
tp, val, tb = _make_exc_info(RuntimeError("outer"))
105+
SwanLabRun._excepthook(run, tp, val, tb)
106+
saved_hook.assert_called_once_with(tp, val, tb)
107+
108+
109+
class TestSigintHandler:
110+
def test_calls_finish_aborted_when_running(self):
111+
"""SIGINT handler 在实验运行中应调用 finish(state='aborted')"""
112+
run = _make_mock_run()
113+
run._original_sigint_handler = signal.SIG_DFL
114+
with pytest.raises(KeyboardInterrupt):
115+
SwanLabRun._sigint_handler(run, signal.SIGINT, None)
116+
run.finish.assert_called_once_with(state="aborted", error="KeyboardInterrupt")
117+
118+
def test_no_op_when_not_running(self):
119+
"""_state != 'running' 时不调用 finish,仍抛出 KeyboardInterrupt"""
120+
run = _make_mock_run(state="success")
121+
run._original_sigint_handler = signal.SIG_DFL
122+
with pytest.raises(KeyboardInterrupt):
123+
SwanLabRun._sigint_handler(run, signal.SIGINT, None)
124+
run.finish.assert_not_called()
125+
126+
def test_calls_original_callable_handler(self):
127+
"""如果原始 handler 是 callable,应调用它而非 raise KeyboardInterrupt"""
128+
run = _make_mock_run()
129+
original = MagicMock()
130+
run._original_sigint_handler = original
131+
SwanLabRun._sigint_handler(run, signal.SIGINT, None)
132+
run.finish.assert_called_once_with(state="aborted", error="KeyboardInterrupt")
133+
original.assert_called_once_with(signal.SIGINT, None)
134+
135+
def test_sig_ign_does_nothing(self):
136+
"""如果原始 handler 是 SIG_IGN,应静默返回,不抛出 KeyboardInterrupt"""
137+
run = _make_mock_run()
138+
run._original_sigint_handler = signal.SIG_IGN
139+
# Should not raise
140+
SwanLabRun._sigint_handler(run, signal.SIGINT, None)
141+
run.finish.assert_called_once_with(state="aborted", error="KeyboardInterrupt")

0 commit comments

Comments
 (0)