diff --git a/dimos/core/native_module.py b/dimos/core/native_module.py index f4a674cb5d..00cc2d77d0 100644 --- a/dimos/core/native_module.py +++ b/dimos/core/native_module.py @@ -183,11 +183,17 @@ def stop(self) -> None: ) self._process.kill() self._process.wait(timeout=5) - if self._watchdog is not None and self._watchdog is not threading.current_thread(): - self._watchdog.join(timeout=2) - self._watchdog = None self._process = None super().stop() + # Join the watchdog AFTER super().stop() so all module threads are + # cleaned up first. When the watchdog itself is the caller (crash + # path), it skips joining itself — but the thread exits naturally + # right after this returns. A second stop() from external code + # (e.g. test teardown) will reach here and join the now-finished + # watchdog thread, preventing monitor_threads from seeing a leak. + if self._watchdog is not None and self._watchdog is not threading.current_thread(): + self._watchdog.join(timeout=2) + self._watchdog = None def _watch_process(self) -> None: """Block until the native process exits; trigger stop() if it crashed.""" diff --git a/dimos/core/test_native_module.py b/dimos/core/test_native_module.py index e77b8f9a53..a7e6bd2b9a 100644 --- a/dimos/core/test_native_module.py +++ b/dimos/core/test_native_module.py @@ -107,6 +107,10 @@ def test_process_crash_triggers_stop() -> None: assert mod._process is None, f"Watchdog did not clean up after process {pid} died" + # Join the watchdog thread. stop() is idempotent but will now join the + # watchdog on the second call since the reference is preserved. + mod.stop() + @pytest.mark.slow def test_manual(dimos_cluster: ModuleCoordinator, args_file: str) -> None: