diff --git a/packages/cli/src/repowise/cli/commands/augment_cmd.py b/packages/cli/src/repowise/cli/commands/augment_cmd.py index db8988d6..23ac740c 100644 --- a/packages/cli/src/repowise/cli/commands/augment_cmd.py +++ b/packages/cli/src/repowise/cli/commands/augment_cmd.py @@ -1097,19 +1097,23 @@ def _read_in_flight_marker(repo_path: object) -> dict | None: repo_path = Path(repo_path) now = time.time() - lock_path = repo_path / ".repowise" / ".update.lock" - if lock_path.exists(): - try: - payload = json.loads(lock_path.read_text(encoding="utf-8")) - started = payload.get("started_at") - if isinstance(started, (int, float)) and now - started <= 30 * 60: - return { - "source": "lock", - "target_commit": payload.get("target_commit"), - "elapsed_seconds": now - started, - } - except (json.JSONDecodeError, OSError): - pass + # Delegate lock freshness to the canonical reader: it layers a live-PID + # probe on top of the wall-clock window, so a crashed update's leftover + # lock can't suppress real stale-wiki warnings until the window expires. + from repowise.cli.helpers import read_update_lock + + try: + payload = read_update_lock(repo_path) + except Exception: + payload = None + if payload is not None: + started = payload.get("started_at") + if isinstance(started, (int, float)): + return { + "source": "lock", + "target_commit": payload.get("target_commit"), + "elapsed_seconds": now - started, + } queued_path = repo_path / ".repowise" / ".update.queued" if queued_path.exists(): diff --git a/packages/cli/src/repowise/cli/editor_integrations/claude_config.py b/packages/cli/src/repowise/cli/editor_integrations/claude_config.py index c88b1434..a95c49e6 100644 --- a/packages/cli/src/repowise/cli/editor_integrations/claude_config.py +++ b/packages/cli/src/repowise/cli/editor_integrations/claude_config.py @@ -10,6 +10,7 @@ generate_mcp_config, load_existing_config, merge_mcp_entry, + resolve_repowise_command, ) from repowise.core.workspace.config import find_workspace_root @@ -71,7 +72,11 @@ def register_with_claude_desktop(repo_path: Path) -> Path | None: # Claude Desktop not installed return None target = _resolve_mcp_target(repo_path) - entry = generate_mcp_config(target)["mcpServers"] + # Per-user config: pin the absolute path of the running install so a + # PATH shadow install (conda, old pip, pipx) can't hijack the server. + # Refreshed on every re-registration, so a moved venv self-heals on the + # next `repowise init`/`update`. + entry = generate_mcp_config(target, command=resolve_repowise_command())["mcpServers"] return config_path if merge_mcp_entry(config_path, entry) else None @@ -86,7 +91,8 @@ def register_with_claude_code(repo_path: Path) -> Path | None: """ settings_path = _claude_code_settings_path() target = _resolve_mcp_target(repo_path) - entry = generate_mcp_config(target)["mcpServers"] + # Per-user config: absolute command, see register_with_claude_desktop. + entry = generate_mcp_config(target, command=resolve_repowise_command())["mcpServers"] return settings_path if merge_mcp_entry(settings_path, entry) else None diff --git a/packages/cli/src/repowise/cli/helpers.py b/packages/cli/src/repowise/cli/helpers.py index 4356aff6..5283b83d 100644 --- a/packages/cli/src/repowise/cli/helpers.py +++ b/packages/cli/src/repowise/cli/helpers.py @@ -211,16 +211,22 @@ def acquire_update_lock(repo_path: Path, target_commit: str | None) -> Path: """Write the update lock file. Returns its path. The lock contains the PID and target commit so the augment hook can - decide whether a stale-wiki warning is redundant. Best-effort: if write - fails (read-only fs, permissions), returns the path anyway — callers - must still call ``release_update_lock`` in a finally block. + decide whether a stale-wiki warning is redundant, plus the writing + process's creation-time token so ``read_update_lock`` can tell a live + lock owner apart from an unrelated process that recycled the PID. + Best-effort: if write fails (read-only fs, permissions), returns the + path anyway — callers must still call ``release_update_lock`` in a + finally block. """ import time + from repowise.core.procutils import process_create_token + ensure_repowise_dir(repo_path) lock_path = _update_lock_path(repo_path) payload = { "pid": os.getpid(), + "pid_create_token": process_create_token(os.getpid()), "target_commit": target_commit, "started_at": time.time(), } @@ -240,9 +246,21 @@ def release_update_lock(repo_path: Path) -> None: def read_update_lock(repo_path: Path) -> dict[str, Any] | None: - """Return the lock payload if present and not stale, else ``None``.""" + """Return the lock payload if present and not stale, else ``None``. + + A lock is stale when its wall-clock age exceeds + ``UPDATE_LOCK_STALE_AFTER_SECONDS`` (a hung-but-alive update must not + block forever) — or, much sooner, when its owning PID is positively + dead or has been recycled by an unrelated process. The PID probe means + a crashed/killed update (SIGKILL, power loss — paths atexit can't + cover) no longer blocks further updates for the full 30-minute window. + Probes that can't decide ("unknown") fall back to the wall clock, so a + live update is never treated as stale by mistake. + """ import time + from repowise.core.procutils import pid_alive, process_create_token + lock_path = _update_lock_path(repo_path) if not lock_path.exists(): return None @@ -256,6 +274,20 @@ def read_update_lock(repo_path: Path) -> dict[str, Any] | None: return None if time.time() - started > UPDATE_LOCK_STALE_AFTER_SECONDS: return None + + pid = payload.get("pid") + if isinstance(pid, int) and pid > 0: + alive = pid_alive(pid) + if alive is False: + return None + if alive is True: + stored_token = payload.get("pid_create_token") + # Legacy locks (pre-token) skip the identity check and rely on + # liveness + wall clock alone. + if isinstance(stored_token, str) and stored_token: + current_token = process_create_token(pid) + if current_token is not None and current_token != stored_token: + return None return payload diff --git a/packages/cli/src/repowise/cli/mcp_config.py b/packages/cli/src/repowise/cli/mcp_config.py index fe42e575..0856931e 100644 --- a/packages/cli/src/repowise/cli/mcp_config.py +++ b/packages/cli/src/repowise/cli/mcp_config.py @@ -6,22 +6,77 @@ import re import shutil import subprocess +import sys +import tempfile import tomllib from pathlib import Path import click -def generate_mcp_config(repo_path: Path) -> dict: +def _looks_transient(path: Path) -> bool: + """True when *path* lives somewhere that won't survive (temp, uvx cache). + + ``uvx repowise`` runs from an ephemeral cache environment; pinning a + registration to it would break on the next cache eviction. Same for + anything under the OS temp dir. + """ + try: + resolved = path.resolve() + except OSError: + return True + try: + if resolved.is_relative_to(Path(tempfile.gettempdir()).resolve()): + return True + except (OSError, ValueError): + pass + parts = {part.lower() for part in resolved.parts} + # uv tool-run cache: ~/.cache/uv/archive-v0/... (POSIX) or + # %LOCALAPPDATA%/uv/cache/archive-v0/... (Windows). + return "uv" in parts and ("cache" in parts or ".cache" in parts) + + +def resolve_repowise_command(script: str = "repowise") -> str: + """Absolute path of the *running* install's console script, or the bare name. + + Registrations that store the bare command name are resolved via PATH at + session start, so any shadow install (conda, old pip, pipx, uv tool) + silently hijacks the MCP server. For **per-user** config files we pin + the absolute path of the install that ran ``init`` instead. Repo-shared + files (``.mcp.json``, ``.codex/config.toml``) must keep the bare name — + they may be committed, and one contributor's absolute path would break + everyone else's checkout. + + The lookup is the running interpreter's scripts directory (``Scripts`` + on Windows, ``bin`` elsewhere) — i.e. the venv/conda/pipx/uv-tool + environment actually executing right now, never PATH. Falls back to the + bare name when the script isn't there (e.g. ``python -m`` from a source + checkout) or the location is transient (uvx cache, temp dir). + """ + suffix = ".exe" if sys.platform == "win32" else "" + try: + candidate = Path(sys.executable).parent / f"{script}{suffix}" + if candidate.is_file() and not _looks_transient(candidate): + return str(candidate.resolve()).replace("\\", "/") + except OSError: + pass + return script + + +def generate_mcp_config(repo_path: Path, *, command: str | None = None) -> dict: """Generate MCP config JSON for a repository. - Returns a dict in the standard mcpServers format. + Returns a dict in the standard mcpServers format. *command* defaults to + the bare ``repowise`` (PATH-resolved) — callers writing **per-user** + config files should pass ``resolve_repowise_command()`` to pin the + registration to the install that ran ``init``; repo-shared files keep + the default (see :func:`resolve_repowise_command`). """ abs_path = str(repo_path.resolve()).replace("\\", "/") return { "mcpServers": { "repowise": { - "command": "repowise", + "command": command or "repowise", "args": ["mcp", abs_path, "--transport", "stdio"], "description": "repowise: codebase intelligence — docs, graph, git signals, dead code, decisions", } diff --git a/packages/core/src/repowise/core/procutils.py b/packages/core/src/repowise/core/procutils.py new file mode 100644 index 00000000..4d305504 --- /dev/null +++ b/packages/core/src/repowise/core/procutils.py @@ -0,0 +1,373 @@ +"""Cross-platform process liveness and identity helpers (stdlib only). + +Used by two consumers: + +* The update lock (``repowise.cli.helpers``) — probe whether the PID that + wrote ``.repowise/.update.lock`` is still alive, so a crashed update does + not block further updates for the full wall-clock staleness window. +* The MCP stdio watchdog (``repowise.server.mcp_server``) — walk the + ancestor chain at startup and exit when the MCP client dies, so crashed + or force-quit agent sessions don't leak ``repowise mcp`` servers (which + hold wiki.db handles and contend with later updates). + +Design constraints, in order: + +* **Never signal on Windows.** ``os.kill(pid, sig)`` on Windows is not a + probe — any signal other than CTRL_C_EVENT/CTRL_BREAK_EVENT calls + ``TerminateProcess``. All Windows paths go through ``ctypes``/kernel32. +* **No new dependencies** (no psutil). Windows uses kernel32 via ctypes, + Linux uses ``/proc``, macOS/other POSIX fall back to one-shot ``ps`` + calls (startup / rare paths only, never on a per-tool-call hot path). +* **Best-effort, fail open.** Every helper returns ``None`` for "unknown" + rather than raising; callers must treat unknown conservatively (e.g. a + lock whose owner can't be probed is assumed alive). +""" + +from __future__ import annotations + +import os +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path + +_IS_WINDOWS = sys.platform == "win32" + +# --------------------------------------------------------------------------- +# Windows (ctypes / kernel32) +# --------------------------------------------------------------------------- + +if _IS_WINDOWS: + import ctypes + from ctypes import wintypes + + _kernel32 = ctypes.WinDLL("kernel32", use_last_error=True) + + _PROCESS_QUERY_LIMITED_INFORMATION = 0x1000 + _SYNCHRONIZE = 0x00100000 + _ERROR_INVALID_PARAMETER = 87 + _ERROR_ACCESS_DENIED = 5 + _WAIT_TIMEOUT = 0x102 + _WAIT_OBJECT_0 = 0x0 + _STILL_ACTIVE = 259 + _TH32CS_SNAPPROCESS = 0x2 + _INVALID_HANDLE_VALUE = wintypes.HANDLE(-1).value + + # Explicit signatures — the ctypes default return type is c_int, which + # truncates 64-bit HANDLEs. + _kernel32.OpenProcess.restype = wintypes.HANDLE + _kernel32.OpenProcess.argtypes = [wintypes.DWORD, wintypes.BOOL, wintypes.DWORD] + _kernel32.CloseHandle.restype = wintypes.BOOL + _kernel32.CloseHandle.argtypes = [wintypes.HANDLE] + _kernel32.WaitForSingleObject.restype = wintypes.DWORD + _kernel32.WaitForSingleObject.argtypes = [wintypes.HANDLE, wintypes.DWORD] + _kernel32.GetExitCodeProcess.restype = wintypes.BOOL + _kernel32.GetExitCodeProcess.argtypes = [wintypes.HANDLE, ctypes.POINTER(wintypes.DWORD)] + _kernel32.CreateToolhelp32Snapshot.restype = wintypes.HANDLE + _kernel32.CreateToolhelp32Snapshot.argtypes = [wintypes.DWORD, wintypes.DWORD] + + class _PROCESSENTRY32W(ctypes.Structure): + _fields_ = [ + ("dwSize", wintypes.DWORD), + ("cntUsage", wintypes.DWORD), + ("th32ProcessID", wintypes.DWORD), + ("th32DefaultHeapID", ctypes.POINTER(ctypes.c_ulong)), + ("th32ModuleID", wintypes.DWORD), + ("cntThreads", wintypes.DWORD), + ("th32ParentProcessID", wintypes.DWORD), + ("pcPriClassBase", ctypes.c_long), + ("dwFlags", wintypes.DWORD), + ("szExeFile", ctypes.c_wchar * 260), + ] + + def _win_open_process(pid: int, access: int = _PROCESS_QUERY_LIMITED_INFORMATION) -> int | None: + """OpenProcess with the given rights; None when it can't be opened.""" + handle = _kernel32.OpenProcess(access, False, pid) + return handle or None + + def _win_pid_alive(pid: int) -> bool | None: + # Preferred probe: a zero-timeout wait on the process handle. + # WaitForSingleObject requires SYNCHRONIZE access on the handle. + handle = _win_open_process(pid, _PROCESS_QUERY_LIMITED_INFORMATION | _SYNCHRONIZE) + if handle: + try: + res = _kernel32.WaitForSingleObject(handle, 0) + if res == _WAIT_TIMEOUT: + return True + if res == _WAIT_OBJECT_0: + return False + return None + finally: + _kernel32.CloseHandle(handle) + + err = ctypes.get_last_error() + if err == _ERROR_INVALID_PARAMETER: + # No such process. + return False + if err != _ERROR_ACCESS_DENIED: + return None + + # SYNCHRONIZE denied — retry with query-limited rights only and use + # the exit code (STILL_ACTIVE ⇒ alive; a real exit code of 259 is a + # documented theoretical ambiguity we accept). + handle = _win_open_process(pid) + if not handle: + # Exists (ACCESS_DENIED above proved that) but unreadable. + return True + try: + code = wintypes.DWORD() + if not _kernel32.GetExitCodeProcess(handle, ctypes.byref(code)): + return None + return code.value == _STILL_ACTIVE + finally: + _kernel32.CloseHandle(handle) + + def _win_create_token(pid: int) -> str | None: + handle = _win_open_process(pid) + if not handle: + return None + try: + creation = wintypes.FILETIME() + exit_t = wintypes.FILETIME() + kernel_t = wintypes.FILETIME() + user_t = wintypes.FILETIME() + ok = _kernel32.GetProcessTimes( + handle, + ctypes.byref(creation), + ctypes.byref(exit_t), + ctypes.byref(kernel_t), + ctypes.byref(user_t), + ) + if not ok: + return None + value = (creation.dwHighDateTime << 32) | creation.dwLowDateTime + return str(value) + finally: + _kernel32.CloseHandle(handle) + + def _win_process_table() -> dict[int, tuple[int, str]] | None: + """Snapshot of all processes: pid -> (ppid, exe_name).""" + snapshot = _kernel32.CreateToolhelp32Snapshot(_TH32CS_SNAPPROCESS, 0) + if snapshot == _INVALID_HANDLE_VALUE: + return None + try: + entry = _PROCESSENTRY32W() + entry.dwSize = ctypes.sizeof(_PROCESSENTRY32W) + table: dict[int, tuple[int, str]] = {} + if not _kernel32.Process32FirstW(snapshot, ctypes.byref(entry)): + return None + while True: + table[int(entry.th32ProcessID)] = ( + int(entry.th32ParentProcessID), + entry.szExeFile, + ) + if not _kernel32.Process32NextW(snapshot, ctypes.byref(entry)): + break + return table + finally: + _kernel32.CloseHandle(snapshot) + + +# --------------------------------------------------------------------------- +# Linux (/proc) +# --------------------------------------------------------------------------- + + +def _linux_stat_fields(pid: int) -> list[str] | None: + """Fields of /proc//stat *after* the comm field. + + The comm field is ``(name)`` and may itself contain spaces and + parentheses, so split on the *last* ``)``. + """ + try: + raw = Path(f"/proc/{pid}/stat").read_bytes().decode("ascii", errors="replace") + except OSError: + return None + idx = raw.rfind(")") + if idx == -1: + return None + return raw[idx + 1 :].split() + + +def _linux_create_token(pid: int) -> str | None: + fields = _linux_stat_fields(pid) + if fields is None: + return None + # stat(5): field 22 is starttime — index 19 after stripping pid + comm + # (fields here start at field 3, "state"). + try: + return fields[19] + except IndexError: + return None + + +def _linux_parent_pid(pid: int) -> int | None: + fields = _linux_stat_fields(pid) + if fields is None: + return None + try: + return int(fields[1]) # field 4, "ppid" + except (IndexError, ValueError): + return None + + +def _linux_process_name(pid: int) -> str | None: + try: + return Path(f"/proc/{pid}/comm").read_bytes().decode("utf-8", errors="replace").strip() + except OSError: + return None + + +# --------------------------------------------------------------------------- +# macOS / generic POSIX (one-shot ``ps``; startup / rare paths only) +# --------------------------------------------------------------------------- + + +def _ps_field(pid: int, field: str) -> str | None: + try: + out = subprocess.run( + ["ps", "-p", str(pid), "-o", f"{field}="], + capture_output=True, + text=True, + timeout=5, + ) + except (OSError, subprocess.TimeoutExpired): + return None + if out.returncode != 0: + return None + value = out.stdout.strip() + return value or None + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def pid_alive(pid: int) -> bool | None: + """Whether *pid* refers to a live process. + + Returns ``True`` / ``False`` when known, ``None`` when it can't be + determined. Never sends a signal on Windows. + """ + if not isinstance(pid, int) or pid <= 0: + return None + if _IS_WINDOWS: + try: + return _win_pid_alive(pid) + except Exception: + return None + try: + os.kill(pid, 0) + return True + except ProcessLookupError: + return False + except PermissionError: + return True + except OSError: + return None + + +def process_create_token(pid: int) -> str | None: + """Opaque identity token for *pid* derived from its creation time. + + Two probes of the same live process return equal tokens; a recycled PID + yields a different token. Only token *equality* is meaningful — the + encoding differs per platform (FILETIME on Windows, starttime ticks on + Linux, ``lstart`` text on macOS). ``None`` when unavailable; callers + must skip the comparison in that case rather than treat it as reuse. + """ + if not isinstance(pid, int) or pid <= 0: + return None + try: + if _IS_WINDOWS: + return _win_create_token(pid) + if sys.platform.startswith("linux"): + return _linux_create_token(pid) + return _ps_field(pid, "lstart") + except Exception: + return None + + +def parent_pid(pid: int) -> int | None: + """Parent PID of *pid*, or ``None`` when unknown.""" + if not isinstance(pid, int) or pid <= 0: + return None + try: + if _IS_WINDOWS: + table = _win_process_table() + if table is None or pid not in table: + return None + return table[pid][0] + if sys.platform.startswith("linux"): + return _linux_parent_pid(pid) + value = _ps_field(pid, "ppid") + return int(value) if value else None + except Exception: + return None + + +def process_name(pid: int) -> str | None: + """Executable / command name of *pid*, or ``None`` when unknown.""" + if not isinstance(pid, int) or pid <= 0: + return None + try: + if _IS_WINDOWS: + table = _win_process_table() + if table is None or pid not in table: + return None + return table[pid][1] + if sys.platform.startswith("linux"): + return _linux_process_name(pid) + value = _ps_field(pid, "comm") + if value: + # ps comm may be a full path on macOS. + return value.rsplit("/", 1)[-1] + return None + except Exception: + return None + + +@dataclass(frozen=True) +class ProcInfo: + """Identity snapshot of one process: PID plus name and creation token.""" + + pid: int + name: str | None + create_token: str | None + + +def ancestor_chain(pid: int | None = None, max_depth: int = 12) -> list[ProcInfo]: + """Ancestors of *pid* (default: current process), nearest first. + + Excludes *pid* itself. Stops at PID 0/1 (idle/init), on cycles, or at + *max_depth*. On Windows the whole chain is resolved from a single + Toolhelp32 snapshot; on Linux from ``/proc``; on macOS via ``ps``. + Best-effort — returns whatever prefix of the chain could be resolved. + """ + current = pid if pid is not None else os.getpid() + + table: dict[int, tuple[int, str]] | None = None + if _IS_WINDOWS: + try: + table = _win_process_table() + except Exception: + table = None + if table is None: + return [] + + chain: list[ProcInfo] = [] + seen = {current} + for _ in range(max_depth): + if table is not None: + entry = table.get(current) + parent = entry[0] if entry else None + else: + parent = parent_pid(current) + if parent is None or parent <= 1 or parent in seen: + break + seen.add(parent) + name = table.get(parent, (None, None))[1] if table is not None else process_name(parent) + chain.append(ProcInfo(pid=parent, name=name, create_token=process_create_token(parent))) + current = parent + return chain diff --git a/packages/core/src/repowise/core/workspace/update.py b/packages/core/src/repowise/core/workspace/update.py index 79dc8ee8..6bf3427f 100644 --- a/packages/core/src/repowise/core/workspace/update.py +++ b/packages/core/src/repowise/core/workspace/update.py @@ -40,7 +40,15 @@ def _lock_path(repo_path: Path) -> Path: def _read_lock(repo_path: Path) -> dict[str, Any] | None: - """Return a live lock payload, or None when absent / stale / unreadable.""" + """Return a live lock payload, or None when absent / stale / unreadable. + + Mirrors ``cli/helpers.read_update_lock``: a lock is stale past the + wall-clock window, or immediately when its owning PID is positively + dead / recycled (so a crashed update can't block the repo for 30 min). + Unknown probe results fall back to the wall clock. + """ + from repowise.core.procutils import pid_alive, process_create_token + path = _lock_path(repo_path) if not path.exists(): return None @@ -53,16 +61,31 @@ def _read_lock(repo_path: Path) -> dict[str, Any] | None: return None if time.time() - started > _LOCK_STALE_AFTER_SECONDS: return None + + pid = payload.get("pid") + if isinstance(pid, int) and pid > 0: + alive = pid_alive(pid) + if alive is False: + return None + if alive is True: + stored_token = payload.get("pid_create_token") + if isinstance(stored_token, str) and stored_token: + current_token = process_create_token(pid) + if current_token is not None and current_token != stored_token: + return None return payload def _acquire_lock(repo_path: Path, target_commit: str | None) -> None: """Best-effort write of the lock file. Caller still must release.""" + from repowise.core.procutils import process_create_token + try: (repo_path / ".repowise").mkdir(parents=True, exist_ok=True) _lock_path(repo_path).write_text( _json.dumps({ "pid": os.getpid(), + "pid_create_token": process_create_token(os.getpid()), "target_commit": target_commit, "started_at": time.time(), }), diff --git a/packages/server/src/repowise/server/mcp_server/_server.py b/packages/server/src/repowise/server/mcp_server/_server.py index 968042f5..4ad7fb8f 100644 --- a/packages/server/src/repowise/server/mcp_server/_server.py +++ b/packages/server/src/repowise/server/mcp_server/_server.py @@ -412,4 +412,11 @@ def run_mcp( mcp.settings.port = port mcp.run(transport="sse") else: + # stdio servers are spawned per-session by the MCP client; when the + # client dies abnormally the stdio loop doesn't exit (and Windows + # never kills children), leaking servers that hold wiki.db handles. + # The watchdog exits this process once the client is gone. + from repowise.server.mcp_server._watchdog import start_parent_watchdog + + start_parent_watchdog() mcp.run(transport="stdio") diff --git a/packages/server/src/repowise/server/mcp_server/_watchdog.py b/packages/server/src/repowise/server/mcp_server/_watchdog.py new file mode 100644 index 00000000..3398220b --- /dev/null +++ b/packages/server/src/repowise/server/mcp_server/_watchdog.py @@ -0,0 +1,138 @@ +"""Parent-death watchdog for ``repowise mcp --transport stdio``. + +FastMCP's stdio loop does not reliably exit when the MCP client goes away +abnormally (crash, force-quit, killed terminal). On POSIX an orphaned +server at least reparents to init; on Windows nothing kills children when +the parent dies, so every abnormal session end leaks a ``repowise mcp`` +server holding wiki.db handles. They accumulate silently and contend with +later ``repowise update`` runs (WAL writer locks). + +A plain ``os.getppid()`` watchdog is NOT sufficient: console-script installs +launch through a chain of shims (``repowise.exe`` → venv ``python`` launcher +→ real interpreter on Windows; sometimes ``uv``/``uvx`` wrappers anywhere). +The server's immediate parent is a shim that waits on *us*, not the client — +when the client dies, the shims stay alive, so getppid never changes. The +watchdog instead walks the ancestor chain at startup, skipping past known +launcher processes to find the client, and watches every recorded ancestor +(identity-checked by creation time to defeat PID reuse). + +Failure policy is conservative: if the chain can't be resolved, or a probe +returns "unknown", the watchdog does nothing — a leaked server is better +than killing a live one. ``REPOWISE_MCP_NO_WATCHDOG=1`` disables it +entirely. +""" + +from __future__ import annotations + +import logging +import os +import threading + +from repowise.core.procutils import ProcInfo, ancestor_chain, pid_alive, process_create_token + +_log = logging.getLogger(__name__) + +# Process names that are launcher shims between the MCP client and us, not +# the client itself. Compared against the lowercased name with a trailing +# ``.exe`` stripped: by prefix for interpreter/shim families (covers +# ``python``, ``python3.12``, ``pythonw``, ``repowise``, ``uv``/``uvx``), +# and by exact match for shells — some clients spawn stdio servers through +# ``cmd /c`` / ``sh -c`` wrappers, which survive the client's death exactly +# like the shims do. (Prefix-matching "sh" would swallow unrelated names.) +_LAUNCHER_PREFIXES = ("python", "repowise", "uv") +_LAUNCHER_SHELLS = ("cmd", "sh", "bash", "zsh", "dash", "powershell", "pwsh") + +_DISABLE_ENV = "REPOWISE_MCP_NO_WATCHDOG" + +_POLL_INTERVAL_SECONDS = 5.0 + + +def _is_launcher(name: str | None) -> bool: + if not name: + # Unknown name: treat as a launcher so the walk continues to a + # nameable ancestor rather than anointing a mystery process as + # the client. + return True + normalized = name.lower() + if normalized.endswith(".exe"): + normalized = normalized[:-4] + return normalized.startswith(_LAUNCHER_PREFIXES) or normalized in _LAUNCHER_SHELLS + + +def compute_watch_set() -> list[ProcInfo]: + """Ancestors to watch: launcher shims plus the first non-launcher (client). + + Stops at the client so unrelated higher ancestors (shell, terminal, + desktop session) are never watched — those may die while the client + legitimately keeps running. Returns ``[]`` when nothing was resolvable. + """ + watch: list[ProcInfo] = [] + for info in ancestor_chain(os.getpid()): + watch.append(info) + if not _is_launcher(info.name): + break + return watch + + +def _ancestor_died(info: ProcInfo) -> bool: + """True only when *info*'s process is positively gone or replaced.""" + alive = pid_alive(info.pid) + if alive is False: + return True + if alive is True and info.create_token is not None: + current = process_create_token(info.pid) + if current is not None and current != info.create_token: + # PID recycled by an unrelated process — the ancestor is gone. + return True + return False + + +def start_parent_watchdog() -> threading.Thread | None: + """Start the daemon watchdog thread; returns it, or ``None`` when inactive. + + Exits the process with ``os._exit(0)`` when a watched ancestor dies. + ``_exit`` (not ``sys.exit``) is deliberate: the stdio event loop is + blocked on a dead pipe and would never unwind; SQLite/LanceDB handles + are released by the OS at process exit. + """ + if os.environ.get(_DISABLE_ENV, "").strip().lower() in ("1", "true", "yes"): + _log.debug("MCP watchdog disabled via %s", _DISABLE_ENV) + return None + + try: + watch = compute_watch_set() + except Exception: + _log.debug("MCP watchdog: failed to resolve ancestor chain", exc_info=True) + return None + if not watch: + _log.debug("MCP watchdog: no resolvable ancestors; watchdog inactive") + return None + + _log.info( + "MCP watchdog: watching %s", + ", ".join(f"{w.name or '?'}({w.pid})" for w in watch), + ) + + def _run() -> None: + # threading.Event.wait gives an interruptible sleep and lets tests + # poke the loop; the event is never set in production. + idle = threading.Event() + while True: + idle.wait(_POLL_INTERVAL_SECONDS) + try: + for info in watch: + if _ancestor_died(info): + _log.info( + "MCP watchdog: ancestor %s(%d) is gone — exiting", + info.name or "?", + info.pid, + ) + os._exit(0) + except Exception: + # Probe hiccup — never let the watchdog kill or crash the + # server on uncertainty. + _log.debug("MCP watchdog: probe failed", exc_info=True) + + thread = threading.Thread(target=_run, name="repowise-mcp-watchdog", daemon=True) + thread.start() + return thread diff --git a/tests/unit/cli/test_augment_staleness.py b/tests/unit/cli/test_augment_staleness.py index 3b4e76ae..3cf2f584 100644 --- a/tests/unit/cli/test_augment_staleness.py +++ b/tests/unit/cli/test_augment_staleness.py @@ -102,11 +102,14 @@ def test_lock_emits_in_flight_notice_not_stale_warning(self, repo): _state(repo_path, last_sync_commit=head, docs_enabled=True) new_head = _commit(repo_path) + import os import time + # A live PID: read_update_lock now probes liveness, so a fabricated + # dead PID would (correctly) be treated as a crashed update. (repo_path / ".repowise" / ".update.lock").write_text( json.dumps({ - "pid": 999999, + "pid": os.getpid(), "target_commit": new_head, "started_at": time.time(), }), @@ -118,6 +121,33 @@ def test_lock_emits_in_flight_notice_not_stale_warning(self, repo): assert "update in background" in msg assert "stale" not in msg.lower() + def test_lock_from_dead_pid_does_not_suppress(self, repo): + # A crashed update's leftover lock must NOT suppress the stale + # warning for the rest of the 30-min window — the dead owner makes + # it stale immediately. + import subprocess + import sys + import time + + repo_path, head = repo + _state(repo_path, last_sync_commit=head, docs_enabled=True) + _commit(repo_path) + + proc = subprocess.Popen([sys.executable, "-c", "pass"]) + proc.wait(timeout=30) + (repo_path / ".repowise" / ".update.lock").write_text( + json.dumps({ + "pid": proc.pid, + "target_commit": "x", + "started_at": time.time(), + }), + encoding="utf-8", + ) + + msg = _post(repo_path) + assert msg is not None + assert "stale" in msg.lower() + def test_stale_lock_does_not_suppress(self, repo): repo_path, head = repo _state(repo_path, last_sync_commit=head, docs_enabled=True) @@ -141,11 +171,12 @@ def test_lock_for_different_commit_still_signals_in_flight(self, repo): _state(repo_path, last_sync_commit=head, docs_enabled=True) _commit(repo_path) + import os import time (repo_path / ".repowise" / ".update.lock").write_text( json.dumps({ - "pid": 1, + "pid": os.getpid(), # live owner — see liveness probe note above "target_commit": head, # old HEAD, not the current one "started_at": time.time(), }), diff --git a/tests/unit/cli/test_mcp_command_resolution.py b/tests/unit/cli/test_mcp_command_resolution.py new file mode 100644 index 00000000..1406e69d --- /dev/null +++ b/tests/unit/cli/test_mcp_command_resolution.py @@ -0,0 +1,191 @@ +"""PATH-hijack fix: per-user MCP registrations pin the running install. + +Bare ``"command": "repowise"`` entries are resolved via PATH at session +start, so a shadow install (conda, old pip, pipx, uv tool) silently +hijacks the MCP server. Per-user config files now store the absolute path +of the install that ran ``init``. Repo-shared files (``.mcp.json``, +``.codex/config.toml``) intentionally keep the bare name — they may be +committed, and one contributor's absolute path would break every other +checkout. +""" + +from __future__ import annotations + +import json +import sys +import tomllib +from pathlib import Path + +import pytest + +from repowise.cli import mcp_config +from repowise.cli.editor_integrations import claude_config + +_SUFFIX = ".exe" if sys.platform == "win32" else "" + + +def _fake_install(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + """Point sys.executable at a fake venv with a repowise script beside it. + + pytest's tmp_path usually lives under the OS temp dir, which the + transient-location check rejects — so the fake temp dir is moved away + to keep the fake install eligible. + """ + monkeypatch.setattr( + "tempfile.gettempdir", lambda: str(tmp_path / "elsewhere-tmp") + ) + bin_dir = tmp_path / "venv" / ("Scripts" if sys.platform == "win32" else "bin") + bin_dir.mkdir(parents=True) + fake_python = bin_dir / f"python{_SUFFIX}" + fake_python.write_text("", encoding="utf-8") + script = bin_dir / f"repowise{_SUFFIX}" + script.write_text("", encoding="utf-8") + monkeypatch.setattr(sys, "executable", str(fake_python)) + return script + + +# --------------------------------------------------------------------------- +# resolve_repowise_command +# --------------------------------------------------------------------------- + + +def test_resolves_script_next_to_interpreter( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + script = _fake_install(tmp_path, monkeypatch) + + resolved = mcp_config.resolve_repowise_command() + + assert resolved == str(script.resolve()).replace("\\", "/") + + +def test_falls_back_to_bare_name_when_script_missing( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + script = _fake_install(tmp_path, monkeypatch) + script.unlink() + + assert mcp_config.resolve_repowise_command() == "repowise" + + +def test_falls_back_to_bare_name_under_temp_dir( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + _fake_install(tmp_path, monkeypatch) + # Re-point the temp dir back over the fake install: now it's transient. + monkeypatch.setattr("tempfile.gettempdir", lambda: str(tmp_path)) + + assert mcp_config.resolve_repowise_command() == "repowise" + + +def test_falls_back_to_bare_name_in_uv_cache( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setattr( + "tempfile.gettempdir", lambda: str(tmp_path / "elsewhere-tmp") + ) + bin_dir = ( + tmp_path + / "uv" + / "cache" + / "archive-v0" + / "xyz" + / ("Scripts" if sys.platform == "win32" else "bin") + ) + bin_dir.mkdir(parents=True) + (bin_dir / f"repowise{_SUFFIX}").write_text("", encoding="utf-8") + monkeypatch.setattr(sys, "executable", str(bin_dir / f"python{_SUFFIX}")) + + assert mcp_config.resolve_repowise_command() == "repowise" + + +# --------------------------------------------------------------------------- +# Per-user configs get the absolute command +# --------------------------------------------------------------------------- + + +def test_register_with_claude_code_pins_absolute_command( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setattr(Path, "home", lambda: tmp_path / "home") + monkeypatch.setattr( + claude_config, "resolve_repowise_command", lambda: "/opt/venv/bin/repowise" + ) + repo = tmp_path / "repo" + repo.mkdir() + + settings_path = claude_config.register_with_claude_code(repo) + assert settings_path is not None + + saved = json.loads(settings_path.read_text(encoding="utf-8")) + assert saved["mcpServers"]["repowise"]["command"] == "/opt/venv/bin/repowise" + + +def test_register_with_claude_desktop_pins_absolute_command( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setattr(Path, "home", lambda: tmp_path / "home") + monkeypatch.setattr(sys, "platform", "darwin") + monkeypatch.setattr( + claude_config, "resolve_repowise_command", lambda: "/opt/venv/bin/repowise" + ) + desktop_parent = tmp_path / "home" / "Library" / "Application Support" / "Claude" + desktop_parent.mkdir(parents=True) + repo = tmp_path / "repo" + repo.mkdir() + + config_path = claude_config.register_with_claude_desktop(repo) + assert config_path is not None + + saved = json.loads(config_path.read_text(encoding="utf-8")) + assert saved["mcpServers"]["repowise"]["command"] == "/opt/venv/bin/repowise" + + +def test_reregistration_refreshes_stale_absolute_command( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """A moved venv self-heals: the next registration overwrites the path.""" + monkeypatch.setattr(Path, "home", lambda: tmp_path / "home") + repo = tmp_path / "repo" + repo.mkdir() + + monkeypatch.setattr( + claude_config, "resolve_repowise_command", lambda: "/old/venv/bin/repowise" + ) + claude_config.register_with_claude_code(repo) + monkeypatch.setattr( + claude_config, "resolve_repowise_command", lambda: "/new/venv/bin/repowise" + ) + settings_path = claude_config.register_with_claude_code(repo) + assert settings_path is not None + + saved = json.loads(settings_path.read_text(encoding="utf-8")) + assert saved["mcpServers"]["repowise"]["command"] == "/new/venv/bin/repowise" + + +# --------------------------------------------------------------------------- +# Repo-shared configs keep the bare name +# --------------------------------------------------------------------------- + + +def test_root_mcp_json_keeps_bare_command( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """``.mcp.json`` may be committed — never bake in a machine-local path.""" + _fake_install(tmp_path, monkeypatch) # absolute path IS resolvable... + + config_path = mcp_config.save_root_mcp_config(tmp_path) + + saved = json.loads(config_path.read_text(encoding="utf-8")) + assert saved["mcpServers"]["repowise"]["command"] == "repowise" # ...but unused + + +def test_codex_config_keeps_bare_command( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + _fake_install(tmp_path, monkeypatch) + + config_path = mcp_config.save_codex_mcp_config(tmp_path) + + saved = tomllib.loads(config_path.read_text(encoding="utf-8")) + assert saved["mcp_servers"]["repowise"]["command"] == "repowise" diff --git a/tests/unit/cli/test_update_lock.py b/tests/unit/cli/test_update_lock.py new file mode 100644 index 00000000..5f2bf7f0 --- /dev/null +++ b/tests/unit/cli/test_update_lock.py @@ -0,0 +1,173 @@ +"""Update-lock staleness: live-PID probe + PID-reuse identity check. + +A crashed/killed ``repowise update`` (SIGKILL, power loss — paths atexit +can't cover) used to block further updates for the full 30-minute +wall-clock window because ``read_update_lock`` never validated that the +lock's PID was still alive. These tests pin the new semantics for both +the canonical CLI lock and its workspace mirror in +``repowise.core.workspace.update``. +""" + +from __future__ import annotations + +import json +import os +import subprocess +import sys +import time +from pathlib import Path + +import pytest + +from repowise.cli import helpers +from repowise.core.procutils import process_create_token +from repowise.core.workspace import update as ws_update + + +def _dead_pid() -> int: + proc = subprocess.Popen([sys.executable, "-c", "pass"]) + proc.wait(timeout=30) + return proc.pid + + +def _write_lock(repo: Path, payload: dict) -> None: + (repo / ".repowise").mkdir(parents=True, exist_ok=True) + (repo / ".repowise" / ".update.lock").write_text(json.dumps(payload), encoding="utf-8") + + +# --------------------------------------------------------------------------- +# Canonical CLI lock (repowise.cli.helpers) +# --------------------------------------------------------------------------- + + +def test_acquire_records_pid_and_create_token(tmp_path: Path) -> None: + helpers.acquire_update_lock(tmp_path, "abc123") + + payload = json.loads( + (tmp_path / ".repowise" / ".update.lock").read_text(encoding="utf-8") + ) + assert payload["pid"] == os.getpid() + assert payload["target_commit"] == "abc123" + assert payload["pid_create_token"] == process_create_token(os.getpid()) + + +def test_fresh_lock_with_live_pid_is_honored(tmp_path: Path) -> None: + helpers.acquire_update_lock(tmp_path, "abc123") + + payload = helpers.read_update_lock(tmp_path) + assert payload is not None + assert payload["pid"] == os.getpid() + + +def test_fresh_lock_with_dead_pid_is_stale(tmp_path: Path) -> None: + """The headline fix: a crashed update's lock no longer blocks for 30 min.""" + _write_lock( + tmp_path, + {"pid": _dead_pid(), "target_commit": "abc", "started_at": time.time()}, + ) + + assert helpers.read_update_lock(tmp_path) is None + + +def test_lock_with_recycled_pid_is_stale(tmp_path: Path) -> None: + """Same PID, different creation token ⇒ an unrelated process — stale.""" + _write_lock( + tmp_path, + { + "pid": os.getpid(), + "pid_create_token": "definitely-not-our-create-token", + "target_commit": "abc", + "started_at": time.time(), + }, + ) + + assert helpers.read_update_lock(tmp_path) is None + + +def test_legacy_lock_without_token_still_honored(tmp_path: Path) -> None: + """Locks written by older repowise versions carry no token — the + identity check is skipped, liveness + wall clock still apply.""" + _write_lock( + tmp_path, + {"pid": os.getpid(), "target_commit": "abc", "started_at": time.time()}, + ) + + assert helpers.read_update_lock(tmp_path) is not None + + +def test_lock_without_pid_falls_back_to_wall_clock(tmp_path: Path) -> None: + _write_lock(tmp_path, {"target_commit": "abc", "started_at": time.time()}) + assert helpers.read_update_lock(tmp_path) is not None + + _write_lock( + tmp_path, + { + "target_commit": "abc", + "started_at": time.time() - helpers.UPDATE_LOCK_STALE_AFTER_SECONDS - 60, + }, + ) + assert helpers.read_update_lock(tmp_path) is None + + +def test_old_lock_is_stale_even_with_live_pid(tmp_path: Path) -> None: + """A hung-but-alive update must still hit the wall-clock ceiling.""" + _write_lock( + tmp_path, + { + "pid": os.getpid(), + "pid_create_token": process_create_token(os.getpid()), + "target_commit": "abc", + "started_at": time.time() - helpers.UPDATE_LOCK_STALE_AFTER_SECONDS - 60, + }, + ) + + assert helpers.read_update_lock(tmp_path) is None + + +def test_unknown_probe_results_fall_back_to_wall_clock( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """When liveness can't be determined, a fresh lock must stay honored.""" + _write_lock( + tmp_path, + {"pid": os.getpid(), "target_commit": "abc", "started_at": time.time()}, + ) + monkeypatch.setattr("repowise.core.procutils.pid_alive", lambda _pid: None) + + assert helpers.read_update_lock(tmp_path) is not None + + +# --------------------------------------------------------------------------- +# Workspace mirror (repowise.core.workspace.update) — format must stay in sync +# --------------------------------------------------------------------------- + + +def test_workspace_lock_round_trip_matches_cli_format(tmp_path: Path) -> None: + ws_update._acquire_lock(tmp_path, "abc123") + try: + cli_view = helpers.read_update_lock(tmp_path) + ws_view = ws_update._read_lock(tmp_path) + assert cli_view is not None + assert ws_view is not None + assert cli_view["pid"] == ws_view["pid"] == os.getpid() + assert cli_view["pid_create_token"] == ws_view["pid_create_token"] + finally: + ws_update._release_lock(tmp_path) + + +def test_workspace_lock_with_dead_pid_is_stale(tmp_path: Path) -> None: + _write_lock( + tmp_path, + {"pid": _dead_pid(), "target_commit": "abc", "started_at": time.time()}, + ) + + assert ws_update._read_lock(tmp_path) is None + + +def test_workspace_legacy_lock_without_token_still_honored(tmp_path: Path) -> None: + _write_lock( + tmp_path, + {"pid": os.getpid(), "target_commit": "abc", "started_at": time.time()}, + ) + + assert ws_update._read_lock(tmp_path) is not None diff --git a/tests/unit/server/test_mcp_watchdog.py b/tests/unit/server/test_mcp_watchdog.py new file mode 100644 index 00000000..6fcb54c9 --- /dev/null +++ b/tests/unit/server/test_mcp_watchdog.py @@ -0,0 +1,206 @@ +"""MCP stdio parent-death watchdog. + +The launcher-chain problem these tests pin down: console-script installs +run as ``client → repowise shim → python`` (three levels on Windows), so +``os.getppid()`` from the server is a shim that waits on *us* and never +dies when the client does. The watchdog walks past launcher-named +ancestors to the client and watches the whole recorded chain. +""" + +from __future__ import annotations + +import os +import subprocess +import sys +import textwrap +import time + +import pytest + +from repowise.core.procutils import ProcInfo, pid_alive, process_create_token +from repowise.server.mcp_server import _watchdog + +# --------------------------------------------------------------------------- +# _is_launcher +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "name", + [ + "python", + "python3.12", + "Python.exe", + "pythonw.exe", + "repowise", + "repowise.exe", + "uv", + "uvx", + # Shell wrappers (``cmd /c`` / ``sh -c`` spawn styles) — exact match. + "cmd.exe", + "sh", + "bash", + "powershell.exe", + "pwsh", + None, + ], +) +def test_launcher_names(name: str | None) -> None: + assert _watchdog._is_launcher(name) is True + + +@pytest.mark.parametrize( + "name", + # "sharepoint" pins exact (not prefix) matching for shell names. + ["claude", "Claude.exe", "node", "Cursor.exe", "codex", "sharepoint", "explorer.exe"], +) +def test_client_names(name: str) -> None: + assert _watchdog._is_launcher(name) is False + + +# --------------------------------------------------------------------------- +# compute_watch_set — stops at the first non-launcher ancestor +# --------------------------------------------------------------------------- + + +def test_watch_set_stops_at_client(monkeypatch: pytest.MonkeyPatch) -> None: + chain = [ + ProcInfo(pid=100, name="repowise.exe", create_token="a"), + ProcInfo(pid=101, name="python.exe", create_token="b"), + ProcInfo(pid=102, name="claude.exe", create_token="c"), + # Above the client — must never be watched: it can die while the + # client legitimately keeps running. + ProcInfo(pid=103, name="powershell.exe", create_token="d"), + ProcInfo(pid=104, name="explorer.exe", create_token="e"), + ] + monkeypatch.setattr(_watchdog, "ancestor_chain", lambda _pid: chain) + + watch = _watchdog.compute_watch_set() + + assert [w.pid for w in watch] == [100, 101, 102] + + +def test_watch_set_empty_chain(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(_watchdog, "ancestor_chain", lambda _pid: []) + assert _watchdog.compute_watch_set() == [] + + +def test_watch_set_all_launchers_keeps_whole_chain( + monkeypatch: pytest.MonkeyPatch, +) -> None: + chain = [ + ProcInfo(pid=100, name="python", create_token="a"), + ProcInfo(pid=101, name="python3", create_token="b"), + ] + monkeypatch.setattr(_watchdog, "ancestor_chain", lambda _pid: chain) + + assert [w.pid for w in _watchdog.compute_watch_set()] == [100, 101] + + +# --------------------------------------------------------------------------- +# _ancestor_died +# --------------------------------------------------------------------------- + + +def test_dead_pid_is_died() -> None: + proc = subprocess.Popen([sys.executable, "-c", "pass"]) + proc.wait(timeout=30) + info = ProcInfo(pid=proc.pid, name="python", create_token=None) + assert _watchdog._ancestor_died(info) is True + + +def test_live_self_with_matching_token_is_not_died() -> None: + info = ProcInfo( + pid=os.getpid(), name="python", create_token=process_create_token(os.getpid()) + ) + assert _watchdog._ancestor_died(info) is False + + +def test_live_pid_with_mismatched_token_is_died() -> None: + """PID recycled by an unrelated process counts as the ancestor dying.""" + info = ProcInfo(pid=os.getpid(), name="python", create_token="not-our-token") + assert _watchdog._ancestor_died(info) is True + + +def test_unknown_liveness_is_not_died(monkeypatch: pytest.MonkeyPatch) -> None: + """Fail open: probe uncertainty must never kill a live server.""" + monkeypatch.setattr(_watchdog, "pid_alive", lambda _pid: None) + info = ProcInfo(pid=os.getpid(), name="python", create_token="whatever") + assert _watchdog._ancestor_died(info) is False + + +# --------------------------------------------------------------------------- +# start_parent_watchdog +# --------------------------------------------------------------------------- + + +def test_disabled_via_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv(_watchdog._DISABLE_ENV, "1") + assert _watchdog.start_parent_watchdog() is None + + +def test_inactive_when_chain_unresolvable(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv(_watchdog._DISABLE_ENV, raising=False) + monkeypatch.setattr(_watchdog, "compute_watch_set", lambda: []) + assert _watchdog.start_parent_watchdog() is None + + +# --------------------------------------------------------------------------- +# End-to-end: the orphan scenario +# --------------------------------------------------------------------------- + +# The "server" loads _watchdog.py directly by file path (skipping the heavy +# mcp_server package import), starts the watchdog with a fast poll, signals +# readiness, then sleeps far past the test timeout. The 60s sleep doubles +# as cleanup if the watchdog ever fails to fire. +_SERVER_SCRIPT = textwrap.dedent( + """ + import importlib.util, time + spec = importlib.util.spec_from_file_location("rw_watchdog", {watchdog_path!r}) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + mod._POLL_INTERVAL_SECONDS = 0.2 + thread = mod.start_parent_watchdog() + assert thread is not None, "watchdog did not start" + print("READY", flush=True) + time.sleep(60) + """ +) + +# The "client" spawns the server, waits until its watchdog is armed, hands +# the server PID to the test, and exits — orphaning the server exactly the +# way a crashed MCP client does. +_CLIENT_SCRIPT = textwrap.dedent( + """ + import subprocess, sys + proc = subprocess.Popen( + [sys.executable, "-c", {server_script!r}], + stdout=subprocess.PIPE, text=True, + ) + line = proc.stdout.readline() + assert line.strip() == "READY", f"unexpected: {{line!r}}" + print(proc.pid, flush=True) + """ +) + + +def test_server_exits_when_client_dies() -> None: + server_script = _SERVER_SCRIPT.format(watchdog_path=_watchdog.__file__) + client_script = _CLIENT_SCRIPT.format(server_script=server_script) + + client = subprocess.run( + [sys.executable, "-c", client_script], + capture_output=True, + text=True, + timeout=120, + ) + assert client.returncode == 0, client.stderr + server_pid = int(client.stdout.strip().splitlines()[-1]) + + # The client is gone; the orphaned server must notice and exit. + deadline = time.time() + 30 + while time.time() < deadline: + if pid_alive(server_pid) is False: + return + time.sleep(0.25) + pytest.fail(f"orphaned server (pid {server_pid}) did not exit within 30s") diff --git a/tests/unit/test_procutils.py b/tests/unit/test_procutils.py new file mode 100644 index 00000000..81b183d1 --- /dev/null +++ b/tests/unit/test_procutils.py @@ -0,0 +1,128 @@ +"""Tests for repowise.core.procutils — cross-platform process probes. + +These run real probes against the test process itself and short-lived +child processes, so they exercise the platform-specific code path of +whatever OS the suite runs on (kernel32 on Windows, /proc on Linux, +``ps`` on macOS). +""" + +from __future__ import annotations + +import os +import subprocess +import sys + +import pytest + +from repowise.core.procutils import ( + ancestor_chain, + parent_pid, + pid_alive, + process_create_token, + process_name, +) + + +def _spawn_and_reap() -> int: + """Run a trivial child to completion and return its (now dead) PID.""" + proc = subprocess.Popen([sys.executable, "-c", "pass"]) + proc.wait(timeout=30) + return proc.pid + + +# --------------------------------------------------------------------------- +# pid_alive +# --------------------------------------------------------------------------- + + +def test_pid_alive_self() -> None: + assert pid_alive(os.getpid()) is True + + +def test_pid_alive_dead_child() -> None: + dead_pid = _spawn_and_reap() + assert pid_alive(dead_pid) is False + + +def test_pid_alive_rejects_garbage() -> None: + assert pid_alive(0) is None + assert pid_alive(-5) is None + assert pid_alive("123") is None # type: ignore[arg-type] + + +# --------------------------------------------------------------------------- +# process_create_token +# --------------------------------------------------------------------------- + + +def test_create_token_self_is_stable() -> None: + first = process_create_token(os.getpid()) + second = process_create_token(os.getpid()) + assert first is not None + assert first == second + + +def test_create_token_dead_pid_is_none_or_differs_from_live() -> None: + """A dead PID must never return the same token as a live process. + + (On most platforms it returns None; a racy PID reuse would return a + *different* token — both are acceptable, equality would be a bug.) + """ + dead_pid = _spawn_and_reap() + token = process_create_token(dead_pid) + assert token is None or token != process_create_token(os.getpid()) + + +def test_create_token_distinguishes_processes() -> None: + proc = subprocess.Popen([sys.executable, "-c", "import time; time.sleep(30)"]) + try: + child_token = process_create_token(proc.pid) + own_token = process_create_token(os.getpid()) + assert child_token is not None + assert child_token != own_token + finally: + proc.kill() + proc.wait(timeout=30) + + +# --------------------------------------------------------------------------- +# parent_pid / process_name / ancestor_chain +# --------------------------------------------------------------------------- + + +def test_parent_pid_of_child_is_us() -> None: + proc = subprocess.Popen([sys.executable, "-c", "import time; time.sleep(30)"]) + try: + assert parent_pid(proc.pid) == os.getpid() + finally: + proc.kill() + proc.wait(timeout=30) + + +def test_process_name_of_child_is_python() -> None: + proc = subprocess.Popen([sys.executable, "-c", "import time; time.sleep(30)"]) + try: + name = process_name(proc.pid) + assert name is not None + assert "python" in name.lower() + finally: + proc.kill() + proc.wait(timeout=30) + + +def test_ancestor_chain_starts_at_parent() -> None: + if os.getppid() <= 1: + pytest.skip("test process has no live parent to walk") + chain = ancestor_chain() + assert chain, "expected at least one resolvable ancestor" + assert chain[0].pid == os.getppid() + # Every entry carries a usable identity for the watchdog. + for info in chain: + assert info.pid > 1 + + +def test_ancestor_chain_excludes_self_and_has_no_duplicates() -> None: + chain = ancestor_chain() + pids = [info.pid for info in chain] + assert os.getpid() not in pids + assert len(pids) == len(set(pids))