From e4108c3372d90f6302b334878eb0f089132cc10a Mon Sep 17 00:00:00 2001 From: Ethan Ligon & Sue Coder Date: Mon, 29 Jun 2026 17:35:21 -0700 Subject: [PATCH 1/3] fix(remote): harden git-over-SSH transport and rebuild husk mirrors Two robustness fixes for `collaborate`/`bootstrap` against a remote mirror, prompted by a wedged PlayPen sync on the Savio DTN. Symptoms: the fetch reported `couldn't find remote ref main` and the push died with `Session open refused by peer` -> `disabling multiplexing` -> remote login-shell noise -> `the remote end hung up unexpectedly` / `No refs in common`. Root cause chain: 1. The remote mirror existed but was a husk from a prior failed bootstrap: `git init` ran, no push ever landed, so there was no `main` ref. Fetch correctly swallowed this; the push then tried to sync into a half-dead repo. 2. The DTN ControlMaster refused a session for the push, so ssh fell back to a fresh full dial that re-ran the remote login shell. A broken shared dotfile (allhands/.bashrc-el8 calling a missing lmod init on a non-el8 DTN) then killed git-receive-pack. Changes: - _remote_git_env: the GIT_SSH_COMMAND was the only SSH path missing the executor's standard guards. Add BatchMode/ConnectTimeout/ ServerAlive*/LogLevel=ERROR to both the scaffolding and fallback branches and their inner ProxyCommand, so a refused master session fails fast and quietly instead of dialing a contaminated fresh connection. Verbosity is preserved under debug_ssh. - ensure_remote_clone + _remote_repo_has_content: detect a remote repo that exists but has no commits and no base branch (a husk) and rebuild it (rm -rf + git init) rather than push into it. A husk has no commits, so nothing is lost. This makes the failure resilient and legible; it does not fix the underlying broken DTN dotfile or the master-refused-session condition. Tests: transport hardening (both hops), debug verbosity preserved, and husk rebuild. Full suite: 457 passed. Co-Authored-By: Claude Opus 4.8 (1M context) --- sucoder/mirror.py | 102 ++++++++++++++++++++++++++++++--- tests/test_remote.py | 130 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 224 insertions(+), 8 deletions(-) diff --git a/sucoder/mirror.py b/sucoder/mirror.py index 1558aa7..22fd810 100644 --- a/sucoder/mirror.py +++ b/sucoder/mirror.py @@ -535,6 +535,39 @@ def _remote_git_env(self, ctx: MirrorContext) -> tuple: gateway = remote.gateway debug_ssh = getattr(self.executor, "debug_ssh", False) + # Harden the git transport the same way the rest of the executor + # hardens its one-shot SSH calls (see _build_ssh_command / + # _build_login_node_command). Without this, a git push/fetch + # whose ControlMaster session is refused ("Session open refused + # by peer") silently falls back to a fresh, full SSH dial that + # re-runs the remote login shell. On clusters with a broken + # shared login script (e.g. an el8 bashrc sourced on a non-el8 + # DTN) that noise kills git-receive-pack and the push wedges with + # a cryptic "remote end hung up". BatchMode makes a + # credential-less fallback fail fast instead of hanging on a + # prompt that DEVNULL stdin can never answer; ConnectTimeout and + # ServerAlive* bound a dead/wedged link; LogLevel=ERROR silences + # the confusing mux warnings. + ex = self.executor + connect_timeout = getattr(ex, "CONNECT_TIMEOUT", 10) + keepalive_interval = getattr(ex, "KEEPALIVE_INTERVAL", 15) + keepalive_count = getattr(ex, "KEEPALIVE_COUNT_MAX", 3) + hardening = [ + "-o", "BatchMode=yes", + "-o", f"ConnectTimeout={connect_timeout}", + "-o", f"ServerAliveInterval={keepalive_interval}", + "-o", f"ServerAliveCountMax={keepalive_count}", + ] + if not debug_ssh: + # -vvv (debug) sets its own LogLevel; don't override it. + hardening.extend(["-o", "LogLevel=ERROR"]) + # Same guards for the inner ProxyCommand ssh (embedded as one -o + # arg, so spelled out as a string rather than a parts list). + proxy_quiet = "" if debug_ssh else "-o LogLevel=ERROR " + proxy_hardening = ( + f"-o BatchMode=yes {proxy_quiet}-o ConnectTimeout={connect_timeout} " + ) + # Prefer the scaffolding node (DTN) for git transport when # available. It sees the same Lustre filesystem and avoids # load on login nodes (or the fragile compute-node chain). @@ -544,6 +577,7 @@ def _remote_git_env(self, ctx: MirrorContext) -> tuple: ssh_cmd_parts = ["ssh"] if debug_ssh: ssh_cmd_parts.append("-vvv") + ssh_cmd_parts.extend(hardening) ssh_cmd_parts.extend([ "-o", "ControlMaster=auto", "-o", f"ControlPath={scaffolding_sock}", @@ -553,7 +587,7 @@ def _remote_git_env(self, ctx: MirrorContext) -> tuple: gw_socket = _gw_sock(gateway) ssh_cmd_parts.extend([ "-o", - f"ProxyCommand=ssh -o ControlMaster=auto " + f"ProxyCommand=ssh {proxy_hardening}-o ControlMaster=auto " f"-o ControlPath={gw_socket} " f"-W %h:%p {gateway}", ]) @@ -570,6 +604,7 @@ def _remote_git_env(self, ctx: MirrorContext) -> tuple: ssh_cmd_parts = ["ssh"] if debug_ssh: ssh_cmd_parts.append("-vvv") + ssh_cmd_parts.extend(hardening) if control_path: ssh_cmd_parts.extend([ "-o", "ControlMaster=auto", @@ -587,7 +622,7 @@ def _remote_git_env(self, ctx: MirrorContext) -> tuple: if proxy_node and proxy_sock: ssh_cmd_parts.extend([ "-o", - f"ProxyCommand=ssh -o ControlMaster=auto " + f"ProxyCommand=ssh {proxy_hardening}-o ControlMaster=auto " f"-o ControlPath={proxy_sock} " f"-W %h:%p {proxy_node}", ]) @@ -596,7 +631,7 @@ def _remote_git_env(self, ctx: MirrorContext) -> tuple: gw_socket = _gw_sock(gateway) ssh_cmd_parts.extend([ "-o", - f"ProxyCommand=ssh -o ControlMaster=auto " + f"ProxyCommand=ssh {proxy_hardening}-o ControlMaster=auto " f"-o ControlPath={gw_socket} " f"-W %h:%p {gateway}", ]) @@ -1093,6 +1128,37 @@ def _sync_remote(self, ctx: MirrorContext) -> None: timeout=self._GIT_REMOTE_TIMEOUT, ) + def _remote_repo_has_content( + self, + run: Callable, + remote_path: str, + base: str, + ) -> bool: + """Return ``True`` if the remote git repo has real content. + + A mirror that exists on disk but has neither a HEAD commit nor + the *base* branch is a husk left by a previously failed bootstrap + (``git init`` ran, but no push ever landed). Fetching from such + a repo fails with "couldn't find remote ref " and pushing + into it is fragile, so callers rebuild it from scratch rather + than sync into it. + """ + has_head = run( + ["git", "rev-parse", "--verify", "--quiet", "HEAD"], + check=False, + cwd=remote_path, + ).returncode == 0 + if has_head: + return True + # HEAD may be an unborn symbolic ref pointing at a branch that + # does exist (e.g. a non-default checkout); verify the base + # branch directly before declaring the repo empty. + return run( + ["git", "rev-parse", "--verify", "--quiet", f"refs/heads/{base}"], + check=False, + cwd=remote_path, + ).returncode == 0 + def ensure_remote_clone(self, ctx: MirrorContext) -> bool: """Ensure the mirror exists on the remote host. @@ -1125,16 +1191,37 @@ def ensure_remote_clone(self, ctx: MirrorContext) -> bool: # Use the login node for filesystem scaffolding when available. run = getattr(self.executor, "run_on_login_node", self.executor.run_agent) + base = ctx.settings.default_base_branch or "main" + # Check if remote mirror is a valid git repo. check = run( ["git", "rev-parse", "--git-dir"], check=False, cwd=abs_remote_path, ) - if check.returncode == 0: + repo_exists = check.returncode == 0 + # A repo can exist on disk yet be a husk from a previously failed + # bootstrap: `git init` ran but no push ever landed, so there are + # no commits and no base branch. That is exactly the state that + # produced the "couldn't find remote ref main" fetch failure + # followed by a wedged push. Treat such a husk as broken and + # rebuild it rather than syncing into it. + repo_usable = repo_exists and self._remote_repo_has_content( + run, abs_remote_path, base, + ) + if repo_exists and not repo_usable: + self.logger.warning( + "Remote mirror at %s exists but is empty/half-initialised " + "(no commits, no '%s' branch) — rebuilding it from scratch", + remote_path, base, + ) + + if repo_usable: self.logger.info("Remote mirror already exists at %s", remote_path) else: - # Clean up broken directory from a previously failed init. + # Clean up a missing/broken/half-initialised directory before + # a fresh init. Safe even when the repo merely existed-but- + # empty: a husk has no commits, so there is nothing to lose. run( ["rm", "-rf", abs_remote_path], check=False, @@ -1155,7 +1242,6 @@ def ensure_remote_clone(self, ctx: MirrorContext) -> bool: ["chmod", "700", mirrors_parent], check=False, # may not own the parent ) - base = ctx.settings.default_base_branch or "main" run( ["git", "init", "-b", base], check=True, @@ -1190,8 +1276,8 @@ def ensure_remote_clone(self, ctx: MirrorContext) -> bool: self._sync_remote(ctx) # Ensure HEAD points to the correct branch so that - # updateInstead keeps the working tree in sync. - base = ctx.settings.default_base_branch or "main" + # updateInstead keeps the working tree in sync. (`base` was + # resolved at the top of this method.) run( ["git", "symbolic-ref", "HEAD", f"refs/heads/{base}"], check=True, diff --git a/tests/test_remote.py b/tests/test_remote.py index ccc8830..6bf74b1 100644 --- a/tests/test_remote.py +++ b/tests/test_remote.py @@ -1308,6 +1308,136 @@ def fake_run_agent(args, **kwargs): assert sync_called +def _remote_exec_with_scaffolding(tmp_path: Path): + """A RemoteExecutor configured with a DTN scaffolding node + sockets.""" + import logging + + from sucoder.executor import RemoteExecutor + + logger = logging.getLogger("test.remote.hardening") + return RemoteExecutor( + human_user="ligon", + agent_user="ligon", + agent_group="ligon", + logger=logger, + dry_run=False, + use_sudo_for_agent=False, + gateway="gw.example.com", + login_node="ln001", + remote_mirror_root="~/mirrors", + local_mirror_root=str(tmp_path / "mirrors"), + control_socket_path="/tmp/test.sock", + scaffolding_node="dtn.example.com", + scaffolding_socket_path="/tmp/dtn.sock", + ) + + +def test_remote_git_env_hardens_transport( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """GIT_SSH_COMMAND carries fail-fast / quiet guards on both hops. + + Regression guard: a refused ControlMaster session must fall back + fast and silently instead of dialing a fresh, login-shell-polluted + connection that wedges git-receive-pack. + """ + manager = _build_remote_manager(tmp_path) + ctx = manager.context_for("rproj") + manager.executor = _remote_exec_with_scaffolding(tmp_path) + # Avoid a real SSH round-trip to resolve ~. + monkeypatch.setattr( + manager, "_resolve_remote_path", + lambda ctx: "/home/ligon/mirrors/rproj", + ) + + url, env = manager._remote_git_env(ctx) + + assert url == "dtn.example.com:/home/ligon/mirrors/rproj" + cmd = env["GIT_SSH_COMMAND"] + # Outer ssh hardening (matches the rest of the executor). + assert "BatchMode=yes" in cmd + assert "ConnectTimeout=10" in cmd + assert "ServerAliveInterval=15" in cmd + assert "ServerAliveCountMax=3" in cmd + assert "LogLevel=ERROR" in cmd + # Rides the DTN ControlMaster socket. + assert "ControlPath=/tmp/dtn.sock" in cmd + # The inner ProxyCommand ssh is hardened too — BatchMode + LogLevel + # appear a second time inside the quoted ProxyCommand string. + assert "ProxyCommand=ssh -o BatchMode=yes" in cmd + assert cmd.count("BatchMode=yes") >= 2 + assert cmd.count("LogLevel=ERROR") >= 2 + + +def test_remote_git_env_debug_preserves_verbosity( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """With debug_ssh, -vvv is kept and LogLevel=ERROR is not forced.""" + manager = _build_remote_manager(tmp_path) + ctx = manager.context_for("rproj") + executor = _remote_exec_with_scaffolding(tmp_path) + executor.debug_ssh = True + manager.executor = executor + monkeypatch.setattr( + manager, "_resolve_remote_path", + lambda ctx: "/home/ligon/mirrors/rproj", + ) + + _url, env = manager._remote_git_env(ctx) + cmd = env["GIT_SSH_COMMAND"] + + assert "-vvv" in cmd + assert "LogLevel=ERROR" not in cmd + # Fail-fast guards still apply in debug mode. + assert "BatchMode=yes" in cmd + assert "ConnectTimeout=10" in cmd + + +def test_ensure_remote_clone_rebuilds_empty_mirror( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """A husk repo (exists, but no commits/base branch) is rebuilt. + + Regression guard for the PlayPen/DTN failure: a remote mirror that + was `git init`'d by a prior failed bootstrap but never received a + push has no `main` ref. ensure_remote_clone must rebuild it rather + than sync into the half-dead repo. + """ + from sucoder.executor import CommandResult + + manager = _build_remote_manager(tmp_path) + ctx = manager.context_for("rproj") + + agent_calls: list = [] + + def fake_run_agent(args, **kwargs): + agent_calls.append(list(args)) + s = " ".join(str(a) for a in args) + if "echo" in s: + return CommandResult(list(args), list(args), "/home/ligon\n", "", 0) + if "rev-parse" in s and "--git-dir" in s: + # Repo exists on disk. + return CommandResult(list(args), list(args), ".git\n", "", 0) + if "rev-parse" in s and ("HEAD" in s or "refs/heads/" in s): + # No commits, no base branch → husk. + return CommandResult(list(args), list(args), "", "fatal", 1) + return CommandResult(list(args), list(args), "", "", 0) + + monkeypatch.setattr(manager.executor, "run_agent", fake_run_agent) + # Isolate the rebuild decision: don't touch the network. + monkeypatch.setattr(manager, "_pull_from_remote", lambda ctx: None) + sync_called: list = [] + monkeypatch.setattr(manager, "_sync_remote", lambda ctx: sync_called.append(True)) + + manager.ensure_remote_clone(ctx) + + cmds = [" ".join(str(a) for a in c) for c in agent_calls] + # Husk detected → wiped and re-initialised before syncing. + assert any("rm -rf" in c for c in cmds) + assert any("git init" in c for c in cmds) + assert sync_called + + def test_ensure_remote_mirror_exists_success( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: From 8f0ac1de206be8870609719dbffbe1a0958243a1 Mon Sep 17 00:00:00 2001 From: Ethan Ligon & Sue Coder Date: Mon, 29 Jun 2026 17:50:23 -0700 Subject: [PATCH 2/3] feat(remote): fail over git transport from DTN to login node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a scaffolding node (DTN) is configured, git push/fetch prefer it for its fat pipes and spare CPU. But transfer nodes commonly cap concurrent SSH sessions, and when the DTN ControlMaster refuses a session ("Session open refused by peer") the transport had nowhere to go but a contaminated fresh dial -> wedged push. The filesystem scaffolding path already fails over off the DTN (executor _DTN_FALLBACK_RETURNCODES); git transport did not. This closes that asymmetry. - _remote_git_env(use_scaffolding=False): build transport against the target/login node, skipping the DTN. - _git_transports(): ordered [DTN, login node] transports, deduped to one when there is no distinct fallback. - _is_transport_failure(): classify a git/ssh result as a broken connection (255/-1, "session open refused", "remote end hung up", etc.) vs. a real answer like an empty mirror's "couldn't find remote ref" — the latter would repeat on any node, so it is not retried. - _sync_remote: try each transport; fail over only on a transport fault, raise a genuine rejection immediately. - _pull_from_url: now takes an ordered transports list and fails over the fetch. This is a correctness fix, not just convenience: a silently-failed DTN fetch would skip the pull and let the next push force-overwrite agent commits, so we fail over to the login node before giving up. _pull_from_local passes a single local transport. Tests: transport ordering/dedup, push failover + no-failover-on-real- error, fetch failover + no-failover-on-empty-mirror. Full suite: 463 passed. Co-Authored-By: Claude Opus 4.8 (1M context) --- sucoder/mirror.py | 191 ++++++++++++++++++++++++++++++++++--------- tests/test_remote.py | 181 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 335 insertions(+), 37 deletions(-) diff --git a/sucoder/mirror.py b/sucoder/mirror.py index 22fd810..e4dcc7b 100644 --- a/sucoder/mirror.py +++ b/sucoder/mirror.py @@ -27,7 +27,7 @@ MirrorSettings, RemoteConfig, ) -from .executor import CommandError, CommandExecutor, RemoteExecutor +from .executor import CommandError, CommandExecutor, CommandResult, RemoteExecutor from .permissions import ( apply_agent_repo_permissions, check_parent_traversable, @@ -518,7 +518,9 @@ def _resolve_remote_path(self, ctx: MirrorContext) -> str: return raw.replace("~", cached, 1) - def _remote_git_env(self, ctx: MirrorContext) -> tuple: + def _remote_git_env( + self, ctx: MirrorContext, *, use_scaffolding: bool = True, + ) -> tuple: """Return ``(url, env)`` for git operations against the remote mirror. Builds the SSH transport command using the ControlMaster sockets @@ -527,6 +529,12 @@ def _remote_git_env(self, ctx: MirrorContext) -> tuple: When a scaffolding node (DTN) is configured, git transport is routed through it — fat pipes, spare CPU, and the mirror lives on shared Lustre visible from any cluster node. + + Pass ``use_scaffolding=False`` to deliberately skip the DTN and + build transport against the target/login node instead. That is + the failover path when the DTN's ControlMaster refuses a session + (a common limit on transfer nodes): the login node is already + authenticated, session-capable, and sees the same Lustre. """ remote = ctx.settings.remote assert remote is not None @@ -573,7 +581,7 @@ def _remote_git_env(self, ctx: MirrorContext) -> tuple: # load on login nodes (or the fragile compute-node chain). scaffolding_node = getattr(self.executor, "scaffolding_node", "") scaffolding_sock = getattr(self.executor, "scaffolding_socket_path", "") - if scaffolding_node and scaffolding_sock: + if use_scaffolding and scaffolding_node and scaffolding_sock: ssh_cmd_parts = ["ssh"] if debug_ssh: ssh_cmd_parts.append("-vvv") @@ -644,6 +652,76 @@ def _remote_git_env(self, ctx: MirrorContext) -> tuple: env["GIT_SSH_COMMAND"] = git_ssh_cmd return url, env + def _git_transports( + self, ctx: MirrorContext, + ) -> List[Tuple[str, str, Mapping[str, str]]]: + """Ordered ``(label, url, env)`` git transports to try. + + Primary is the DTN (scaffolding node) when one is configured; + the fallback is the target/login node. The DTN has fat pipes + and spare CPU, but transfer nodes commonly cap concurrent SSH + sessions, so a refused master session must not be fatal — the + login node sees the same shared filesystem and can carry the + push/fetch instead. Without a scaffolding node the two collapse + to one transport (no pointless retry). + """ + transports: List[Tuple[str, str, Mapping[str, str]]] = [] + scaffolding_node = getattr(self.executor, "scaffolding_node", "") + scaffolding_sock = getattr(self.executor, "scaffolding_socket_path", "") + if scaffolding_node and scaffolding_sock: + url, env = self._remote_git_env(ctx, use_scaffolding=True) + transports.append(("DTN", url, env)) + + fallback_label = ( + "compute node" + if getattr(self.executor, "is_compute_node", False) + else "login node" + ) + url, env = self._remote_git_env(ctx, use_scaffolding=False) + # Skip a duplicate when there is no distinct fallback (e.g. no + # scaffolding node, or both resolve to the same host). + if not transports or transports[0][1] != url: + transports.append((fallback_label, url, env)) + return transports + + @staticmethod + def _is_transport_failure(result: "CommandResult") -> bool: + """True if a git result looks like an SSH transport fault. + + Distinguishes "the connection broke" (worth failing over to + another node) from "the remote answered, but…" (e.g. an empty + mirror's ``couldn't find remote ref`` — a real answer that would + repeat identically on any transport, so not worth a retry). + """ + # -1 is our timeout sentinel; 255 is SSH's own connection error. + if result.returncode in (-1, 255): + return True + stderr = (result.stderr or "").lower() + markers = ( + "session open refused", # DTN master refused the session + "the remote end hung up", # receive/upload-pack died mid-stream + "connection closed", + "connection refused", + "connection timed out", + "connection reset", + "broken pipe", + "could not resolve hostname", + "no route to host", + "kex_exchange_identification", # sshd dropped us before banner + ) + return any(m in stderr for m in markers) + + @staticmethod + def _short_git_error(result: "CommandResult") -> str: + """A one-line summary of a git/ssh failure for log messages.""" + skip = ("mux_client", "controlsocket", "warning:") + last = "" + for line in (result.stderr or "").splitlines(): + stripped = line.strip() + if stripped and not stripped.lower().startswith(skip): + last = stripped + return last or f"rc={result.returncode}" + # -- Interactive helpers --------------------------------------------- @staticmethod @@ -679,11 +757,9 @@ def _unique_branch_name( def _pull_from_remote(self, ctx: MirrorContext) -> None: """Fetch agent commits from the remote mirror into canonical.""" - url, env = self._remote_git_env(ctx) self._pull_from_url( ctx, - url, - env=env, + self._git_transports(ctx), source_label="remote mirror", timeout=self._GIT_REMOTE_TIMEOUT, ) @@ -706,26 +782,31 @@ def _pull_from_local(self, ctx: MirrorContext) -> None: ) self._pull_from_url( ctx, - str(mirror_path), - env=None, + [("local mirror", str(mirror_path), None)], source_label=f"local mirror at {mirror_path}", ) def _pull_from_url( self, ctx: MirrorContext, - url: str, + transports: Sequence[Tuple[str, str, Optional[Mapping[str, str]]]], *, - env: Optional[Mapping[str, str]] = None, source_label: str = "mirror", timeout: Optional[int] = None, ) -> None: - """Fetch agent commits from *url* into canonical and reconcile. + """Fetch agent commits into canonical and reconcile. + + Shared by ``_pull_from_remote`` (SSH transports, DTN→login-node + failover) and ``_pull_from_local`` (a single filesystem path). + Must run *before* ``_sync_remote`` so that work the agent + committed on the mirror is not lost when the canonical repo + force-pushes over it. - Shared by ``_pull_from_remote`` (SSH/tunnel URL) and - ``_pull_from_local`` (filesystem path). Must run *before* - ``_sync_remote`` so that work the agent committed on the mirror - is not lost when the canonical repo force-pushes over it. + *transports* is an ordered list of ``(label, url, env)``; each is + tried until one connects. This matters for correctness, not just + convenience: if the DTN transport silently fails we would skip + the pull and the next push could force-overwrite agent commits, + so we fail over to the login node before giving up. Strategy: 1. Fetch the mirror's branch into a temporary ref — always safe. @@ -742,20 +823,38 @@ def _pull_from_url( tmp_ref = "refs/sucoder/mirror-head" self.logger.info("Fetching agent commits from %s", source_label) - result = self.executor.run_human( - ["git", "fetch", url, f"{base}:{tmp_ref}"], - check=False, - cwd=str(ctx.canonical_path), - env=env, - timeout=timeout, - ) - if result.returncode != 0: + + # Try each transport until one connects. A non-transport failure + # (e.g. an empty mirror's "couldn't find remote ref") is a real + # answer that would repeat on every node, so it stops the + # failover; only a broken connection rolls to the next transport. + result: Optional[CommandResult] = None + url = transports[0][1] if transports else "" + for idx, (label, t_url, t_env) in enumerate(transports): + url = t_url + result = self.executor.run_human( + ["git", "fetch", t_url, f"{base}:{tmp_ref}"], + check=False, + cwd=str(ctx.canonical_path), + env=t_env, + timeout=timeout, + ) + if result.returncode == 0 or not self._is_transport_failure(result): + break + if idx + 1 < len(transports): + self.logger.warning( + "Fetch from %s via %s failed (%s); retrying via %s", + source_label, label, self._short_git_error(result), + transports[idx + 1][0], + ) + + if result is None or result.returncode != 0: # Mirror may be empty (first run) or unreachable. self.logger.warning( "Could not fetch from %s (rc=%d): %s", source_label, - result.returncode, - (result.stderr or "").strip(), + result.returncode if result is not None else -1, + (result.stderr or "").strip() if result is not None else "", ) return @@ -1114,19 +1213,37 @@ def _exists(name: str) -> bool: def _sync_remote(self, ctx: MirrorContext) -> None: """Push local canonical commits to the remote mirror. - Uses the login node ControlMaster for git transport — no - tunnel needed when the login node has internet access. + Prefers the DTN ControlMaster for git transport, but fails over + to the login node if the DTN refuses the session (transfer nodes + commonly cap concurrent SSH sessions). A genuine git error + (e.g. a rejected ref) is *not* retried — it would fail the same + way on the login node and the original message is clearer. """ - url, env = self._remote_git_env(ctx) - - self.logger.info("Pushing to remote mirror %s", url) - self.executor.run_human( - ["git", "push", url, "--all", "--force"], - check=True, - cwd=str(ctx.canonical_path), - env=env, - timeout=self._GIT_REMOTE_TIMEOUT, - ) + transports = self._git_transports(ctx) + for idx, (label, url, env) in enumerate(transports): + is_last = idx + 1 >= len(transports) + self.logger.info( + "Pushing to remote mirror %s (via %s)", url, label, + ) + try: + self.executor.run_human( + ["git", "push", url, "--all", "--force"], + check=True, + cwd=str(ctx.canonical_path), + env=env, + timeout=self._GIT_REMOTE_TIMEOUT, + ) + return + except CommandError as exc: + # Only fail over when the connection itself broke; a real + # git rejection should surface immediately. + if is_last or not self._is_transport_failure(exc.result): + raise + self.logger.warning( + "Push via %s failed (%s); retrying via %s", + label, self._short_git_error(exc.result), + transports[idx + 1][0], + ) def _remote_repo_has_content( self, diff --git a/tests/test_remote.py b/tests/test_remote.py index 6bf74b1..612e138 100644 --- a/tests/test_remote.py +++ b/tests/test_remote.py @@ -1438,6 +1438,187 @@ def fake_run_agent(args, **kwargs): assert sync_called +def test_git_transports_dtn_then_login( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """With a DTN, transports are [DTN, login node] with distinct urls.""" + manager = _build_remote_manager(tmp_path) + ctx = manager.context_for("rproj") + manager.executor = _remote_exec_with_scaffolding(tmp_path) + monkeypatch.setattr( + manager, "_resolve_remote_path", + lambda ctx: "/home/ligon/mirrors/rproj", + ) + + transports = manager._git_transports(ctx) + + assert [t[0] for t in transports] == ["DTN", "login node"] + dtn_url, login_url = transports[0][1], transports[1][1] + assert dtn_url.startswith("dtn.example.com:") + assert login_url.startswith("ln001:") + # DTN rides the scaffolding socket; login node rides its own. + assert "ControlPath=/tmp/dtn.sock" in transports[0][2]["GIT_SSH_COMMAND"] + assert "ControlPath=/tmp/test.sock" in transports[1][2]["GIT_SSH_COMMAND"] + + +def test_git_transports_single_without_scaffolding( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Without a DTN there is one transport — no pointless failover.""" + from sucoder.executor import RemoteExecutor + + manager = _build_remote_manager(tmp_path) + ctx = manager.context_for("rproj") + import logging + manager.executor = RemoteExecutor( + human_user="ligon", agent_user="ligon", agent_group="ligon", + logger=logging.getLogger("test.remote.single"), + dry_run=False, use_sudo_for_agent=False, + gateway="gw.example.com", login_node="ln001", + remote_mirror_root="~/mirrors", + local_mirror_root=str(tmp_path / "mirrors"), + control_socket_path="/tmp/test.sock", + ) + monkeypatch.setattr( + manager, "_resolve_remote_path", + lambda ctx: "/home/ligon/mirrors/rproj", + ) + + transports = manager._git_transports(ctx) + assert len(transports) == 1 + assert transports[0][0] == "login node" + + +def _transport_result(args, returncode, stderr=""): + from sucoder.executor import CommandResult + return CommandResult(list(args), list(args), "", stderr, returncode) + + +def test_sync_remote_fails_over_to_login_node( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """A DTN push that hits a transport fault retries on the login node.""" + from sucoder.executor import CommandError + + manager = _build_remote_manager(tmp_path) + ctx = manager.context_for("rproj") + manager.executor = _remote_exec_with_scaffolding(tmp_path) + monkeypatch.setattr( + manager, "_resolve_remote_path", + lambda ctx: "/home/ligon/mirrors/rproj", + ) + + pushes: list = [] + + def fake_run_human(args, **kwargs): + url = args[2] + pushes.append(url) + if url.startswith("dtn"): + res = _transport_result( + args, 1, "fatal: the remote end hung up unexpectedly") + raise CommandError("boom", res) + return _transport_result(args, 0) + + monkeypatch.setattr(manager.executor, "run_human", fake_run_human) + + manager._sync_remote(ctx) # must not raise + + assert len(pushes) == 2 + assert pushes[0].startswith("dtn.example.com:") + assert pushes[1].startswith("ln001:") + + +def test_sync_remote_no_failover_on_real_error( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """A genuine git rejection surfaces immediately — no login-node retry.""" + from sucoder.executor import CommandError + + manager = _build_remote_manager(tmp_path) + ctx = manager.context_for("rproj") + manager.executor = _remote_exec_with_scaffolding(tmp_path) + monkeypatch.setattr( + manager, "_resolve_remote_path", + lambda ctx: "/home/ligon/mirrors/rproj", + ) + + pushes: list = [] + + def fake_run_human(args, **kwargs): + pushes.append(args[2]) + res = _transport_result( + args, 1, "! [rejected] main -> main (non-fast-forward)") + raise CommandError("rejected", res) + + monkeypatch.setattr(manager.executor, "run_human", fake_run_human) + + with pytest.raises(CommandError): + manager._sync_remote(ctx) + + # Failed on the DTN and did NOT retry on the login node. + assert len(pushes) == 1 + assert pushes[0].startswith("dtn.example.com:") + + +def test_pull_fails_over_to_login_node( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """A DTN fetch transport fault retries on the login node before giving up.""" + manager = _build_remote_manager(tmp_path) + ctx = manager.context_for("rproj") + manager.executor = _remote_exec_with_scaffolding(tmp_path) + monkeypatch.setattr( + manager, "_resolve_remote_path", + lambda ctx: "/home/ligon/mirrors/rproj", + ) + + fetches: list = [] + + def fake_run_human(args, **kwargs): + url = args[2] + fetches.append(url) + if url.startswith("dtn"): + return _transport_result( + args, 128, "fatal: the remote end hung up unexpectedly") + return _transport_result(args, 0) # login node connects + + monkeypatch.setattr(manager.executor, "run_human", fake_run_human) + + manager._pull_from_remote(ctx) # reconciliation no-ops (tmp ref absent) + + assert len(fetches) == 2 + assert fetches[0].startswith("dtn.example.com:") + assert fetches[1].startswith("ln001:") + + +def test_pull_no_failover_on_empty_mirror( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """An empty remote ('couldn't find remote ref') is a real answer, no retry.""" + manager = _build_remote_manager(tmp_path) + ctx = manager.context_for("rproj") + manager.executor = _remote_exec_with_scaffolding(tmp_path) + monkeypatch.setattr( + manager, "_resolve_remote_path", + lambda ctx: "/home/ligon/mirrors/rproj", + ) + + fetches: list = [] + + def fake_run_human(args, **kwargs): + fetches.append(args[2]) + return _transport_result( + args, 128, "fatal: couldn't find remote ref main") + + monkeypatch.setattr(manager.executor, "run_human", fake_run_human) + + manager._pull_from_remote(ctx) # swallowed as a warning + + # Connected fine (just an empty mirror) → no login-node retry. + assert len(fetches) == 1 + assert fetches[0].startswith("dtn.example.com:") + + def test_ensure_remote_mirror_exists_success( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: From fe20f3dfc27568fc7ea01a1bf21f5bdb30b5d78a Mon Sep 17 00:00:00 2001 From: Ethan Ligon & Sue Coder Date: Mon, 29 Jun 2026 18:30:42 -0700 Subject: [PATCH 3/3] feat(remote): prefer login node over DTN for git transport MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Diagnostics against Savio's DTN confirmed it caps concurrent SSH sessions to ~1: a 12-way concurrent probe over the DTN ControlMaster refused 11 of 12. That is why the original push was refused — its DTN channel overlapped another DTN op and lost the single session slot. The DTN's fat pipes aren't worth that fragility for git's small delta transfers, so flip _git_transports ordering: the target/login node is now primary, and a configured DTN is kept only as a secondary fallback (preserving a route home if the login master is down, at no cost in the common path). Filesystem scaffolding still uses the DTN directly via run_on_login_node — those ops are sequential and fine under a 1-session cap. The login-node-first push/fetch now avoids the cap entirely in the normal case. No new config: this changes the default ordering only. The failover machinery (transport classification, retry) is unchanged; only the order of the transport list flips. Tests updated for the new ordering, plus a prefers-login-node case. Full suite: 464 passed. Co-Authored-By: Claude Opus 4.8 (1M context) --- sucoder/mirror.py | 60 +++++++++++++++++++++--------------- tests/test_remote.py | 72 ++++++++++++++++++++++++++++++-------------- 2 files changed, 84 insertions(+), 48 deletions(-) diff --git a/sucoder/mirror.py b/sucoder/mirror.py index e4dcc7b..7fa6e5e 100644 --- a/sucoder/mirror.py +++ b/sucoder/mirror.py @@ -657,31 +657,40 @@ def _git_transports( ) -> List[Tuple[str, str, Mapping[str, str]]]: """Ordered ``(label, url, env)`` git transports to try. - Primary is the DTN (scaffolding node) when one is configured; - the fallback is the target/login node. The DTN has fat pipes - and spare CPU, but transfer nodes commonly cap concurrent SSH - sessions, so a refused master session must not be fatal — the - login node sees the same shared filesystem and can carry the - push/fetch instead. Without a scaffolding node the two collapse - to one transport (no pointless retry). + Primary is the **target/login node** — it is session-capable and + reliable. A scaffolding node (DTN), if configured, is kept only + as a *secondary* fallback: transfer nodes have fat pipes but + commonly cap concurrent SSH sessions to ~1 (Savio's DTN refuses + the 2nd of 12 concurrent sessions), so routing git through them + is fragile under any overlap. Filesystem scaffolding still uses + the DTN directly (see ``run_on_login_node``); only git push/fetch + prefer the login node here. Keeping the DTN last preserves a + route home if the login-node master is ever down, at no cost in + the common path. Without a scaffolding node only one transport + is returned (no pointless retry). + + Note: for a SLURM compute-node target the "primary" is the + compute node (where the agent runs and the executor is pinned), + which is likewise session-capable and sees the same Lustre. """ transports: List[Tuple[str, str, Mapping[str, str]]] = [] - scaffolding_node = getattr(self.executor, "scaffolding_node", "") - scaffolding_sock = getattr(self.executor, "scaffolding_socket_path", "") - if scaffolding_node and scaffolding_sock: - url, env = self._remote_git_env(ctx, use_scaffolding=True) - transports.append(("DTN", url, env)) - fallback_label = ( + primary_label = ( "compute node" if getattr(self.executor, "is_compute_node", False) else "login node" ) url, env = self._remote_git_env(ctx, use_scaffolding=False) - # Skip a duplicate when there is no distinct fallback (e.g. no - # scaffolding node, or both resolve to the same host). - if not transports or transports[0][1] != url: - transports.append((fallback_label, url, env)) + transports.append((primary_label, url, env)) + + scaffolding_node = getattr(self.executor, "scaffolding_node", "") + scaffolding_sock = getattr(self.executor, "scaffolding_socket_path", "") + if scaffolding_node and scaffolding_sock: + dtn_url, dtn_env = self._remote_git_env(ctx, use_scaffolding=True) + # Skip the DTN when it resolves to the same host as the + # primary (no distinct fallback). + if dtn_url != url: + transports.append(("DTN", dtn_url, dtn_env)) return transports @staticmethod @@ -804,9 +813,9 @@ def _pull_from_url( *transports* is an ordered list of ``(label, url, env)``; each is tried until one connects. This matters for correctness, not just - convenience: if the DTN transport silently fails we would skip - the pull and the next push could force-overwrite agent commits, - so we fail over to the login node before giving up. + convenience: if the primary transport silently fails we would + skip the pull and the next push could force-overwrite agent + commits, so we fall over to the next transport before giving up. Strategy: 1. Fetch the mirror's branch into a temporary ref — always safe. @@ -1213,11 +1222,12 @@ def _exists(name: str) -> bool: def _sync_remote(self, ctx: MirrorContext) -> None: """Push local canonical commits to the remote mirror. - Prefers the DTN ControlMaster for git transport, but fails over - to the login node if the DTN refuses the session (transfer nodes - commonly cap concurrent SSH sessions). A genuine git error - (e.g. a rejected ref) is *not* retried — it would fail the same - way on the login node and the original message is clearer. + Pushes over the login node (the reliable, session-capable + transport); if a configured DTN is present it is tried only as a + fallback should the login-node transport break. A genuine git + error (e.g. a rejected ref) is *not* retried — it would fail the + same way on the other transport and the original message is + clearer. See ``_git_transports`` for the ordering rationale. """ transports = self._git_transports(ctx) for idx, (label, url, env) in enumerate(transports): diff --git a/tests/test_remote.py b/tests/test_remote.py index 612e138..807961c 100644 --- a/tests/test_remote.py +++ b/tests/test_remote.py @@ -1438,10 +1438,10 @@ def fake_run_agent(args, **kwargs): assert sync_called -def test_git_transports_dtn_then_login( +def test_git_transports_login_first_then_dtn( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - """With a DTN, transports are [DTN, login node] with distinct urls.""" + """Login node is primary; the DTN is kept only as a fallback.""" manager = _build_remote_manager(tmp_path) ctx = manager.context_for("rproj") manager.executor = _remote_exec_with_scaffolding(tmp_path) @@ -1452,13 +1452,13 @@ def test_git_transports_dtn_then_login( transports = manager._git_transports(ctx) - assert [t[0] for t in transports] == ["DTN", "login node"] - dtn_url, login_url = transports[0][1], transports[1][1] - assert dtn_url.startswith("dtn.example.com:") + assert [t[0] for t in transports] == ["login node", "DTN"] + login_url, dtn_url = transports[0][1], transports[1][1] assert login_url.startswith("ln001:") - # DTN rides the scaffolding socket; login node rides its own. - assert "ControlPath=/tmp/dtn.sock" in transports[0][2]["GIT_SSH_COMMAND"] - assert "ControlPath=/tmp/test.sock" in transports[1][2]["GIT_SSH_COMMAND"] + assert dtn_url.startswith("dtn.example.com:") + # Login node rides its own socket; DTN rides the scaffolding socket. + assert "ControlPath=/tmp/test.sock" in transports[0][2]["GIT_SSH_COMMAND"] + assert "ControlPath=/tmp/dtn.sock" in transports[1][2]["GIT_SSH_COMMAND"] def test_git_transports_single_without_scaffolding( @@ -1494,10 +1494,36 @@ def _transport_result(args, returncode, stderr=""): return CommandResult(list(args), list(args), "", stderr, returncode) -def test_sync_remote_fails_over_to_login_node( +def test_sync_remote_prefers_login_node( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """The push goes to the login node first; the DTN is not touched.""" + manager = _build_remote_manager(tmp_path) + ctx = manager.context_for("rproj") + manager.executor = _remote_exec_with_scaffolding(tmp_path) + monkeypatch.setattr( + manager, "_resolve_remote_path", + lambda ctx: "/home/ligon/mirrors/rproj", + ) + + pushes: list = [] + + def fake_run_human(args, **kwargs): + pushes.append(args[2]) + return _transport_result(args, 0) + + monkeypatch.setattr(manager.executor, "run_human", fake_run_human) + + manager._sync_remote(ctx) + + assert len(pushes) == 1 + assert pushes[0].startswith("ln001:") + + +def test_sync_remote_fails_over_to_dtn( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - """A DTN push that hits a transport fault retries on the login node.""" + """A login-node push that hits a transport fault retries on the DTN.""" from sucoder.executor import CommandError manager = _build_remote_manager(tmp_path) @@ -1513,7 +1539,7 @@ def test_sync_remote_fails_over_to_login_node( def fake_run_human(args, **kwargs): url = args[2] pushes.append(url) - if url.startswith("dtn"): + if url.startswith("ln001"): res = _transport_result( args, 1, "fatal: the remote end hung up unexpectedly") raise CommandError("boom", res) @@ -1524,8 +1550,8 @@ def fake_run_human(args, **kwargs): manager._sync_remote(ctx) # must not raise assert len(pushes) == 2 - assert pushes[0].startswith("dtn.example.com:") - assert pushes[1].startswith("ln001:") + assert pushes[0].startswith("ln001:") + assert pushes[1].startswith("dtn.example.com:") def test_sync_remote_no_failover_on_real_error( @@ -1555,15 +1581,15 @@ def fake_run_human(args, **kwargs): with pytest.raises(CommandError): manager._sync_remote(ctx) - # Failed on the DTN and did NOT retry on the login node. + # Failed on the login node and did NOT fall over to the DTN. assert len(pushes) == 1 - assert pushes[0].startswith("dtn.example.com:") + assert pushes[0].startswith("ln001:") -def test_pull_fails_over_to_login_node( +def test_pull_fails_over_to_dtn( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - """A DTN fetch transport fault retries on the login node before giving up.""" + """A login-node fetch transport fault retries on the DTN before giving up.""" manager = _build_remote_manager(tmp_path) ctx = manager.context_for("rproj") manager.executor = _remote_exec_with_scaffolding(tmp_path) @@ -1577,18 +1603,18 @@ def test_pull_fails_over_to_login_node( def fake_run_human(args, **kwargs): url = args[2] fetches.append(url) - if url.startswith("dtn"): + if url.startswith("ln001"): return _transport_result( args, 128, "fatal: the remote end hung up unexpectedly") - return _transport_result(args, 0) # login node connects + return _transport_result(args, 0) # DTN connects monkeypatch.setattr(manager.executor, "run_human", fake_run_human) manager._pull_from_remote(ctx) # reconciliation no-ops (tmp ref absent) assert len(fetches) == 2 - assert fetches[0].startswith("dtn.example.com:") - assert fetches[1].startswith("ln001:") + assert fetches[0].startswith("ln001:") + assert fetches[1].startswith("dtn.example.com:") def test_pull_no_failover_on_empty_mirror( @@ -1614,9 +1640,9 @@ def fake_run_human(args, **kwargs): manager._pull_from_remote(ctx) # swallowed as a warning - # Connected fine (just an empty mirror) → no login-node retry. + # Connected fine (just an empty mirror) → no DTN retry. assert len(fetches) == 1 - assert fetches[0].startswith("dtn.example.com:") + assert fetches[0].startswith("ln001:") def test_ensure_remote_mirror_exists_success(