From 053210accdc3a69a74d78cd26f7c70ccd4a99528 Mon Sep 17 00:00:00 2001 From: "Daniel A. Wozniak" Date: Fri, 12 Jun 2026 17:13:56 -0700 Subject: [PATCH] Cap minion AsyncAuth retry loop with auth_tries The minion's AsyncAuth._authenticate() outer loop on 3006.x and 3007.x keeps calling sign_in() forever whenever the master answers with the "retry" sentinel (key not yet accepted, master AES rotation in flight, multi-master probe). The minion sleeps acceptance_wait_time between attempts, doubling up to acceptance_wait_time_max, and never surfaces an error: no log, no traceback, just a stuck minion. 3008.x already caps this loop using the existing auth_tries option (default 7); backport the same guard so the minion bails out of _authenticate() with SaltClientError("Failed to authenticate with the master after N attempts") once auth_tries iterations have been spent returning "retry". auth_tries=0 keeps the old "loop forever" behavior for operators who actually want it. The synchronous SAuth.authenticate() path is intentionally left unchanged: that is a separate code path used by salt-call and other single-shot CLI flows, and its existing semantics are out of scope for this fix. Fixes #69442 --- changelog/69442.fixed.md | 1 + salt/crypt.py | 8 +++++ tests/pytests/unit/test_crypt.py | 57 ++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+) create mode 100644 changelog/69442.fixed.md diff --git a/changelog/69442.fixed.md b/changelog/69442.fixed.md new file mode 100644 index 000000000000..64f4dcf983ec --- /dev/null +++ b/changelog/69442.fixed.md @@ -0,0 +1 @@ +Fixed `AsyncAuth._authenticate()` on the minion to honour `auth_tries` as an outer-loop cap, so a minion that keeps getting `retry` responses from `sign_in()` bails out with `SaltClientError` instead of looping silently forever. diff --git a/salt/crypt.py b/salt/crypt.py index ac8667589c08..a1905579a17d 100644 --- a/salt/crypt.py +++ b/salt/crypt.py @@ -773,7 +773,10 @@ def _authenticate(self): self.opts, crypt="clear", io_loop=self.io_loop ) as channel: error = None + attempts = 0 + auth_tries = self.opts.get("auth_tries", 0) while True: + attempts += 1 try: creds = yield self.sign_in(channel=channel) except SaltClientError as exc: @@ -783,6 +786,11 @@ def _authenticate(self): if self.opts.get("detect_mode") is True: error = SaltClientError("Detect mode is on") break + if auth_tries > 0 and attempts >= auth_tries: + error = SaltClientError( + f"Failed to authenticate with the master after {attempts} attempts" + ) + break if self.opts.get("caller"): # We have a list of masters, so we should break # and try the next one in the list. diff --git a/tests/pytests/unit/test_crypt.py b/tests/pytests/unit/test_crypt.py index 2ac1a3cee2bd..e52c71f38750 100644 --- a/tests/pytests/unit/test_crypt.py +++ b/tests/pytests/unit/test_crypt.py @@ -5,6 +5,7 @@ import salt.crypt as crypt import salt.exceptions +import salt.ext.tornado.gen from tests.support.mock import patch @@ -350,3 +351,59 @@ def test_verify_master_caches_clean_key_on_first_contact( assert result == "aes-key" assert m_pub_fn.read_text() == cached_pub_key + + +async def test_authenticate_caps_retry_loop_with_auth_tries_69442(minion_root, io_loop): + """ + Regression test for https://github.com/saltstack/salt/issues/69442 + + When ``sign_in()`` keeps returning ``"retry"`` (for example because the + master has not yet accepted the minion key, the master AES key is in + flux, or the master is reachable but rejecting auth), the outer + ``AsyncAuth._authenticate()`` loop must bail out after ``auth_tries`` + attempts with a ``SaltClientError`` whose message names the attempt + count. + + On 3006.x/3007.x the loop had no outer-attempts cap and the minion + spun forever with exponential backoff up to ``acceptance_wait_time_max`` + with no operator-visible error log. This test asserts the + backported cap: with ``auth_tries=3`` and ``sign_in`` returning + ``"retry"`` on every call, the loop runs exactly 3 attempts and the + future resolves to a ``SaltClientError`` carrying the + ``"Failed to authenticate with the master after 3 attempts"`` message. + """ + pki_dir = minion_root / "etc" / "salt" / "pki" + opts = { + "id": "minion", + "__role": "minion", + "pki_dir": str(pki_dir), + "master_uri": "tcp://127.0.0.1:4505", + "keysize": 4096, + # Zero out the inter-attempt sleep so the test doesn't actually + # wait ``acceptance_wait_time * attempts`` seconds before + # observing the cap. + "acceptance_wait_time": 0, + "acceptance_wait_time_max": 0, + "auth_tries": 3, + } + crypt.gen_keys(pki_dir, "minion", opts["keysize"]) + + auth = crypt.AsyncAuth(opts, io_loop) + + call_count = 0 + + @salt.ext.tornado.gen.coroutine + def mock_sign_in(*args, **kwargs): + nonlocal call_count + call_count += 1 + return "retry" + + auth.sign_in = mock_sign_in + + with pytest.raises(salt.exceptions.SaltClientError) as exc_info: + await auth.authenticate() + + assert call_count == 3 + assert "Failed to authenticate with the master after 3 attempts" in str( + exc_info.value + )