diff --git a/changelog/69442.fixed.md b/changelog/69442.fixed.md new file mode 100644 index 00000000000..64f4dcf983e --- /dev/null +++ b/changelog/69442.fixed.md @@ -0,0 +1 @@ +Fixed `AsyncAuth._authenticate()` on the minion to honour `auth_tries` as an outer-loop cap, so a minion that keeps getting `retry` responses from `sign_in()` bails out with `SaltClientError` instead of looping silently forever. diff --git a/salt/crypt.py b/salt/crypt.py index ac8667589c0..a1905579a17 100644 --- a/salt/crypt.py +++ b/salt/crypt.py @@ -773,7 +773,10 @@ def _authenticate(self): self.opts, crypt="clear", io_loop=self.io_loop ) as channel: error = None + attempts = 0 + auth_tries = self.opts.get("auth_tries", 0) while True: + attempts += 1 try: creds = yield self.sign_in(channel=channel) except SaltClientError as exc: @@ -783,6 +786,11 @@ def _authenticate(self): if self.opts.get("detect_mode") is True: error = SaltClientError("Detect mode is on") break + if auth_tries > 0 and attempts >= auth_tries: + error = SaltClientError( + f"Failed to authenticate with the master after {attempts} attempts" + ) + break if self.opts.get("caller"): # We have a list of masters, so we should break # and try the next one in the list. diff --git a/tests/pytests/unit/test_crypt.py b/tests/pytests/unit/test_crypt.py index 2ac1a3cee2b..e52c71f3875 100644 --- a/tests/pytests/unit/test_crypt.py +++ b/tests/pytests/unit/test_crypt.py @@ -5,6 +5,7 @@ import salt.crypt as crypt import salt.exceptions +import salt.ext.tornado.gen from tests.support.mock import patch @@ -350,3 +351,59 @@ def test_verify_master_caches_clean_key_on_first_contact( assert result == "aes-key" assert m_pub_fn.read_text() == cached_pub_key + + +async def test_authenticate_caps_retry_loop_with_auth_tries_69442(minion_root, io_loop): + """ + Regression test for https://github.com/saltstack/salt/issues/69442 + + When ``sign_in()`` keeps returning ``"retry"`` (for example because the + master has not yet accepted the minion key, the master AES key is in + flux, or the master is reachable but rejecting auth), the outer + ``AsyncAuth._authenticate()`` loop must bail out after ``auth_tries`` + attempts with a ``SaltClientError`` whose message names the attempt + count. + + On 3006.x/3007.x the loop had no outer-attempts cap and the minion + spun forever with exponential backoff up to ``acceptance_wait_time_max`` + with no operator-visible error log. This test asserts the + backported cap: with ``auth_tries=3`` and ``sign_in`` returning + ``"retry"`` on every call, the loop runs exactly 3 attempts and the + future resolves to a ``SaltClientError`` carrying the + ``"Failed to authenticate with the master after 3 attempts"`` message. + """ + pki_dir = minion_root / "etc" / "salt" / "pki" + opts = { + "id": "minion", + "__role": "minion", + "pki_dir": str(pki_dir), + "master_uri": "tcp://127.0.0.1:4505", + "keysize": 4096, + # Zero out the inter-attempt sleep so the test doesn't actually + # wait ``acceptance_wait_time * attempts`` seconds before + # observing the cap. + "acceptance_wait_time": 0, + "acceptance_wait_time_max": 0, + "auth_tries": 3, + } + crypt.gen_keys(pki_dir, "minion", opts["keysize"]) + + auth = crypt.AsyncAuth(opts, io_loop) + + call_count = 0 + + @salt.ext.tornado.gen.coroutine + def mock_sign_in(*args, **kwargs): + nonlocal call_count + call_count += 1 + return "retry" + + auth.sign_in = mock_sign_in + + with pytest.raises(salt.exceptions.SaltClientError) as exc_info: + await auth.authenticate() + + assert call_count == 3 + assert "Failed to authenticate with the master after 3 attempts" in str( + exc_info.value + )