diff --git a/changelog/68326.fixed.md b/changelog/68326.fixed.md new file mode 100644 index 000000000000..59817e576fea --- /dev/null +++ b/changelog/68326.fixed.md @@ -0,0 +1 @@ +Fixed the `salt.state` orchestrate state silently reporting only `Run failed on minions: ` when a targeted minion returned `False`, no return at all, or a list of error strings. The orchestrate comment now includes the per-minion failure detail (the minion's actual return value or "did not return a state result") so operators can diagnose `salt-run state.orchestrate` failures without re-running with extra logging. diff --git a/salt/states/saltmod.py b/salt/states/saltmod.py index fdd4692c0f4d..488341c3e48b 100644 --- a/salt/states/saltmod.py +++ b/salt/states/saltmod.py @@ -98,6 +98,48 @@ def run_thread(): return outputs +def _format_failure_detail(minion, m_ret, m_no_return): + """ + Build a one-line human-readable detail string explaining a + minion-level failure that did *not* produce a state-result dict. + + The orchestrate state historically reported these failures as a bare + ``Run failed on minions: `` line with an opaque ``False`` (or + list of error strings) buried in ``changes``. That left operators + with no clue what went wrong (issue #68326). This helper produces + the per-minion line we append to that comment. + + The contract: + + * ``m_no_return=True`` means the minion's response had no ``ret`` + key at all (the master flagged it ``failed: True``, or otherwise + lost the return). We say so explicitly. + * A list/tuple ``m_ret`` is treated as a sequence of error strings + (state.sls returns this shape for pillar errors, queue conflicts, + and similar pre-compilation failures) and joined. + * Any other non-dict ``m_ret`` (``False`` is the case that triggered + the report) is reproduced literally so the operator can see what + came back. + * A dict ``m_ret`` reaching this helper means + ``state.check_result`` rejected the high-level shape; we omit a + detail line and let the existing ``changes`` payload speak for + itself rather than dumping the full dict into the comment. + + Returns an empty string when there is no useful one-line summary to + add; callers should treat empty as "nothing to append". + """ + if m_no_return: + return f"Minion {minion} did not return a state result." + if isinstance(m_ret, (list, tuple)): + parts = [str(item) for item in m_ret if item] + if not parts: + return f"Minion {minion} returned an empty error list." + return "Minion {} returned errors: {}".format(minion, "; ".join(parts)) + if isinstance(m_ret, dict): + return "" + return f"Minion {minion} returned {m_ret!r} instead of a state result." + + def state( name, tgt, @@ -364,6 +406,12 @@ def state( changes = {} fail = set() + # Per-minion human-readable detail collected for failures whose minion + # return value isn't a state-result dict (e.g. False, a list of error + # strings, or no return at all). Surfaced in the final comment so users + # see *why* their orchestrate run failed instead of only seeing the + # opaque minion-id list (issue #68326). + fail_detail = {} no_change = set() if fail_minions is None: @@ -386,24 +434,41 @@ def state( log.warning("Output from salt state not highstate") m_ret = False + # Track whether the minion had no usable return at all, separate + # from "the return was present but indicated failure" so we can + # explain the difference in the comment. + m_no_return = False if "return" in mdata and "ret" not in mdata: mdata["ret"] = mdata.pop("return") m_state = True + # Capture whatever the minion actually returned (if anything) for + # use in failure-detail formatting. The legacy code only consumes + # ``mdata["ret"]`` on the non-failed path; for `failed: True` + # responses with a useful ``ret`` payload we still want to show + # the user what came back rather than reporting a bare ``False``. + m_raw_ret = mdata.get("ret", mdata.get("return", None)) if mdata.get("failed", False): m_state = False + m_no_return = "ret" not in mdata and "return" not in mdata else: try: m_ret = mdata["ret"] except KeyError: m_state = False + m_no_return = True if m_state: m_state = __utils__["state.check_result"](m_ret, recurse=True) if not m_state: if minion not in fail_minions: fail.add(minion) + fail_detail[minion] = _format_failure_detail( + minion, + m_raw_ret if m_raw_ret is not None else m_ret, + m_no_return, + ) changes[minion] = m_ret continue try: @@ -423,6 +488,17 @@ def state( if len(fail) > allow_fail: state_ret["result"] = False state_ret["comment"] = "Run failed on minions: {}".format(", ".join(fail)) + # Append per-minion detail for any failures whose minion return + # was not a state-result dict — without this, the orchestrate + # output is just an opaque ``False`` in ``changes`` and gives the + # operator no clue what went wrong (issue #68326). + details = [ + fail_detail[minion] + for minion in sorted(fail) + if minion in fail_detail and fail_detail[minion] + ] + if details: + state_ret["comment"] += "\n" + "\n".join(details) else: state_ret["comment"] = "States ran successfully." if changes: diff --git a/tests/pytests/unit/states/saltmod/test_state.py b/tests/pytests/unit/states/saltmod/test_state.py index 44100617338a..c9915ed7e60f 100644 --- a/tests/pytests/unit/states/saltmod/test_state.py +++ b/tests/pytests/unit/states/saltmod/test_state.py @@ -740,6 +740,118 @@ def test_state_failed_and_expected_minions(): assert ret == expected +def test_state_minion_ret_false_includes_detail_in_comment(): + """ + When a targeted minion's state return value is the scalar ``False`` + (for example because the minion timed out at the master level or + because state.sls on the minion failed before any state ran), the + ``salt.state`` orchestrate state must surface that detail in its + comment instead of leaving the user with only ``Run failed on + minions: `` and an opaque ``False`` in ``changes``. + + Regression test for #68326. + """ + name = "state" + tgt = "minion1" + + cmd_ret = { + "10.0.0.1": { + "jid": "20170406104341210934", + "retcode": 1, + "ret": False, + "out": "highstate", + }, + } + + with patch.dict(saltmod.__opts__, {"test": False}): + mock = MagicMock(return_value=cmd_ret) + with patch.dict(saltmod.__salt__, {"saltutil.cmd": mock}): + ret = saltmod.state(name, tgt, highstate=True) + + assert ret["result"] is False + assert ret["changes"] == {"out": "highstate", "ret": {"10.0.0.1": False}} + # The bare "Run failed on minions: ..." comment from before the fix + # is exactly what the issue complains about. The comment must now + # include the minion identifier *and* useful detail about what came + # back so that the user can begin to diagnose the failure. + assert "10.0.0.1" in ret["comment"] + assert ret["comment"] != "Run failed on minions: 10.0.0.1" + lowered = ret["comment"].lower() + assert "false" in lowered or "no return" in lowered or "did not return" in lowered + + +def test_state_minion_failed_flag_includes_detail_in_comment(): + """ + When ``saltutil.cmd`` flags a minion's response with ``failed: True`` + but no useful ``ret`` payload was attached, the orchestrate state + must say so in its comment instead of returning a bare + "Run failed on minions: " line. + + Regression test for #68326. + """ + name = "state" + tgt = "minion1" + + cmd_ret = { + "10.0.0.1": { + "jid": "20170406104341210934", + "retcode": 1, + "failed": True, + "out": "highstate", + }, + } + + with patch.dict(saltmod.__opts__, {"test": False}): + mock = MagicMock(return_value=cmd_ret) + with patch.dict(saltmod.__salt__, {"saltutil.cmd": mock}): + ret = saltmod.state(name, tgt, highstate=True) + + assert ret["result"] is False + assert "10.0.0.1" in ret["comment"] + assert ret["comment"] != "Run failed on minions: 10.0.0.1" + assert ( + "did not return" in ret["comment"].lower() + or "no return" in ret["comment"].lower() + ) + + +def test_state_minion_ret_error_list_includes_detail_in_comment(): + """ + When a minion's state return value is a list of error strings (for + example pillar render errors, or a state.sls queue conflict), the + orchestrate state must include those error messages in its comment + so the user can see why the run failed. + + Regression test for #68326. + """ + name = "state" + tgt = "minion1" + + error_msg = ( + 'The function "state.highstate" is running as PID 1234 and was ' + "started at 2025, Sep 12 22:23:04.123456 with jid 20250912142302454180" + ) + cmd_ret = { + "10.0.0.1": { + "jid": "20170406104341210934", + "retcode": 1, + "ret": [error_msg], + "out": "highstate", + }, + } + + with patch.dict(saltmod.__opts__, {"test": False}): + mock = MagicMock(return_value=cmd_ret) + with patch.dict(saltmod.__salt__, {"saltutil.cmd": mock}): + ret = saltmod.state(name, tgt, highstate=True) + + assert ret["result"] is False + assert "10.0.0.1" in ret["comment"] + # The error message text must appear in the orchestrate comment, + # not just be hidden inside `changes`. + assert error_msg in ret["comment"] + + def test_state_allow_fail(): name = "state"