diff --git a/src/serving/api/alerts/dispatcher.py b/src/serving/api/alerts/dispatcher.py index 6dd432e..14930ad 100644 --- a/src/serving/api/alerts/dispatcher.py +++ b/src/serving/api/alerts/dispatcher.py @@ -218,7 +218,13 @@ def next_escalation_step( if step.level > alert.last_escalation_level and elapsed_minutes >= step.after_minutes ] if due_steps: - return due_steps[-1] + # Advance exactly one level per evaluation tick — the lowest level above + # the current one — so every intermediate escalation target is paged. + # Returning the highest due step (due_steps[-1]) silently skipped the + # on-call recipients of intervening levels whenever two or more became + # due between ticks (sparse polling, restart catch-up). + # (audit_28_06_26.md §5 medium: escalation skips intermediate levels) + return min(due_steps, key=lambda step: step.level) if ( len(alert.escalation) == 1 and alert.last_escalation_level == alert.escalation[0].level diff --git a/tests/unit/test_alert_escalation_delivery.py b/tests/unit/test_alert_escalation_delivery.py index b34d45f..14b0c48 100644 --- a/tests/unit/test_alert_escalation_delivery.py +++ b/tests/unit/test_alert_escalation_delivery.py @@ -13,7 +13,11 @@ import pytest import src.serving.api.alerts.escalation as escalation -from src.serving.api.alerts.dispatcher import AlertEscalationStep, AlertRule +from src.serving.api.alerts.dispatcher import ( + AlertEscalationStep, + AlertRule, + next_escalation_step, +) _NOW = datetime(2026, 6, 28, 12, 0, tzinfo=UTC) @@ -107,3 +111,55 @@ async def test_escalation_level_not_advanced_on_delivery_failure( assert result.last_escalation_level == 1 # not advanced to 2 on failed delivery assert triggered == 0 + + +# --- next_escalation_step: no intermediate-level skip (audit_28_06_26.md §5 medium) --- + +_FIRED = datetime(2026, 6, 28, 11, 0, tzinfo=UTC) # 60 min before _NOW + + +def _three_level_alert(last_level: int) -> AlertRule: + return _alert( + escalation=[ + AlertEscalationStep(level=1, after_minutes=0, webhook_url="https://h/l1"), + AlertEscalationStep(level=2, after_minutes=10, webhook_url="https://h/l2"), + AlertEscalationStep(level=3, after_minutes=20, webhook_url="https://h/l3"), + ], + fired_at=_FIRED, + state="firing", + last_escalation_level=last_level, + ) + + +def test_next_escalation_step_advances_one_level_at_a_time() -> None: + # All three levels are due at _NOW (60 min elapsed >= 0/10/20), but a single + # tick must page the *next* level (2), not jump to the highest (3) and skip + # level 2's on-call recipient. + step = next_escalation_step(_three_level_alert(last_level=1), _NOW) + assert step is not None + assert step.level == 2 + assert step.webhook_url == "https://h/l2" + + # After level 2 is recorded, the following tick advances to level 3. + step = next_escalation_step(_three_level_alert(last_level=2), _NOW) + assert step is not None + assert step.level == 3 + + # Once the top level is reached there is nothing further to escalate to. + assert next_escalation_step(_three_level_alert(last_level=3), _NOW) is None + + +def test_next_escalation_step_picks_lowest_due_regardless_of_list_order() -> None: + # Robust to an escalation list that is not sorted by level. + alert = _alert( + escalation=[ + AlertEscalationStep(level=3, after_minutes=20, webhook_url="https://h/l3"), + AlertEscalationStep(level=2, after_minutes=10, webhook_url="https://h/l2"), + ], + fired_at=_FIRED, + state="firing", + last_escalation_level=1, + ) + step = next_escalation_step(alert, _NOW) + assert step is not None + assert step.level == 2