From 2f286d8387200687be0c781cb02c28e3a287f3c5 Mon Sep 17 00:00:00 2001 From: JuliaEdom Date: Mon, 29 Jun 2026 13:45:24 +0300 Subject: [PATCH] =?UTF-8?q?fix(alerts):=20page=20every=20intermediate=20es?= =?UTF-8?q?calation=20level=20(audit=5F28=20=C2=A75)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit next_escalation_step returned the highest *due* step (due_steps[-1]). When two or more escalation levels became due between evaluation ticks (sparse polling, restart catch-up), the on-call recipients of the intervening levels were silently skipped — only the top level's webhook fired. Advance exactly one level per tick (the lowest due level above the current one, robust to unsorted escalation lists) so each tier is paged. Adds next_escalation_step regression tests (3-level all-due -> level 2 then 3, never jumping straight to 3; lowest-due-regardless-of-order). No public contract change; no-Docker verified (ruff/format/mypy clean, 75 alert/dispatcher unit tests pass). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/serving/api/alerts/dispatcher.py | 8 ++- tests/unit/test_alert_escalation_delivery.py | 58 +++++++++++++++++++- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/src/serving/api/alerts/dispatcher.py b/src/serving/api/alerts/dispatcher.py index 6dd432e..14930ad 100644 --- a/src/serving/api/alerts/dispatcher.py +++ b/src/serving/api/alerts/dispatcher.py @@ -218,7 +218,13 @@ def next_escalation_step( if step.level > alert.last_escalation_level and elapsed_minutes >= step.after_minutes ] if due_steps: - return due_steps[-1] + # Advance exactly one level per evaluation tick — the lowest level above + # the current one — so every intermediate escalation target is paged. + # Returning the highest due step (due_steps[-1]) silently skipped the + # on-call recipients of intervening levels whenever two or more became + # due between ticks (sparse polling, restart catch-up). + # (audit_28_06_26.md §5 medium: escalation skips intermediate levels) + return min(due_steps, key=lambda step: step.level) if ( len(alert.escalation) == 1 and alert.last_escalation_level == alert.escalation[0].level diff --git a/tests/unit/test_alert_escalation_delivery.py b/tests/unit/test_alert_escalation_delivery.py index b34d45f..14b0c48 100644 --- a/tests/unit/test_alert_escalation_delivery.py +++ b/tests/unit/test_alert_escalation_delivery.py @@ -13,7 +13,11 @@ import pytest import src.serving.api.alerts.escalation as escalation -from src.serving.api.alerts.dispatcher import AlertEscalationStep, AlertRule +from src.serving.api.alerts.dispatcher import ( + AlertEscalationStep, + AlertRule, + next_escalation_step, +) _NOW = datetime(2026, 6, 28, 12, 0, tzinfo=UTC) @@ -107,3 +111,55 @@ async def test_escalation_level_not_advanced_on_delivery_failure( assert result.last_escalation_level == 1 # not advanced to 2 on failed delivery assert triggered == 0 + + +# --- next_escalation_step: no intermediate-level skip (audit_28_06_26.md §5 medium) --- + +_FIRED = datetime(2026, 6, 28, 11, 0, tzinfo=UTC) # 60 min before _NOW + + +def _three_level_alert(last_level: int) -> AlertRule: + return _alert( + escalation=[ + AlertEscalationStep(level=1, after_minutes=0, webhook_url="https://h/l1"), + AlertEscalationStep(level=2, after_minutes=10, webhook_url="https://h/l2"), + AlertEscalationStep(level=3, after_minutes=20, webhook_url="https://h/l3"), + ], + fired_at=_FIRED, + state="firing", + last_escalation_level=last_level, + ) + + +def test_next_escalation_step_advances_one_level_at_a_time() -> None: + # All three levels are due at _NOW (60 min elapsed >= 0/10/20), but a single + # tick must page the *next* level (2), not jump to the highest (3) and skip + # level 2's on-call recipient. + step = next_escalation_step(_three_level_alert(last_level=1), _NOW) + assert step is not None + assert step.level == 2 + assert step.webhook_url == "https://h/l2" + + # After level 2 is recorded, the following tick advances to level 3. + step = next_escalation_step(_three_level_alert(last_level=2), _NOW) + assert step is not None + assert step.level == 3 + + # Once the top level is reached there is nothing further to escalate to. + assert next_escalation_step(_three_level_alert(last_level=3), _NOW) is None + + +def test_next_escalation_step_picks_lowest_due_regardless_of_list_order() -> None: + # Robust to an escalation list that is not sorted by level. + alert = _alert( + escalation=[ + AlertEscalationStep(level=3, after_minutes=20, webhook_url="https://h/l3"), + AlertEscalationStep(level=2, after_minutes=10, webhook_url="https://h/l2"), + ], + fired_at=_FIRED, + state="firing", + last_escalation_level=1, + ) + step = next_escalation_step(alert, _NOW) + assert step is not None + assert step.level == 2