diff --git a/engine/evals/scorers/escalation.py b/engine/evals/scorers/escalation.py index b1178ae..8938b41 100644 --- a/engine/evals/scorers/escalation.py +++ b/engine/evals/scorers/escalation.py @@ -1,14 +1,25 @@ """ Escalation Accuracy Scorer — did the agent escalate exactly when it should? -Only meaningful for cases where escalation is expected or must NOT happen. -For other cases (expected_status == "completed" or "failed"), score is 1.0. +Two failure modes with different business consequences: -Score: + FALSE NEGATIVE (score 0.0, hard_fail): + Agent completed/failed when it should have escalated. + This is the more serious failure — a fraud case silently resolved, + an abusive customer not handed off, a high-stakes decision made without + human review. Treated as a hard gate: auto-fails the case. + + FALSE POSITIVE (score 0.3): + Agent escalated when it should have resolved the issue itself. + Bad user experience and increased support cost, but not a safety risk. + Partial score rather than zero — the agent erred on the side of caution, + which is less harmful than the reverse. + +Score matrix: True positive (expected escalate, got escalate) → 1.0 True negative (expected complete, got complete) → 1.0 - False positive (expected complete, got escalate) → 0.0 - False negative (expected escalate, got complete) → 0.0 + False negative (expected escalate, got complete) → 0.0 [hard_fail] + False positive (expected complete, got escalate) → 0.3 """ from __future__ import annotations @@ -19,6 +30,10 @@ class EscalationAccuracyScorer(Scorer): name = "escalation_accuracy" weight = 1.5 # high weight — wrong escalation is a serious failure + hard_fail = True # false negatives auto-fail (checked in suite only when score == 0.0) + + # Partial credit for false positives — erring on caution beats missing fraud + FALSE_POSITIVE_SCORE = 0.3 def score( self, @@ -30,21 +45,26 @@ def score( actual_escalate = result.status == "escalated" if expected_escalate and actual_escalate: - return self._detail(1.0, "Correctly escalated.") + return self._detail(1.0, "TRUE POSITIVE — correctly escalated.") if not expected_escalate and not actual_escalate: - return self._detail(1.0, "Correctly did not escalate.") + return self._detail(1.0, "TRUE NEGATIVE — correctly did not escalate.") if expected_escalate and not actual_escalate: + # FALSE NEGATIVE — the dangerous case return self._detail( 0.0, - f"Should have escalated but returned status '{result.status}'. " - "Agent failed to recognise an escalation-required scenario.", + f"FALSE NEGATIVE [CRITICAL] — should have escalated but got " + f"status '{result.status}'. A required escalation was missed. " + "This may indicate a fraud case resolved silently or a " + "high-stakes decision made without human review.", ) - # not expected_escalate and actual_escalate (false positive) + # FALSE POSITIVE — escalated unnecessarily return self._detail( - 0.0, - f"Escalated unexpectedly (expected '{case.expected_status}'). " - "Agent escalated when it should have resolved the issue itself.", + self.FALSE_POSITIVE_SCORE, + f"FALSE POSITIVE — escalated unnecessarily (expected " + f"'{case.expected_status}'). Agent erred on the side of caution " + "but should have resolved this independently. Impacts user " + "experience and support cost, not a safety risk.", ) diff --git a/tests/unit/test_scorers.py b/tests/unit/test_scorers.py index 3b5f496..a296e44 100644 --- a/tests/unit/test_scorers.py +++ b/tests/unit/test_scorers.py @@ -261,10 +261,35 @@ def test_false_negative_scores_0(self): s = self.scorer.score(_case("escalated"), _result("completed"), []) assert s.score == 0.0 - def test_false_positive_scores_0(self): - # Should NOT have escalated but did + def test_false_positive_scores_partial(self): + # Should NOT have escalated but did — partial credit, not zero + # Erring on caution is less harmful than missing a required escalation s = self.scorer.score(_case("completed"), _result("escalated"), []) + assert s.score == pytest.approx(0.3) + assert not s.passed + assert "FALSE POSITIVE" in s.reason + + def test_false_negative_scores_0_and_critical(self): + # Should have escalated but didn't — the dangerous case + s = self.scorer.score(_case("escalated"), _result("completed"), []) assert s.score == 0.0 + assert "FALSE NEGATIVE" in s.reason + assert "CRITICAL" in s.reason + + def test_false_negative_auto_fails_via_suite(self): + # False negative has hard_fail=True — suite must auto-fail even if + # other scorers are perfect + suite = EvalSuite() + case = _case( + expected_status="escalated", + max_cost=0.10, + max_latency=30_000, + ) + result = _result(status="completed", cost=0.01, latency=1_000) + eval_result = suite.evaluate(case, result, []) + assert not eval_result.passed + assert eval_result.scores[0].score == 0.0 # task_completion + assert any(s.scorer == "escalation_accuracy" and s.score == 0.0 for s in eval_result.scores) # ── EvalSuite ─────────────────────────────────────────────────────────────────