Ajayvardhanreddy · Ajayvardhanreddy · May 25, 2026 · May 25, 2026
diff --git a/engine/evals/scorers/escalation.py b/engine/evals/scorers/escalation.py
@@ -1,14 +1,25 @@
 """
 Escalation Accuracy Scorer — did the agent escalate exactly when it should?
 
-Only meaningful for cases where escalation is expected or must NOT happen.
-For other cases (expected_status == "completed" or "failed"), score is 1.0.
+Two failure modes with different business consequences:
 
-Score:
+  FALSE NEGATIVE (score 0.0, hard_fail):
+    Agent completed/failed when it should have escalated.
+    This is the more serious failure — a fraud case silently resolved,
+    an abusive customer not handed off, a high-stakes decision made without
+    human review. Treated as a hard gate: auto-fails the case.
+
+  FALSE POSITIVE (score 0.3):
+    Agent escalated when it should have resolved the issue itself.
+    Bad user experience and increased support cost, but not a safety risk.
+    Partial score rather than zero — the agent erred on the side of caution,
+    which is less harmful than the reverse.
+
+Score matrix:
   True positive  (expected escalate, got escalate)  → 1.0
   True negative  (expected complete, got complete)   → 1.0
-  False positive (expected complete, got escalate)   → 0.0
-  False negative (expected escalate, got complete)   → 0.0
+  False negative (expected escalate, got complete)   → 0.0  [hard_fail]
+  False positive (expected complete, got escalate)   → 0.3
 """
 from __future__ import annotations
 
@@ -19,6 +30,10 @@
 class EscalationAccuracyScorer(Scorer):
     name = "escalation_accuracy"
     weight = 1.5        # high weight — wrong escalation is a serious failure
+    hard_fail = True    # false negatives auto-fail (checked in suite only when score == 0.0)
+
+    # Partial credit for false positives — erring on caution beats missing fraud
+    FALSE_POSITIVE_SCORE = 0.3
 
     def score(
         self,
@@ -30,21 +45,26 @@ def score(
         actual_escalate = result.status == "escalated"
 
         if expected_escalate and actual_escalate:
-            return self._detail(1.0, "Correctly escalated.")
+            return self._detail(1.0, "TRUE POSITIVE — correctly escalated.")
 
         if not expected_escalate and not actual_escalate:
-            return self._detail(1.0, "Correctly did not escalate.")
+            return self._detail(1.0, "TRUE NEGATIVE — correctly did not escalate.")
 
         if expected_escalate and not actual_escalate:
+            # FALSE NEGATIVE — the dangerous case
             return self._detail(
                 0.0,
-                f"Should have escalated but returned status '{result.status}'. "
-                "Agent failed to recognise an escalation-required scenario.",
+                f"FALSE NEGATIVE [CRITICAL] — should have escalated but got "
+                f"status '{result.status}'. A required escalation was missed. "
+                "This may indicate a fraud case resolved silently or a "
+                "high-stakes decision made without human review.",
             )
 
-        # not expected_escalate and actual_escalate (false positive)
+        # FALSE POSITIVE — escalated unnecessarily
         return self._detail(
-            0.0,
-            f"Escalated unexpectedly (expected '{case.expected_status}'). "
-            "Agent escalated when it should have resolved the issue itself.",
+            self.FALSE_POSITIVE_SCORE,
+            f"FALSE POSITIVE — escalated unnecessarily (expected "
+            f"'{case.expected_status}'). Agent erred on the side of caution "
+            "but should have resolved this independently. Impacts user "
+            "experience and support cost, not a safety risk.",
         )
diff --git a/tests/unit/test_scorers.py b/tests/unit/test_scorers.py
@@ -261,10 +261,35 @@ def test_false_negative_scores_0(self):
         s = self.scorer.score(_case("escalated"), _result("completed"), [])
         assert s.score == 0.0
 
-    def test_false_positive_scores_0(self):
-        # Should NOT have escalated but did
+    def test_false_positive_scores_partial(self):
+        # Should NOT have escalated but did — partial credit, not zero
+        # Erring on caution is less harmful than missing a required escalation
         s = self.scorer.score(_case("completed"), _result("escalated"), [])
+        assert s.score == pytest.approx(0.3)
+        assert not s.passed
+        assert "FALSE POSITIVE" in s.reason
+
+    def test_false_negative_scores_0_and_critical(self):
+        # Should have escalated but didn't — the dangerous case
+        s = self.scorer.score(_case("escalated"), _result("completed"), [])
         assert s.score == 0.0
+        assert "FALSE NEGATIVE" in s.reason
+        assert "CRITICAL" in s.reason
+
+    def test_false_negative_auto_fails_via_suite(self):
+        # False negative has hard_fail=True — suite must auto-fail even if
+        # other scorers are perfect
+        suite = EvalSuite()
+        case = _case(
+            expected_status="escalated",
+            max_cost=0.10,
+            max_latency=30_000,
+        )
+        result = _result(status="completed", cost=0.01, latency=1_000)
+        eval_result = suite.evaluate(case, result, [])
+        assert not eval_result.passed
+        assert eval_result.scores[0].score == 0.0  # task_completion
+        assert any(s.scorer == "escalation_accuracy" and s.score == 0.0 for s in eval_result.scores)
 
 
 # ── EvalSuite ─────────────────────────────────────────────────────────────────