Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 33 additions & 13 deletions engine/evals/scorers/escalation.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,25 @@
"""
Escalation Accuracy Scorer — did the agent escalate exactly when it should?

Only meaningful for cases where escalation is expected or must NOT happen.
For other cases (expected_status == "completed" or "failed"), score is 1.0.
Two failure modes with different business consequences:

Score:
FALSE NEGATIVE (score 0.0, hard_fail):
Agent completed/failed when it should have escalated.
This is the more serious failure — a fraud case silently resolved,
an abusive customer not handed off, a high-stakes decision made without
human review. Treated as a hard gate: auto-fails the case.

FALSE POSITIVE (score 0.3):
Agent escalated when it should have resolved the issue itself.
Bad user experience and increased support cost, but not a safety risk.
Partial score rather than zero — the agent erred on the side of caution,
which is less harmful than the reverse.

Score matrix:
True positive (expected escalate, got escalate) → 1.0
True negative (expected complete, got complete) → 1.0
False positive (expected complete, got escalate) → 0.0
False negative (expected escalate, got complete) → 0.0
False negative (expected escalate, got complete) → 0.0 [hard_fail]
False positive (expected complete, got escalate) → 0.3
"""
from __future__ import annotations

Expand All @@ -19,6 +30,10 @@
class EscalationAccuracyScorer(Scorer):
name = "escalation_accuracy"
weight = 1.5 # high weight — wrong escalation is a serious failure
hard_fail = True # false negatives auto-fail (checked in suite only when score == 0.0)

# Partial credit for false positives — erring on caution beats missing fraud
FALSE_POSITIVE_SCORE = 0.3

def score(
self,
Expand All @@ -30,21 +45,26 @@ def score(
actual_escalate = result.status == "escalated"

if expected_escalate and actual_escalate:
return self._detail(1.0, "Correctly escalated.")
return self._detail(1.0, "TRUE POSITIVE — correctly escalated.")

if not expected_escalate and not actual_escalate:
return self._detail(1.0, "Correctly did not escalate.")
return self._detail(1.0, "TRUE NEGATIVE — correctly did not escalate.")

if expected_escalate and not actual_escalate:
# FALSE NEGATIVE — the dangerous case
return self._detail(
0.0,
f"Should have escalated but returned status '{result.status}'. "
"Agent failed to recognise an escalation-required scenario.",
f"FALSE NEGATIVE [CRITICAL] — should have escalated but got "
f"status '{result.status}'. A required escalation was missed. "
"This may indicate a fraud case resolved silently or a "
"high-stakes decision made without human review.",
)

# not expected_escalate and actual_escalate (false positive)
# FALSE POSITIVE — escalated unnecessarily
return self._detail(
0.0,
f"Escalated unexpectedly (expected '{case.expected_status}'). "
"Agent escalated when it should have resolved the issue itself.",
self.FALSE_POSITIVE_SCORE,
f"FALSE POSITIVE — escalated unnecessarily (expected "
f"'{case.expected_status}'). Agent erred on the side of caution "
"but should have resolved this independently. Impacts user "
"experience and support cost, not a safety risk.",
)
29 changes: 27 additions & 2 deletions tests/unit/test_scorers.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,10 +261,35 @@ def test_false_negative_scores_0(self):
s = self.scorer.score(_case("escalated"), _result("completed"), [])
assert s.score == 0.0

def test_false_positive_scores_0(self):
# Should NOT have escalated but did
def test_false_positive_scores_partial(self):
# Should NOT have escalated but did — partial credit, not zero
# Erring on caution is less harmful than missing a required escalation
s = self.scorer.score(_case("completed"), _result("escalated"), [])
assert s.score == pytest.approx(0.3)
assert not s.passed
assert "FALSE POSITIVE" in s.reason

def test_false_negative_scores_0_and_critical(self):
# Should have escalated but didn't — the dangerous case
s = self.scorer.score(_case("escalated"), _result("completed"), [])
assert s.score == 0.0
assert "FALSE NEGATIVE" in s.reason
assert "CRITICAL" in s.reason

def test_false_negative_auto_fails_via_suite(self):
# False negative has hard_fail=True — suite must auto-fail even if
# other scorers are perfect
suite = EvalSuite()
case = _case(
expected_status="escalated",
max_cost=0.10,
max_latency=30_000,
)
result = _result(status="completed", cost=0.01, latency=1_000)
eval_result = suite.evaluate(case, result, [])
assert not eval_result.passed
assert eval_result.scores[0].score == 0.0 # task_completion
assert any(s.scorer == "escalation_accuracy" and s.score == 0.0 for s in eval_result.scores)


# ── EvalSuite ─────────────────────────────────────────────────────────────────
Expand Down
Loading