-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathdeepeval.py
More file actions
115 lines (102 loc) · 4.72 KB
/
deepeval.py
File metadata and controls
115 lines (102 loc) · 4.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from typing import Any, Dict, List, Optional
from eval_protocol.models import EvaluateResult, MetricResult
from eval_protocol.typed_interface import reward_function
__all__ = ["adapt_metric"]
try:
from deepeval.metrics.base_metric import BaseConversationalMetric, BaseMetric
from deepeval.test_case import ConversationalTestCase, LLMTestCase
except Exception: # pragma: no cover - deepeval is optional
BaseMetric = None
BaseConversationalMetric = None
LLMTestCase = None
ConversationalTestCase = None
def _metric_name(metric: Any) -> str:
name = getattr(metric, "__name__", None)
if name and name not in {
"Base Metric",
"Base Conversational Metric",
"Base Multimodal Metric",
}:
return str(name)
name = getattr(metric, "name", None)
if name:
return str(name)
return metric.__class__.__name__
def adapt_metric(metric: Any):
"""Adapt a deepeval metric object into an Eval Protocol reward function."""
@reward_function
def wrapped(
messages: List[Dict[str, Any]],
ground_truth: Optional[str] = None,
**kwargs: Any,
) -> EvaluateResult:
if BaseMetric is None or LLMTestCase is None:
raise ImportError("deepeval must be installed to use this integration")
if not messages:
return EvaluateResult(score=0.0, reason="No messages", metrics={})
output = messages[-1].get("content", "")
input_msg = ""
if len(messages) >= 2:
input_msg = messages[-2].get("content", "")
def _build_case_kwargs() -> Dict[str, Any]:
case_kwargs: Dict[str, Any] = {}
params = getattr(metric, "evaluation_params", None)
if params:
for param in params:
if param.value == "input":
case_kwargs["input"] = input_msg
elif param.value == "actual_output":
case_kwargs["actual_output"] = output
elif param.value == "expected_output":
case_kwargs["expected_output"] = ground_truth
elif param.value == "context":
case_kwargs["context"] = kwargs.get("context")
elif param.value == "retrieval_context":
case_kwargs["retrieval_context"] = kwargs.get("retrieval_context")
elif param.value == "tools_called":
case_kwargs["tools_called"] = kwargs.get("tools_called")
elif param.value == "expected_tools":
case_kwargs["expected_tools"] = kwargs.get("expected_tools")
else:
case_kwargs = {
"input": input_msg,
"actual_output": output,
"expected_output": ground_truth,
}
if "input" not in case_kwargs:
case_kwargs["input"] = input_msg
if "actual_output" not in case_kwargs:
case_kwargs["actual_output"] = output
return case_kwargs
if BaseConversationalMetric is not None and isinstance(metric, BaseConversationalMetric):
# Narrow types for optional imports to satisfy the type checker
assert LLMTestCase is not None
assert ConversationalTestCase is not None
turns = []
for i, msg in enumerate(messages):
turn_input = messages[i - 1].get("content", "") if i > 0 else ""
output_turn = msg.get("content", "")
input_msg_backup = input_msg
input_msg = turn_input
output = output_turn
turn_kwargs = _build_case_kwargs()
turns.append(LLMTestCase(**turn_kwargs))
input_msg = input_msg_backup
output = messages[-1].get("content", "")
test_case = ConversationalTestCase(turns=turns)
else:
# Narrow types for optional imports to satisfy the type checker
assert LLMTestCase is not None
case_kwargs = _build_case_kwargs()
test_case = LLMTestCase(**case_kwargs)
# Guard against metric.measure being None or non-callable
measure_fn = getattr(metric, "measure", None)
if not callable(measure_fn):
raise TypeError("Provided metric does not have a callable 'measure' method")
measure_fn(test_case, **kwargs)
score = float(metric.score or 0.0)
reason = getattr(metric, "reason", None)
name = _metric_name(metric)
metrics = {name: MetricResult(score=score, reason=reason or "", is_score_valid=True)}
return EvaluateResult(score=score, reason=reason, metrics=metrics)
return wrapped