Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 39 additions & 4 deletions examples/frontdesk/frontdesk_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@
function_tool,
inference,
)
from livekit.agents.evals import (
JudgeGroup,
accuracy_judge,
coherence_judge,
conciseness_judge,
handoff_judge,
relevancy_judge,
safety_judge,
task_completion_judge,
tool_use_judge,
)
from livekit.plugins import silero
from livekit.plugins.turn_detector.multilingual import MultilingualModel

Expand All @@ -34,6 +45,7 @@
@dataclass
class Userdata:
cal: Calendar
appointment_booked: bool = False


logger = logging.getLogger("front-desk")
Expand Down Expand Up @@ -96,6 +108,8 @@ async def schedule_appointment(
# Tell the LLM this slot isn't available anymore
raise ToolError("This slot isn't available anymore") from None

ctx.userdata.appointment_booked = True

local = slot.start_time.astimezone(self.tz)
return f"The appointment was successfully scheduled for {local.strftime('%A, %B %d, %Y at %H:%M %Z')}."

Expand Down Expand Up @@ -160,12 +174,33 @@ async def list_available_slots(


async def on_session_end(ctx: JobContext) -> None:
# import json
report = ctx.make_session_report()

# Skip evaluation for very short conversations
chat = report.chat_history.copy(exclude_function_call=True, exclude_instructions=True)
if len(chat.items) < 3:
return

judges = JudgeGroup(
llm="openai/gpt-4o-mini",
judges=[
task_completion_judge(),
accuracy_judge(),
tool_use_judge(),
handoff_judge(),
safety_judge(),
relevancy_judge(),
coherence_judge(),
conciseness_judge(),
],
)

# report = ctx.make_session_report()
# report_json = json.dumps(report.to_cloud_data(), indent=2)
await judges.evaluate(report.chat_history)

pass
if ctx.primary_session.userdata.appointment_booked:
ctx.tagger.success()
else:
ctx.tagger.fail(reason="Appointment was not booked")

Comment on lines 176 to 204
Copy link
Contributor

@coderabbitai coderabbitai bot Jan 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Short-session early return skips tagging outcomes.

At Line 180–182, returning early prevents ctx.tagger.success() / fail(...) from being called, so short sessions never get tagged—even if a booking completed. Consider skipping evaluation but still tagging.

🔧 Suggested restructure
-    if len(chat.items) < 3:
-        return
-
-    judges = JudgeGroup(
-        llm="openai/gpt-4o-mini",
-        judges=[
-            task_completion_judge(),
-            accuracy_judge(),
-            tool_use_judge(),
-            handoff_judge(),
-            safety_judge(),
-            relevancy_judge(),
-            coherence_judge(),
-            conciseness_judge(),
-        ],
-    )
-
-    await judges.evaluate(report.chat_history)
+    if len(chat.items) >= 3:
+        judges = JudgeGroup(
+            llm="openai/gpt-4o-mini",
+            judges=[
+                task_completion_judge(),
+                accuracy_judge(),
+                tool_use_judge(),
+                handoff_judge(),
+                safety_judge(),
+                relevancy_judge(),
+                coherence_judge(),
+                conciseness_judge(),
+            ],
+        )
+
+        await judges.evaluate(report.chat_history)
 
     if ctx.primary_session.userdata.appointment_booked:
         ctx.tagger.success()
     else:
         ctx.tagger.fail(reason="Appointment was not booked")
🤖 Prompt for AI Agents
In `@examples/frontdesk/frontdesk_agent.py` around lines 176 - 204, The early
return in on_session_end prevents tagging short sessions; instead of returning
when len(chat.items) < 3, skip running JudgeGroup evaluation but still call the
tagging logic: create the report (ctx.make_session_report()), skip or bypass
judges.evaluate(...) when the chat is short, then call ctx.tagger.success() if
ctx.primary_session.userdata.appointment_booked is true or ctx.tagger.fail(...)
otherwise, ensuring the tagging runs regardless of whether judges were executed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's fine bro

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@theomonnom, understood! I'll leave the implementation as is. 👍


@server.rtc_session(on_session_end=on_session_end)
Expand Down
4 changes: 4 additions & 0 deletions livekit-agents/livekit/agents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
get_job_context,
)
from .llm.chat_context import (
AgentConfigUpdate,
AgentHandoff,
ChatContent,
ChatContext,
ChatItem,
Expand Down Expand Up @@ -149,6 +151,8 @@ def __getattr__(name: str) -> typing.Any:
"FunctionToolsExecutedEvent",
"FunctionCall",
"FunctionCallOutput",
"AgentConfigUpdate",
"AgentHandoff",
"StopResponse",
"ToolError",
"RunContext",
Expand Down
38 changes: 38 additions & 0 deletions livekit-agents/livekit/agents/evals/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from .evaluation import (
EvaluationResult,
Evaluator,
JudgeGroup,
)
from .judge import (
Judge,
JudgmentResult,
Verdict,
accuracy_judge,
coherence_judge,
conciseness_judge,
handoff_judge,
relevancy_judge,
safety_judge,
task_completion_judge,
tool_use_judge,
)

__all__ = [
# Evaluation
"EvaluationResult",
"Evaluator",
"JudgeGroup",
# Core types
"Judge",
"JudgmentResult",
"Verdict",
# Built-in judges
"accuracy_judge",
"coherence_judge",
"conciseness_judge",
"handoff_judge",
"relevancy_judge",
"safety_judge",
"task_completion_judge",
"tool_use_judge",
]
189 changes: 189 additions & 0 deletions livekit-agents/livekit/agents/evals/evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
from __future__ import annotations

import asyncio
import os
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Protocol

from ..llm import LLM, ChatContext
from .judge import JudgmentResult

_evals_verbose = int(os.getenv("LIVEKIT_EVALS_VERBOSE", 0))

if TYPE_CHECKING:
from ..inference import LLMModels


class Evaluator(Protocol):
"""Protocol for any object that can evaluate a conversation."""

@property
def name(self) -> str:
"""Name identifying this evaluator."""
...

async def evaluate(
self,
*,
chat_ctx: ChatContext,
reference: ChatContext | None = None,
llm: LLM | None = None,
) -> JudgmentResult: ...


@dataclass
class EvaluationResult:
"""Result of evaluating a conversation with a group of judges."""

judgments: dict[str, JudgmentResult] = field(default_factory=dict)
"""Individual judgment results keyed by judge name."""

@property
def score(self) -> float:
"""Score from 0.0 to 1.0. Pass=1, maybe=0.5, fail=0."""
if not self.judgments:
return 0.0
total = 0.0
for j in self.judgments.values():
if j.passed:
total += 1.0
elif j.uncertain:
total += 0.5
return total / len(self.judgments)

@property
def all_passed(self) -> bool:
"""True if all judgments passed. Maybes count as not passed."""
return all(j.passed for j in self.judgments.values())

@property
def any_passed(self) -> bool:
"""True if at least one judgment passed."""
return any(j.passed for j in self.judgments.values())

@property
def majority_passed(self) -> bool:
"""True if more than half of the judgments passed."""
if not self.judgments:
return True
return self.score > len(self.judgments) / 2
Comment on lines +64 to +69
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

majority_passed is mathematically incorrect.
self.score is in [0,1], but it’s compared against len(self.judgments)/2 (≥1 for 2+ judges), so this almost always returns False.

🐛 Suggested fix
     def majority_passed(self) -> bool:
         """True if more than half of the judgments passed."""
         if not self.judgments:
             return True
-        return self.score > len(self.judgments) / 2
+        passed = sum(1 for j in self.judgments.values() if j.passed)
+        return passed > len(self.judgments) / 2
🤖 Prompt for AI Agents
In `@livekit-agents/livekit/agents/evals/evaluation.py` around lines 61 - 66, The
majority_passed property compares a fractional score to a count, which is wrong;
update the logic in majority_passed (in evaluation.py) to compare like-for-like:
either check if self.score > 0.5 (since score is in [0,1]) or compute
passed_count = self.score * len(self.judgments) and compare passed_count >
len(self.judgments)/2; keep the existing empty-judgments shortcut (return True
when not self.judgments).


@property
def none_failed(self) -> bool:
"""True if no judgments explicitly failed. Maybes are allowed."""
return not any(j.failed for j in self.judgments.values())

class JudgeGroup:
"""A group of judges that evaluate conversations together.

Automatically tags the session with judgment results when called within a job context.

Example:
```python
async def on_session_end(ctx: JobContext) -> None:
judges = JudgeGroup(
llm="openai/gpt-4o-mini",
judges=[
task_completion_judge(),
accuracy_judge(),
],
)

report = ctx.make_session_report()
result = await judges.evaluate(report.chat_history)
# Results are automatically tagged to the session
```
"""

def __init__(
self,
*,
llm: LLM | LLMModels | str,
judges: list[Evaluator] | None = None,
) -> None:
"""Initialize a JudgeGroup.

Args:
llm: The LLM to use for evaluation. Can be an LLM instance or a model
string like "openai/gpt-4o-mini" (uses LiveKit inference gateway).
judges: The judges to run during evaluation.
"""
if isinstance(llm, str):
from ..inference import LLM as InferenceLLM

self._llm: LLM = InferenceLLM(llm)
else:
self._llm = llm

self._judges = judges or []

@property
def llm(self) -> LLM:
"""The LLM used for evaluation."""
return self._llm

@property
def judges(self) -> list[Evaluator]:
"""The judges to run during evaluation."""
return self._judges

async def evaluate(
self,
chat_ctx: ChatContext,
*,
reference: ChatContext | None = None,
) -> EvaluationResult:
"""Evaluate a conversation with all judges.

Automatically tags the session with results when called within a job context.

Args:
chat_ctx: The conversation to evaluate.
reference: Optional reference conversation for comparison.

Returns:
EvaluationResult containing all judgment results.
"""
from ..job import get_job_context
from ..log import logger

# Run all judges concurrently
async def run_judge(judge: Evaluator) -> tuple[str, JudgmentResult | BaseException]:
try:
result = await judge.evaluate(
chat_ctx=chat_ctx,
reference=reference,
llm=self._llm,
)
return judge.name, result
except Exception as e:
logger.warning(f"Judge '{judge.name}' failed: {e}")
return judge.name, e

results = await asyncio.gather(*[run_judge(j) for j in self._judges])

# Filter out failed judges
judgments: dict[str, JudgmentResult] = {}
for name, result in results:
if isinstance(result, JudgmentResult):
judgments[name] = result

evaluation_result = EvaluationResult(judgments=judgments)

if _evals_verbose:
print("\n+ JudgeGroup evaluation results:")
for name, result in results:
if isinstance(result, JudgmentResult):
print(f" [{name}] verdict={result.verdict}")
print(f" reasoning: {result.reasoning}\n")
else:
print(f" [{name}] ERROR: {result}\n")

# Auto-tag if running within a job context
try:
ctx = get_job_context()
ctx.tagger._evaluation(evaluation_result)
except RuntimeError:
pass # Not in a job context, skip tagging

return evaluation_result
Loading
Loading