livekit · theomonnom · Jan 17, 2026 · Jan 21, 2026 · Jan 21, 2026 · Jan 21, 2026
diff --git a/examples/frontdesk/frontdesk_agent.py b/examples/frontdesk/frontdesk_agent.py
@@ -25,6 +25,17 @@
     function_tool,
     inference,
 )
+from livekit.agents.evals import (
+    JudgeGroup,
+    accuracy_judge,
+    coherence_judge,
+    conciseness_judge,
+    handoff_judge,
+    relevancy_judge,
+    safety_judge,
+    task_completion_judge,
+    tool_use_judge,
+)
 from livekit.plugins import silero
 from livekit.plugins.turn_detector.multilingual import MultilingualModel
 
@@ -34,6 +45,7 @@
 @dataclass
 class Userdata:
     cal: Calendar
+    appointment_booked: bool = False
 
 
 logger = logging.getLogger("front-desk")
@@ -96,6 +108,8 @@ async def schedule_appointment(
             # Tell the LLM this slot isn't available anymore
             raise ToolError("This slot isn't available anymore") from None
 
+        ctx.userdata.appointment_booked = True
+
         local = slot.start_time.astimezone(self.tz)
         return f"The appointment was successfully scheduled for {local.strftime('%A, %B %d, %Y at %H:%M %Z')}."
 
@@ -160,12 +174,33 @@ async def list_available_slots(
 
 
 async def on_session_end(ctx: JobContext) -> None:
-    # import json
+    report = ctx.make_session_report()
+
+    # Skip evaluation for very short conversations
+    chat = report.chat_history.copy(exclude_function_call=True, exclude_instructions=True)
+    if len(chat.items) < 3:
+        return
+
+    judges = JudgeGroup(
+        llm="openai/gpt-4o-mini",
+        judges=[
+            task_completion_judge(),
+            accuracy_judge(),
+            tool_use_judge(),
+            handoff_judge(),
+            safety_judge(),
+            relevancy_judge(),
+            coherence_judge(),
+            conciseness_judge(),
+        ],
+    )
 
-    # report = ctx.make_session_report()
-    # report_json = json.dumps(report.to_cloud_data(), indent=2)
+    await judges.evaluate(report.chat_history)
 
-    pass
+    if ctx.primary_session.userdata.appointment_booked:
+        ctx.tagger.success()
+    else:
+        ctx.tagger.fail(reason="Appointment was not booked")
 
 
 @server.rtc_session(on_session_end=on_session_end)

diff --git a/livekit-agents/livekit/agents/__init__.py b/livekit-agents/livekit/agents/__init__.py
@@ -37,6 +37,8 @@
     get_job_context,
 )
 from .llm.chat_context import (
+    AgentConfigUpdate,
+    AgentHandoff,
     ChatContent,
     ChatContext,
     ChatItem,
@@ -149,6 +151,8 @@ def __getattr__(name: str) -> typing.Any:
     "FunctionToolsExecutedEvent",
     "FunctionCall",
     "FunctionCallOutput",
+    "AgentConfigUpdate",
+    "AgentHandoff",
     "StopResponse",
     "ToolError",
     "RunContext",

diff --git a/livekit-agents/livekit/agents/evals/__init__.py b/livekit-agents/livekit/agents/evals/__init__.py
@@ -0,0 +1,38 @@
+from .evaluation import (
+    EvaluationResult,
+    Evaluator,
+    JudgeGroup,
+)
+from .judge import (
+    Judge,
+    JudgmentResult,
+    Verdict,
+    accuracy_judge,
+    coherence_judge,
+    conciseness_judge,
+    handoff_judge,
+    relevancy_judge,
+    safety_judge,
+    task_completion_judge,
+    tool_use_judge,
+)
+
+__all__ = [
+    # Evaluation
+    "EvaluationResult",
+    "Evaluator",
+    "JudgeGroup",
+    # Core types
+    "Judge",
+    "JudgmentResult",
+    "Verdict",
+    # Built-in judges
+    "accuracy_judge",
+    "coherence_judge",
+    "conciseness_judge",
+    "handoff_judge",
+    "relevancy_judge",
+    "safety_judge",
+    "task_completion_judge",
+    "tool_use_judge",
+]
diff --git a/livekit-agents/livekit/agents/evals/evaluation.py b/livekit-agents/livekit/agents/evals/evaluation.py
@@ -0,0 +1,189 @@
+from __future__ import annotations
+
+import asyncio
+import os
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Protocol
+
+from ..llm import LLM, ChatContext
+from .judge import JudgmentResult
+
+_evals_verbose = int(os.getenv("LIVEKIT_EVALS_VERBOSE", 0))
+
+if TYPE_CHECKING:
+    from ..inference import LLMModels
+
+
+class Evaluator(Protocol):
+    """Protocol for any object that can evaluate a conversation."""
+
+    @property
+    def name(self) -> str:
+        """Name identifying this evaluator."""
+        ...
+
+    async def evaluate(
+        self,
+        *,
+        chat_ctx: ChatContext,
+        reference: ChatContext | None = None,
+        llm: LLM | None = None,
+    ) -> JudgmentResult: ...
+
+
+@dataclass
+class EvaluationResult:
+    """Result of evaluating a conversation with a group of judges."""
+
+    judgments: dict[str, JudgmentResult] = field(default_factory=dict)
+    """Individual judgment results keyed by judge name."""
+
+    @property
+    def score(self) -> float:
+        """Score from 0.0 to 1.0. Pass=1, maybe=0.5, fail=0."""
+        if not self.judgments:
+            return 0.0
+        total = 0.0
+        for j in self.judgments.values():
+            if j.passed:
+                total += 1.0
+            elif j.uncertain:
+                total += 0.5
+        return total / len(self.judgments)
+
+    @property
+    def all_passed(self) -> bool:
+        """True if all judgments passed. Maybes count as not passed."""
+        return all(j.passed for j in self.judgments.values())
+
+    @property
+    def any_passed(self) -> bool:
+        """True if at least one judgment passed."""
+        return any(j.passed for j in self.judgments.values())
+
+    @property
+    def majority_passed(self) -> bool:
+        """True if more than half of the judgments passed."""
+        if not self.judgments:
+            return True
+        return self.score > len(self.judgments) / 2
+
+    @property
+    def none_failed(self) -> bool:
+        """True if no judgments explicitly failed. Maybes are allowed."""
+        return not any(j.failed for j in self.judgments.values())
+
+class JudgeGroup:
+    """A group of judges that evaluate conversations together.
+
+    Automatically tags the session with judgment results when called within a job context.
+
+    Example:
+        ```python
+        async def on_session_end(ctx: JobContext) -> None:
+            judges = JudgeGroup(
+                llm="openai/gpt-4o-mini",
+                judges=[
+                    task_completion_judge(),
+                    accuracy_judge(),
+                ],
+            )
+
+            report = ctx.make_session_report()
+            result = await judges.evaluate(report.chat_history)
+            # Results are automatically tagged to the session
+        ```
+    """
+
+    def __init__(
+        self,
+        *,
+        llm: LLM | LLMModels | str,
+        judges: list[Evaluator] | None = None,
+    ) -> None:
+        """Initialize a JudgeGroup.
+
+        Args:
+            llm: The LLM to use for evaluation. Can be an LLM instance or a model
+                string like "openai/gpt-4o-mini" (uses LiveKit inference gateway).
+            judges: The judges to run during evaluation.
+        """
+        if isinstance(llm, str):
+            from ..inference import LLM as InferenceLLM
+
+            self._llm: LLM = InferenceLLM(llm)
+        else:
+            self._llm = llm
+
+        self._judges = judges or []
+
+    @property
+    def llm(self) -> LLM:
+        """The LLM used for evaluation."""
+        return self._llm
+
+    @property
+    def judges(self) -> list[Evaluator]:
+        """The judges to run during evaluation."""
+        return self._judges
+
+    async def evaluate(
+        self,
+        chat_ctx: ChatContext,
+        *,
+        reference: ChatContext | None = None,
+    ) -> EvaluationResult:
+        """Evaluate a conversation with all judges.
+
+        Automatically tags the session with results when called within a job context.
+
+        Args:
+            chat_ctx: The conversation to evaluate.
+            reference: Optional reference conversation for comparison.
+
+        Returns:
+            EvaluationResult containing all judgment results.
+        """
+        from ..job import get_job_context
+        from ..log import logger
+
+        # Run all judges concurrently
+        async def run_judge(judge: Evaluator) -> tuple[str, JudgmentResult | BaseException]:
+            try:
+                result = await judge.evaluate(
+                    chat_ctx=chat_ctx,
+                    reference=reference,
+                    llm=self._llm,
+                )
+                return judge.name, result
+            except Exception as e:
+                logger.warning(f"Judge '{judge.name}' failed: {e}")
+                return judge.name, e
+
+        results = await asyncio.gather(*[run_judge(j) for j in self._judges])
+
+        # Filter out failed judges
+        judgments: dict[str, JudgmentResult] = {}
+        for name, result in results:
+            if isinstance(result, JudgmentResult):
+                judgments[name] = result
+
+        evaluation_result = EvaluationResult(judgments=judgments)
+
+        if _evals_verbose:
+            print("\n+ JudgeGroup evaluation results:")
+            for name, result in results:
+                if isinstance(result, JudgmentResult):
+                    print(f"  [{name}] verdict={result.verdict}")
+                    print(f"    reasoning: {result.reasoning}\n")
+                else:
+                    print(f"  [{name}] ERROR: {result}\n")
+
+        # Auto-tag if running within a job context
+        try:
+            ctx = get_job_context()
+            ctx.tagger._evaluation(evaluation_result)
+        except RuntimeError:
+            pass  # Not in a job context, skip tagging
+
+        return evaluation_result