Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
87068d5
add interface draft
chenghao-mou Feb 5, 2026
e0d5ec1
Merge branch 'main' into feat/AGT-2520-multimodal-EOU
chenghao-mou Mar 6, 2026
8eebccc
draft
chenghao-mou Mar 11, 2026
f92fbc0
fix type issues
chenghao-mou Mar 11, 2026
d1086ff
refactor stream to support turn detector protocol
chenghao-mou Mar 12, 2026
0a02bb1
minor fixes
chenghao-mou Mar 12, 2026
168d0d7
minor fixes
chenghao-mou Mar 12, 2026
277db6e
WIP: use only ws stream
chenghao-mou Mar 24, 2026
03c0e2e
Merge branch 'main' into feat/AGT-2520-multimodal-EOU
chenghao-mou Mar 24, 2026
56b4796
fix uv.lock bad merge
chenghao-mou Mar 24, 2026
be9a550
WIP: more refactoring
chenghao-mou Mar 25, 2026
601229c
fix mypy
chenghao-mou Mar 25, 2026
c4d92f8
remove temp url
chenghao-mou Mar 25, 2026
e963d85
disable turn detection when agent is still speaking
chenghao-mou Mar 25, 2026
c529d79
minor refactoring
chenghao-mou Mar 29, 2026
09baed8
fix type issues
chenghao-mou Mar 29, 2026
3830638
wip
chenghao-mou Apr 10, 2026
f214aa0
clean up encoder
chenghao-mou Apr 20, 2026
c922f44
wip
chenghao-mou Apr 20, 2026
f94a0dd
Merge branch 'main' into feat/AGT-2520-multimodal-EOU
chenghao-mou Apr 20, 2026
604bfdc
update protos
chenghao-mou Apr 21, 2026
f9ec64a
minor fixes
chenghao-mou Apr 21, 2026
ddbf594
address comments
chenghao-mou Apr 21, 2026
d465564
add text fallback
chenghao-mou Apr 22, 2026
6e7d6bf
add text fallback
chenghao-mou Apr 22, 2026
200d634
fix threshold
chenghao-mou Apr 22, 2026
dbd11b0
remove temp deps
chenghao-mou Apr 22, 2026
60004dd
support realtime model
chenghao-mou Apr 22, 2026
6de53f4
fix type issues
chenghao-mou Apr 22, 2026
4ed8a82
add id in logs
chenghao-mou Apr 23, 2026
0db57ea
use threaded audio encoder
chenghao-mou Apr 24, 2026
bbcfc3a
close encoder
chenghao-mou Apr 24, 2026
7e04332
update dep
chenghao-mou Apr 27, 2026
04db92f
address comments
chenghao-mou Apr 30, 2026
46fd3bf
add cloud agent worker token
chenghao-mou Apr 30, 2026
e4e8ef6
Merge branch 'main' into feat/AGT-2520-multimodal-EOU
chenghao-mou Apr 30, 2026
fc94068
fix type issues
chenghao-mou Apr 30, 2026
999edd5
add token in header instead
chenghao-mou Apr 30, 2026
cde90de
Merge branch 'main' into feat/AGT-2520-multimodal-EOU
chenghao-mou Apr 30, 2026
3603f04
wip
chenghao-mou May 13, 2026
6272402
Merge branch 'main' into feat/AGT-2520-multimodal-EOU
chenghao-mou May 13, 2026
3bc3ff3
refactor for the cloud model
chenghao-mou May 14, 2026
a08b624
add support for both v1 and v1-mini
chenghao-mou May 14, 2026
f435571
fix example
chenghao-mou May 15, 2026
8e75d60
address comments
chenghao-mou May 15, 2026
cf54cbe
Merge branch 'main' into feat/AGT-2520-multimodal-EOU
chenghao-mou May 15, 2026
4f10a69
address comments
chenghao-mou May 15, 2026
e96f1be
clean up session _on_error annotation
chenghao-mou May 15, 2026
97400d2
Merge branch 'main' into feat/AGT-2520-multimodal-EOU
chenghao-mou May 15, 2026
b1e9294
merge inference and local eot code
chenghao-mou May 15, 2026
49f0de0
update tests
chenghao-mou May 17, 2026
7fe2bfb
Merge branch 'main' into feat/AGT-2520-multimodal-EOU
chenghao-mou May 17, 2026
8b150aa
clean up
chenghao-mou May 17, 2026
28af3f5
minor refactor and clean up
chenghao-mou May 18, 2026
75ddae6
refactor
chenghao-mou May 19, 2026
76cec5d
Merge branch 'main' into feat/AGT-2520-multimodal-EOU
chenghao-mou May 19, 2026
2ccf54d
refactor
chenghao-mou May 19, 2026
7fbca08
clean up
chenghao-mou May 19, 2026
82c599a
refactor
chenghao-mou May 19, 2026
4b6fdb5
clean up
chenghao-mou May 19, 2026
7500160
more refactoring
chenghao-mou May 19, 2026
efe8d5c
fix makefile indentation
chenghao-mou May 19, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ To run the examples, you'll need:

- A [LiveKit Cloud](https://cloud.livekit.io) account or a local [LiveKit server](https://github.com/livekit/livekit)
- API keys for the model providers you want to use in a `.env` file
- Python 3.9 or higher
- Python 3.10 or higher
- [uv](https://docs.astral.sh/uv/)

### Environment file
Expand Down
4 changes: 2 additions & 2 deletions examples/voice_agents/basic_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from livekit.agents.beta import EndCallTool
from livekit.agents.llm import function_tool
from livekit.plugins import silero
from livekit.plugins.turn_detector.multilingual import MultilingualModel
from livekit.plugins.turn_detector.audio import AudioTurnDetector

# uncomment to enable Krisp background voice/noise cancellation
# from livekit.plugins import noise_cancellation
Expand Down Expand Up @@ -98,7 +98,7 @@ async def entrypoint(ctx: JobContext) -> None:
turn_handling=TurnHandlingOptions(
# VAD and turn detection are used to determine when the user is speaking and when the agent should respond
# See more at https://docs.livekit.io/agents/build/turns
turn_detection=MultilingualModel(),
turn_detection=AudioTurnDetector(),
interruption={
# sometimes background noise could interrupt the agent session, these are considered false positive interruptions
# when it's detected, you may resume the agent's speech
Expand Down
3 changes: 3 additions & 0 deletions livekit-agents/livekit/agents/inference/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
HEADER_USER_AGENT = "User-Agent"
HEADER_ROOM_ID = "X-LiveKit-Room-ID"
HEADER_JOB_ID = "X-LiveKit-Job-ID"
HEADER_WORKER_TOKEN = "X-LiveKit-Worker-Token"
HEADER_INFERENCE_PROVIDER = "X-LiveKit-Inference-Provider"
HEADER_INFERENCE_PRIORITY = "X-LiveKit-Inference-Priority"

Expand Down Expand Up @@ -55,6 +56,8 @@ def get_inference_headers() -> dict[str, str]:
headers[HEADER_ROOM_ID] = ctx.job.room.sid
if ctx.job.id:
headers[HEADER_JOB_ID] = ctx.job.id
if worker_token := os.getenv("LIVEKIT_WORKER_TOKEN"):
headers[HEADER_WORKER_TOKEN] = worker_token
except RuntimeError:
pass
return headers
Expand Down
4 changes: 4 additions & 0 deletions livekit-agents/livekit/agents/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .base import (
AgentMetrics,
EOTInferenceMetrics,
EOUMetrics,
InterruptionMetrics,
LLMMetrics,
Expand All @@ -10,6 +11,7 @@
)
from .usage import (
AgentSessionUsage,
EOTModelUsage,
InterruptionModelUsage,
LLMModelUsage,
ModelUsage,
Expand All @@ -25,6 +27,7 @@
"AgentMetrics",
"VADMetrics",
"EOUMetrics",
"EOTInferenceMetrics",
"STTMetrics",
"TTSMetrics",
"RealtimeModelMetrics",
Expand All @@ -34,6 +37,7 @@
"TTSModelUsage",
"STTModelUsage",
"InterruptionModelUsage",
"EOTModelUsage",
"ModelUsage",
"AgentSessionUsage",
"ModelUsageCollector",
Expand Down
17 changes: 17 additions & 0 deletions livekit-agents/livekit/agents/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,22 @@ class EOUMetrics(_BaseMetrics):
metadata: Metadata | None = None


class EOTInferenceMetrics(_BaseMetrics):
"""Per-inference metrics emitted by the EOT model on each prediction."""

type: Literal["eot_inference_metrics"] = "eot_inference_metrics"
timestamp: float
total_duration: float
"""Earliest audio creation time in an inference to response receive time."""
detection_delay: float
"""Latest audio creation time in an inference to response receive time."""
prediction_duration: float
"""Server side model inference time."""
num_requests: int = 1
"""Number of inference requests made during one inference."""
metadata: Metadata | None = None


class RealtimeModelMetrics(_BaseMetrics):
class CachedTokenDetails(BaseModel):
audio_tokens: int = 0
Expand Down Expand Up @@ -199,6 +215,7 @@ class AvatarMetrics(_BaseMetrics):
| TTSMetrics
| VADMetrics
| EOUMetrics
| EOTInferenceMetrics
| RealtimeModelMetrics
| InterruptionMetrics
| AvatarMetrics
Expand Down
35 changes: 33 additions & 2 deletions livekit-agents/livekit/agents/metrics/usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from .base import (
AgentMetrics,
EOTInferenceMetrics,
InterruptionMetrics,
LLMMetrics,
RealtimeModelMetrics,
Expand Down Expand Up @@ -108,7 +109,19 @@ class InterruptionModelUsage(_BaseModelUsage):
"""Total number of requests sent to the interruption detection model."""


ModelUsage = LLMModelUsage | TTSModelUsage | STTModelUsage | InterruptionModelUsage
class EOTModelUsage(_BaseModelUsage):
"""Usage summary for end-of-turn detection models."""

type: Literal["eot_usage"] = "eot_usage"
provider: str
"""The provider name (e.g., 'livekit')."""
model: str
"""The model name (e.g., 'eot-audio')."""
total_requests: int = 0
"""Total number of inference requests sent to the EOT model."""


ModelUsage = LLMModelUsage | TTSModelUsage | STTModelUsage | InterruptionModelUsage | EOTModelUsage
"""Union type for all model usage types."""


Expand All @@ -125,13 +138,19 @@ def __init__(self) -> None:
self._tts_usage: dict[tuple[str, str], TTSModelUsage] = {}
self._stt_usage: dict[tuple[str, str], STTModelUsage] = {}
self._interruption_usage: dict[tuple[str, str], InterruptionModelUsage] = {}
self._eot_usage: dict[tuple[str, str], EOTModelUsage] = {}

def __call__(self, metrics: AgentMetrics) -> None:
self.collect(metrics)

def _extract_provider_model(
self,
metrics: LLMMetrics | STTMetrics | TTSMetrics | RealtimeModelMetrics | InterruptionMetrics,
metrics: LLMMetrics
| STTMetrics
| TTSMetrics
| RealtimeModelMetrics
| InterruptionMetrics
| EOTInferenceMetrics,
) -> tuple[str, str]:
"""Extract provider and model from metrics metadata."""
provider = ""
Expand Down Expand Up @@ -169,6 +188,13 @@ def _get_interruption_usage(self, provider: str, model: str) -> InterruptionMode
self._interruption_usage[key] = InterruptionModelUsage(provider=provider, model=model)
return self._interruption_usage[key]

def _get_eot_usage(self, provider: str, model: str) -> EOTModelUsage:
"""Get or create an EOTModelUsage for the given provider/model combination."""
key = (provider, model)
if key not in self._eot_usage:
self._eot_usage[key] = EOTModelUsage(provider=provider, model=model)
return self._eot_usage[key]

def collect(self, metrics: AgentMetrics) -> None:
if isinstance(metrics, LLMMetrics):
provider, model = self._extract_provider_model(metrics)
Expand Down Expand Up @@ -225,6 +251,10 @@ def collect(self, metrics: AgentMetrics) -> None:
provider, model = self._extract_provider_model(metrics)
interruption_usage = self._get_interruption_usage(provider, model)
interruption_usage.total_requests += metrics.num_requests
elif isinstance(metrics, EOTInferenceMetrics):
provider, model = self._extract_provider_model(metrics)
eot_usage = self._get_eot_usage(provider, model)
eot_usage.total_requests += metrics.num_requests

def flatten(self) -> list[ModelUsage]:
"""Returns a list of usage summaries, one per model/provider combination."""
Expand All @@ -233,4 +263,5 @@ def flatten(self) -> list[ModelUsage]:
result.extend(u.model_copy(deep=True) for u in self._tts_usage.values())
result.extend(u.model_copy(deep=True) for u in self._stt_usage.values())
result.extend(u.model_copy(deep=True) for u in self._interruption_usage.values())
result.extend(u.model_copy(deep=True) for u in self._eot_usage.values())
return result
2 changes: 2 additions & 0 deletions livekit-agents/livekit/agents/telemetry/trace_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@
ATTR_TRANSCRIPT_CONFIDENCE = "lk.transcript_confidence"
ATTR_TRANSCRIPTION_DELAY = "lk.transcription_delay"
ATTR_END_OF_TURN_DELAY = "lk.end_of_turn_delay"
ATTR_EOU_SOURCE = "lk.eou.source"
ATTR_EOU_DETECTION_DELAY = "lk.eou.detection_delay"

# metrics
ATTR_LLM_METRICS = "lk.llm_metrics"
Expand Down
2 changes: 2 additions & 0 deletions livekit-agents/livekit/agents/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .audio import AudioArrayBuffer, AudioBuffer, combine_frames, merge_frames
from .bounded_dict import BoundedDict
from .connection_pool import ConnectionPool
from .env import resolve_env_var
from .exp_filter import ExpFilter
from .log import log_exceptions
from .misc import is_dev_mode, is_given, is_hosted, nodename, shortuuid, time_ms
Expand Down Expand Up @@ -39,6 +40,7 @@
"wait_for_agent",
"wait_for_participant",
"wait_for_track_publication",
"resolve_env_var",
]

# Cleanup docs of unexported modules
Expand Down
33 changes: 33 additions & 0 deletions livekit-agents/livekit/agents/utils/env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os

from ..types import NotGivenOr
from .misc import is_given


def resolve_env_var(val: NotGivenOr[str], *env_vars: str, default: str = "") -> str:
"""
Resolve an environment variable from a list of potential sources.

Args:
val: The value to resolve.
*env_vars: The environment variables to check. Order matters, the first non-None value will be returned.
default: The default value to return if no environment variables are set.

Returns:
The resolved environment variable.

Examples:
>>> resolve_env_var(
... NOT_GIVEN,
... "ABC_URL",
... default="https://agent-gateway.livekit.cloud/v1",
... )
"https://agent-gateway.livekit.cloud/v1"
"""
if is_given(val):
return val
for env_var in env_vars:
curr_val = os.getenv(env_var, None)
if curr_val is not None and curr_val != "":
return curr_val
return default
13 changes: 12 additions & 1 deletion livekit-agents/livekit/agents/voice/agent_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
update_instructions,
)
from .speech_handle import DEFAULT_INPUT_DETAILS, InputDetails, SpeechHandle
from .turn import EndpointingOptions, TurnDetectionMode
from .turn import EndpointingOptions, TurnDetectionMode, _AudioTurnDetector

if TYPE_CHECKING:
from ..llm import mcp
Expand Down Expand Up @@ -232,6 +232,11 @@ def _validate_turn_detection(
self, turn_detection: TurnDetectionMode | None
) -> TurnDetectionMode | None:
if turn_detection is not None and not isinstance(turn_detection, str):
if isinstance(turn_detection, _AudioTurnDetector) and self.vad is None:
raise ValueError(
"AudioTurnDetector requires a VAD model; pass vad=silero.VAD.load() "
"(or another VAD) to AgentSession/Agent."
)
# return directly if turn_detection is _TurnDetector
return turn_detection

Expand Down Expand Up @@ -693,6 +698,9 @@ async def _start_session(self, *, reuse_resources: _ReusableResources | None = N
self._interruption_detector.on("error", self._on_error)
self._interruption_detector.on("overlapping_speech", self._on_overlap_speech_ended)

if isinstance(self._turn_detection, _AudioTurnDetector):
self._turn_detection.on("metrics_collected", self._on_metrics_collected)

if self.mcp_servers:
from ..llm.mcp import MCPToolset

Expand Down Expand Up @@ -963,6 +971,9 @@ async def _close_session(self) -> None:
self._interruption_detector.off("error", self._on_error)
self._interruption_detector.off("overlapping_speech", self._on_overlap_speech_ended)

if isinstance(self._turn_detection, _AudioTurnDetector):
self._turn_detection.off("metrics_collected", self._on_metrics_collected)

if self._rt_session is not None:
await self._rt_session.aclose()

Expand Down
Loading
Loading