diff --git a/.changeset/agent-activity-tool-choice-als-leak.md b/.changeset/agent-activity-tool-choice-als-leak.md new file mode 100644 index 000000000..947736c0a --- /dev/null +++ b/.changeset/agent-activity-tool-choice-als-leak.md @@ -0,0 +1,5 @@ +--- +'@livekit/agents': patch +--- + +Fix `AgentActivity.generateReply` defaulting `toolChoice` to `'none'` on a child `AgentSession` spawned inside a tool. The previous check relied on `AsyncLocalStorage`, which leaks the parent function-call context into the child session and caused the framework to drop legitimate tool calls emitted by the child agent (e.g. the supervisor's `connect_to_caller` invocation in `WarmTransferTask`). The check now uses per-task info, matching the Python implementation. diff --git a/.changeset/warm-transfer-task.md b/.changeset/warm-transfer-task.md new file mode 100644 index 000000000..a21681ec4 --- /dev/null +++ b/.changeset/warm-transfer-task.md @@ -0,0 +1,5 @@ +--- +'@livekit/agents': minor +--- + +Add beta WarmTransferTask workflow for SIP-based human handoffs. diff --git a/agents/resources/hold_music.ogg b/agents/resources/hold_music.ogg new file mode 100644 index 000000000..c9e4d41b6 Binary files /dev/null and b/agents/resources/hold_music.ogg differ diff --git a/agents/src/beta/index.ts b/agents/src/beta/index.ts index 4fb384d19..98ac382e9 100644 --- a/agents/src/beta/index.ts +++ b/agents/src/beta/index.ts @@ -6,6 +6,9 @@ export { type TaskCompletedEvent, type TaskGroupOptions, type TaskGroupResult, + WarmTransferTask, type InstructionParts, + type WarmTransferResult, + type WarmTransferTaskOptions, } from './workflows/index.js'; export { Instructions } from '../llm/index.js'; diff --git a/agents/src/beta/workflows/index.ts b/agents/src/beta/workflows/index.ts index 1ca201576..0b0cfcb52 100644 --- a/agents/src/beta/workflows/index.ts +++ b/agents/src/beta/workflows/index.ts @@ -7,4 +7,9 @@ export { type TaskGroupOptions, type TaskGroupResult, } from './task_group.js'; +export { + WarmTransferTask, + type WarmTransferResult, + type WarmTransferTaskOptions, +} from './warm_transfer.js'; export type { InstructionParts } from './utils.js'; diff --git a/agents/src/beta/workflows/warm_transfer.ts b/agents/src/beta/workflows/warm_transfer.ts new file mode 100644 index 000000000..bae6e7c9b --- /dev/null +++ b/agents/src/beta/workflows/warm_transfer.ts @@ -0,0 +1,575 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { SIPOutboundConfig } from '@livekit/protocol'; +import { type DisconnectReason, type ParticipantKind, Room, RoomEvent } from '@livekit/rtc-node'; +import { AccessToken, RoomServiceClient, SipClient, type VideoGrant } from 'livekit-server-sdk'; +import { z } from 'zod'; +import type { LLMModels, STTModelString, TTSModelString } from '../../inference/index.js'; +import { getJobContext } from '../../job.js'; +import type { ChatContext, LLM, RealtimeModel, ToolContext } from '../../llm/index.js'; +import { ToolError, ToolFlag, tool } from '../../llm/index.js'; +import { log } from '../../log.js'; +import type { STT } from '../../stt/index.js'; +import type { TTS } from '../../tts/index.js'; +import type { VAD } from '../../vad.js'; +import { Agent, AgentTask } from '../../voice/agent.js'; +import { AgentSession, type TurnDetectionMode } from '../../voice/agent_session.js'; +import { + type AudioConfig, + type AudioSourceType, + BackgroundAudioPlayer, + BuiltinAudioClip, + type PlayHandle, +} from '../../voice/background_audio.js'; +import { DEFAULT_PARTICIPANT_KINDS } from '../../voice/room_io/index.js'; + +export interface WarmTransferResult { + humanAgentIdentity: string; +} + +export interface InstructionParts { + persona?: string; + extra?: string; +} + +export interface WarmTransferTaskOptions { + /** The phone number or SIP URI to dial for the human agent. */ + sipCallTo?: string; + /** + * ID of a pre-configured LiveKit SIP outbound trunk used to originate the call. + * Falls back to the `LIVEKIT_SIP_OUTBOUND_TRUNK` environment variable when not provided. + */ + sipTrunkId?: string | null; + /** Low-level SIP connection config for originating calls through a custom SIP domain. */ + sipConnection?: SIPOutboundConfig; + /** Optional SIP From number. Falls back to `LIVEKIT_SIP_NUMBER`. */ + sipNumber?: string; + /** Headers to include on the outbound SIP call. */ + sipHeaders?: Record; + /** How long to wait, in milliseconds, for the human agent to answer before giving up. */ + ringingTimeout?: number | null; + /** Audio played to the caller while they are on hold during the transfer. */ + holdAudio?: AudioSourceType | AudioConfig | AudioConfig[] | null; + instructions?: InstructionParts | string; + chatCtx?: ChatContext; + turnDetection?: TurnDetectionMode | null; + tools?: ToolContext; + stt?: STT | STTModelString | null; + vad?: VAD | null; + llm?: LLM | RealtimeModel | LLMModels | null; + tts?: TTS | TTSModelString | null; + allowInterruptions?: boolean; + /** @deprecated use `sipCallTo` instead. */ + targetPhoneNumber?: string; + /** @deprecated use `instructions.extra` instead. */ + extraInstructions?: string; +} + +export class WarmTransferTask extends AgentTask { + private _callerRoom: Room | null = null; + private _humanAgentRoom: Room | null = null; + private _humanAgentSession: AgentSession | null = null; + // Initialized in the constructor body to avoid ES2022 class-field [[Define]] + // semantics wiping the resolver assigned inside the Promise executor. + private _humanAgentFailed!: Promise; + private _resolveHumanAgentFailed!: () => void; + private _humanAgentIdentity = 'human-agent-sip'; + + private _sipCallTo: string; + private _sipTrunkId: string | null; + private _sipConnection?: SIPOutboundConfig; + private _sipNumber: string; + private _sipHeaders: Record; + private _ringingTimeout: number | null; + + private _backgroundAudio = new BackgroundAudioPlayer(); + private _holdAudioHandle: PlayHandle | null = null; + private _holdAudio: AudioSourceType | AudioConfig | AudioConfig[] | null; + + private _originalIoState: Record = {}; + private _taskTurnDetection: TurnDetectionMode | undefined; + private _allowInterruptions: boolean | undefined; + private _logger = log(); + + constructor(options: WarmTransferTaskOptions = {}) { + let sipCallTo = options.sipCallTo; + let instructions = options.instructions; + const { + sipTrunkId, + sipConnection, + sipNumber, + sipHeaders, + ringingTimeout, + holdAudio, + chatCtx, + turnDetection, + tools, + stt, + vad, + llm, + tts, + allowInterruptions, + targetPhoneNumber, + extraInstructions = '', + } = options; + + if (targetPhoneNumber) { + log().warn('`targetPhoneNumber` is deprecated, use `sipCallTo` instead'); + sipCallTo ??= targetPhoneNumber; + } + + if (!sipCallTo) { + throw new Error('`sipCallTo` must be set'); + } + + if (!instructions) { + instructions = { persona: PERSONA, extra: extraInstructions }; + } else if (extraInstructions) { + log().warn('`extraInstructions` will be ignored when `instructions` is provided'); + } + + if (typeof instructions !== 'string') { + // Substitute all placeholders in a single pass with function + // replacements. This avoids two pitfalls of chained `.replace(str, str)`: + // (1) special dollar-sign patterns (`$&`, `$\``, `$'`, `$N`) in the + // replacement string being interpreted by `replace`, which could corrupt + // the prompt if the conversation history contains them; and + // (2) earlier substitutions accidentally introducing later placeholder + // text (e.g. a user message containing `{extra}` consuming the real + // `{extra}` slot). + const replacements: Record = { + persona: instructions.persona ?? PERSONA, + _conversation_history: WarmTransferTask.formatConversationHistory(chatCtx), + extra: instructions.extra ?? '', + }; + instructions = INSTRUCTIONS_TEMPLATE.replace( + /\{(persona|_conversation_history|extra)\}/g, + (_match, key: string) => replacements[key] ?? '', + ); + } + + super({ + instructions, + turnDetection: turnDetection ?? undefined, + tools, + stt: stt ?? undefined, + vad: vad ?? undefined, + llm: llm ?? undefined, + tts: tts ?? undefined, + allowInterruptions, + }); + + this._humanAgentFailed = new Promise((resolve) => { + this._resolveHumanAgentFailed = resolve; + }); + + this._tools = { + ...this._tools, + connect_to_caller: this.buildConnectToCallerTool(), + decline_transfer: this.buildDeclineTransferTool(), + voicemail_detected: this.buildVoicemailDetectedTool(), + }; + this._chatCtx = this._chatCtx.copy({ toolCtx: this._tools }); + + this._taskTurnDetection = turnDetection ?? undefined; + this._allowInterruptions = allowInterruptions; + + this._sipCallTo = sipCallTo; + this._sipConnection = sipConnection; + if (sipTrunkId !== undefined) { + this._sipTrunkId = sipTrunkId; + } else if (this._sipConnection) { + this._sipTrunkId = null; + } else { + this._sipTrunkId = process.env.LIVEKIT_SIP_OUTBOUND_TRUNK ?? null; + } + if (this._sipTrunkId === null && !this._sipConnection) { + throw new Error( + '`LIVEKIT_SIP_OUTBOUND_TRUNK` environment variable, `sipTrunkId`, or `sipConnection` must be set', + ); + } + + this._sipNumber = sipNumber ?? process.env.LIVEKIT_SIP_NUMBER ?? ''; + this._sipHeaders = sipHeaders ?? {}; + this._ringingTimeout = ringingTimeout ?? null; + this._holdAudio = + holdAudio === undefined ? { source: BuiltinAudioClip.HOLD_MUSIC, volume: 0.8 } : holdAudio; + } + + private static formatConversationHistory(chatCtx?: ChatContext): string { + if (!chatCtx) { + return ''; + } + + let previousConversation = ''; + for (const item of chatCtx.items) { + if (item.type !== 'message' || (item.role !== 'user' && item.role !== 'assistant')) { + continue; + } + + const content = item.textContent; + if (!content) { + continue; + } + + const role = item.role === 'user' ? 'Caller' : 'Assistant'; + previousConversation += `${role}: ${content}\n`; + } + return previousConversation; + } + + async onEnter(): Promise { + const jobCtx = getJobContext(); + this._callerRoom = jobCtx.room; + + if (this._holdAudio !== null) { + await this._backgroundAudio.start({ room: this._callerRoom }); + this._holdAudioHandle = this._backgroundAudio.play(this._holdAudio, true); + } + + this.setIoEnabled(false); + + const dialAbortController = new AbortController(); + const dialHumanAgent = this.dialHumanAgent(dialAbortController.signal); + try { + const result = await Promise.race([ + dialHumanAgent.then((session) => ({ session })), + this._humanAgentFailed.then(() => ({ session: null })), + ]); + + if (!result.session) { + throw new Error('human agent room closed'); + } + + this._humanAgentSession = result.session; + } catch (error) { + this._logger.error({ error }, 'could not dial human agent'); + this.setResult(new ToolError('could not dial human agent')); + } finally { + dialAbortController.abort(); + const session = await dialHumanAgent.catch(() => null); + if (session && this._humanAgentSession !== session) { + await this.cleanupHumanAgentDial(session, this._humanAgentRoom); + if (this._humanAgentRoom) { + this._humanAgentRoom = null; + } + } + } + } + + private buildConnectToCallerTool() { + return tool({ + description: 'Called when the human agent wants to connect to the caller.', + flags: ToolFlag.IGNORE_ON_ENTER, + execute: async () => { + this._logger.debug('connecting to caller'); + if (!this._callerRoom) { + throw new Error('caller room is not available'); + } + + await this.mergeCalls(); + this.setResult({ humanAgentIdentity: this._humanAgentIdentity }); + this._callerRoom.on( + RoomEvent.ParticipantDisconnected, + this.onCallerParticipantDisconnected, + ); + }, + }); + } + + private buildDeclineTransferTool() { + return tool({ + description: + 'Handles the case when the human agent explicitly declines to connect to the caller.', + parameters: z.object({ + reason: z + .string() + .describe('A short explanation of why the human agent declined to connect to the caller'), + }), + flags: ToolFlag.IGNORE_ON_ENTER, + execute: async ({ reason }: { reason: string }) => { + this.setResult(new ToolError(`human agent declined to connect: ${reason}`)); + }, + }); + } + + private buildVoicemailDetectedTool() { + return tool({ + description: + 'Called when the call reaches voicemail. Use this tool AFTER you hear the voicemail greeting', + flags: ToolFlag.IGNORE_ON_ENTER, + execute: async () => { + this.setResult(new ToolError('voicemail detected')); + }, + }); + } + + private onHumanAgentRoomClose = (reason: DisconnectReason): void => { + this._logger.debug({ reason }, "human agent's room closed"); + this._resolveHumanAgentFailed(); + this.setResult(new ToolError(`room closed: ${reason}`)); + }; + + private onCallerParticipantDisconnected = (participant: { + identity: string; + kind: ParticipantKind; + }): void => { + if (!DEFAULT_PARTICIPANT_KINDS.includes(participant.kind)) { + return; + } + + this._logger.info( + { participantIdentity: participant.identity }, + 'participant disconnected from caller room, closing', + ); + + if (!this._callerRoom?.name) { + return; + } + + this._callerRoom.off(RoomEvent.ParticipantDisconnected, this.onCallerParticipantDisconnected); + + const rooms = new RoomServiceClient(getJobContext().info.url); + void rooms.deleteRoom(this._callerRoom.name).catch((error) => { + this._logger.warn({ error }, 'failed to delete caller room'); + }); + }; + + private setResult(result: WarmTransferResult | Error): void { + if (this.done) { + return; + } + + if (this._humanAgentSession) { + this._humanAgentSession.shutdown(); + this._humanAgentSession = null; + } + + // AgentSession.shutdown() closes RoomIO but does not disconnect the + // underlying Room, so the supervisor room's WebSocket would leak across + // every transfer. Disconnect explicitly here. The room is moved out of + // (mergeCalls) or torn down (failure) by the time we reach this point. + if (this._humanAgentRoom) { + const humanAgentRoom = this._humanAgentRoom; + this._humanAgentRoom = null; + void humanAgentRoom.disconnect().catch((error) => { + this._logger.warn({ error }, 'failed to disconnect human agent room'); + }); + } + + if (this._holdAudioHandle) { + this._holdAudioHandle.stop(); + this._holdAudioHandle = null; + } + void this._backgroundAudio.close().catch((error) => { + this._logger.warn({ error }, 'failed to close background audio'); + }); + + this.setIoEnabled(true); + this.complete(result); + } + + private async dialHumanAgent(signal: AbortSignal): Promise { + if (!this._callerRoom?.name) { + throw new Error('caller room is not available'); + } + const localIdentity = this._callerRoom.localParticipant?.identity; + if (!localIdentity) { + throw new Error('caller room local participant is not available'); + } + + const jobCtx = getJobContext(); + const humanAgentRoomName = `${this._callerRoom.name}-human-agent`; + const room = new Room(); + let humanAgentSession: AgentSession | null = null; + let completed = false; + + try { + const token = new AccessToken(undefined, undefined, { identity: localIdentity }); + token.kind = 'agent'; + token.addGrant({ + roomJoin: true, + room: humanAgentRoomName, + canUpdateOwnMetadata: true, + canPublish: true, + canSubscribe: true, + } as VideoGrant); + + this._logger.debug( + { wsUrl: jobCtx.info.url, humanAgentRoomName }, + 'connecting to human agent room', + ); + const jwt = await token.toJwt(); + await this.abortable(() => room.connect(jobCtx.info.url, jwt), signal); + room.on(RoomEvent.Disconnected, this.onHumanAgentRoomClose); + + humanAgentSession = new AgentSession({ + vad: this.session.vad, + llm: this.session.llm, + stt: this.session.stt, + tts: this.session.tts, + turnDetection: this.session.turnDetection, + }); + const session = humanAgentSession; + const humanAgent = new Agent({ + instructions: this.instructions, + stt: this.stt, + vad: this.vad, + llm: this.llm, + tts: this.tts, + tools: this.toolCtx, + chatCtx: this._chatCtx.copy(), + turnDetection: this._taskTurnDetection, + allowInterruptions: this._allowInterruptions, + }); + + await this.abortable( + () => + session.start({ + agent: humanAgent, + room, + inputOptions: { + closeOnDisconnect: true, + participantIdentity: this._humanAgentIdentity, + }, + record: false, + }), + signal, + ); + + const sip = new SipClient(jobCtx.info.url); + await this.abortable( + () => + sip.createSipParticipant( + this._sipTrunkId ?? '', + this._sipCallTo, + humanAgentRoomName, + { + participantIdentity: this._humanAgentIdentity, + waitUntilAnswered: true, + fromNumber: this._sipNumber || undefined, + headers: this._sipHeaders, + ringingTimeout: + this._ringingTimeout !== null ? this._ringingTimeout / 1000 : undefined, + }, + this._sipConnection, + ), + signal, + ); + + this._humanAgentRoom = room; + completed = true; + return session; + } finally { + if (!completed) { + room.off(RoomEvent.Disconnected, this.onHumanAgentRoomClose); + await this.cleanupHumanAgentDial(humanAgentSession, room); + } + } + } + + private async cleanupHumanAgentDial( + humanAgentSession: AgentSession | null, + room: Room | null, + ): Promise { + await room?.disconnect().catch((error) => { + this._logger.warn({ error }, 'failed to disconnect human agent room'); + }); + await humanAgentSession?.close().catch((error) => { + this._logger.warn({ error }, 'failed to close human agent session'); + }); + } + + private async abortable(fn: () => Promise, signal: AbortSignal): Promise { + if (signal.aborted) { + throw new Error('dial cancelled'); + } + + // The underlying room/SIP SDK calls are not AbortSignal-aware. The race only + // unblocks this task; cleanup disconnects the room so wait-until-answered SIP + // requests resolve promptly against a closed room. + let onAbort!: () => void; + const abortPromise = new Promise((_, reject) => { + onAbort = () => reject(new Error('dial cancelled')); + signal.addEventListener('abort', onAbort, { once: true }); + }); + + try { + return await Promise.race([fn(), abortPromise]); + } finally { + signal.removeEventListener('abort', onAbort); + } + } + + private async mergeCalls(): Promise { + if (!this._callerRoom?.name || !this._humanAgentRoom?.name) { + throw new Error('calls are not ready to merge'); + } + + this._humanAgentRoom.off(RoomEvent.Disconnected, this.onHumanAgentRoomClose); + + this._logger.debug( + { humanAgentIdentity: this._humanAgentIdentity, callerRoom: this._callerRoom.name }, + 'moving human agent to caller room', + ); + + const rooms = new RoomServiceClient(getJobContext().info.url); + await rooms.moveParticipant( + this._humanAgentRoom.name, + this._humanAgentIdentity, + this._callerRoom.name, + ); + } + + private setIoEnabled(enabled: boolean): void { + const input = this.session.input; + const output = this.session.output; + + if (Object.keys(this._originalIoState).length === 0) { + this._originalIoState = { + audioInput: input.audioEnabled, + audioOutput: output.audioEnabled, + transcriptionOutput: output.transcriptionEnabled, + }; + } + + if (input.audio) { + input.setAudioEnabled(enabled && this._originalIoState.audioInput!); + } + if (output.audio) { + output.setAudioEnabled(enabled && this._originalIoState.audioOutput!); + } + if (output.transcription) { + output.setTranscriptionEnabled(enabled && this._originalIoState.transcriptionOutput!); + } + } +} + +const PERSONA = `# Identity + +You are an agent that is reaching out to a human agent for help. There has been a previous conversation +between you and a caller, the conversation history is included below. + +# Goal + +Your main goal is to give the human agent sufficient context about why the caller had called in, +so that the human agent could gain sufficient knowledge to help the caller directly.`; + +const INSTRUCTIONS_TEMPLATE = `{persona} + +# Context + +In the conversation, user refers to the human agent, caller refers to the person who's transcript is included. +Remember, you are not speaking to the caller right now, you are speaking to the human agent. + +## Conversation history with caller +{_conversation_history} +## End of conversation history with caller + +Once the human agent has confirmed, you should call the tool \`connect_to_caller\` to connect them to the caller. + +You are talking to the human agent now, start by giving them a summary of the conversation so far, and answer any questions they might have. + +{extra} +`; diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index f2afae247..5b1a27123 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -72,7 +72,6 @@ import { StopResponse, _getActivityTaskInfo, _setActivityTaskInfo, - functionCallStorage, speechHandleStorage, } from './agent.js'; import { type AgentSession, type TurnDetectionMode } from './agent_session.js'; @@ -1667,10 +1666,17 @@ export class AgentActivity implements RecognitionHooks { throw new Error('trying to generate reply without an LLM model'); } - const functionCall = functionCallStorage.getStore()?.functionCall; - if (toolChoice === undefined && functionCall !== undefined) { - // when generateReply is called inside a tool, set toolChoice to 'none' by default - toolChoice = 'none'; + if (toolChoice === undefined) { + // when generateReply is called inside a tool on THIS activity, set toolChoice + // to 'none' by default. We look up the current task's info (which is per-activity) + // rather than `functionCallStorage` (which is AsyncLocalStorage and leaks across + // child AgentSessions spawned inside a tool, e.g. WarmTransferTask's supervisor + // session) — matches Python's _get_activity_task_info(task).function_call check. + const currentTask = Task.current(); + const taskInfo = currentTask ? _getActivityTaskInfo(currentTask) : undefined; + if (taskInfo?.functionCall) { + toolChoice = 'none'; + } } const handle = SpeechHandle.create({ diff --git a/agents/src/voice/background_audio.ts b/agents/src/voice/background_audio.ts index 391f9d938..ea723e371 100644 --- a/agents/src/voice/background_audio.ts +++ b/agents/src/voice/background_audio.ts @@ -21,6 +21,7 @@ import { AgentSessionEventTypes, type AgentStateChangedEvent } from './events.js const TASK_TIMEOUT_MS = 500; export enum BuiltinAudioClip { + HOLD_MUSIC = 'hold_music.ogg', OFFICE_AMBIENCE = 'office-ambience.ogg', KEYBOARD_TYPING = 'keyboard-typing.ogg', KEYBOARD_TYPING2 = 'keyboard-typing2.ogg', diff --git a/agents/src/voice/room_io/room_io.ts b/agents/src/voice/room_io/room_io.ts index fd6541344..ed5315bf7 100644 --- a/agents/src/voice/room_io/room_io.ts +++ b/agents/src/voice/room_io/room_io.ts @@ -51,7 +51,7 @@ export const DEFAULT_TEXT_INPUT_CALLBACK: TextInputCallback = (sess, ev) => { sess.generateReply({ userInput: ev.text }); }; -const DEFAULT_PARTICIPANT_KINDS: ParticipantKind[] = [ +export const DEFAULT_PARTICIPANT_KINDS: ParticipantKind[] = [ ParticipantKind.CONNECTOR, ParticipantKind.SIP, ParticipantKind.STANDARD, diff --git a/examples/src/warm_transfer.ts b/examples/src/warm_transfer.ts new file mode 100644 index 000000000..8ad5f9f29 --- /dev/null +++ b/examples/src/warm_transfer.ts @@ -0,0 +1,162 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { + type JobContext, + type JobProcess, + ServerOptions, + beta, + cli, + defineAgent, + inference, + llm, + log, + voice, +} from '@livekit/agents'; +import * as livekit from '@livekit/agents-plugin-livekit'; +import * as silero from '@livekit/agents-plugin-silero'; +import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; +import { fileURLToPath } from 'node:url'; + +const SIP_TRUNK_ID = process.env.LIVEKIT_SIP_OUTBOUND_TRUNK; +const SUPERVISOR_PHONE_NUMBER = process.env.LIVEKIT_SUPERVISOR_PHONE_NUMBER; +const SIP_NUMBER = process.env.LIVEKIT_SIP_NUMBER; + +class SupportAgent extends voice.Agent { + constructor() { + super({ + instructions: INSTRUCTIONS, + tools: { + transfer_to_human: llm.tool({ + description: `Called when the user asks to speak to a human agent. This will put the user on hold while the supervisor is connected. + +Ensure that the user has confirmed that they wanted to be transferred. Do not start transfer until the user has confirmed. +Examples on when the tool should be called: +---- +- User: Can I speak to your supervisor? +- Assistant: Yes of course. +---- +- Assistant: I'm unable to help with that, would you like to speak to a human agent? +- User: Yes please. +----`, + execute: async (_, { ctx }) => { + const logger = log().child({ example: 'warm-transfer' }); + logger.info('tool called to transfer to human'); + const holdSpeech = ctx.session.say( + 'Please hold while I connect you to a human agent.', + { allowInterruptions: false }, + ); + await holdSpeech.waitForPlayout(); + + try { + if (!SIP_TRUNK_ID || !SUPERVISOR_PHONE_NUMBER) { + throw new Error( + 'LIVEKIT_SIP_OUTBOUND_TRUNK and LIVEKIT_SUPERVISOR_PHONE_NUMBER must be set', + ); + } + + const result = await new beta.WarmTransferTask({ + sipCallTo: SUPERVISOR_PHONE_NUMBER, + sipTrunkId: SIP_TRUNK_ID, + sipNumber: SIP_NUMBER, + chatCtx: ctx.session.history, + // Give up if the supervisor doesn't pick up within 25s with + // `ringingTimeout: 25000`. + extraInstructions: SUMMARY_INSTRUCTIONS, + }).run(); + + logger.info( + { supervisorIdentity: result.humanAgentIdentity }, + 'transfer to supervisor successful', + ); + const goodbyeSpeech = ctx.session.say( + "you are on the line with my supervisor. I'll be hanging up now.", + { allowInterruptions: false }, + ); + await goodbyeSpeech.waitForPlayout(); + ctx.session.shutdown(); + } catch (error) { + if (error instanceof llm.ToolError) { + logger.error({ error }, 'failed to transfer to supervisor with tool error'); + throw error; + } + + logger.error({ error }, 'failed to transfer to supervisor'); + throw new llm.ToolError(`failed to transfer to supervisor with error: ${error}`); + } + }, + }), + }, + }); + } + + async onEnter(): Promise { + this.session.generateReply(); + } +} + +export default defineAgent({ + prewarm: async (proc: JobProcess) => { + proc.userData.vad = await silero.VAD.load(); + }, + entry: async (ctx: JobContext) => { + const session = new voice.AgentSession({ + vad: ctx.proc.userData.vad as silero.VAD, + llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }), + stt: new inference.STT({ model: 'deepgram/nova-3', language: 'en' }), + tts: new inference.TTS({ + model: 'cartesia/sonic-3', + voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc', + }), + turnDetection: new livekit.turnDetector.MultilingualModel(), + }); + + await session.start({ + agent: new SupportAgent(), + room: ctx.room, + inputOptions: { + noiseCancellation: BackgroundVoiceCancellation(), + }, + }); + }, +}); + +const INSTRUCTIONS = ` +# Personality + +You are friendly and helpful, with a welcoming personality +You're naturally curious, empathetic, and intuitive, always aiming to deeply understand the user's intent by actively listening. + +# Environment + +You are engaged in a live, spoken dialogue over the phone. +There are no other ways of communication with the user (no chat, text, visual, etc) + +# Tone + +Your responses are warm, measured, and supportive, typically 1-2 sentences to maintain a comfortable pace. +You speak with gentle, thoughtful pacing, using pauses (marked by "...") when appropriate to let emotional moments breathe. +You naturally include subtle conversational elements like "Hmm," "I see," and occasional rephrasing to sound authentic. +You actively acknowledge feelings ("That sounds really difficult...") and check in regularly ("How does that resonate with you?"). +You vary your tone to match the user's emotional state, becoming calmer and more deliberate when they express distress. + +# Identity + +You are a customer support agent for LiveKit. + +# Transferring to a human + +In some cases, the user may ask to speak to a human agent. This could happen when you are unable to answer their question. +When such is requested, you would always confirm with the user before initiating the transfer. +`; + +const SUMMARY_INSTRUCTIONS = ` +Introduce the conversation from your perspective as the AI assistant who participated in this call: + +WHO you're talking to (name, role, company if mentioned) +WHY they contacted you (goal, problem, request) +WHY a human agent is requested or needed at this point +Brief summary in 100-200 characters from a first-person perspective +`; + +cli.runApp(new ServerOptions({ agent: fileURLToPath(import.meta.url), agentName: 'sip-inbound' })); diff --git a/turbo.json b/turbo.json index aad21cc28..da338b76f 100644 --- a/turbo.json +++ b/turbo.json @@ -55,6 +55,9 @@ "PHONIC_API_KEY", "RESEMBLE_API_KEY", "LIVEKIT_REMOTE_EOT_URL", + "LIVEKIT_SIP_NUMBER", + "LIVEKIT_SIP_OUTBOUND_TRUNK", + "LIVEKIT_SUPERVISOR_PHONE_NUMBER", "GOOGLE_API_KEY", "GOOGLE_GENAI_API_KEY", "GOOGLE_GENAI_USE_VERTEXAI",