diff --git a/examples/network/services/chataudioio/main.js b/examples/network/services/chataudioio/main.js index 789e22d45..b93ab77cf 100644 --- a/examples/network/services/chataudioio/main.js +++ b/examples/network/services/chataudioio/main.js @@ -16,7 +16,7 @@ import ChatAudioIO from "ChatAudioIO"; const chat = new ChatAudioIO({ specifier: "humeAIEVI", - voiceName: "Sunny", + voiceID: "Sunny", instructions: "You're a hostile fisherman with a salty sense of humor. You dislike people and care even less for fish.", onStateChanged(state) { trace(`State: ${ChatAudioIO.states[state]} ${this.error ?? ""}\n`); diff --git a/modules/network/services/chatAudioIO/ChatAudioIO.js b/modules/network/services/chatAudioIO/ChatAudioIO.js index 28f6152cd..5a24f0e0e 100644 --- a/modules/network/services/chatAudioIO/ChatAudioIO.js +++ b/modules/network/services/chatAudioIO/ChatAudioIO.js @@ -78,7 +78,7 @@ class ChatAudioIO { this.onOutputTranscript = options.onOutputTranscript ?? callback; this.onStateChanged = options.onStateChanged ?? callback; - this.createWorker(options.specifier, options.instructions, options.functions, options.voiceID, options.providerID, options.modelID); + this.createWorker(options.specifier, options.instructions, options.functions, options.voiceID, options.providerID, options.modelID, options.apiKey); } close() { this.worker?.terminate(); @@ -96,7 +96,7 @@ class ChatAudioIO { if (this.output) this.output.volume = volume; } - createWorker(specifier, instructions, functions, voiceID, providerID, modelID) { + createWorker(specifier, instructions, functions, voiceID, providerID, modelID, apiKey) { this.worker = new Worker(specifier, { static: 512 * 1024, chunk: { @@ -118,7 +118,8 @@ class ChatAudioIO { functions, voiceID, providerID, - modelID + modelID, + apiKey }) this.ensureInput(); } diff --git a/modules/network/services/chatAudioIO/readme.md b/modules/network/services/chatAudioIO/readme.md index be4e6c43f..66c524220 100644 --- a/modules/network/services/chatAudioIO/readme.md +++ b/modules/network/services/chatAudioIO/readme.md @@ -4,11 +4,11 @@ Updated December 3, 2025 ## Architecture -The conversation module uses a JavaScript [worker](https://moddable.com/documentation/base/worker). The worker is in charge of networks protocols, communicating with the AI cloud services, and encoding/decoding audio samples. +The conversation module uses a JavaScript [worker](https://moddable.com/documentation/base/worker). The worker is in charge of networks protocols, communicating with the AI cloud services, and encoding/decoding audio samples. The conversation module and its worker communicate with [marshalled messages](https://www.moddable.com/documentation/xs/XS%20Marshalling). They share input and output audio buffers for efficiency. This document describes the [messages](#Messages). -Because audio samples are transmitted as Base64 encoded data embedded in JSON, workers use a special parser to optimize memory usage and throughput. This document also describes the [`JSONBase64Parser`](#JSONBase64Parser). +Because audio samples are transmitted as Base64 encoded data embedded in JSON, workers use a special parser to optimize memory usage and throughput. This document also describes the [`JSONBase64Parser`](#JSONBase64Parser). The conversation library implements support for various AI cloud services using this worker architecture: @@ -32,6 +32,7 @@ The `options` object selects and configures a service. Its properties are: - `voiceID`: *string*, the identifier of the voice, optional - `providerID`: *string*, the identifier of the language model provider, optional - `modelID`: *string*, the identifier of the language model, optional +- `apiKey`: *string*, the API key of the AI cloud services, optional The `options` object can also provides callbacks. All callbacks are optional. @@ -49,7 +50,7 @@ The `options` object can also provides callbacks. All callbacks are optional. static CONNECTING = 2; static CONNECTED = 3; static SPEAKING = 4; // user is speaking (sending audio to cloud) - static LISTENING = 5; // user is listening (receiving audio from cloud) + static LISTENING = 5; // user is listening (receiving audio from cloud) static WAITING = 6; ``` @@ -94,14 +95,15 @@ Use `sendText` to inform the service about user interactions that did not involv - `voiceID`: *string*, the identifier of the voice, optional - `providerID`: *string*, the identifier of the language model provider, optional - `modelID`: *string*, the identifier of the language model, optional +- `apiKey`: *string*, the API key of the AI cloud services, optional The `voiceID`, `providerID` and `modelID` are specific to each service. Look at [ConversationalAI assets](https://github.com/Moddable-OpenSource/moddable/blob/public/contributed/conversationalAI/assets.js) to get voice, provider and model identifiers, names and descriptions by service. The format of function descriptions is a JSON schema that is more or less common to all services. ```javascript -{ - id:"configure", +{ + id:"configure", instructions: "You are a helpful lighting system bot. You can turn lights on and off. Do not perform any other tasks.", functions: [ { @@ -117,7 +119,7 @@ The format of function descriptions is a JSON schema that is more or less common }, required: ["light_name"], } - }, + }, { name: "turn_light_off", description: "Turn the light off. Call this whenever you need to turn off a light, for example when a customer tells 'turn bedroom light off.'", @@ -131,7 +133,7 @@ The format of function descriptions is a JSON schema that is more or less common }, required: ["light_name"], } - } + } ] } ``` @@ -261,7 +263,7 @@ Tell the application that the worker is connected and has configured the service #### disconnected -Tell the application that the worker is disconnected from the service. +Tell the application that the worker is disconnected from the service. ```javascript { @@ -293,7 +295,7 @@ Tell the application that the worker is receiving audio samples from the service } ``` -The application creates an audio output object which reads audio samples from the output buffer. +The application creates an audio output object which reads audio samples from the output buffer. #### receiveAudio diff --git a/modules/network/services/chatAudioIO/workers/deepgramAgent.js b/modules/network/services/chatAudioIO/workers/deepgramAgent.js index 92593d003..f4ddd1be5 100644 --- a/modules/network/services/chatAudioIO/workers/deepgramAgent.js +++ b/modules/network/services/chatAudioIO/workers/deepgramAgent.js @@ -28,9 +28,6 @@ class DeepgramVoiceAgentModel extends ChatWebSocketWorker { super(options); this.host = "agent.deepgram.com"; this.path = `/v1/agent/converse`; - this.headers = [ - ["Authorization", `Token ${config.deepgramKey}`], - ]; this.keepAliveTimer = null; } close() { @@ -40,6 +37,10 @@ class DeepgramVoiceAgentModel extends ChatWebSocketWorker { configure(message) { const prompt = message.instructions ?? ""; const functions = message.functions ?? []; + const apiKey = message.apiKey ?? config.deepgramKey; + this.headers = [ + ["Authorization", `Token ${apiKey}`], + ]; this.setup = { type: "Settings", experimental: true, diff --git a/modules/network/services/chatAudioIO/workers/elevenLabsAgent.js b/modules/network/services/chatAudioIO/workers/elevenLabsAgent.js index 3711c4d71..a0808c85b 100644 --- a/modules/network/services/chatAudioIO/workers/elevenLabsAgent.js +++ b/modules/network/services/chatAudioIO/workers/elevenLabsAgent.js @@ -45,6 +45,11 @@ class ElevenLabsModel extends ChatWebSocketWorker { this.setup = { type: "conversation_initiation_client_data", } + const apiKey = message.apiKey ?? config.elevenLabsKey; + this.headers = new Map([ + [ "xi-api-key", apiKey ], + [ "Content-Type", "application/json" ], + ]); this.body = { conversation_config: { asr: { @@ -80,10 +85,7 @@ class ElevenLabsModel extends ChatWebSocketWorker { ...device.network.https, host: this.host }); - const headers = new Map([ - [ "xi-api-key", config.elevenLabsKey ], - [ "Content-Type", "application/json" ], - ]); + const headers = this.headers const request = (method, path, body) => { let buffer = null; let length = 0; @@ -150,6 +152,8 @@ class ElevenLabsModel extends ChatWebSocketWorker { case 3: client.close(); if (json?.agent_id) { + this.headers.delete("content-length"); + this.headers.delete("Content-Type"); this.path = `/v1/convai/conversation?agent_id=${json.agent_id}`; super.connect(message); } diff --git a/modules/network/services/chatAudioIO/workers/googleGeminiLive.js b/modules/network/services/chatAudioIO/workers/googleGeminiLive.js index 5c510b1a2..297897ffa 100644 --- a/modules/network/services/chatAudioIO/workers/googleGeminiLive.js +++ b/modules/network/services/chatAudioIO/workers/googleGeminiLive.js @@ -28,7 +28,6 @@ class GoogleGeminiLiveModel extends ChatWebSocketWorker { constructor(options) { super(options); this.host = "generativelanguage.googleapis.com"; - this.path = `/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent?key=${config.geminiAPIKey}`; this.headers = null; this.audioPrefix = audioPrefix; this.audioSuffix = audioSuffix; @@ -39,6 +38,8 @@ class GoogleGeminiLiveModel extends ChatWebSocketWorker { const tools = message.functions ?? []; const voiceName = message.voiceID ?? "aoede"; const model = message.modelID ?? "gemini-2.5-flash-native-audio-preview-12-2025"; + const apiKey = message.apiKey ?? config.geminiAPIKey; + this.path = `/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent?key=${apiKey}`; this.setup = { model: `models/${model}`, generationConfig: { diff --git a/modules/network/services/chatAudioIO/workers/humeAIEVI.js b/modules/network/services/chatAudioIO/workers/humeAIEVI.js index 9ecaef183..792b0731a 100644 --- a/modules/network/services/chatAudioIO/workers/humeAIEVI.js +++ b/modules/network/services/chatAudioIO/workers/humeAIEVI.js @@ -28,7 +28,6 @@ class HumeAIEVIModel extends ChatWebSocketWorker { constructor(options) { super(options); this.host = "api.hume.ai"; - this.path = `/v0/evi/chat?api_key=${config.humeAIKey}`; this.headers = null; this.audioPrefix = audioPrefix; this.audioSuffix = audioSuffix; @@ -39,10 +38,7 @@ class HumeAIEVIModel extends ChatWebSocketWorker { ...device.network.https, host: this.host }); - const headers = new Map([ - [ "X-Hume-Api-Key", config.humeAIKey ], - [ "Content-Type", "application/json" ], - ]); + const headers = this.headers const request = (method, path, body) => { let buffer = null; let length = 0; @@ -105,6 +101,8 @@ class HumeAIEVIModel extends ChatWebSocketWorker { case 3: client.close(); this.path += `&config_id=${json.id}`; + this.headers.delete("content-length"); + this.headers.delete("Content-Type"); super.connect(message); return; } @@ -129,6 +127,12 @@ class HumeAIEVIModel extends ChatWebSocketWorker { system_prompt: instructions, tools, }; + const apiKey = message.apiKey ?? config.humeAIKey; + this.path = `/v0/evi/chat?api_key=${apiKey}`; + this.headers = new Map([ + [ "X-Hume-Api-Key", apiKey ], + [ "Content-Type", "application/json" ], + ]); this.body = { evi_version: "4-mini", name: "Moddable", diff --git a/modules/network/services/chatAudioIO/workers/openAIRealtime.js b/modules/network/services/chatAudioIO/workers/openAIRealtime.js index fc89a6212..6ca4eb12f 100644 --- a/modules/network/services/chatAudioIO/workers/openAIRealtime.js +++ b/modules/network/services/chatAudioIO/workers/openAIRealtime.js @@ -29,9 +29,6 @@ class OpenAIRealTimeModel extends ChatWebSocketWorker { constructor(options) { super(options); this.host = "api.openai.com"; - this.headers = [ - ["Authorization", `Bearer ${config.openAIKey}`] - ]; this.audioPrefix = audioPrefix; this.audioSuffix = audioSuffix; } @@ -45,6 +42,10 @@ class OpenAIRealTimeModel extends ChatWebSocketWorker { tool.type = "function"; tool.parameters.additionalProperties = false; }); + const apiKey = message.apiKey ?? config.openAIKey; + this.headers = [ + ["Authorization", `Bearer ${apiKey}`] + ]; this.session = { type: 'realtime', audio: { diff --git a/typings/ChatAudioIO.d.ts b/typings/ChatAudioIO.d.ts index 5435e7617..6f49c832d 100644 --- a/typings/ChatAudioIO.d.ts +++ b/typings/ChatAudioIO.d.ts @@ -20,9 +20,10 @@ declare module "ChatAudioIO" { type ChatAudioIOOptions = { - specifier: "googleGeminiLive" | "openAIRealtime" | "humeAIEVI" | "elevenLabsAgent"; + specifier: "googleGeminiLive" | "openAIRealtime" | "humeAIEVI" | "elevenLabsAgent" | "deepgramAgent"; voiceName?: string; instructions?: string; + apiKey?: string; onStateChanged?: (this: ChatAudioIO, state: number) => void; onInputTranscript?: (this: ChatAudioIO, text: string, more: boolean) => void;