From 9cc971c3ea47df57a86d41bde1d12f33c78081e4 Mon Sep 17 00:00:00 2001 From: karan-dhir Date: Thu, 14 May 2026 19:33:07 -0400 Subject: [PATCH 1/4] feat(tts): add Gradium TTS plugin Adds @livekit/agents-plugin-gradium, a new TTS plugin that connects directly to the Gradium API (https://gradium.ai). - WebSocket streaming via SynthesizeStream with sentence tokenization and readyFuture handshake before text transmission - REST one-shot synthesis via ChunkedStream (only_audio: true) - PCM output at 48 kHz, 16-bit signed LE mono - Word-level timed transcripts from Gradium text segment messages - Zod-validated WebSocket message parsing - Chunk timeout + IPv4 dual-stack fallback matching Cartesia patterns - GRADIUM_API_KEY added to turbo.json globalEnv --- .changeset/gradium-tts-plugin.md | 5 + plugins/gradium/api-extractor.json | 5 + plugins/gradium/package.json | 52 +++ plugins/gradium/src/index.ts | 19 + plugins/gradium/src/models.ts | 22 ++ plugins/gradium/src/tts.ts | 576 +++++++++++++++++++++++++++++ plugins/gradium/src/types.ts | 59 +++ plugins/gradium/tsconfig.json | 14 + plugins/gradium/tsup.config.ts | 7 + pnpm-lock.yaml | 53 ++- turbo.json | 1 + 11 files changed, 801 insertions(+), 12 deletions(-) create mode 100644 .changeset/gradium-tts-plugin.md create mode 100644 plugins/gradium/api-extractor.json create mode 100644 plugins/gradium/package.json create mode 100644 plugins/gradium/src/index.ts create mode 100644 plugins/gradium/src/models.ts create mode 100644 plugins/gradium/src/tts.ts create mode 100644 plugins/gradium/src/types.ts create mode 100644 plugins/gradium/tsconfig.json create mode 100644 plugins/gradium/tsup.config.ts diff --git a/.changeset/gradium-tts-plugin.md b/.changeset/gradium-tts-plugin.md new file mode 100644 index 000000000..733e77694 --- /dev/null +++ b/.changeset/gradium-tts-plugin.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents-plugin-gradium": minor +--- + +Add Gradium TTS plugin diff --git a/plugins/gradium/api-extractor.json b/plugins/gradium/api-extractor.json new file mode 100644 index 000000000..32c90f0fa --- /dev/null +++ b/plugins/gradium/api-extractor.json @@ -0,0 +1,5 @@ +{ + "$schema": "https://developer.microsoft.com/json-schemas/api-extractor/v7/api-extractor.schema.json", + "extends": "../../api-extractor-shared.json", + "mainEntryPointFilePath": "./dist/index.d.ts" +} diff --git a/plugins/gradium/package.json b/plugins/gradium/package.json new file mode 100644 index 000000000..8fe7311ba --- /dev/null +++ b/plugins/gradium/package.json @@ -0,0 +1,52 @@ +{ + "name": "@livekit/agents-plugin-gradium", + "version": "0.1.0", + "description": "Gradium plugin for LiveKit Node Agents", + "main": "dist/index.js", + "require": "dist/index.cjs", + "types": "dist/index.d.ts", + "exports": { + "import": { + "types": "./dist/index.d.ts", + "default": "./dist/index.js" + }, + "require": { + "types": "./dist/index.d.cts", + "default": "./dist/index.cjs" + } + }, + "author": "LiveKit", + "type": "module", + "repository": "git@github.com:livekit/agents-js.git", + "license": "Apache-2.0", + "files": [ + "dist", + "src", + "README.md" + ], + "scripts": { + "build": "tsup --onSuccess \"pnpm build:types\"", + "build:types": "tsc --declaration --emitDeclarationOnly && node ../../scripts/copyDeclarationOutput.js", + "clean": "rm -rf dist", + "clean:build": "pnpm clean && pnpm build", + "lint": "eslint -f unix \"src/**/*.{ts,js}\"", + "api:check": "api-extractor run --typescript-compiler-folder ../../node_modules/typescript", + "api:update": "api-extractor run --local --typescript-compiler-folder ../../node_modules/typescript --verbose" + }, + "devDependencies": { + "@livekit/agents": "workspace:*", + "@livekit/rtc-node": "catalog:", + "@microsoft/api-extractor": "^7.35.0", + "@types/ws": "catalog:", + "tsup": "^8.3.5", + "typescript": "^5.0.0" + }, + "dependencies": { + "ws": "catalog:" + }, + "peerDependencies": { + "@livekit/agents": "workspace:*", + "@livekit/rtc-node": "catalog:", + "zod": "^3.25.76 || ^4.1.8" + } +} diff --git a/plugins/gradium/src/index.ts b/plugins/gradium/src/index.ts new file mode 100644 index 000000000..d680b0f5b --- /dev/null +++ b/plugins/gradium/src/index.ts @@ -0,0 +1,19 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { Plugin } from '@livekit/agents'; + +export * from './models.js'; +export * from './tts.js'; + +class GradiumPlugin extends Plugin { + constructor() { + super({ + title: 'gradium', + version: __PACKAGE_VERSION__, + package: __PACKAGE_NAME__, + }); + } +} + +Plugin.registerPlugin(new GradiumPlugin()); diff --git a/plugins/gradium/src/models.ts b/plugins/gradium/src/models.ts new file mode 100644 index 000000000..dcd54c46d --- /dev/null +++ b/plugins/gradium/src/models.ts @@ -0,0 +1,22 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +/** Supported Gradium TTS models */ +export type TTSModels = 'default' | string; + +/** + * Supported audio output formats. + * - `wav`: 48 kHz, 16-bit signed mono (includes WAV header) + * - `pcm`: 48 kHz, 16-bit signed little-endian mono, raw + * - `opus`: Ogg-wrapped Opus + * - `pcm_8000`–`pcm_48000`: Raw PCM at a specific sample rate + * - `ulaw_8000`, `alaw_8000`: Telephony codecs at 8 kHz + */ +export type TTSOutputFormat = 'wav' | 'pcm' | 'opus' | 'ulaw_8000' | 'alaw_8000' | `pcm_${number}`; + +/** Languages supported by Gradium TTS */ +export type TTSLanguage = 'en' | 'fr' | 'de' | 'es' | 'pt'; + +/** Default voice ID (Emma, English feminine) */ +export const TTSDefaultVoiceId = 'YTpq7expH9539ERJ'; diff --git a/plugins/gradium/src/tts.ts b/plugins/gradium/src/tts.ts new file mode 100644 index 000000000..9d11dcfd4 --- /dev/null +++ b/plugins/gradium/src/tts.ts @@ -0,0 +1,576 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { + type APIConnectOptions, + APIConnectionError, + APITimeoutError, + AudioByteStream, + Future, + type TimedString, + asError, + createTimedString, + log, + shortuuid, + stream, + tokenize, + tts, +} from '@livekit/agents'; +import type { AudioFrame } from '@livekit/rtc-node'; +import { type RawData, WebSocket } from 'ws'; +import { TTSDefaultVoiceId, type TTSModels, type TTSOutputFormat } from './models.js'; +import { + type GradiumServerMessage, + gradiumServerMessageSchema, + isAudioMessage, + isEosMessage, + isErrorMessage, + isReadyMessage, + isTextSegmentMessage, +} from './types.js'; + +const GRADIUM_API_KEY_HEADER = 'x-api-key'; +const GRADIUM_SAMPLE_RATE = 48000; +const GRADIUM_CHANNELS = 1; +const BUFFERED_WORDS_COUNT = 8; + +/** Advanced voice generation settings */ +export interface JsonConfig { + /** Sampling temperature (0.0–1.4, default 0.7). Higher values produce more varied output. */ + temp?: number; + /** Voice similarity coefficient (1.0–4.0, default 2.0). */ + cfg_coef?: number; + /** Speech speed control (−4.0–4.0, default 0.0). Negative values are faster. */ + padding_bonus?: number; + /** Language alias for text normalization rules (e.g. `"en"`, `"fr"`). */ + rewrite_rules?: string; +} + +/** Configuration options for Gradium TTS */ +export interface TTSOptions { + /** Voice ID from the Gradium voice library or a custom cloned voice. */ + voiceId: string; + /** TTS model name. */ + modelName: TTSModels | string; + /** + * Audio output format. + * Must be a raw PCM variant (`"pcm"` or `"pcm_"`) when using the streaming voice pipeline, + * since the framework feeds raw samples into `AudioByteStream`. + * @defaultValue "pcm" + */ + outputFormat: TTSOutputFormat; + apiKey?: string; + baseUrl: string; + /** Optional pronunciation dictionary ID. */ + pronunciationId?: string; + /** + * Whether to include word-level timing in `SynthesizedAudio.timedTranscripts`. + * @defaultValue true + */ + wordTimestamps: boolean; + /** + * Milliseconds to wait for the next audio chunk before closing the WebSocket. + * @defaultValue 5000 + */ + chunkTimeout: number; + /** Advanced generation settings forwarded as `json_config`. */ + jsonConfig?: JsonConfig; +} + +const defaultTTSOptions: TTSOptions = { + voiceId: TTSDefaultVoiceId, + modelName: 'default', + outputFormat: 'pcm', + apiKey: process.env.GRADIUM_API_KEY, + baseUrl: 'https://api.gradium.ai/api', + wordTimestamps: true, + chunkTimeout: 5000, +}; + +export class TTS extends tts.TTS { + #opts: TTSOptions; + label = 'gradium.TTS'; + + get model(): string { + return this.#opts.modelName; + } + + get provider(): string { + return 'Gradium'; + } + + /** + * Create a new Gradium TTS instance. + * + * @remarks + * `apiKey` must be set to your Gradium API key, either via the argument or the + * `GRADIUM_API_KEY` environment variable. + */ + constructor(opts: Partial = {}) { + const resolvedOpts = { ...defaultTTSOptions, ...opts }; + super(GRADIUM_SAMPLE_RATE, GRADIUM_CHANNELS, { + streaming: true, + alignedTranscript: resolvedOpts.wordTimestamps, + }); + this.#opts = resolvedOpts; + + if (!this.#opts.apiKey) { + throw new Error('Gradium API key is required, either as an argument or via $GRADIUM_API_KEY'); + } + } + + /** Update options after construction. */ + updateOptions(opts: Partial): void { + this.#opts = { ...this.#opts, ...opts }; + } + + synthesize( + text: string, + connOptions?: APIConnectOptions, + abortSignal?: AbortSignal, + ): ChunkedStream { + return new ChunkedStream(this, text, this.#opts, connOptions, abortSignal); + } + + stream(options?: { connOptions?: APIConnectOptions }): SynthesizeStream { + return new SynthesizeStream(this, this.#opts, options?.connOptions); + } +} + +export class ChunkedStream extends tts.ChunkedStream { + label = 'gradium.ChunkedStream'; + #opts: TTSOptions; + #text: string; + + constructor( + tts: TTS, + text: string, + opts: TTSOptions, + connOptions?: APIConnectOptions, + abortSignal?: AbortSignal, + ) { + super(text, tts, connOptions, abortSignal); + this.#text = text; + this.#opts = opts; + } + + protected async run(): Promise { + const requestId = shortuuid(); + const bstream = new AudioByteStream(GRADIUM_SAMPLE_RATE, GRADIUM_CHANNELS); + + const body: Record = { + text: this.#text, + voice_id: this.#opts.voiceId, + output_format: this.#opts.outputFormat, + only_audio: true, + model_name: this.#opts.modelName, + }; + if (this.#opts.pronunciationId) body.pronunciation_id = this.#opts.pronunciationId; + if (this.#opts.jsonConfig) body.json_config = this.#opts.jsonConfig; + + const response = await fetch(`${this.#opts.baseUrl}/post/speech/tts`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + [GRADIUM_API_KEY_HEADER]: this.#opts.apiKey!, + }, + body: JSON.stringify(body), + signal: this.abortSignal, + }); + + if (!response.ok) { + throw new Error(`Gradium TTS request failed: ${response.status} ${response.statusText}`); + } + + if (!response.body) { + throw new Error('Gradium TTS response body is empty'); + } + + let lastFrame: AudioFrame | undefined; + const sendLastFrame = (segmentId: string, final: boolean) => { + if (lastFrame) { + this.queue.put({ requestId, segmentId, frame: lastFrame, final }); + lastFrame = undefined; + } + }; + + const reader = response.body.getReader(); + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + for (const frame of bstream.write(value)) { + sendLastFrame(requestId, false); + lastFrame = frame; + } + } + for (const frame of bstream.flush()) { + if (frame.samplesPerChannel === 0) continue; + sendLastFrame(requestId, false); + lastFrame = frame; + } + sendLastFrame(requestId, true); + } finally { + reader.releaseLock(); + this.queue.close(); + } + } +} + +export class SynthesizeStream extends tts.SynthesizeStream { + #opts: TTSOptions; + #logger = log(); + #tokenizer = new tokenize.basic.SentenceTokenizer({ + minSentenceLength: BUFFERED_WORDS_COUNT, + }).stream(); + label = 'gradium.SynthesizeStream'; + + constructor(tts: TTS, opts: TTSOptions, connOptions?: APIConnectOptions) { + super(tts, connOptions); + this.#opts = opts; + } + + /** Update options after construction. */ + updateOptions(opts: Partial): void { + this.#opts = { ...this.#opts, ...opts }; + } + + protected async run(): Promise { + const requestId = shortuuid(); + // Resolved when the server confirms the setup message with a `ready` response. + const readyFuture = new Future(); + let closing = false; + + const inputTask = async () => { + for await (const data of this.input) { + if (data === SynthesizeStream.FLUSH_SENTINEL) { + this.#tokenizer.flush(); + continue; + } + this.#tokenizer.pushText(data); + } + this.#tokenizer.endInput(); + this.#tokenizer.close(); + }; + + const sentenceTask = async (ws: WebSocket) => { + // Wait for the server to confirm setup before sending text. + await readyFuture.await; + for await (const event of this.#tokenizer) { + ws.send( + JSON.stringify({ type: 'text', text: event.token + ' ', client_req_id: requestId }), + ); + } + ws.send(JSON.stringify({ type: 'end_of_stream', client_req_id: requestId })); + }; + + const recvTask = async (ws: WebSocket) => { + const bstream = new AudioByteStream(GRADIUM_SAMPLE_RATE, GRADIUM_CHANNELS); + const eventChannel = stream.createStreamChannel(); + + let lastFrame: AudioFrame | undefined; + let pendingTimedTranscripts: TimedString[] = []; + + const sendLastFrame = (segmentId: string, final: boolean) => { + if (lastFrame && !this.queue.closed) { + this.queue.put({ + requestId, + segmentId, + frame: lastFrame, + final, + timedTranscripts: + pendingTimedTranscripts.length > 0 ? pendingTimedTranscripts : undefined, + }); + lastFrame = undefined; + pendingTimedTranscripts = []; + } + }; + + let chunkTimeout: NodeJS.Timeout | null = null; + const clearChunkTimeout = () => { + if (chunkTimeout) { + clearTimeout(chunkTimeout); + chunkTimeout = null; + } + }; + + const onMessage = (data: RawData) => { + void eventChannel.write(data).catch((err: unknown) => { + this.#logger.debug({ err }, 'Failed to write Gradium event to channel (likely closed)'); + }); + }; + const onClose = (code: number, reason: Buffer) => { + if (!closing) { + this.#logger.debug(`Gradium WebSocket closed: ${code} ${reason.toString()}`); + } + clearChunkTimeout(); + void eventChannel.close(); + }; + const onError = (err: Error) => { + this.#logger.error({ err }, 'Gradium WebSocket error'); + void eventChannel.close(); + }; + + ws.on('message', onMessage); + ws.on('close', onClose); + ws.on('error', onError); + + try { + const reader = eventChannel.stream().getReader(); + while (!this.closed && !this.abortController.signal.aborted) { + const result = await reader.read(); + if (result.done) break; + + let serverMsg: GradiumServerMessage; + try { + serverMsg = gradiumServerMessageSchema.parse(JSON.parse(result.value.toString())); + } catch (parseErr) { + this.#logger.warn({ parseErr }, 'Failed to parse Gradium WebSocket message'); + continue; + } + + if (isReadyMessage(serverMsg)) { + if (!readyFuture.done) readyFuture.resolve(); + continue; + } + + if (isErrorMessage(serverMsg)) { + this.#logger.error( + { message: serverMsg.message, code: serverMsg.code }, + 'Gradium TTS error', + ); + // Unblock sentenceTask if still waiting for ready. + if (!readyFuture.done) readyFuture.reject(new Error(serverMsg.message)); + break; + } + + if (isTextSegmentMessage(serverMsg) && this.#opts.wordTimestamps) { + pendingTimedTranscripts.push( + createTimedString({ + text: serverMsg.text, + startTime: serverMsg.start_s ?? 0, + endTime: serverMsg.stop_s ?? serverMsg.start_s ?? 0, + }), + ); + continue; + } + + if (isAudioMessage(serverMsg)) { + const buf = Buffer.from(serverMsg.data, 'base64'); + const audioData = buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength); + for (const frame of bstream.write(audioData)) { + sendLastFrame(requestId, false); + lastFrame = frame; + } + clearChunkTimeout(); + chunkTimeout = setTimeout(() => { + this.#logger.debug(`Gradium chunk stream timeout after ${this.#opts.chunkTimeout}ms`); + ws.close(); + }, this.#opts.chunkTimeout); + continue; + } + + if (isEosMessage(serverMsg)) { + for (const frame of bstream.flush()) { + sendLastFrame(requestId, false); + lastFrame = frame; + } + sendLastFrame(requestId, true); + if (!this.queue.closed) { + this.queue.put(SynthesizeStream.END_OF_STREAM); + } + clearChunkTimeout(); + closing = true; + ws.close(); + break; + } + } + } catch (err) { + if (err instanceof Error && !err.message.includes('WebSocket closed')) { + if ( + err.message.includes('Queue is closed') || + err.message.includes('Channel is closed') + ) { + this.#logger.warn({ err }, 'Channel closed during processing (expected on disconnect)'); + } else { + this.#logger.error({ err }, 'Error in recvTask from Gradium WebSocket'); + } + } + } finally { + ws.off('message', onMessage); + ws.off('close', onClose); + ws.off('error', onError); + clearChunkTimeout(); + } + }; + + // wss://api.gradium.ai/api/speech/tts + const wsUrl = `${this.#opts.baseUrl.replace(/^http/, 'ws')}/speech/tts`; + let ws: WebSocket | undefined; + + try { + ws = await connectGradiumWebSocket({ + url: wsUrl, + apiKey: this.#opts.apiKey!, + timeoutMs: this.connOptions.timeoutMs, + abortSignal: this.abortSignal, + }); + + // Send setup before starting tasks; recvTask will resolve readyFuture on `ready`. + const setupMsg: Record = { + type: 'setup', + voice_id: this.#opts.voiceId, + output_format: this.#opts.outputFormat, + model_name: this.#opts.modelName, + close_ws_on_eos: true, + }; + if (this.#opts.pronunciationId) setupMsg.pronunciation_id = this.#opts.pronunciationId; + if (this.#opts.jsonConfig) setupMsg.json_config = this.#opts.jsonConfig; + ws.send(JSON.stringify(setupMsg)); + + await Promise.all([inputTask(), sentenceTask(ws), recvTask(ws)]); + } catch (e) { + if (this.abortSignal.aborted) return; + throw toRetryableConnectionError(e); + } finally { + if (ws && ws.readyState !== WebSocket.CLOSED) { + safeTerminateWebSocket(ws); + } + } + } +} + +// ─── Helpers ──────────────────────────────────────────────────────────────── + +const transientNetworkCodes = new Set([ + 'ETIMEDOUT', + 'ECONNRESET', + 'EAI_AGAIN', + 'ENETUNREACH', + 'ECONNREFUSED', + 'EHOSTUNREACH', +]); + +const isRecord = (v: unknown): v is Record => v !== null && typeof v === 'object'; + +const isAggregateErrorLike = (e: unknown): e is { errors: unknown[]; name?: string } => + isRecord(e) && e.name === 'AggregateError' && Array.isArray(e.errors); + +const hasAnyTransientCode = (e: unknown): boolean => { + if (isRecord(e) && typeof e.code === 'string') return transientNetworkCodes.has(e.code); + if (isAggregateErrorLike(e)) return e.errors.some((inner) => hasAnyTransientCode(inner)); + return false; +}; + +const toRetryableConnectionError = (e: unknown): APIConnectionError => { + const err = asError(e); + const isTimeout = + (isRecord(e) && e.code === 'ETIMEDOUT') || + (typeof err.message === 'string' && err.message.includes('ETIMEDOUT')); + const msg = isTimeout + ? 'Gradium connection timed out' + : `Gradium connection failed: ${err.message || 'unknown error'}`; + return isTimeout + ? new APITimeoutError({ message: msg }) + : new APIConnectionError({ message: msg }); +}; + +const safeTerminateWebSocket = (ws: WebSocket): void => { + try { + ws.on('error', () => {}); + } catch { + // ignore + } + try { + if (ws.readyState === WebSocket.CONNECTING) { + ws.close(); + } else { + ws.terminate(); + } + } catch { + // ignore + } +}; + +const connectGradiumWebSocket = async ({ + url, + apiKey, + timeoutMs, + abortSignal, +}: { + url: string; + apiKey: string; + timeoutMs: number; + abortSignal: AbortSignal; +}): Promise => { + const connectOnce = async (family?: number): Promise => { + const ws = new WebSocket(url, { + handshakeTimeout: timeoutMs, + family, + headers: { [GRADIUM_API_KEY_HEADER]: apiKey }, + }); + + if (abortSignal.aborted) { + safeTerminateWebSocket(ws); + throw new Error('aborted'); + } + + const fut = new Future(); + let timeout: NodeJS.Timeout | undefined; + + const cleanup = () => { + if (timeout) clearTimeout(timeout); + ws.off('open', onOpen); + ws.off('error', onError); + ws.off('close', onClose); + abortSignal.removeEventListener('abort', onAbort); + }; + + const onOpen = () => { + cleanup(); + fut.resolve(); + }; + const onError = (err: Error) => { + cleanup(); + fut.reject(asError(err)); + }; + const onClose = (code: number, reason: Buffer) => { + cleanup(); + fut.reject( + new Error(`WebSocket closed before open (code=${code}, reason=${reason.toString()})`), + ); + }; + const onAbort = () => { + cleanup(); + safeTerminateWebSocket(ws); + fut.reject(new Error('aborted')); + }; + + ws.on('open', onOpen); + ws.on('error', onError); + ws.on('close', onClose); + abortSignal.addEventListener('abort', onAbort, { once: true }); + + if (timeoutMs > 0) { + timeout = setTimeout(() => fut.reject(new Error('connect timeout')), timeoutMs); + } + + try { + await fut.await; + return ws; + } catch (e) { + safeTerminateWebSocket(ws); + throw e; + } + }; + + try { + return await connectOnce(); + } catch (e) { + // One retry forcing IPv4 to work around Node.js dual-stack happy-eyeballs flakiness. + if (hasAnyTransientCode(e) || isAggregateErrorLike(e)) { + return await connectOnce(4); + } + throw e; + } +}; diff --git a/plugins/gradium/src/types.ts b/plugins/gradium/src/types.ts new file mode 100644 index 000000000..e6009d4a2 --- /dev/null +++ b/plugins/gradium/src/types.ts @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { z } from 'zod'; + +export const readyMessageSchema = z.object({ + type: z.literal('ready'), +}); + +export const audioMessageSchema = z.object({ + type: z.literal('audio'), + data: z.string(), + client_req_id: z.string().optional(), +}); + +export const textSegmentMessageSchema = z.object({ + type: z.literal('text'), + text: z.string(), + start_s: z.number().optional(), + stop_s: z.number().optional(), + client_req_id: z.string().optional(), +}); + +export const eosMessageSchema = z.object({ + type: z.literal('end_of_stream'), + client_req_id: z.string().optional(), +}); + +export const errorMessageSchema = z.object({ + type: z.literal('error'), + message: z.string(), + code: z.number().optional(), +}); + +export const gradiumServerMessageSchema = z.discriminatedUnion('type', [ + readyMessageSchema, + audioMessageSchema, + textSegmentMessageSchema, + eosMessageSchema, + errorMessageSchema, +]); + +export type GradiumReadyMessage = z.infer; +export type GradiumAudioMessage = z.infer; +export type GradiumTextSegmentMessage = z.infer; +export type GradiumEosMessage = z.infer; +export type GradiumErrorMessage = z.infer; +export type GradiumServerMessage = z.infer; + +export const isReadyMessage = (msg: GradiumServerMessage): msg is GradiumReadyMessage => + msg.type === 'ready'; +export const isAudioMessage = (msg: GradiumServerMessage): msg is GradiumAudioMessage => + msg.type === 'audio'; +export const isTextSegmentMessage = (msg: GradiumServerMessage): msg is GradiumTextSegmentMessage => + msg.type === 'text'; +export const isEosMessage = (msg: GradiumServerMessage): msg is GradiumEosMessage => + msg.type === 'end_of_stream'; +export const isErrorMessage = (msg: GradiumServerMessage): msg is GradiumErrorMessage => + msg.type === 'error'; diff --git a/plugins/gradium/tsconfig.json b/plugins/gradium/tsconfig.json new file mode 100644 index 000000000..d5c19e6e3 --- /dev/null +++ b/plugins/gradium/tsconfig.json @@ -0,0 +1,14 @@ +{ + "extends": "../../tsconfig.json", + "include": ["./src"], + "compilerOptions": { + "rootDir": "./src", + "declarationDir": "./dist", + "outDir": "./dist" + }, + "typedocOptions": { + "name": "plugins/agents-plugin-gradium", + "entryPointStrategy": "resolve", + "entryPoints": ["src/index.ts"] + } +} diff --git a/plugins/gradium/tsup.config.ts b/plugins/gradium/tsup.config.ts new file mode 100644 index 000000000..8ca20961f --- /dev/null +++ b/plugins/gradium/tsup.config.ts @@ -0,0 +1,7 @@ +import { defineConfig } from 'tsup'; + +import defaults from '../../tsup.config'; + +export default defineConfig({ + ...defaults, +}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index e6e63cef8..cdf293382 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -704,6 +704,34 @@ importers: specifier: ^5.0.0 version: 5.9.3 + plugins/gradium: + dependencies: + ws: + specifier: 'catalog:' + version: 8.20.0 + zod: + specifier: ^3.25.76 || ^4.1.8 + version: 4.3.6 + devDependencies: + '@livekit/agents': + specifier: workspace:* + version: link:../../agents + '@livekit/rtc-node': + specifier: 'catalog:' + version: 0.13.27 + '@microsoft/api-extractor': + specifier: ^7.35.0 + version: 7.43.7(@types/node@25.6.0) + '@types/ws': + specifier: 'catalog:' + version: 8.18.1 + tsup: + specifier: ^8.3.5 + version: 8.4.0(@microsoft/api-extractor@7.43.7(@types/node@25.6.0))(postcss@8.5.9)(tsx@4.21.0)(typescript@5.9.3) + typescript: + specifier: ^5.0.0 + version: 5.9.3 + plugins/hedra: dependencies: livekit-server-sdk: @@ -745,10 +773,10 @@ importers: version: 0.13.27 '@microsoft/api-extractor': specifier: ^7.35.0 - version: 7.43.7(@types/node@22.19.1) + version: 7.43.7(@types/node@25.6.0) tsup: specifier: ^8.3.5 - version: 8.4.0(@microsoft/api-extractor@7.43.7(@types/node@22.19.1))(postcss@8.5.9)(tsx@4.21.0)(typescript@5.9.3) + version: 8.4.0(@microsoft/api-extractor@7.43.7(@types/node@25.6.0))(postcss@8.5.9)(tsx@4.21.0)(typescript@5.9.3) typescript: specifier: ^5.0.0 version: 5.9.3 @@ -2793,6 +2821,7 @@ packages: '@ungap/structured-clone@1.2.0': resolution: {integrity: sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==} + deprecated: Potential CWE-502 - Update to 1.3.1 or higher '@vitest/coverage-v8@4.0.17': resolution: {integrity: sha512-/6zU2FLGg0jsd+ePZcwHRy3+WpNTBBhDY56P4JTRqUN/Dp6CvOEa9HrikcQ4KfV2b2kAHUFB4dl1SuocWXSFEw==} @@ -5960,7 +5989,7 @@ snapshots: '@eslint/eslintrc@2.1.4': dependencies: ajv: 6.12.6 - debug: 4.4.1 + debug: 4.4.3 espree: 9.6.1 globals: 13.24.0 ignore: 5.3.1 @@ -6040,7 +6069,7 @@ snapshots: '@humanwhocodes/config-array@0.11.14': dependencies: '@humanwhocodes/object-schema': 2.0.3 - debug: 4.4.1 + debug: 4.4.3 minimatch: 3.1.2 transitivePeerDependencies: - supports-color @@ -6885,7 +6914,7 @@ snapshots: dependencies: '@typescript-eslint/typescript-estree': 6.21.0(typescript@5.9.3) '@typescript-eslint/utils': 6.21.0(eslint@8.57.0)(typescript@5.9.3) - debug: 4.4.1 + debug: 4.4.3 eslint: 8.57.0 ts-api-utils: 1.3.0(typescript@5.9.3) optionalDependencies: @@ -6899,7 +6928,7 @@ snapshots: dependencies: '@typescript-eslint/types': 6.21.0 '@typescript-eslint/visitor-keys': 6.21.0 - debug: 4.4.1 + debug: 4.4.3 globby: 11.1.0 is-glob: 4.0.3 minimatch: 9.0.3 @@ -7731,7 +7760,7 @@ snapshots: eslint-import-resolver-typescript@3.6.1(@typescript-eslint/parser@6.21.0(eslint@8.57.0)(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1)(eslint@8.57.0): dependencies: - debug: 4.4.1 + debug: 4.4.3 enhanced-resolve: 5.16.1 eslint: 8.57.0 eslint-module-utils: 2.8.1(@typescript-eslint/parser@6.21.0(eslint@8.57.0)(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.1(@typescript-eslint/parser@6.21.0(eslint@8.57.0)(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1)(eslint@8.57.0))(eslint@8.57.0) @@ -9117,7 +9146,7 @@ snapshots: require-in-the-middle@7.5.2: dependencies: - debug: 4.4.1 + debug: 4.4.3 module-details-from-path: 1.0.4 resolve: 1.22.8 transitivePeerDependencies: @@ -9524,7 +9553,7 @@ snapshots: cac: 6.7.14 chokidar: 4.0.3 consola: 3.4.2 - debug: 4.4.1 + debug: 4.4.3 esbuild: 0.25.2 joycon: 3.1.1 picocolors: 1.1.1 @@ -9552,7 +9581,7 @@ snapshots: cac: 6.7.14 chokidar: 4.0.3 consola: 3.4.2 - debug: 4.4.1 + debug: 4.4.3 esbuild: 0.25.2 joycon: 3.1.1 picocolors: 1.1.1 @@ -9689,7 +9718,7 @@ snapshots: vite-node@1.6.0(@types/node@22.19.1): dependencies: cac: 6.7.14 - debug: 4.4.1 + debug: 4.4.3 pathe: 1.1.2 picocolors: 1.1.1 vite: 5.4.21(@types/node@22.19.1) @@ -9707,7 +9736,7 @@ snapshots: vite-node@3.2.2(@types/node@22.19.1)(tsx@4.21.0): dependencies: cac: 6.7.14 - debug: 4.4.1 + debug: 4.4.3 es-module-lexer: 1.7.0 pathe: 2.0.3 vite: 7.3.2(@types/node@22.19.1)(tsx@4.21.0) diff --git a/turbo.json b/turbo.json index aad21cc28..096d4405b 100644 --- a/turbo.json +++ b/turbo.json @@ -77,6 +77,7 @@ "LK_GOOGLE_DEBUG", "LIVEKIT_EVALS_VERBOSE", "OVHCLOUD_API_KEY", + "GRADIUM_API_KEY", "INWORLD_API_KEY", "TRUGEN_API_KEY", "TRUGEN_API_URL", From 53c0d83420860a69a2a75beb67cad489e259623d Mon Sep 17 00:00:00 2001 From: karan-dhir Date: Fri, 15 May 2026 10:50:04 -0400 Subject: [PATCH 2/4] fix(gradium): reject readyFuture on WebSocket close/error before ready If the server closes the connection before sending 'ready', onClose/onError previously only closed the eventChannel, leaving sentenceTask deadlocked on readyFuture.await forever. Reject the future in both handlers so Promise.all unblocks and the retry loop can take over. --- plugins/gradium/src/tts.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/plugins/gradium/src/tts.ts b/plugins/gradium/src/tts.ts index 9d11dcfd4..170caaf37 100644 --- a/plugins/gradium/src/tts.ts +++ b/plugins/gradium/src/tts.ts @@ -304,10 +304,16 @@ export class SynthesizeStream extends tts.SynthesizeStream { this.#logger.debug(`Gradium WebSocket closed: ${code} ${reason.toString()}`); } clearChunkTimeout(); + if (!readyFuture.done) { + readyFuture.reject(new Error(`WebSocket closed before ready (code=${code})`)); + } void eventChannel.close(); }; const onError = (err: Error) => { this.#logger.error({ err }, 'Gradium WebSocket error'); + if (!readyFuture.done) { + readyFuture.reject(err); + } void eventChannel.close(); }; From 62aa4d0d79d0471556287a8172527d011d7cd546 Mon Sep 17 00:00:00 2001 From: karan-dhir Date: Fri, 15 May 2026 11:30:30 -0400 Subject: [PATCH 3/4] fix(gradium): fix abort hang and ChunkedStream retry on transient errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes from Devin review: 1. SynthesizeStream hung permanently on abort while waiting for server 'ready' — recvTask blocked on reader.read() and sentenceTask blocked on readyFuture.await with nothing to unblock them. Added onAbort listener in recvTask that closes the eventChannel and rejects readyFuture, removed in finally. 2. ChunkedStream threw plain Error on HTTP failure/empty body, which the base-class retry loop doesn't recognise and won't retry. Changed to APIConnectionError so transient 5xx responses are retried. --- plugins/gradium/src/tts.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/plugins/gradium/src/tts.ts b/plugins/gradium/src/tts.ts index 170caaf37..997ad247b 100644 --- a/plugins/gradium/src/tts.ts +++ b/plugins/gradium/src/tts.ts @@ -179,11 +179,13 @@ export class ChunkedStream extends tts.ChunkedStream { }); if (!response.ok) { - throw new Error(`Gradium TTS request failed: ${response.status} ${response.statusText}`); + throw new APIConnectionError({ + message: `Gradium TTS request failed: ${response.status} ${response.statusText}`, + }); } if (!response.body) { - throw new Error('Gradium TTS response body is empty'); + throw new APIConnectionError({ message: 'Gradium TTS response body is empty' }); } let lastFrame: AudioFrame | undefined; @@ -317,9 +319,15 @@ export class SynthesizeStream extends tts.SynthesizeStream { void eventChannel.close(); }; + const onAbort = () => { + if (!readyFuture.done) readyFuture.reject(new Error('aborted')); + void eventChannel.close(); + }; + ws.on('message', onMessage); ws.on('close', onClose); ws.on('error', onError); + this.abortController.signal.addEventListener('abort', onAbort, { once: true }); try { const reader = eventChannel.stream().getReader(); @@ -406,6 +414,7 @@ export class SynthesizeStream extends tts.SynthesizeStream { ws.off('message', onMessage); ws.off('close', onClose); ws.off('error', onError); + this.abortController.signal.removeEventListener('abort', onAbort); clearChunkTimeout(); } }; From ba259d6bc82b303f08d3a1d59b433c851bbf753e Mon Sep 17 00:00:00 2001 From: karan-dhir Date: Fri, 15 May 2026 12:40:03 -0400 Subject: [PATCH 4/4] fix(gradium): correct sample rate for pcm_ formats and wrap fetch errors Two fixes from Devin review: 1. AudioByteStream and the TTS base class were always initialized with GRADIUM_SAMPLE_RATE (48000) even when outputFormat is e.g. pcm_24000, causing garbled audio at incorrect playback speed. Added sampleRateFromFormat() helper that parses the rate from pcm_ format strings and falls back to 48000 for plain pcm/non-pcm formats. Applied in TTS constructor, ChunkedStream.run(), and recvTask. 2. ChunkedStream.run() did not catch TypeError thrown by fetch() on network failure, so transient errors were non-retryable. Wrapped the fetch call in try/catch -> toRetryableConnectionError(). --- plugins/gradium/src/tts.ts | 41 +++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/plugins/gradium/src/tts.ts b/plugins/gradium/src/tts.ts index 997ad247b..90b62931c 100644 --- a/plugins/gradium/src/tts.ts +++ b/plugins/gradium/src/tts.ts @@ -34,6 +34,12 @@ const GRADIUM_SAMPLE_RATE = 48000; const GRADIUM_CHANNELS = 1; const BUFFERED_WORDS_COUNT = 8; +const sampleRateFromFormat = (format: TTSOutputFormat): number => { + if (format === 'pcm') return GRADIUM_SAMPLE_RATE; + const m = /^pcm_(\d+)$/.exec(format); + return m ? parseInt(m[1]!, 10) : GRADIUM_SAMPLE_RATE; +}; + /** Advanced voice generation settings */ export interface JsonConfig { /** Sampling temperature (0.0–1.4, default 0.7). Higher values produce more varied output. */ @@ -108,7 +114,7 @@ export class TTS extends tts.TTS { */ constructor(opts: Partial = {}) { const resolvedOpts = { ...defaultTTSOptions, ...opts }; - super(GRADIUM_SAMPLE_RATE, GRADIUM_CHANNELS, { + super(sampleRateFromFormat(resolvedOpts.outputFormat), GRADIUM_CHANNELS, { streaming: true, alignedTranscript: resolvedOpts.wordTimestamps, }); @@ -156,7 +162,8 @@ export class ChunkedStream extends tts.ChunkedStream { protected async run(): Promise { const requestId = shortuuid(); - const bstream = new AudioByteStream(GRADIUM_SAMPLE_RATE, GRADIUM_CHANNELS); + const sampleRate = sampleRateFromFormat(this.#opts.outputFormat); + const bstream = new AudioByteStream(sampleRate, GRADIUM_CHANNELS); const body: Record = { text: this.#text, @@ -168,15 +175,22 @@ export class ChunkedStream extends tts.ChunkedStream { if (this.#opts.pronunciationId) body.pronunciation_id = this.#opts.pronunciationId; if (this.#opts.jsonConfig) body.json_config = this.#opts.jsonConfig; - const response = await fetch(`${this.#opts.baseUrl}/post/speech/tts`, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - [GRADIUM_API_KEY_HEADER]: this.#opts.apiKey!, - }, - body: JSON.stringify(body), - signal: this.abortSignal, - }); + let response: Response; + try { + response = await fetch(`${this.#opts.baseUrl}/post/speech/tts`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + [GRADIUM_API_KEY_HEADER]: this.#opts.apiKey!, + }, + body: JSON.stringify(body), + signal: this.abortSignal, + }); + } catch (e) { + if (this.abortSignal.aborted) return; + this.queue.close(); + throw toRetryableConnectionError(e); + } if (!response.ok) { throw new APIConnectionError({ @@ -267,7 +281,10 @@ export class SynthesizeStream extends tts.SynthesizeStream { }; const recvTask = async (ws: WebSocket) => { - const bstream = new AudioByteStream(GRADIUM_SAMPLE_RATE, GRADIUM_CHANNELS); + const bstream = new AudioByteStream( + sampleRateFromFormat(this.#opts.outputFormat), + GRADIUM_CHANNELS, + ); const eventChannel = stream.createStreamChannel(); let lastFrame: AudioFrame | undefined;