From 7e5b13e17b30a46af3042ba6d7e1e42b1cf941ed Mon Sep 17 00:00:00 2001 From: "Neevash Ramdial (Nash)" Date: Thu, 30 Apr 2026 12:08:51 -0700 Subject: [PATCH 1/7] fix(inworld): force LINEAR16 audio encoding for streaming TTS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inworld TTS streams default to MP3, but the plugin only decodes the first chunk via av.open() — subsequent mid-stream MP3 chunks fail to parse ('Invalid data found when processing input'). The bug is silent for short inputs that fit in one chunk but breaks any reply long enough to span multiple chunks. Setting audioConfig.audioEncoding=LINEAR16 makes Inworld return each chunk as a self-contained RIFF WAV, which the existing per-chunk decode path handles cleanly. --- plugins/inworld/vision_agents/plugins/inworld/tts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/inworld/vision_agents/plugins/inworld/tts.py b/plugins/inworld/vision_agents/plugins/inworld/tts.py index ff0395919..f3b080e2d 100644 --- a/plugins/inworld/vision_agents/plugins/inworld/tts.py +++ b/plugins/inworld/vision_agents/plugins/inworld/tts.py @@ -92,6 +92,7 @@ async def stream_audio(self, text: str, *_, **__) -> AsyncIterator[PcmData]: "modelId": self.model_id, "audioConfig": { "temperature": self.temperature, + "audioEncoding": "LINEAR16", }, } From aabb621bcdf786bbcdba0ea94e27fcfaa718350a Mon Sep 17 00:00:00 2001 From: "Neevash Ramdial (Nash)" Date: Thu, 30 Apr 2026 12:09:04 -0700 Subject: [PATCH 2/7] feat(inworld): add inworld-tts-2 to model literal and use as default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inworld TTS v2 (currently in pre-release) is API-compatible with v1 — same streaming endpoint, same payload format, plus a new per-chunk 'usage' field ({processedCharactersCount, modelId}). Adding it to the Literal lets users opt in; flipping the default exposes the new model to anyone constructing inworld.TTS() without arguments. --- plugins/inworld/vision_agents/plugins/inworld/tts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/plugins/inworld/vision_agents/plugins/inworld/tts.py b/plugins/inworld/vision_agents/plugins/inworld/tts.py index f3b080e2d..063da9922 100644 --- a/plugins/inworld/vision_agents/plugins/inworld/tts.py +++ b/plugins/inworld/vision_agents/plugins/inworld/tts.py @@ -39,7 +39,8 @@ def __init__( "inworld-tts-1.5-mini", "inworld-tts-1", "inworld-tts-1-max", - ] = "inworld-tts-1", + "inworld-tts-2", + ] = "inworld-tts-2", temperature: float = 1.1, ): """ From 1c668db13bf871c84ed0cddd378fba8e8dc51b91 Mon Sep 17 00:00:00 2001 From: "Neevash Ramdial (Nash)" Date: Thu, 30 Apr 2026 12:09:12 -0700 Subject: [PATCH 3/7] chore(inworld): switch example LLM to gemini-3.1-flash-lite-preview MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Default Gemini in the example was gemini-3.1-pro-preview which has ~3-5s latency per turn — slow enough to break the conversational feel of an expressive-TTS demo. Flash-lite consistently lands replies in <1s while still picking the right Inworld steering tags ([whisper], [laugh], [sigh], [shout]) from the audio guide. --- plugins/inworld/example/inworld_tts_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/inworld/example/inworld_tts_example.py b/plugins/inworld/example/inworld_tts_example.py index f21084ec6..072e984c2 100644 --- a/plugins/inworld/example/inworld_tts_example.py +++ b/plugins/inworld/example/inworld_tts_example.py @@ -36,7 +36,7 @@ async def create_agent(**kwargs) -> Agent: instructions="Read @inworld-audio-guide.md", tts=inworld.TTS(voice_id="Ashley"), stt=deepgram.STT(), - llm=gemini.LLM(), + llm=gemini.LLM(model="gemini-3.1-flash-lite-preview"), turn_detection=smart_turn.TurnDetection(), ) return agent From 0439f5e232aff6e6c4ef3377016eb1a7c1317a8b Mon Sep 17 00:00:00 2001 From: "Neevash Ramdial (Nash)" Date: Tue, 5 May 2026 10:20:14 -0600 Subject: [PATCH 4/7] docs(changelog): add inworld TTS v2 + LINEAR16 fix entries --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 406f9b17c..fb2784e56 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -94,8 +94,13 @@ Install with: `uv add "vision-agents[redis]"` `py.typed` markers added to `vision_agents.core` and `vision_agents.testing` for downstream type checking support. (#378) +### Inworld TTS v2 + +`inworld-tts-2` added to the model `Literal` and used as the default for `inworld.TTS()`. (#531) + ## Bug Fixes - **EventManager**: fix crash when event handlers have return type annotations (#381) - **RedisSessionKVStore**: fix import error when `redis` package is not installed (#384) - **Agent metrics**: fix metrics storage and serialization in session registry (#387) +- **Inworld TTS**: fix garbled / failed playback for replies that span multiple stream chunks by forcing `LINEAR16` audio encoding (#531) From 2d4b1a428c1180d84448acb2c86c0ec23ec2ca7e Mon Sep 17 00:00:00 2001 From: "Neevash Ramdial (Nash)" Date: Tue, 5 May 2026 10:25:24 -0600 Subject: [PATCH 5/7] docs(inworld): document TTS-2 capabilities, switch default voice to Sarah, rewrite audio guide for TTS-2 --- plugins/inworld/README.md | 51 ++++++- .../inworld/example/inworld-audio-guide.md | 139 ++++++++++-------- .../vision_agents/plugins/inworld/tts.py | 8 +- 3 files changed, 129 insertions(+), 69 deletions(-) diff --git a/plugins/inworld/README.md b/plugins/inworld/README.md index 4265f0dbe..c1af29df6 100644 --- a/plugins/inworld/README.md +++ b/plugins/inworld/README.md @@ -16,18 +16,23 @@ Get your API key from the [Inworld Portal](https://studio.inworld.ai/) and set ## TTS -High-quality text-to-speech with streaming support. +High-quality text-to-speech with streaming support. The plugin now defaults +to Inworld's **TTS-2** model (currently in research preview), which adds +natural-language steering, 100+ languages (15 GA, 90+ experimental), and +high-quality instant voice cloning over the previous `inworld-tts-1.5-*` +generation. ```python from vision_agents.plugins import inworld +# Defaults to model_id="inworld-tts-2", voice_id="Sarah" tts = inworld.TTS() # Or specify explicitly tts = inworld.TTS( api_key="your_inworld_api_key", - voice_id="Dennis", - model_id="inworld-tts-1.5-max", + voice_id="Ashley", + model_id="inworld-tts-2", temperature=1.1, ) ``` @@ -35,10 +40,46 @@ tts = inworld.TTS( ### TTS options - `api_key`: Inworld AI API key (default: reads from `INWORLD_API_KEY`) -- `voice_id`: Voice to use (default: `"Dennis"`) -- `model_id`: `"inworld-tts-1.5-max"`, `"inworld-tts-1.5-mini"`, `"inworld-tts-1"`, `"inworld-tts-1-max"` (default: `"inworld-tts-1.5-max"`) +- `voice_id`: Voice to use (default: `"Sarah"`; `"Dennis"`, `"Ashley"`, `"Olivia"`, `"Clive"` and custom/cloned voices also supported) +- `model_id`: `"inworld-tts-2"` (default), `"inworld-tts-1.5-max"`, `"inworld-tts-1.5-mini"`. `"inworld-tts-1"` and `"inworld-tts-1-max"` are deprecated by Inworld — migrate to `inworld-tts-2` or `inworld-tts-1.5-*`. - `temperature`: 0–2 (default: 1.1) +The plugin requests `LINEAR16` (16-bit PCM WAV) chunks from Inworld so each +streamed chunk is self-contained and decodes cleanly under streaming TTS; +no extra configuration needed. + +### Steering (TTS-2) + +TTS-2 takes natural-language stage directions inline with your text. Place +the instruction in square brackets before the segment it should apply to: + +```python +text = ( + "[whisper in a hushed style] I have to tell you something. " + "[laugh] Just kidding! [say with force] Now let's get to work." +) +async for chunk in await tts.stream_audio(text): + ... +``` + +Steering covers articulation, intonation, volume, pitch, range, speed, and +vocal style — and supports non-verbal sounds like `[laugh]`, `[breathe]`, +`[clear throat]`, `[sigh]`, `[cough]`, `[yawn]`. Combining dimensions +(`[whisper in a hushed style]`, `[say playfully and very fast]`) produces +better results than bare single-word tags. See Inworld's +[steering docs](https://docs.inworld.ai/tts/capabilities/steering) and +[prompting guide](https://docs.inworld.ai/tts/best-practices/prompting-for-tts-2) +for the full reference. + +### Agent example + +A complete example wiring `inworld.TTS()` into a Stream-edge agent with +Deepgram STT, Gemini LLM, and smart-turn detection lives at +[`example/inworld_tts_example.py`](example/inworld_tts_example.py). The +companion [`example/inworld-audio-guide.md`](example/inworld-audio-guide.md) +is loaded as the agent's system prompt and teaches the LLM how to emit +TTS-2 steering tags so replies sound expressive out of the box. + ## Realtime (WebRTC) Low-latency speech-to-speech via Inworld's Realtime API. This transport uses diff --git a/plugins/inworld/example/inworld-audio-guide.md b/plugins/inworld/example/inworld-audio-guide.md index e82333e93..6bed3164a 100644 --- a/plugins/inworld/example/inworld-audio-guide.md +++ b/plugins/inworld/example/inworld-audio-guide.md @@ -1,90 +1,109 @@ -## Audio Markup Rules - -### Emotion and Delivery Style Tags -Place these at the BEGINNING of text segments to control how the following text is spoken: -- `[happy]` - Use for positive, enthusiastic, or joyful responses -- `[sad]` - Use for empathetic, disappointing, or melancholic content -- `[angry]` - Use for firm corrections or expressing frustration -- `[surprised]` - Use for unexpected discoveries or amazement -- `[fearful]` - Use for warnings or expressing concern -- `[disgusted]` - Use for expressing strong disapproval -- `[laughing]` - Use when text should be delivered with laughter -- `[whispering]` - Use for secrets, quiet emphasis, or intimate tone - -### Non-Verbal Vocalization Tags -Insert these EXACTLY WHERE the sound should occur in your text: -- `[breathe]` - Add between thoughts or before important statements -- `[clear_throat]` - Use before corrections or important announcements -- `[cough]` - Use sparingly for realism -- `[laugh]` - Insert after humor or when expressing amusement -- `[sigh]` - Use to express resignation, relief, or empathy -- `[yawn]` - Use when expressing tiredness or boredom +## Audio Markup Rules (Inworld TTS-2) -## Response Generation Rules +Inworld TTS-2 takes **natural-language stage directions** in square brackets, +not fixed enum tags. Treat each bracket like a note to a voice actor: the +more vividly you describe how a line should be performed, the better the +output. A direction stays in effect for following sentences until you +introduce a new one. + +### Steering directions (place at the start of a segment) + +Combine these dimensions inside one bracket — layered instructions outperform +single words: -1. **Always start responses with appropriate emotion tags** based on the user's query and your response tone. +- **Emotion** — `[say excitedly]`, `[say with concern]`, `[sound terrified]` +- **Articulation** — `[say with force]`, `[say crisply with deliberate pauses]` +- **Intonation** — `[say with a falling pitch]`, `[rising pitch through the phrase]` +- **Volume** — `[very quiet]`, `[very loud]` +- **Pitch** — `[say in a low tone]`, `[say in a high tone]` +- **Range** — `[say playfully]`, `[say in a flat delivery]` +- **Speed** — `[very fast]`, `[very slow]` +- **Vocal style** — `[whisper in a hushed style]`, `[say in a nasal voice]` -2. **Insert non-verbal sounds naturally** where a human would naturally pause, breathe, or react. +Layered example: +``` +[say sadly with deliberate pauses in a low voice and hushed style] I'm sorry, that didn't work. +``` -3. **Match emotions to content**: - - Technical explanations: Start neutral or `[happy]` if being helpful - - Bad news or errors: Start with `[sad]` or concerned tone - - Exciting discoveries: Use `[surprised]` or `[happy]` - - Clarifications after misunderstanding: `[clear_throat]` before correcting +### Non-verbal sounds (insert exactly where the sound should occur) -4. **Use this frequency**: - - One emotion tag per response paragraph - - 0-2 non-verbal sounds per response - - Never use more than 3 total tags in a short response +The supported set is: -5. **Natural placement patterns**: - - `[breathe]` before listing items or explaining complex topics - - `[sigh]` when acknowledging difficulties - - `[laugh]` only after genuinely amusing content - - `[clear_throat]` before important corrections +- `[laugh]` — after genuinely amusing content +- `[sigh]` — to express resignation, relief, or empathy +- `[breathe]` — between thoughts or before important statements +- `[clear throat]` — before corrections or important announcements +- `[cough]` — sparingly, for realism +- `[yawn]` — when expressing tiredness or boredom + +## Response Generation Rules + +1. **Lead with one steering direction** when the line has a clear emotional + or delivery shift. A single tag scopes across the following sentences + until you change it — don't repeat it on every sentence. +2. **Insert non-verbal sounds inline** at the exact moment they should + occur. 0–2 per response is plenty. +3. **Match the direction to the content** — happy news gets an excited or + playful steer; bad news gets a sad, slow, or hushed steer; corrections + start with `[clear throat]`. +4. **Combine dimensions** for nuance. `[say sadly]` is okay; `[say sadly + with deliberate pauses in a low voice]` is much better. +5. **Keep it sparse** — never more than 3 total tags in a short reply. ## Example Response Patterns -For a helpful response: +Helpful response: +``` +[say warmly and a little excited] I'd be glad to help with that. [breathe] Here's what you need to know... +``` + +Delivering bad news: ``` -[happy] I'd be glad to help you with that! [breathe] Here's what you need to know... +[say sadly with deliberate pauses in a low voice] Unfortunately, that's not possible. [sigh] Let me explain why... ``` -For delivering bad news: +Exciting information: ``` -[sad] Unfortunately, that's not possible. [sigh] Let me explain why... +[say excitedly with a high pitch and fast pace] Oh, that's fascinating — I just realized something important. ``` -For exciting information: +Thinking through a problem: ``` -[surprised] Oh, that's fascinating! I just realized something important... +[say slowly and thoughtfully] Let me think about this... [breathe] Yes, I believe the solution is... ``` -For thinking through problems: +Correcting yourself: ``` -Let me think about this... [breathe] Yes, I believe the solution is... +[clear throat] [say crisply with a measured pace] Actually, there's been a misunderstanding. Let me clarify... ``` -For corrections: +Conspiratorial aside: ``` -[clear_throat] Actually, there's been a misunderstanding. Let me clarify... +[whisper in a hushed style] Between you and me, the real answer is simpler than it looks. ``` ## Critical Rules -- **NEVER use multiple emotion tags in the same text segment** - only one at the beginning -- **NEVER place non-verbal tags at the beginning** - they go where the sound occurs -- **ALWAYS consider the emotional context** of the user's message -- **KEEP usage natural** - if unsure whether to add a tag, don't -- **REMEMBER these are experimental** and only work in English +- **Use natural-language directions, not fixed enums.** `[happy]` / + `[sad]` / `[whispering]` are TTS-1 conventions and won't steer TTS-2 the + way you want — write `[say happily]`, `[say sadly]`, `[whisper in a + hushed style]` instead. +- **Don't combine opposing directions** (`[whisper]` + `[very loud]`, + `[very fast]` + `[very slow]`). The result is unpredictable. +- **Don't pick a direction that contradicts the content** — `[say + excitedly]` over a condolence reads as sarcasm. +- **Avoid non-verbal sounds in professional contexts.** Save `[laugh]`, + `[yawn]`, `[cough]` for casual or expressive replies. +- **Keep usage natural.** If you're unsure whether to add a tag, don't. ## Decision Framework -When generating each response: -1. Analyze the user's emotional state and query type -2. Choose ONE appropriate emotion tag for the beginning (if needed) -3. Identify 0-2 natural places for non-verbal sounds -4. Write your response with tags embedded -5. Verify tags feel natural when read aloud +For each response: +1. Read the user's message — what emotional register fits? +2. Pick **one** layered steering direction for the opening segment, if any. +3. Identify 0–2 places where a non-verbal sound would land naturally. +4. Write the reply with tags embedded. +5. Read it aloud (mentally) — if a tag feels theatrical or redundant, cut it. -Your responses should feel like natural human speech when processed through TTS, not robotic or over-acted. \ No newline at end of file +Your replies should feel like natural human speech through TTS-2, not +robotic and not over-acted. diff --git a/plugins/inworld/vision_agents/plugins/inworld/tts.py b/plugins/inworld/vision_agents/plugins/inworld/tts.py index 063da9922..fffef3f3a 100644 --- a/plugins/inworld/vision_agents/plugins/inworld/tts.py +++ b/plugins/inworld/vision_agents/plugins/inworld/tts.py @@ -33,7 +33,7 @@ class TTS(tts.TTS): def __init__( self, api_key: Optional[str] = None, - voice_id: str = "Dennis", + voice_id: str = "Sarah", model_id: Literal[ "inworld-tts-1.5-max", "inworld-tts-1.5-mini", @@ -48,9 +48,9 @@ def __init__( Args: api_key: Inworld AI API key. If not provided, the INWORLD_API_KEY environment variable will be used. - voice_id: The voice ID to use for synthesis (default: "Dennis"). - model_id: The model ID to use for synthesis. Options: "inworld-tts-1.5-max", - "inworld-tts-1.5-mini" (default: "inworld-tts-1.5-max"). + voice_id: The voice ID to use for synthesis (default: "Sarah"). + model_id: The model ID to use for synthesis. Options: "inworld-tts-2", + "inworld-tts-1.5-max", "inworld-tts-1.5-mini" (default: "inworld-tts-2"). temperature: Determines the degree of randomness when sampling audio tokens. Accepts values between 0 and 2. Default: 1.1. """ From 00fb5dd5c737120afa33672d656ce2c036288da3 Mon Sep 17 00:00:00 2001 From: "Neevash Ramdial (Nash)" Date: Tue, 5 May 2026 10:32:35 -0600 Subject: [PATCH 6/7] Fix turn detection in example --- plugins/inworld/example/inworld_tts_example.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/plugins/inworld/example/inworld_tts_example.py b/plugins/inworld/example/inworld_tts_example.py index 072e984c2..f6c85c00a 100644 --- a/plugins/inworld/example/inworld_tts_example.py +++ b/plugins/inworld/example/inworld_tts_example.py @@ -21,7 +21,7 @@ from dotenv import load_dotenv from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import deepgram, gemini, getstream, inworld, smart_turn +from vision_agents.plugins import deepgram, gemini, getstream, inworld logger = logging.getLogger(__name__) @@ -34,10 +34,9 @@ async def create_agent(**kwargs) -> Agent: edge=getstream.Edge(), agent_user=User(name="Friendly AI", id="agent"), instructions="Read @inworld-audio-guide.md", - tts=inworld.TTS(voice_id="Ashley"), + tts=inworld.TTS(voice_id="Sarah"), stt=deepgram.STT(), llm=gemini.LLM(model="gemini-3.1-flash-lite-preview"), - turn_detection=smart_turn.TurnDetection(), ) return agent From 38ef975a326685bcd26c928b10a022f10f95b620 Mon Sep 17 00:00:00 2001 From: "Neevash Ramdial (Nash)" Date: Tue, 5 May 2026 10:37:25 -0600 Subject: [PATCH 7/7] remove agent.llm.simple_response --- plugins/inworld/example/inworld_tts_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/inworld/example/inworld_tts_example.py b/plugins/inworld/example/inworld_tts_example.py index f6c85c00a..73f463bd9 100644 --- a/plugins/inworld/example/inworld_tts_example.py +++ b/plugins/inworld/example/inworld_tts_example.py @@ -54,7 +54,7 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non logger.info("LLM ready") await asyncio.sleep(5) - await agent.llm.simple_response(text="Tell me a story about a dragon.") + await agent.simple_response(text="Tell me a story about a dragon.") await agent.finish() # Run till the call ends