|
| 1 | +--- |
| 2 | +title: useTextToSpeech |
| 3 | +keywords: [ |
| 4 | + text to speech |
| 5 | + tts, |
| 6 | + voice synthesizer, |
| 7 | + transcription, |
| 8 | + kokoro, |
| 9 | + react native, |
| 10 | + executorch, |
| 11 | + ai, |
| 12 | + machine learning, |
| 13 | + on-device, |
| 14 | + mobile ai, |
| 15 | + ] |
| 16 | +description: "Learn how to use text-to-speech models in your React Native applications with React Native ExecuTorch's useTextToSpeech hook." |
| 17 | +--- |
| 18 | + |
| 19 | +Text to speech is a task that allows to transform written text into spoken language. It is commonly used to implement features such as voice assistants, accessibility tools, or audiobooks. |
| 20 | + |
| 21 | +:::warning |
| 22 | +It is recommended to use models provided by us, which are available at our [Hugging Face repository](https://huggingface.co/software-mansion/react-native-executorch-kokoro). You can also use [constants](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/src/constants/modelUrls.ts) shipped with our library. |
| 23 | +::: |
| 24 | + |
| 25 | +## Reference |
| 26 | + |
| 27 | +You can play the generated waveform in any way most suitable to you; however, in the snippet below we utilize the react-native-audio-api library to play synthesized speech. |
| 28 | + |
| 29 | +```typescript |
| 30 | +import { |
| 31 | + useTextToSpeech, |
| 32 | + KOKORO_MEDIUM, |
| 33 | + KOKORO_VOICE_AF_HEART, |
| 34 | +} from 'react-native-executorch'; |
| 35 | +import { AudioContext } from 'react-native-audio-api'; |
| 36 | + |
| 37 | +const model = useTextToSpeech({ |
| 38 | + model: KOKORO_MEDIUM, |
| 39 | + voice: KOKORO_VOICE_AF_HEART, |
| 40 | +}); |
| 41 | + |
| 42 | +const audioContext = new AudioContext({ sampleRate: 24000 }); |
| 43 | + |
| 44 | +const handleSpeech = async (text: string) => { |
| 45 | + const speed = 1.0; |
| 46 | + const waveform = await model.forward(text, speed); |
| 47 | + |
| 48 | + const audioBuffer = audioContext.createBuffer(1, waveform.length, 24000); |
| 49 | + audioBuffer.getChannelData(0).set(waveform); |
| 50 | + |
| 51 | + const source = audioContext.createBufferSource(); |
| 52 | + source.buffer = audioBuffer; |
| 53 | + source.connect(audioContext.destination); |
| 54 | + source.start(); |
| 55 | +}; |
| 56 | +``` |
| 57 | + |
| 58 | +### Arguments |
| 59 | + |
| 60 | +**`model`** (`KokoroConfig`) - Object specifying the source files for the Kokoro TTS model (duration predictor, synthesizer). |
| 61 | + |
| 62 | +**`voice`** (`VoiceConfig`) - Object specifying the voice data and phonemizer assets (tagger and lexicon). |
| 63 | + |
| 64 | +**`preventLoad?`** - Boolean that can prevent automatic model loading after running the hook. |
| 65 | + |
| 66 | +For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page. |
| 67 | + |
| 68 | +### Returns |
| 69 | + |
| 70 | +| Field | Type | Description | |
| 71 | +| ------------------ | --------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | |
| 72 | +| `forward` | `(text: string, speed?: number) => Promise<Float32Array>` | Synthesizes a full text into speech. Returns a promise resolving to the full audio waveform as a `Float32Array`. | |
| 73 | +| `stream` | `(input: TextToSpeechStreamingInput) => Promise<void>` | Starts a streaming synthesis session. Takes a text input and callbacks to handle audio chunks as they are generated. Ideal for reducing the "time to first audio" for long sentences | |
| 74 | +| `streamStop` | `(): void` | Stops the streaming process if there is any ongoing. | |
| 75 | +| `error` | `RnExecutorchError \| null` | Contains the error message if the model failed to load or synthesis failed. | |
| 76 | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing a synthesis. | |
| 77 | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for synthesis. | |
| 78 | +| `downloadProgress` | `number` | Tracks the progress of the model and voice assets download process. | |
| 79 | + |
| 80 | +<details> |
| 81 | +<summary>Type definitions</summary> |
| 82 | + |
| 83 | +```typescript |
| 84 | +interface TextToSpeechStreamingInput { |
| 85 | + text: string; |
| 86 | + speed?: number; |
| 87 | + onBegin?: () => void | Promise<void>; |
| 88 | + onNext?: (chunk: Float32Array) => Promise<void> | void; |
| 89 | + onEnd?: () => Promise<void> | void; |
| 90 | +} |
| 91 | + |
| 92 | +interface KokoroConfig { |
| 93 | + durationSource: ResourceSource; |
| 94 | + synthesizerSource: ResourceSource; |
| 95 | +} |
| 96 | + |
| 97 | +interface VoiceConfig { |
| 98 | + voiceSource: ResourceSource; |
| 99 | + extra: { |
| 100 | + taggerSource: ResourceSource; |
| 101 | + lexiconSource: ResourceSource; |
| 102 | + }; |
| 103 | +} |
| 104 | +``` |
| 105 | + |
| 106 | +</details> |
| 107 | + |
| 108 | +## Running the model |
| 109 | + |
| 110 | +The module provides two ways to generate speech: |
| 111 | + |
| 112 | +1. **`forward(text, speed)`**: Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`. |
| 113 | + |
| 114 | +:::note |
| 115 | +Since it processes the entire text at once, it might take a significant amount of time to produce an audio for long text inputs. |
| 116 | +::: |
| 117 | + |
| 118 | +2. **`stream({ text, speed })`**: An async generator that yields chunks of audio as they are computed. |
| 119 | + This is ideal for reducing the "time to first audio" for long sentences. |
| 120 | + |
| 121 | +## Example |
| 122 | + |
| 123 | +### Speech Synthesis |
| 124 | + |
| 125 | +```tsx |
| 126 | +import React from 'react'; |
| 127 | +import { Button, View } from 'react-native'; |
| 128 | +import { |
| 129 | + useTextToSpeech, |
| 130 | + KOKORO_MEDIUM, |
| 131 | + KOKORO_VOICE_AF_HEART, |
| 132 | +} from 'react-native-executorch'; |
| 133 | +import { AudioContext } from 'react-native-audio-api'; |
| 134 | + |
| 135 | +export default function App() { |
| 136 | + const tts = useTextToSpeech({ |
| 137 | + model: KOKORO_MEDIUM, |
| 138 | + voice: KOKORO_VOICE_AF_HEART, |
| 139 | + }); |
| 140 | + |
| 141 | + const generateAudio = async () => { |
| 142 | + const audioData = await tts.forward({ |
| 143 | + text: 'Hello world! This is a sample text.', |
| 144 | + }); |
| 145 | + |
| 146 | + // Playback example |
| 147 | + const ctx = new AudioContext({ sampleRate: 24000 }); |
| 148 | + const buffer = ctx.createBuffer(1, audioData.length, 24000); |
| 149 | + buffer.getChannelData(0).set(audioData); |
| 150 | + |
| 151 | + const source = ctx.createBufferSource(); |
| 152 | + source.buffer = buffer; |
| 153 | + source.connect(ctx.destination); |
| 154 | + source.start(); |
| 155 | + }; |
| 156 | + |
| 157 | + return ( |
| 158 | + <View style={{ flex: 1, justifyContent: 'center', alignItems: 'center' }}> |
| 159 | + <Button title="Speak" onPress={generateAudio} disabled={!tts.isReady} /> |
| 160 | + </View> |
| 161 | + ); |
| 162 | +} |
| 163 | +``` |
| 164 | + |
| 165 | +### Streaming Synthesis |
| 166 | + |
| 167 | +```tsx |
| 168 | +import React, { useRef } from 'react'; |
| 169 | +import { Button, View } from 'react-native'; |
| 170 | +import { |
| 171 | + useTextToSpeech, |
| 172 | + KOKORO_MEDIUM, |
| 173 | + KOKORO_VOICE_AF_HEART, |
| 174 | +} from 'react-native-executorch'; |
| 175 | +import { AudioContext } from 'react-native-audio-api'; |
| 176 | + |
| 177 | +export default function App() { |
| 178 | + const tts = useTextToSpeech({ |
| 179 | + model: KOKORO_MEDIUM, |
| 180 | + voice: KOKORO_VOICE_AF_HEART, |
| 181 | + }); |
| 182 | + |
| 183 | + const contextRef = useRef(new AudioContext({ sampleRate: 24000 })); |
| 184 | + |
| 185 | + const generateStream = async () => { |
| 186 | + const ctx = contextRef.current; |
| 187 | + |
| 188 | + await tts.stream({ |
| 189 | + text: "This is a longer text, which is being streamed chunk by chunk. Let's see how it works!", |
| 190 | + onNext: async (chunk) => { |
| 191 | + return new Promise((resolve) => { |
| 192 | + const buffer = ctx.createBuffer(1, chunk.length, 24000); |
| 193 | + buffer.getChannelData(0).set(chunk); |
| 194 | + |
| 195 | + const source = ctx.createBufferSource(); |
| 196 | + source.buffer = buffer; |
| 197 | + source.connect(ctx.destination); |
| 198 | + source.onEnded = () => resolve(); |
| 199 | + source.start(); |
| 200 | + }); |
| 201 | + }, |
| 202 | + }); |
| 203 | + }; |
| 204 | + |
| 205 | + return ( |
| 206 | + <View style={{ flex: 1, justifyContent: 'center', alignItems: 'center' }}> |
| 207 | + <Button title="Stream" onPress={generateStream} disabled={!tts.isReady} /> |
| 208 | + </View> |
| 209 | + ); |
| 210 | +} |
| 211 | +``` |
| 212 | + |
| 213 | +## Supported models |
| 214 | + |
| 215 | +| Model | Language | |
| 216 | +| -------------------------------------------------------------------------------- | :------: | |
| 217 | +| [Kokoro](https://huggingface.co/software-mansion/react-native-executorch-kokoro) | English | |
0 commit comments