Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.egg-info/
16 changes: 2 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,13 @@ Kitten TTS is an open-source realistic text-to-speech model with just 15 million

[Join our discord](https://discord.gg/upcyF5s6)


## ✨ Features

- **Ultra-lightweight**: Model size less than 25MB
- **CPU-optimized**: Runs without GPU on any device
- **High-quality voices**: Several premium voice options available
- **Fast inference**: Optimized for real-time speech synthesis



## 🚀 Quick Start

### Installation
Expand All @@ -24,15 +21,13 @@ Kitten TTS is an open-source realistic text-to-speech model with just 15 million
pip install https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl
```



### Basic Usage
### Basic Usage

```
from kittentts import KittenTTS
m = KittenTTS("KittenML/kitten-tts-nano-0.1")

audio = m.generate("This high quality TTS model works without a GPU", voice='expr-voice-2-f' )
audio = m.generate("This high quality TTS model works without a GPU", voice='expr-voice-2-f')

# available_voices : [ 'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f', 'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f' ]

Expand All @@ -42,20 +37,13 @@ sf.write('output.wav', audio, 24000)

```





## 💻 System Requirements

Works literally everywhere



## Checklist

- [x] Release a preview model
- [ ] Release the fully trained model weights
- [ ] Release mobile SDK
- [ ] Release web version

1 change: 1 addition & 0 deletions kittentts/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__pycache__/*
17 changes: 10 additions & 7 deletions kittentts/get_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
import numpy as np
from huggingface_hub import hf_hub_download
from .onnx_model import KittenTTS_1_Onnx

Expand All @@ -22,8 +23,8 @@ def __init__(self, model_name="KittenML/kitten-tts-nano-0.1", cache_dir=None):
repo_id = model_name

self.model = download_from_huggingface(repo_id=repo_id, cache_dir=cache_dir)
def generate(self, text, voice="expr-voice-5-m", speed=1.0):

def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) -> np.ndarray:
"""Generate audio from text.

Args:
Expand All @@ -34,9 +35,11 @@ def generate(self, text, voice="expr-voice-5-m", speed=1.0):
Returns:
Audio data as numpy array
"""
if not text:
raise ValueError("Input text cannot be empty.")
return self.model.generate(text, voice=voice, speed=speed)
def generate_to_file(self, text, output_path, voice="expr-voice-5-m", speed=1.0, sample_rate=24000):

def generate_to_file(self, text: str, output_path: str, voice: str = "expr-voice-5-m", speed: float = 1.0, sample_rate: int = 24000):
"""Generate audio from text and save to file.

Args:
Expand All @@ -46,15 +49,15 @@ def generate_to_file(self, text, output_path, voice="expr-voice-5-m", speed=1.0,
speed: Speech speed (1.0 = normal)
sample_rate: Audio sample rate
"""
return self.model.generate_to_file(text, output_path, voice=voice, speed=speed, sample_rate=sample_rate)
self.model.generate_to_file(text, output_path, voice=voice, speed=speed, sample_rate=sample_rate)

@property
def available_voices(self):
"""Get list of available voices."""
return self.model.available_voices


def download_from_huggingface(repo_id="KittenML/kitten-tts-nano-0.1", cache_dir=None):
def download_from_huggingface(repo_id: str="KittenML/kitten-tts-nano-0.1", cache_dir=None) -> KittenTTS_1_Onnx:
"""Download model files from Hugging Face repository.

Args:
Expand Down Expand Up @@ -97,6 +100,6 @@ def download_from_huggingface(repo_id="KittenML/kitten-tts-nano-0.1", cache_dir=
return model


def get_model(repo_id="KittenML/kitten-tts-nano-0.1", cache_dir=None):
def get_model(repo_id: str="KittenML/kitten-tts-nano-0.1", cache_dir=None) -> KittenTTS:
"""Get a KittenTTS model (legacy function for backward compatibility)."""
return KittenTTS(repo_id, cache_dir)
18 changes: 6 additions & 12 deletions kittentts/onnx_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import onnxruntime as ort


def basic_english_tokenize(text):
def basic_english_tokenize(text: str) -> list:
"""Basic English tokenizer that splits on whitespace and punctuation."""
import re
tokens = re.findall(r"\w+|[^\w\s]", text)
Expand All @@ -27,14 +27,9 @@ def __init__(self, dummy=None):

self.word_index_dictionary = dicts

def __call__(self, text):
indexes = []
for char in text:
try:
indexes.append(self.word_index_dictionary[char])
except KeyError:
pass
return indexes
def __call__(self, text: str) -> list:
dicts = self.word_index_dictionary
return [dicts[char] for char in text if char in dicts]


class KittenTTS_1_Onnx:
Expand All @@ -48,7 +43,6 @@ def __init__(self, model_path="kitten_tts_nano_preview.onnx", voices_path="voice
self.model_path = model_path
self.voices = np.load(voices_path)
self.session = ort.InferenceSession(model_path)

self.phonemizer = phonemizer.backend.EspeakBackend(
language="en-us", preserve_punctuation=True, with_stress=True
)
Expand Down Expand Up @@ -124,10 +118,10 @@ def generate_to_file(self, text: str, output_path: str, voice: str = "expr-voice

# Example usage
if __name__ == "__main__":
tts = KittenTTS()
tts = KittenTTS_1_Onnx()

text = """
It begins with an "Ugh!" Another mysterious stain appears on a favorite shirt. Every trick has been tried, but the stain persists.
"""

tts.generate_to_file(text, "inference_output25.wav", voice="expr-voice-5-m")
tts.generate_to_file(text, "inference_output25.wav", voice="expr-voice-5-m")