The official Python SDK for Shunyalabs Speech AI APIs — ASR (speech-to-text) and TTS (text-to-speech).
Supports HTTP batch and WebSocket streaming modes with a fully async client.
pip install shunyalabs[all]Install only what you need:
pip install shunyalabs[ASR] # Speech-to-text only
pip install shunyalabs[TTS] # Text-to-speech only
pip install shunyalabs[extras] # Audio playback helpers (sounddevice)All API calls use Authorization: Bearer <api_key> header authentication.
from shunyalabs import AsyncShunyaClient
client = AsyncShunyaClient(api_key="your-api-key")Or set the SHUNYALABS_API_KEY environment variable and omit api_key=.
import asyncio
from shunyalabs import AsyncShunyaClient
from shunyalabs.tts import TTSConfig
async def main():
async with AsyncShunyaClient(api_key="your-api-key") as client:
result = await client.tts.synthesize(
"Hello, world!",
config=TTSConfig(model="zero-indic", voice="Varun"),
)
result.save("output.mp3")
print(f"{len(result.audio_data)} bytes saved")
asyncio.run(main())import asyncio
from shunyalabs import AsyncShunyaClient
from shunyalabs.tts import TTSConfig
async def main():
async with AsyncShunyaClient(api_key="your-api-key") as client:
chunks = []
async for audio in await client.tts.stream(
"Hello, world!",
config=TTSConfig(model="zero-indic", voice="Varun"),
):
chunks.append(audio)
print(f"{len(chunks)} chunks, {sum(len(c) for c in chunks)} bytes")
asyncio.run(main())import asyncio
from shunyalabs import AsyncShunyaClient
from shunyalabs.asr import TranscriptionConfig
async def main():
async with AsyncShunyaClient(api_key="your-api-key") as client:
result = await client.asr.transcribe(
"audio.wav",
config=TranscriptionConfig(model="zero-indic"),
)
print(result.text)
asyncio.run(main())import asyncio, subprocess
from shunyalabs import AsyncShunyaClient
from shunyalabs.asr import StreamingConfig, StreamingMessageType
async def main():
async with AsyncShunyaClient(api_key="your-api-key") as client:
conn = await client.asr.stream(
config=StreamingConfig(language="en", sample_rate=16000),
)
@conn.on(StreamingMessageType.FINAL_SEGMENT)
def on_seg(msg):
print(f"[seg] {msg.text}")
@conn.on(StreamingMessageType.FINAL)
def on_final(msg):
print(f"[final] {msg.text}")
# Convert audio to 16kHz mono PCM and stream
pcm = subprocess.run(
["ffmpeg", "-i", "audio.wav", "-ar", "16000", "-ac", "1", "-f", "s16le", "-"],
capture_output=True,
).stdout
for i in range(0, len(pcm), 4096):
await conn.send_audio(pcm[i : i + 4096])
await conn.end()
await conn.close()
asyncio.run(main())| Parameter | Type | Default | Description |
|---|---|---|---|
api_key |
str |
None |
API key. Falls back to SHUNYALABS_API_KEY env var. |
timeout |
float |
60.0 |
Request timeout in seconds. |
max_retries |
int |
2 |
Retries for failed requests (5xx, connection errors). |
asr_url |
str |
https://asr.shunyalabs.ai |
ASR batch API base URL. |
asr_ws_url |
str |
wss://asr.shunyalabs.ai/ws |
ASR streaming WebSocket URL. |
tts_url |
str |
https://tts.shunyalabs.ai |
TTS batch API base URL. |
tts_ws_url |
str |
wss://tts.shunyalabs.ai/ws |
TTS streaming WebSocket URL. |
All URL parameters can also be set via environment variables: SHUNYALABS_ASR_URL, SHUNYALABS_ASR_WS_URL, SHUNYALABS_TTS_URL, SHUNYALABS_TTS_WS_URL.
Examples:
# Default — uses production endpoints
client = AsyncShunyaClient(api_key="your-api-key")
# Custom timeout and retries
client = AsyncShunyaClient(api_key="your-api-key", timeout=120.0, max_retries=5)
# Self-hosted endpoints
client = AsyncShunyaClient(
api_key="your-api-key",
asr_url="https://my-asr-server.example.com",
tts_url="https://my-tts-server.example.com",
tts_ws_url="wss://my-tts-server.example.com/ws",
)Configuration for synthesis requests. Passed as config= to synthesize() and stream().
| Parameter | Type | Default | Description |
|---|---|---|---|
model |
str |
required | Model name (e.g. "zero-indic"). |
voice |
str |
required | Speaker voice name. See Available Speakers. |
response_format |
OutputFormat |
"mp3" |
Output audio format. See Output Formats. |
speed |
float |
1.0 |
Speaking speed multiplier (0.25–4.0). |
trim_silence |
bool |
False |
Trim leading/trailing silence from audio. |
volume_normalization |
str |
None |
"peak" or "loudness". |
background_audio |
str |
None |
Preset name or base64-encoded background audio. |
background_volume |
float |
0.1 |
Background volume relative to speech (0.0–1.0). |
model — Select the TTS model
# Currently available: "zero-indic"
config = TTSConfig(model="zero-indic", voice="Rajesh")
result = await client.tts.synthesize("Hello!", config=config)
# Output: 48000 bytes saved to output.mp3voice — Choose a speaker
# Male English speaker
config = TTSConfig(model="zero-indic", voice="Varun")
# Female Hindi speaker
config = TTSConfig(model="zero-indic", voice="Sunita")
# Any speaker can speak any language — voice only controls vocal characteristics
config = TTSConfig(model="zero-indic", voice="Murugan") # Tamil-native male speaking English
result = await client.tts.synthesize("Good morning, how are you?", config=config)response_format — Output audio format
Values: "pcm", "wav", "mp3", "ogg_opus", "flac", "mulaw", "alaw"
# MP3 (default) — compressed, good for storage
config = TTSConfig(model="zero-indic", voice="Varun", response_format="mp3")
result = await client.tts.synthesize("Hello!", config=config)
result.save("output.mp3")
# Output: 12480 bytes (compressed)
# WAV — uncompressed, good for processing
config = TTSConfig(model="zero-indic", voice="Varun", response_format="wav")
result = await client.tts.synthesize("Hello!", config=config)
result.save("output.wav")
# Output: 96044 bytes (uncompressed with header)
# PCM — raw samples, for real-time pipelines
config = TTSConfig(model="zero-indic", voice="Varun", response_format="pcm")
result = await client.tts.synthesize("Hello!", config=config)
# Output: 96000 bytes (raw 16-bit samples)
# OGG Opus — compressed, good for web streaming
config = TTSConfig(model="zero-indic", voice="Varun", response_format="ogg_opus")
# mu-law / A-law — for telephony systems
config = TTSConfig(model="zero-indic", voice="Varun", response_format="mulaw")
config = TTSConfig(model="zero-indic", voice="Varun", response_format="alaw")speed — Speaking speed multiplier
Range: 0.25 (very slow) to 4.0 (very fast). Default: 1.0.
# Slow — good for language learning
config = TTSConfig(model="zero-indic", voice="Nisha", speed=0.75)
result = await client.tts.synthesize("Take your time to understand this.", config=config)
# Output: longer audio, ~33% slower than normal
# Normal speed (default)
config = TTSConfig(model="zero-indic", voice="Nisha", speed=1.0)
# Fast — good for notifications or summaries
config = TTSConfig(model="zero-indic", voice="Nisha", speed=1.5)
result = await client.tts.synthesize("Quick update: your order has shipped.", config=config)
# Output: shorter audio, ~50% faster than normal
# Very fast
config = TTSConfig(model="zero-indic", voice="Nisha", speed=2.0)trim_silence — Remove silence padding
# Without trim (default) — audio may have leading/trailing silence
config = TTSConfig(model="zero-indic", voice="Rajesh", trim_silence=False)
result = await client.tts.synthesize("Hello.", config=config)
# Output: 64000 bytes (includes silence padding)
# With trim — tighter audio, no dead air
config = TTSConfig(model="zero-indic", voice="Rajesh", trim_silence=True)
result = await client.tts.synthesize("Hello.", config=config)
# Output: 48000 bytes (silence stripped)volume_normalization — Normalize audio loudness
Values: None (off), "peak", "loudness"
# No normalization (default)
config = TTSConfig(model="zero-indic", voice="Rajesh")
# Peak normalization — scale so the loudest sample hits 0 dBFS
config = TTSConfig(model="zero-indic", voice="Rajesh", volume_normalization="peak")
result = await client.tts.synthesize("This audio will have consistent peak levels.", config=config)
# Loudness normalization — perceptually even loudness (EBU R128)
config = TTSConfig(model="zero-indic", voice="Rajesh", volume_normalization="loudness")
result = await client.tts.synthesize("This audio will sound equally loud regardless of content.", config=config)background_audio + background_volume — Add background music
import base64
# Using a preset name
config = TTSConfig(
model="zero-indic",
voice="Nisha",
background_audio="cafe-ambience",
background_volume=0.15, # 15% volume relative to speech
)
result = await client.tts.synthesize("Welcome to our podcast.", config=config)
# Using custom audio (base64-encoded)
with open("background.mp3", "rb") as f:
bg_b64 = base64.b64encode(f.read()).decode()
config = TTSConfig(
model="zero-indic",
voice="Nisha",
background_audio=bg_b64,
background_volume=0.1, # 10% volume (subtle background)
)
result = await client.tts.synthesize("Welcome to our podcast.", config=config)
result.save("podcast_intro.mp3")Each speaker has a native language listed below, but every speaker can speak in any language — the native language only indicates the speaker's voice characteristics and accent.
| Language | Male Speaker | Female Speaker |
|---|---|---|
| Assamese | Bimal |
Anjana |
| Bengali | Arjun |
Priyanka |
| Bodo | Daimalu |
Hasina |
| Dogri | Vishal |
Neelam |
| English | Varun |
Nisha |
| Gujarati | Rakesh |
Pooja |
| Hindi | Rajesh |
Sunita |
| Kannada | Kiran |
Shreya |
| Kashmiri | Farooq |
Habba |
| Konkani | Mohan |
Sarita |
| Maithili | Suresh |
Meera |
| Malayalam | Krishnan |
Deepa |
| Manipuri | Tomba |
Ibemhal |
| Marathi | Siddharth |
Ananya |
| Nepali | Bikash |
Sapana |
| Odia | Bijay |
Sujata |
| Punjabi | Gurpreet |
Simran |
| Sanskrit | Vedant |
Gayatri |
| Santali | Chandu |
Roshni |
| Sindhi | Amjad |
Kavita |
| Tamil | Murugan |
Thangam |
| Telugu | Vishnu |
Lakshmi |
| Urdu | Salman |
Fatima |
23 languages, 46 speakers (1 male + 1 female per language).
Control the emotional tone by passing a style tag before the text (e.g. "<Happy> Hello!").
| Style Tag | Description |
|---|---|
<Happy> |
Joyful, upbeat tone |
<Sad> |
Somber, melancholic tone |
<Angry> |
Forceful, intense tone |
<Fearful> |
Anxious, trembling tone |
<Surprised> |
Exclamatory, astonished tone |
<Disgust> |
Repulsed, disapproving tone |
<News> |
Formal news-anchor style |
<Conversational> |
Casual, everyday speech |
<Narrative> |
Storytelling / audiobook style |
<Enthusiastic> |
Energetic, passionate tone |
<Neutral> |
Clean read-speech (default, no tag needed) |
Expression style examples:
# Happy greeting
config = TTSConfig(model="zero-indic", voice="Rajesh")
result = await client.tts.synthesize("<Happy> Welcome aboard! We're thrilled to have you.", config=config)
# News anchor reading
config = TTSConfig(model="zero-indic", voice="Nisha")
result = await client.tts.synthesize("<News> Breaking news: the markets rallied today.", config=config)
# Storytelling
config = TTSConfig(model="zero-indic", voice="Krishnan")
result = await client.tts.synthesize("<Narrative> Once upon a time, in a land far away...", config=config)
# Conversational chatbot
config = TTSConfig(model="zero-indic", voice="Simran")
result = await client.tts.synthesize("<Conversational> Hey! How's it going?", config=config)
# Neutral (default — no tag needed)
config = TTSConfig(model="zero-indic", voice="Varun")
result = await client.tts.synthesize("Your account balance is five thousand rupees.", config=config)| Format | Value |
|---|---|
| PCM (raw) | "pcm" |
| WAV | "wav" |
| MP3 | "mp3" |
| OGG Opus | "ogg_opus" |
| FLAC | "flac" |
| mu-law | "mulaw" |
| A-law | "alaw" |
Batch (HTTP)
result = await client.tts.synthesize("text", config=TTSConfig(...))
result.save("output.mp3") # Save to file
result.audio_data # Raw bytes
result.sample_rate # Sample rate (Hz)Streaming (WebSocket)
# Iterate audio chunks
async for audio_bytes in await client.tts.stream("text", config=TTSConfig(...)):
play(audio_bytes)
# With chunk metadata
async for chunk_meta, audio_bytes in await client.tts.stream("text", config=TTSConfig(...), detailed=True):
print(chunk_meta.chunk_index, len(audio_bytes))
# Collect all and return combined bytes
audio = await client.tts.synthesize_stream("text", config=TTSConfig(...))
# Stream directly to file
await client.tts.stream_to_file("text", "output.pcm", config=TTSConfig(...))Returned by synthesize().
| Attribute | Type | Description |
|---|---|---|
audio_data |
bytes |
Decoded audio bytes. |
sample_rate |
int |
Audio sample rate in Hz. |
format |
str |
Audio format string. |
The following formats are natively supported. For video inputs, the audio track is automatically extracted before transcription begins.
Audio Formats
| Format | Extension | Notes |
|---|---|---|
| WAV | .wav |
Recommended — lossless, best accuracy |
| FLAC | .flac |
Lossless compression |
| MP3 | .mp3 |
Widely supported lossy format |
| M4A / AAC | .m4a, .aac |
Common mobile/Apple format |
| OGG / OPUS | .ogg, .opus |
Open container / low-latency codec |
| AMR / AMR-WB | .amr |
Telephony standard format |
| AIFF | .aiff |
Apple lossless audio format |
Video Formats
| Format | Extension |
|---|---|
| MP4 | .mp4 |
| WebM | .webm |
| MOV | .mov |
| MKV | .mkv |
✓ For best transcription accuracy, use WAV files with PCM 16-bit encoding at 16,000 Hz mono.
The API supports a range of sample rates to accommodate diverse audio sources including telephony recordings. Files provided at unlisted rates are automatically resampled during processing.
| Sample Rate | Common Use Case |
|---|---|
| 8,000 Hz | Telephony / phone call recordings |
| 16,000 Hz | Standard speech recognition (recommended) |
| 22,050 Hz | General audio |
| 44,100 Hz | CD-quality audio |
| 48,000 Hz | Professional broadcast audio |
ℹ Audio provided at sample rates not listed above is automatically resampled to 16,000 Hz during processing. No manual conversion is required.
Configuration for batch transcription. Passed as config= to transcribe().
Models:
| Model | Use for |
|---|---|
zero-indic |
General Indian languages (Hindi, Tamil, Telugu, Kannada, Marathi, Bengali, etc.) |
zero-med |
Medical/clinical audio — auto-applies medical terminology correction |
zero-codeswitch |
Code-switched speech (Hinglish, Tanglish, etc.) — auto-restores mixed text |
Core Parameters:
| Parameter | Type | Default | Description |
|---|---|---|---|
model |
str |
required | Model name ("zero-indic", "zero-med", "zero-codeswitch"). |
language_code |
str |
"auto" |
Language hint — ISO code (hi, ta, en) or name (Hindi, Tamil). |
output_script |
str |
"auto" |
Transliterate output script ("Latin", "Devanagari", "Tamil", etc.). |
word_timestamps |
bool |
False |
Per-word start/end times and alignment confidence scores. |
Diarization & Speaker ID:
| Parameter | Type | Default | Description |
|---|---|---|---|
enable_diarization |
bool |
False |
Identify who spoke when (SPEAKER_00, SPEAKER_01). |
enable_speaker_identification |
bool |
False |
Resolve speakers to registered names (requires diarization). |
enable_emotion_diarization |
bool |
False |
Detect dominant emotion per segment. |
project |
str |
None |
Speaker library namespace for speaker identification. |
NLP Features:
| Parameter | Type | Default | Description |
|---|---|---|---|
enable_intent_detection |
bool |
False |
Classify transcript intent via Gemini. |
intent_choices |
list[str] |
None |
Constrain to specific intents. |
enable_summarization |
bool |
False |
Generate concise summary via Gemini. |
summary_max_length |
int |
150 |
Max words in summary. |
enable_sentiment_analysis |
bool |
False |
Sentiment label, score, and explanation. |
enable_keyterm_normalization |
bool |
False |
Normalize domain-specific terms. |
keyterm_keywords |
list[str] |
None |
Focus normalization on these terms. |
Post-processing:
| Parameter | Type | Default | Description |
|---|---|---|---|
enable_profanity_hashing |
bool |
False |
Mask profanity with ****. |
hash_keywords |
list[str] |
None |
Mask specific words/phrases with **** (regex). |
output_language |
str |
None |
Translate transcript to this language ("en", "hi", etc.). |
model + language_code — Basic transcription
# Auto-detect language (default)
config = TranscriptionConfig(model="zero-indic", language_code="auto")
result = await client.asr.transcribe("audio.wav", config=config)
print(result.text)
print(f"Detected: {result.detected_language}")
# Output:
# "Hello, how are you doing today?"
# Detected: English
# Specify language for better accuracy on short clips
config = TranscriptionConfig(model="zero-indic", language_code="hi")
result = await client.asr.transcribe("hindi_audio.wav", config=config)
print(result.text)
# Output: "नमस्ते, आप कैसे हैं?"
# Medical audio — auto-applies medical terminology correction
config = TranscriptionConfig(model="zero-med", language_code="en")
# Code-switched speech (Hinglish, Tanglish, etc.)
config = TranscriptionConfig(model="zero-codeswitch")output_script — Transliterate output to a different script
Uses aksharamukha — no LLM involved.
Common values: "auto", "Latin", "Devanagari", "Bengali", "Telugu", "Tamil", "Kannada", "ITRANS"
# Latin/Roman script — romanised output
config = TranscriptionConfig(model="zero-indic", language_code="hi", output_script="Latin")
result = await client.asr.transcribe("hindi_audio.wav", config=config)
print(result.text)
# Output: "namaste mohammad ji ye ek zaruri call hai"
# Auto (default) — no transliteration
config = TranscriptionConfig(model="zero-indic", output_script="auto")word_timestamps — Per-word timing and confidence
config = TranscriptionConfig(model="zero-indic", word_timestamps=True)
result = await client.asr.transcribe("audio.wav", config=config)
for seg in result.segments:
if seg.words:
for w in seg.words:
print(f" {w.word} [{w.start:.3f}s - {w.end:.3f}s] score={w.score}")
# Output:
# नमस्ते [0.532s - 0.932s] score=-4.237
# मोहम्मद [1.012s - 1.412s] score=-6.226Note on
score: Log-probability from the alignment model — more negative means lower confidence. Values above-5are generally reliable.
enable_diarization — Who spoke when
config = TranscriptionConfig(model="zero-indic", enable_diarization=True)
result = await client.asr.transcribe("call.wav", config=config)
print(result.speakers) # ["SPEAKER_00", "SPEAKER_01"]
for seg in result.segments:
print(f" [{seg.speaker}] [{seg.start:.1f}s - {seg.end:.1f}s] {seg.text}")
# Output:
# [SPEAKER_00] [0.5s - 3.2s] नमस्ते, आप कैसे हैं
# [SPEAKER_01] [4.1s - 6.8s] मैं ठीक हूँ धन्यवादenable_speaker_identification + project — Resolve to registered names
Requires enable_diarization=True and pre-registered voice profiles (see Speaker APIs).
config = TranscriptionConfig(
model="zero-indic",
enable_diarization=True,
enable_speaker_identification=True,
project="support_team",
)
result = await client.asr.transcribe("call.wav", config=config)
print(result.speakers) # ["Priya", "Rahul"]
for seg in result.segments:
print(f" [{seg.speaker}] {seg.text}")
# Output:
# [Priya] नमस्ते, आप कैसे हैं
# [Rahul] मैं ठीक हूँenable_emotion_diarization — Emotion per segment
config = TranscriptionConfig(
model="zero-indic",
enable_diarization=True,
enable_emotion_diarization=True,
)
result = await client.asr.transcribe("call.wav", config=config)
for seg in result.segments:
print(f" [{seg.speaker}] ({seg.emotion}) {seg.text}")
# Output:
# [SPEAKER_00] (angry) I've been waiting for an hour!
# [SPEAKER_01] (neutral) I'm sorry about that, let me help.enable_intent_detection + intent_choices — Classify intent
# Constrained to specific choices
config = TranscriptionConfig(
model="zero-indic",
enable_intent_detection=True,
intent_choices=["complaint", "inquiry", "service_request", "compliment"],
)
result = await client.asr.transcribe("customer_call.wav", config=config)
print(result.nlp_analysis.intent)
# Output: {"label": "service_request", "confidence": 0.92, "reasoning": "Caller is requesting roadside assistance"}
# Open intent — no constraints
config = TranscriptionConfig(model="zero-indic", enable_intent_detection=True)enable_summarization + summary_max_length — Summarize transcript
config = TranscriptionConfig(
model="zero-indic",
enable_summarization=True,
summary_max_length=50, # max ~50 words
)
result = await client.asr.transcribe("meeting.wav", config=config)
print(result.nlp_analysis.summary)
# Output: "Customer called about a vehicle breakdown. Agent confirmed the complaint and promised a technician within the hour."enable_sentiment_analysis — Detect sentiment
config = TranscriptionConfig(model="zero-indic", enable_sentiment_analysis=True)
result = await client.asr.transcribe("feedback.wav", config=config)
print(result.nlp_analysis.sentiment)
# Output: {"label": "negative", "score": -0.72, "explanation": "Customer expresses frustration about..."}enable_keyterm_normalization + keyterm_keywords — Fix domain terms
config = TranscriptionConfig(
model="zero-indic",
enable_keyterm_normalization=True,
keyterm_keywords=["EMI", "NACH mandate", "bounce charge"],
)
result = await client.asr.transcribe("finance_call.wav", config=config)
print(result.nlp_analysis.normalized_text)
# Output: "आपकी EMI की तारीख पाँच अगस्त है" (normalized from "emi")enable_profanity_hashing + hash_keywords — Redact sensitive words
# Mask profanity
config = TranscriptionConfig(model="zero-indic", enable_profanity_hashing=True)
result = await client.asr.transcribe("audio.wav", config=config)
# Output: "अरे **** यह काम क्यों नहीं हो रहा"
# Mask specific keywords (regex, case-insensitive, no LLM)
config = TranscriptionConfig(
model="zero-indic",
hash_keywords=["account number", "card number", "OTP"],
)
result = await client.asr.transcribe("call.wav", config=config)
# Output: "आपका **** 4321 है और आपका **** कल भेजा गया था"output_language — Translate transcript
config = TranscriptionConfig(
model="zero-indic",
language_code="hi",
output_language="en",
)
result = await client.asr.transcribe("hindi_audio.wav", config=config)
print(result.text)
# Output: "Hello, this is an urgent call regarding your vehicle service."
print(result.nlp_analysis.translation)
# Also available in nlp_analysisBatch (HTTP)
The transcribe() method accepts multiple input types:
# From file path (string)
result = await client.asr.transcribe("audio.wav", config=TranscriptionConfig(model="zero-indic"))
result = await client.asr.transcribe("/absolute/path/to/recording.mp3", config=TranscriptionConfig(model="zero-indic"))
# From raw audio bytes (in-memory audio)
with open("audio.wav", "rb") as f:
audio_bytes = f.read()
result = await client.asr.transcribe(audio_bytes, config=TranscriptionConfig(model="zero-indic"))
# From file object
with open("audio.wav", "rb") as f:
result = await client.asr.transcribe_file(f, config=TranscriptionConfig(model="zero-indic"))
# From remote URL
result = await client.asr.transcribe_url("https://example.com/audio.wav", config=TranscriptionConfig(model="zero-indic"))⚠ URL inputs must be publicly accessible at the time of the API request. Authenticated or signed URLs with short expiry times may fail.
Streaming (WebSocket)
conn = await client.asr.stream(config=StreamingConfig(language="en"))Configuration for the WebSocket streaming session.
| Parameter | Type | Default | Description |
|---|---|---|---|
language |
str |
"auto" |
Language code or "auto". |
sample_rate |
int |
16000 |
Audio sample rate in Hz. |
dtype |
str |
"int16" |
Audio data type ("int16", "float32"). |
chunk_size_sec |
float |
1.0 |
Processing chunk size in seconds. |
silence_threshold_sec |
float |
0.5 |
Silence duration to trigger segmentation. |
language — Set recognition language
# Auto-detect (default)
conn = await client.asr.stream(config=StreamingConfig(language="auto"))
# Specific language for better accuracy
conn = await client.asr.stream(config=StreamingConfig(language="en"))
conn = await client.asr.stream(config=StreamingConfig(language="hi"))
conn = await client.asr.stream(config=StreamingConfig(language="ta"))sample_rate + dtype — Match your audio source
# Standard microphone input: 16kHz, 16-bit integer (default)
conn = await client.asr.stream(config=StreamingConfig(
sample_rate=16000,
dtype="int16",
))
# High-quality audio: 48kHz, 32-bit float
conn = await client.asr.stream(config=StreamingConfig(
sample_rate=48000,
dtype="float32",
))chunk_size_sec — Processing window size
# Smaller chunks = lower latency, more partial results
conn = await client.asr.stream(config=StreamingConfig(chunk_size_sec=0.5))
# Larger chunks = more context, potentially better accuracy
conn = await client.asr.stream(config=StreamingConfig(chunk_size_sec=2.0))silence_threshold_sec — Control segment boundaries
# Quick segmentation — short pauses trigger a new segment
conn = await client.asr.stream(config=StreamingConfig(silence_threshold_sec=0.3))
# Good for: fast-paced dialogue, command recognition
# Patient segmentation — only split on longer pauses
conn = await client.asr.stream(config=StreamingConfig(silence_threshold_sec=1.5))
# Good for: lectures, monologues, dictationRegister event handlers on the connection object:
conn = await client.asr.stream(config=StreamingConfig(language="en"))
@conn.on(StreamingMessageType.PARTIAL)
def on_partial(msg):
print(f"Interim: {msg.text}")
@conn.on(StreamingMessageType.FINAL_SEGMENT)
def on_segment(msg):
print(f"Segment: {msg.text}")
@conn.on(StreamingMessageType.FINAL)
def on_final(msg):
print(f"Final: {msg.text} ({msg.audio_duration_sec}s)")
@conn.on(StreamingMessageType.DONE)
def on_done(msg):
print(f"Done. {msg.total_segments} segments, {msg.total_audio_duration_sec}s")
@conn.on(StreamingMessageType.ERROR)
def on_error(msg):
print(f"Error: {msg.message}")| Event | Model | Key Attributes |
|---|---|---|
PARTIAL |
StreamingPartial |
text, language, segment_id, latency_ms |
FINAL_SEGMENT |
StreamingFinalSegment |
text, language, segment_id, silence_duration_ms |
FINAL |
StreamingFinal |
text, language, audio_duration_sec, inference_time_ms |
DONE |
StreamingDone |
total_segments, total_audio_duration_sec |
ERROR |
StreamingError |
message, code |
await conn.send_audio(pcm_bytes) # Send raw PCM audio
await conn.end() # Signal end of audio stream
await conn.close() # Close WebSocket connection
conn.is_closed # Check connection status
conn.session_id # Server-assigned session IDReturned by transcribe().
Response Schema
{
"success": true,
"request_id": "a1b2c3...",
"text": "[Priya] नमस्ते मेरी गाड़ी खराब हो गई है [Rahul] ठीक है मैं आपकी मदद करता हूँ",
"segments": [
{
"start": 0.5,
"end": 4.2,
"text": "नमस्ते मेरी गाड़ी खराब हो गई है",
"speaker": "Priya",
"emotion": "sad",
"words": [
{ "word": "नमस्ते", "start": 0.53, "end": 0.93, "score": -4.2 }
]
}
],
"detected_language": "hindi",
"speakers": ["Priya", "Rahul"],
"audio_duration": 9.1,
"inference_time_ms": 3241.5,
"nlp_analysis": {
"intent": { "label": "service_request", "confidence": 0.95, "reasoning": "..." },
"summary": "Customer reported a vehicle breakdown. Agent offered assistance.",
"sentiment": { "label": "negative", "score": -0.6, "explanation": "..." },
"normalized_text": "...",
"translation": "..."
}
}Accessing Fields
result = await client.asr.transcribe("audio.wav", config=TranscriptionConfig(model="zero-indic"))
# Full transcript
print(result.text)
# Detected language
print(result.detected_language)
# Speakers (when diarization is enabled)
print(result.speakers) # ["SPEAKER_00", "SPEAKER_01"] or ["Priya", "Rahul"]
# Iterate timestamped segments
for seg in result.segments:
speaker = f"[{seg.speaker}] " if seg.speaker else ""
emotion = f"({seg.emotion}) " if seg.emotion else ""
print(f"{speaker}{emotion}[{seg.start:.2f}s → {seg.end:.2f}s] {seg.text}")
# Per-word timestamps (when word_timestamps=True)
if seg.words:
for w in seg.words:
print(f" {w.word} [{w.start:.3f}s-{w.end:.3f}s] score={w.score}")| Attribute | Type | Description |
|---|---|---|
success |
bool |
Whether transcription succeeded. |
request_id |
str |
Unique request identifier. |
text |
str |
Full transcription text (prefixed with [SPEAKER_XX] when diarized). |
segments |
list[SegmentResult] |
Time-aligned segments (see below). |
detected_language |
str |
Detected language name (e.g. "hindi", "English"). |
speakers |
list[str] |
Unique speaker labels (empty when diarization is off). |
audio_duration |
float |
Audio duration in seconds. |
inference_time_ms |
float |
Server inference time in ms. |
nlp_analysis |
NLPAnalysis |
NLP results (if any enable_* flags were set). |
| Attribute | Type | Description |
|---|---|---|
start |
float |
Segment start time in seconds. |
end |
float |
Segment end time in seconds. |
text |
str |
Transcribed text for this segment. |
speaker |
str or None |
Speaker label (when diarization is enabled). |
emotion |
str or None |
Emotion label (when emotion diarization is enabled). |
words |
list[WordResult] or None |
Per-word timestamps (when word_timestamps=True). |
| Attribute | Type | Description |
|---|---|---|
word |
str |
The word. |
start |
float |
Word start time in seconds. |
end |
float |
Word end time in seconds. |
score |
float or None |
Log-probability confidence (more negative = less confident). |
| Attribute | Type | Description |
|---|---|---|
intent |
dict |
Intent label, confidence, reasoning. |
summary |
str |
Transcript summary. |
sentiment |
dict |
Sentiment label, score, explanation. |
emotion |
dict |
Emotion detection results. |
translation |
str |
Translated text (when output_language is set). |
normalized_text |
str |
Text with normalized key terms. |
All exceptions inherit from ShunyalabsError.
| Exception | Description |
|---|---|
AuthenticationError |
Invalid or missing API key (401). |
PermissionDeniedError |
Insufficient permissions (403). |
NotFoundError |
Resource not found (404). |
RateLimitError |
Rate limit exceeded (429). |
ServerError |
Server-side error (5xx). |
TimeoutError |
Request timed out. |
ConnectionError |
Network connectivity issue. |
ConfigurationError |
Invalid or missing configuration. |
AudioError |
Invalid, corrupted, or unsupported audio data. |
SessionError |
Invalid or expired session state. |
ConversationError |
Error returned during conversation flow. |
TranscriptionError |
ASR-specific transcription error. |
SynthesisError |
TTS-specific synthesis error. |
Exception handling example:
from shunyalabs import (
ConfigurationError,
AuthenticationError,
ConnectionError,
TranscriptionError,
AudioError,
SessionError,
TimeoutError,
ConversationError,
)
try:
result = await client.asr.transcribe("audio.wav", config=TranscriptionConfig(model="zero-indic"))
print(result.text)
except ConfigurationError as e:
print(f"Invalid configuration: {e}")
except AuthenticationError as e:
print(f"Authentication failed: {e}")
except ConnectionError as e:
print(f"Unable to connect: {e}")
except TranscriptionError as e:
print(f"Transcription processing failed: {e}")
except AudioError as e:
print(f"Invalid/unsupported audio: {e}")
except SessionError as e:
print(f"Invalid or expired session: {e}")
except TimeoutError as e:
print(f"Request timed out: {e}")
except ConversationError as e:
print(f"Conversation error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")| Limit | Value |
|---|---|
| Maximum audio duration | 15 minutes |
| Maximum File size | 50 MB |
| Concurrent requests | 16 |
⚠ For files exceeding the duration limit, split the audio into smaller segments before submitting. Consider using silence-detection tools to identify natural break points.
ℹ Support for pre-signed URLs for large audio files will be introduced in an upcoming beta release, allowing clients to provide a secure URL instead of uploading directly.
A complete integration walkthrough from installation to parsing the response.
pip install shunyalabs[all]import os
from shunyalabs import AsyncShunyaClient
client = AsyncShunyaClient(api_key=os.environ.get("SHUNYALABS_API_KEY"))⚠ Store your API key in an environment variable rather than hardcoding it.
from shunyalabs.asr import TranscriptionConfig
async with AsyncShunyaClient(api_key=os.environ.get("SHUNYALABS_API_KEY")) as client:
result = await client.asr.transcribe(
"recording.wav",
config=TranscriptionConfig(model="zero-indic"),
)# Full transcript text
print("Transcript:", result.text)
# Detected language
print("Language:", result.detected_language)
# Timestamped segments
for seg in result.segments:
print(f" [{seg.start:.2f}s – {seg.end:.2f}s] {seg.text}")import asyncio, os
from shunyalabs import AsyncShunyaClient, AuthenticationError
from shunyalabs.asr import TranscriptionConfig
async def main():
async with AsyncShunyaClient(api_key=os.environ.get("SHUNYALABS_API_KEY")) as client:
config = TranscriptionConfig(
model="zero-indic",
enable_diarization=True,
word_timestamps=True,
enable_sentiment_analysis=True,
)
try:
result = await client.asr.transcribe("customer_call.wav", config=config)
print("=== Transcript ===")
print(result.text)
print(f"Detected language: {result.detected_language}")
print(f"Speakers: {result.speakers}")
print()
print("=== Segments ===")
for seg in result.segments:
speaker = f"[{seg.speaker}] " if seg.speaker else ""
print(f"{speaker}[{seg.start:.2f}s → {seg.end:.2f}s] {seg.text}")
if result.nlp_analysis and result.nlp_analysis.sentiment:
print(f"\nSentiment: {result.nlp_analysis.sentiment}")
except AuthenticationError as e:
print(f"Authentication failed: {e}")
except Exception as e:
print(f"Error: {e}")
asyncio.run(main())git clone https://github.com/Shunyalabsai/shunyalabs-python-sdk.git
cd shunyalabs-python-sdk
python -m venv .venv
source .venv/bin/activate
pip install -e ".[dev]"
# Run tests
pytest
# Lint
ruff check src/
black --check src/
mypy src/