polytalk/.env.example at main · PolyTalkIO/polytalk · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# PolyTalk Environment Variables
# Copy this file to .env and update values as needed
# All ${VAR} references in config/config.yaml will use these values

# ============================================================================
# APPLICATION LOGGING
# ============================================================================
# Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL
LOG_LEVEL=INFO

# ============================================================================
# STT SERVICE (Local Speech-to-Text with faster-whisper)
# ============================================================================
# STT model to use: small, small-v3, medium, large-v3
STT_MODEL=small

# Device to run STT: cpu or cuda
STT_DEVICE=cpu

# Compute type: int8 (CPU) or float16 (CUDA)
STT_COMPUTE_TYPE=int8

# Number of STT web workers. Each worker loads its own Whisper model.
STT_WORKERS=1

# Load the Whisper model during STT service startup instead of on first stream.
STT_PRELOAD_MODEL=true

# Max file upload size in MB
STT_MAX_UPLOAD_MB=200

# Streaming audio window in seconds. Lower values reduce latency but can reduce
# transcript stability. 3.0 gives Whisper more context while pause flush handles utterance endings.
STT_STREAM_CHUNK_SECONDS=3.0

# Audio overlap between STT windows. Helps avoid missing words at chunk boundaries.
# Keep this modest; too much overlap can increase repeated/hallucinated text.
STT_CHUNK_OVERLAP_SECONDS=0.25

# Parallel STT queue workers. Increase to 2 when STT inference is slower than
# incoming audio windows and the GPU has spare compute.
STT_TRANSCRIBE_WORKERS=2
STT_TRANSCRIBE_QUEUE_SIZE=8
STT_MODEL_WORKERS=2

# Transcript emit batching. STT may infer more often than it emits to PolyTalk.
# Increase these values if live transcript/translation/TTS chunks are too small.
STT_EMIT_MIN_CHARS=120
STT_EMIT_INTERVAL_SECONDS=4.5
# Flush the current speech window after this much trailing silence, even if the
# normal stream window or emit thresholds have not been reached. Set 0 to disable.
STT_PAUSE_FLUSH_SECONDS=1.2
# Keep a small amount of audio before first detected speech, but do not let
# leading tab-share silence fill the first STT window.
STT_LEADING_SILENCE_PREROLL_SECONDS=0.2

# Silence/hallucination guards for streaming STT. These balanced defaults work
# well for typical microphone input: raise RMS/no-speech strictness if Whisper
# hallucinates during silence; lower them if quiet speech is missed.
STT_SILENCE_RMS_THRESHOLD=0.003
STT_NO_SPEECH_PROB_THRESHOLD=0.50
STT_LOG_PROB_THRESHOLD=-1.0
STT_MAX_CROSS_DELTA_WORD_REPEATS=6

# faster-whisper decoding/VAD knobs. Keep previous-text conditioning disabled
# by default for streaming because it can repeat or invent text during silence.
STT_VAD_FILTER=true
STT_VAD_MIN_SILENCE_MS=500
STT_VAD_SPEECH_PAD_MS=200
STT_WORD_TIMESTAMPS=true
STT_CONDITION_ON_PREVIOUS_TEXT=false
STT_TEMPERATURE=0.0
# Optional domain/context prompt for Whisper, for example names or product terms.
# STT_INITIAL_PROMPT=

# ============================================================================
# WHISPER SERVICE CONFIGURATION (Points to local STT or external)
# ============================================================================
# Base URL for Whisper API (defaults to local STT service in Docker)
# For external service, use: https://whisper.your-domain.com
WHISPER_BASE_URL=http://stt:8000

# WebSocket endpoint for streaming transcription (used by PolyTalk)
WHISPER_WS_ENDPOINT=/v1/stream/transcriptions

# Optional: API key for external Whisper API (e.g., OpenAI, custom deployment)
# Set this if your Whisper service requires authentication
# WHISPER_API_KEY=your-api-key-here

# ============================================================================
# TRANSLATION SERVICE (AI Translation)
# ============================================================================
# Translation API format: openai_chat, openai_responses, anthropic_messages,
# or gemini_generate_content.
TRANSLATION_API_FORMAT=openai_chat

# Base URL and endpoint for Translation API. Use your self-hosted AI server
# URL here, or an OpenAI-compatible provider URL.
TRANSLATION_BASE_URL=https://ai.example.com
TRANSLATION_ENDPOINT=/v1/chat/completions

# API key for Translation service
TRANSLATION_API_KEY=your_translation_api_key_here

# AI model to use for translation. For self-hosted translation, use models such
# as qwen3-8b, TranslateGama, or other open-source/open-weight models supported
# by your model server.
TRANSLATION_MODEL=qwen3-8b

# Maximum translation output tokens. Keep this bounded for live streaming, but
# allow enough room for Indic-script targets and longer sentence buffers.
TRANSLATION_MAX_TOKENS=240

# ============================================================================
# VISUAL CONTEXT SERVICE (Shared Tab/Page Screenshot)
# ============================================================================
# Enable one-time shared tab/page screenshot summarization when tab audio
# sharing starts. The summary is used as translation context only.
VISUAL_CONTEXT_ENABLED=false

# Keep mock mode enabled for local setup. For real visual context, set this to
# false and configure a vision-capable provider below.
VISUAL_CONTEXT_MOCK_MODE=true

# Visual context can use a separate vision-capable provider/model, or mirror the
# translation provider if it supports image inputs.
VISUAL_CONTEXT_BASE_URL=https://ai.example.com
VISUAL_CONTEXT_ENDPOINT=/v1/chat/completions
VISUAL_CONTEXT_API_FORMAT=openai_chat
VISUAL_CONTEXT_API_KEY=your_visual_context_api_key_here
VISUAL_CONTEXT_MODEL=gpt-4o-mini

# Maximum output tokens for the compact screenshot summary.
VISUAL_CONTEXT_MAX_TOKENS=300

# ============================================================================
# TTS SERVICE (Local Text-to-Speech with Piper)
# ============================================================================
# Piper model to use (voice model name in tts/voices directory)
TTS_MODEL=en_GB-jenny_dioco-medium

# Base URL for TTS API (local Piper service in Docker)
# For external service, use: https://tts.your-domain.com
TTS_BASE_URL=http://tts:5000

# Supertonic TTS is used for Japanese and Korean. First startup downloads
# model assets into the supertonic_data Docker volume.
SUPERTONIC_TTS_BASE_URL=http://supertonic-tts:7788
SUPERTONIC_TTS_VOICE=F1
SUPERTONIC_TTS_JA_VOICE=F1
SUPERTONIC_TTS_KO_VOICE=F1
SUPERTONIC_TTS_STEPS=8
SUPERTONIC_TTS_SPEED=1.00

# ============================================================================
# APPLICATION SETTINGS
# ============================================================================
# Host to bind the application (0.0.0.0 for all interfaces)
APP_HOST=0.0.0.0

# Port to run the application
APP_PORT=9000

# Enable debug mode (set to false for production)
APP_DEBUG=true

# Comma-separated browser origins allowed to call the app.
# Use the exact HTTPS origin in production, for example:
# ALLOWED_ORIGINS=https://polytalk.example.com
ALLOWED_ORIGINS=http://localhost:9000,http://127.0.0.1:9000

# Translate partial speech after this many buffered characters or seconds.
# Lower values reduce latency; higher values improve context and quality.
TRANSLATION_FLUSH_CHARS=300
TRANSLATION_FLUSH_SECONDS=5.0
TRANSLATION_FLUSH_MIN_CHARS=120