VisionClaw/docker-compose.voice.yml at main · DreamLab-AI/VisionClaw · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Voice-to-Voice Real-Time Audio Services
# Usage: docker-compose -f docker-compose.yml -f docker-compose.voice.yml --profile dev up
#
# Architecture:
#   LiveKit SFU  — WebRTC spatial audio mixer for user-to-user + agent voice
#   Turbo Whisper — faster-whisper with streaming WebSocket endpoint (replaces polling)
#   Kokoro TTS   — OpenAI-compatible TTS with per-agent voice presets
#
# Audio format: Opus throughout (48kHz, mono, 64kbps)

services:
  # LiveKit SFU — WebRTC Selective Forwarding Unit for spatial voice chat
  # All user-to-user voice and agent spatial audio routes through here
  livekit:
    image: livekit/livekit-server:v1.7
    container_name: visionflow-livekit
    hostname: livekit
    command: --config /etc/livekit.yaml
    environment:
      - LIVEKIT_API_KEY=${LIVEKIT_API_KEY:-visionflow}
      - LIVEKIT_API_SECRET=${LIVEKIT_API_SECRET:-changeme}
    ports:
      - "7880:7880"   # HTTP API + WebSocket signaling
      - "7881:7881"   # RTC over TCP
      - "7882:7882/udp"  # RTC over UDP (primary)
      - "50000-50200:50000-50200/udp"  # WebRTC media ports
    volumes:
      - ./config/livekit.yaml:/etc/livekit.yaml:ro
    networks:
      docker_ragflow:
        aliases:
          - livekit
    healthcheck:
      test: ["CMD", "wget", "--spider", "-q", "http://localhost:7880"]
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 5s
    restart: unless-stopped
    profiles:
      - development
      - dev
      - production
      - prod

  # Turbo Whisper — faster-whisper with streaming WebSocket STT
  # Replaces the polling-based whisper-webui-backend with direct streaming
  turbo-whisper:
    image: fedirz/faster-whisper-server:latest-cuda
    container_name: visionflow-turbo-whisper
    hostname: turbo-whisper
    environment:
      - WHISPER__MODEL=Systran/faster-whisper-large-v3
      - WHISPER__DEVICE=cuda
      - WHISPER__COMPUTE_TYPE=float16
      - WHISPER__LANGUAGE=en
      - WHISPER__VAD_FILTER=true
      # Streaming mode: returns partial results as they arrive
      - WHISPER__BEAM_SIZE=1
      - WHISPER__BEST_OF=1
    ports:
      - "8100:8000"   # OpenAI-compatible REST + WebSocket
    networks:
      docker_ragflow:
        aliases:
          - turbo-whisper
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    runtime: nvidia
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 15s
      timeout: 5s
      retries: 3
      start_period: 30s
    restart: unless-stopped
    profiles:
      - development
      - dev
      - production
      - prod

  # Kokoro TTS — Text-to-speech with distinct per-agent voice presets
  # OpenAI-compatible /v1/audio/speech endpoint, Opus output
  kokoro-tts:
    image: ghcr.io/remsky/kokoro-fastapi-cpu:latest
    container_name: visionflow-kokoro-tts
    hostname: kokoro-tts
    environment:
      - KOKORO_DEFAULT_VOICE=af_heart
      - KOKORO_DEFAULT_FORMAT=opus
      - KOKORO_SAMPLE_RATE=48000
    ports:
      - "8880:8880"
    networks:
      docker_ragflow:
        aliases:
          - kokoro-tts
          - kokoro-tts-container
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8880/health"]
      interval: 15s
      timeout: 5s
      retries: 3
      start_period: 20s
    restart: unless-stopped
    profiles:
      - development
      - dev
      - production
      - prod

networks:
  docker_ragflow:
    external: true

volumes:
  whisper-models:
    name: visionflow-whisper-models
    driver: local