From 49b1ff1482d5e0f9d8f4b650f9868c431919017d Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Tue, 3 Mar 2026 10:39:25 +0000
Subject: [PATCH 1/2] feat(realtime): WebRTC support

Signed-off-by: Richard Palethorpe <io@richiejp.com>
---
 .github/workflows/test.yml                    |    4 +-
 .github/workflows/tests-e2e.yml               |    2 +-
 .gitignore                                    |    4 +
 Dockerfile                                    |    6 +-
 Makefile                                      |   86 +-
 core/backend/transcript.go                    |   46 +-
 core/backend/tts.go                           |   58 +-
 core/http/endpoints/openai/opus.go            |  100 ++
 core/http/endpoints/openai/opus_test.go       | 1267 +++++++++++++++++
 core/http/endpoints/openai/realtime.go        |  947 +++++++-----
 .../endpoints/openai/realtime_transport.go    |   23 +
 .../openai/realtime_transport_webrtc.go       |  250 ++++
 .../endpoints/openai/realtime_transport_ws.go |   47 +
 core/http/endpoints/openai/realtime_webrtc.go |  250 ++++
 core/http/endpoints/openai/types/types.go     |   43 +-
 core/http/middleware/trace.go                 |    2 +-
 core/http/react-ui/src/pages/Settings.jsx     |   14 +-
 core/http/react-ui/src/pages/Talk.jsx         |  798 +++++++++--
 core/http/react-ui/src/pages/Traces.jsx       |  347 ++++-
 core/http/react-ui/src/utils/api.js           |    6 +
 core/http/react-ui/src/utils/config.js        |    4 +
 core/http/routes/openai.go                    |    1 +
 core/http/routes/ui.go                        |   38 +
 core/http/static/talk.js                      |  689 +++++++--
 core/http/views/talk.html                     |  251 +++-
 core/http/views/traces.html                   |   53 +-
 core/trace/audio_snippet.go                   |  102 ++
 core/trace/backend_trace.go                   |    2 +-
 docs/content/advanced/advanced-usage.md       |    2 +
 go.mod                                        |   30 +-
 go.sum                                        |   64 +-
 pkg/audio/audio.go                            |   43 +
 pkg/audio/audio_test.go                       |  155 ++
 pkg/opus/opus.go                              |  261 ++++
 pkg/opus/shim/Makefile                        |   10 +
 pkg/opus/shim/libopusshim.so                  |  Bin 0 -> 15240 bytes
 pkg/opus/shim/opus_shim.c                     |    9 +
 pkg/sound/int16.go                            |   12 +
 pkg/sound/int16_test.go                       |  162 +++
 pkg/sound/testutil_test.go                    |   72 +
 tests/e2e/e2e_suite_test.go                   |   95 +-
 tests/e2e/mock-backend/main.go                |   93 +-
 tests/e2e/realtime_webrtc_test.go             |  459 ++++++
 tests/e2e/realtime_ws_test.go                 |  269 ++++
 44 files changed, 6293 insertions(+), 883 deletions(-)
 create mode 100644 core/http/endpoints/openai/opus.go
 create mode 100644 core/http/endpoints/openai/opus_test.go
 create mode 100644 core/http/endpoints/openai/realtime_transport.go
 create mode 100644 core/http/endpoints/openai/realtime_transport_webrtc.go
 create mode 100644 core/http/endpoints/openai/realtime_transport_ws.go
 create mode 100644 core/http/endpoints/openai/realtime_webrtc.go
 create mode 100644 core/trace/audio_snippet.go
 create mode 100644 pkg/audio/audio_test.go
 create mode 100644 pkg/opus/opus.go
 create mode 100644 pkg/opus/shim/Makefile
 create mode 100755 pkg/opus/shim/libopusshim.so
 create mode 100644 pkg/opus/shim/opus_shim.c
 create mode 100644 pkg/sound/int16_test.go
 create mode 100644 pkg/sound/testutil_test.go
 create mode 100644 tests/e2e/realtime_webrtc_test.go
 create mode 100644 tests/e2e/realtime_ws_test.go

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index fd6eece4c7af..b9b71f5e8efd 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -93,7 +93,7 @@ jobs:
       - name: Dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install curl ffmpeg
+          sudo apt-get install curl ffmpeg libopus-dev
       - name: Setup Node.js
         uses: actions/setup-node@v4
         with:
@@ -195,7 +195,7 @@ jobs:
         run: go version
       - name: Dependencies
         run: |
-          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm opus
           pip install --user --no-cache-dir grpcio-tools grpcio
       - name: Setup Node.js
         uses: actions/setup-node@v4
diff --git a/.github/workflows/tests-e2e.yml b/.github/workflows/tests-e2e.yml
index 490eb296ab43..147ea44dab23 100644
--- a/.github/workflows/tests-e2e.yml
+++ b/.github/workflows/tests-e2e.yml
@@ -43,7 +43,7 @@ jobs:
       - name: Dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install -y build-essential
+          sudo apt-get install -y build-essential libopus-dev
       - name: Setup Node.js
         uses: actions/setup-node@v4
         with:
diff --git a/.gitignore b/.gitignore
index 3d7e27f7a96d..3dcb309ca40d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,6 +38,7 @@ test-models/
 test-dir/
 tests/e2e-aio/backends
 tests/e2e-aio/models
+mock-backend
 
 release/
 
@@ -69,3 +70,6 @@ docs/static/gallery.html
 # React UI build artifacts (keep placeholder dist/index.html)
 core/http/react-ui/node_modules/
 core/http/react-ui/dist
+
+# Extracted backend binaries for container-based testing
+local-backends/
diff --git a/Dockerfile b/Dockerfile
index 666f19ab1895..f319ce5b74c1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,7 +10,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         ca-certificates curl wget espeak-ng libgomp1 \
-        ffmpeg libopenblas0 libopenblas-dev sox && \
+        ffmpeg libopenblas0 libopenblas-dev libopus0 sox && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
@@ -190,6 +190,7 @@ RUN apt-get update && \
         curl libssl-dev \
         git \
         git-lfs \
+        libopus-dev pkg-config \
         unzip upx-ucl python3 python-is-python3 && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
@@ -378,6 +379,9 @@ COPY ./entrypoint.sh .
 
 # Copy the binary
 COPY --from=builder /build/local-ai ./
+# Copy the opus shim if it was built
+RUN --mount=from=builder,src=/build/,dst=/mnt/build \
+    if [ -f /mnt/build/libopusshim.so ]; then cp /mnt/build/libopusshim.so ./; fi
 
 # Make sure the models directory exists
 RUN mkdir -p /models /backends
diff --git a/Makefile b/Makefile
index 54c21088afa0..1e0175357a1e 100644
--- a/Makefile
+++ b/Makefile
@@ -106,7 +106,17 @@ react-ui-docker:
 core/http/react-ui/dist: react-ui
 
 ## Build:
-build: protogen-go install-go-tools core/http/react-ui/dist ## Build the project
+
+# Build the opus shim if libopus is available
+build-opus-shim:
+	@if command -v pkg-config >/dev/null 2>&1 && pkg-config --exists opus; then \
+		echo "$(GREEN)I Building opus shim (libopus found)$(RESET)"; \
+		$(MAKE) -C pkg/opus/shim; \
+	else \
+		echo "$(YELLOW)W libopus-dev not found, skipping opus shim build (WebRTC audio will not work)$(RESET)"; \
+	fi
+
+build: protogen-go install-go-tools build-opus-shim core/http/react-ui/dist ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
@@ -114,6 +124,7 @@ build: protogen-go install-go-tools core/http/react-ui/dist ## Build the project
 	$(info ${GREEN}I UPX: ${YELLOW}$(UPX)${RESET})
 	rm -rf $(BINARY_NAME) || true
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./cmd/local-ai
+	@if [ -f pkg/opus/shim/libopusshim.so ]; then cp pkg/opus/shim/libopusshim.so .; fi
 
 build-launcher: ## Build the launcher application
 	$(info ${GREEN}I local-ai launcher build info:${RESET})
@@ -151,7 +162,7 @@ test-models/testmodel.ggml:
 	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
 	cp tests/models_fixtures/* test-models
 
-prepare-test: protogen-go
+prepare-test: protogen-go build-opus-shim
 	cp tests/models_fixtures/* test-models
 
 ########################################################
@@ -163,6 +174,7 @@ test: test-models/testmodel.ggml protogen-go
 	@echo 'Running tests'
 	export GO_TAGS="debug"
 	$(MAKE) prepare-test
+	OPUS_SHIM_LIBRARY=$(abspath ./pkg/opus/shim/libopusshim.so) \
 	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models BACKENDS_PATH=$(abspath ./)/backends \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
 	$(MAKE) test-llama-gguf
@@ -218,9 +230,10 @@ prepare-e2e:
 run-e2e-image:
 	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --name e2e-tests-$(RANDOM) localai-tests
 
-test-e2e: build-mock-backend prepare-e2e run-e2e-image
+test-e2e: build-mock-backend build-opus-shim prepare-e2e run-e2e-image
 	@echo 'Running e2e tests'
 	BUILD_TYPE=$(BUILD_TYPE) \
+	OPUS_SHIM_LIBRARY=$(abspath ./pkg/opus/shim/libopusshim.so) \
 	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390 \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
 	$(MAKE) clean-mock-backend
@@ -250,6 +263,73 @@ test-stablediffusion: prepare-test
 test-stores:
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts $(TEST_FLAKES) -v -r tests/integration
 
+test-realtime: build-mock-backend
+	@echo 'Running realtime e2e tests (mock backend)'
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="Realtime && !real-models" --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
+
+# Real-model realtime tests. Set REALTIME_TEST_MODEL to use your own pipeline,
+# or leave unset to auto-build one from the component env vars below.
+REALTIME_VAD?=silero-vad-ggml
+REALTIME_STT?=whisper-1
+REALTIME_LLM?=qwen3-0.6b
+REALTIME_TTS?=tts-1
+REALTIME_BACKENDS_PATH?=$(abspath ./)/backends
+
+test-realtime-models: build-mock-backend
+	@echo 'Running realtime e2e tests (real models)'
+	REALTIME_TEST_MODEL=$${REALTIME_TEST_MODEL:-realtime-test-pipeline} \
+	REALTIME_VAD=$(REALTIME_VAD) \
+	REALTIME_STT=$(REALTIME_STT) \
+	REALTIME_LLM=$(REALTIME_LLM) \
+	REALTIME_TTS=$(REALTIME_TTS) \
+	REALTIME_BACKENDS_PATH=$(REALTIME_BACKENDS_PATH) \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="Realtime" --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
+
+# --- Container-based real-model testing ---
+
+REALTIME_BACKEND_NAMES ?= silero-vad whisper llama-cpp kokoro
+REALTIME_MODELS_DIR ?= $(abspath ./models)
+REALTIME_BACKENDS_DIR ?= $(abspath ./local-backends)
+REALTIME_DOCKER_FLAGS ?= --gpus all
+
+local-backends:
+	mkdir -p local-backends
+
+extract-backend-%: docker-build-% local-backends
+	@echo "Extracting backend $*..."
+	@CID=$$(docker create local-ai-backend:$*) && \
+	  rm -rf local-backends/$* && mkdir -p local-backends/$* && \
+	  docker cp $$CID:/ - | tar -xf - -C local-backends/$* && \
+	  docker rm $$CID > /dev/null
+
+extract-realtime-backends: $(addprefix extract-backend-,$(REALTIME_BACKEND_NAMES))
+
+test-realtime-models-docker: build-mock-backend
+	docker build --target build-requirements \
+	  --build-arg BUILD_TYPE=$(or $(BUILD_TYPE),cublas) \
+	  --build-arg CUDA_MAJOR_VERSION=$(or $(CUDA_MAJOR_VERSION),13) \
+	  --build-arg CUDA_MINOR_VERSION=$(or $(CUDA_MINOR_VERSION),0) \
+	  -t localai-test-runner .
+	docker run --rm \
+	  $(REALTIME_DOCKER_FLAGS) \
+	  -v $(abspath ./):/build \
+	  -v $(REALTIME_MODELS_DIR):/models:ro \
+	  -v $(REALTIME_BACKENDS_DIR):/backends \
+	  -v localai-go-cache:/root/go/pkg/mod \
+	  -v localai-go-build-cache:/root/.cache/go-build \
+	  -e REALTIME_TEST_MODEL=$${REALTIME_TEST_MODEL:-realtime-test-pipeline} \
+	  -e REALTIME_VAD=$(REALTIME_VAD) \
+	  -e REALTIME_STT=$(REALTIME_STT) \
+	  -e REALTIME_LLM=$(REALTIME_LLM) \
+	  -e REALTIME_TTS=$(REALTIME_TTS) \
+	  -e REALTIME_BACKENDS_PATH=/backends \
+	  -e REALTIME_MODELS_PATH=/models \
+	  -w /build \
+	  localai-test-runner \
+	  bash -c 'git config --global --add safe.directory /build && \
+	    make protogen-go && make build-mock-backend && \
+	    go run github.com/onsi/ginkgo/v2/ginkgo --label-filter="Realtime" --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e'
+
 test-container:
 	docker build --target requirements -t local-ai-test-container .
 	docker run -ti --rm --entrypoint /bin/bash -ti -v $(abspath ./):/build local-ai-test-container
diff --git a/core/backend/transcript.go b/core/backend/transcript.go
index dbbf718a3a48..7568e4e40706 100644
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -3,11 +3,12 @@ package backend
 import (
 	"context"
 	"fmt"
+	"maps"
 	"time"
 
 	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/trace"
 	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/trace"
 
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/model"
@@ -30,9 +31,12 @@ func ModelTranscription(audio, language string, translate, diarize bool, prompt
 	}
 
 	var startTime time.Time
+	var audioSnippet map[string]any
 	if appConfig.EnableTracing {
 		trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems)
 		startTime = time.Now()
+		// Capture audio before the backend call — the backend may delete the file.
+		audioSnippet = trace.AudioSnippet(audio)
 	}
 
 	r, err := transcriptionModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
@@ -45,6 +49,16 @@ func ModelTranscription(audio, language string, translate, diarize bool, prompt
 	})
 	if err != nil {
 		if appConfig.EnableTracing {
+			errData := map[string]any{
+				"audio_file": audio,
+				"language":   language,
+				"translate":  translate,
+				"diarize":    diarize,
+				"prompt":     prompt,
+			}
+			if audioSnippet != nil {
+				maps.Copy(errData, audioSnippet)
+			}
 			trace.RecordBackendTrace(trace.BackendTrace{
 				Timestamp: startTime,
 				Duration:  time.Since(startTime),
@@ -53,13 +67,7 @@ func ModelTranscription(audio, language string, translate, diarize bool, prompt
 				Backend:   modelConfig.Backend,
 				Summary:   trace.TruncateString(audio, 200),
 				Error:     err.Error(),
-				Data: map[string]any{
-					"audio_file": audio,
-					"language":   language,
-					"translate":  translate,
-					"diarize":    diarize,
-					"prompt":     prompt,
-				},
+				Data:      errData,
 			})
 		}
 		return nil, err
@@ -84,6 +92,18 @@ func ModelTranscription(audio, language string, translate, diarize bool, prompt
 	}
 
 	if appConfig.EnableTracing {
+		data := map[string]any{
+			"audio_file":     audio,
+			"language":       language,
+			"translate":      translate,
+			"diarize":        diarize,
+			"prompt":         prompt,
+			"result_text":    tr.Text,
+			"segments_count": len(tr.Segments),
+		}
+		if audioSnippet != nil {
+			maps.Copy(data, audioSnippet)
+		}
 		trace.RecordBackendTrace(trace.BackendTrace{
 			Timestamp: startTime,
 			Duration:  time.Since(startTime),
@@ -91,15 +111,7 @@ func ModelTranscription(audio, language string, translate, diarize bool, prompt
 			ModelName: modelConfig.Name,
 			Backend:   modelConfig.Backend,
 			Summary:   trace.TruncateString(audio+" -> "+tr.Text, 200),
-			Data: map[string]any{
-				"audio_file":     audio,
-				"language":       language,
-				"translate":      translate,
-				"diarize":        diarize,
-				"prompt":         prompt,
-				"result_text":    tr.Text,
-				"segments_count": len(tr.Segments),
-			},
+			Data:      data,
 		})
 	}
 
diff --git a/core/backend/tts.go b/core/backend/tts.go
index 7859cd67cb71..69193db12a5d 100644
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -6,6 +6,7 @@ import (
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
+	"maps"
 	"os"
 	"path/filepath"
 	"time"
@@ -84,6 +85,16 @@ func ModelTTS(
 			errStr = fmt.Sprintf("TTS error: %s", res.Message)
 		}
 
+		data := map[string]any{
+			"text":     text,
+			"voice":    voice,
+			"language": language,
+		}
+		if err == nil && res.Success {
+			if snippet := trace.AudioSnippet(filePath); snippet != nil {
+				maps.Copy(data, snippet)
+			}
+		}
 		trace.RecordBackendTrace(trace.BackendTrace{
 			Timestamp: startTime,
 			Duration:  time.Since(startTime),
@@ -92,11 +103,7 @@ func ModelTTS(
 			Backend:   modelConfig.Backend,
 			Summary:   trace.TruncateString(text, 200),
 			Error:     errStr,
-			Data: map[string]any{
-				"text":     text,
-				"voice":    voice,
-				"language": language,
-			},
+			Data:      data,
 		})
 	}
 
@@ -158,6 +165,11 @@ func ModelTTSStream(
 	headerSent := false
 	var callbackErr error
 
+	// Collect up to 30s of audio for tracing
+	var snippetPCM []byte
+	var totalPCMBytes int
+	snippetCapped := false
+
 	err = ttsModel.TTSStream(context.Background(), &proto.TTSRequest{
 		Text:     text,
 		Model:    modelPath,
@@ -166,7 +178,7 @@ func ModelTTSStream(
 	}, func(reply *proto.Reply) {
 		// First message contains sample rate info
 		if !headerSent && len(reply.Message) > 0 {
-			var info map[string]interface{}
+			var info map[string]any
 			if json.Unmarshal(reply.Message, &info) == nil {
 				if sr, ok := info["sample_rate"].(float64); ok {
 					sampleRate = uint32(sr)
@@ -207,6 +219,22 @@ func ModelTTSStream(
 			if writeErr := audioCallback(reply.Audio); writeErr != nil {
 				callbackErr = writeErr
 			}
+			// Accumulate PCM for tracing snippet
+			totalPCMBytes += len(reply.Audio)
+			if appConfig.EnableTracing && !snippetCapped {
+				maxBytes := int(sampleRate) * 2 * trace.MaxSnippetSeconds // 16-bit mono
+				if len(snippetPCM)+len(reply.Audio) <= maxBytes {
+					snippetPCM = append(snippetPCM, reply.Audio...)
+				} else {
+					remaining := maxBytes - len(snippetPCM)
+					if remaining > 0 {
+						// Align to sample boundary (2 bytes per sample)
+						remaining = remaining &^ 1
+						snippetPCM = append(snippetPCM, reply.Audio[:remaining]...)
+					}
+					snippetCapped = true
+				}
+			}
 		}
 	})
 
@@ -221,6 +249,17 @@ func ModelTTSStream(
 			errStr = resultErr.Error()
 		}
 
+		data := map[string]any{
+			"text":      text,
+			"voice":     voice,
+			"language":  language,
+			"streaming": true,
+		}
+		if resultErr == nil && len(snippetPCM) > 0 {
+			if snippet := trace.AudioSnippetFromPCM(snippetPCM, int(sampleRate), totalPCMBytes); snippet != nil {
+				maps.Copy(data, snippet)
+			}
+		}
 		trace.RecordBackendTrace(trace.BackendTrace{
 			Timestamp: startTime,
 			Duration:  time.Since(startTime),
@@ -229,12 +268,7 @@ func ModelTTSStream(
 			Backend:   modelConfig.Backend,
 			Summary:   trace.TruncateString(text, 200),
 			Error:     errStr,
-			Data: map[string]any{
-				"text":      text,
-				"voice":     voice,
-				"language":  language,
-				"streaming": true,
-			},
+			Data:      data,
 		})
 	}
 
diff --git a/core/http/endpoints/openai/opus.go b/core/http/endpoints/openai/opus.go
new file mode 100644
index 000000000000..86ef7b5236d6
--- /dev/null
+++ b/core/http/endpoints/openai/opus.go
@@ -0,0 +1,100 @@
+package openai
+
+import (
+	"fmt"
+
+	"github.com/mudler/LocalAI/pkg/opus"
+	"github.com/mudler/LocalAI/pkg/sound"
+)
+
+const (
+	opusSampleRate = 48000
+	opusChannels   = 1
+	// 20ms frame at 48kHz mono = 960 samples
+	opusFrameSize = 960
+	// Maximum Opus packet size
+	opusMaxPacketSize = 4000
+	// Maximum decoded frame size (120ms at 48kHz)
+	opusMaxFrameSize = 5760
+)
+
+// OpusEncoder wraps libopus (via purego shim) for encoding PCM int16 LE to Opus frames.
+type OpusEncoder struct {
+	enc *opus.Encoder
+}
+
+func NewOpusEncoder() (*OpusEncoder, error) {
+	enc, err := opus.NewEncoder(opusSampleRate, opusChannels, opus.ApplicationAudio)
+	if err != nil {
+		return nil, fmt.Errorf("opus encoder: %w", err)
+	}
+	if err := enc.SetBitrate(64000); err != nil {
+		enc.Close()
+		return nil, fmt.Errorf("opus set bitrate: %w", err)
+	}
+	if err := enc.SetComplexity(10); err != nil {
+		enc.Close()
+		return nil, fmt.Errorf("opus set complexity: %w", err)
+	}
+	return &OpusEncoder{enc: enc}, nil
+}
+
+// Encode takes PCM int16 LE bytes at the given sampleRate and returns Opus frames.
+// It resamples to 48kHz if needed, then encodes in 20ms frames.
+func (e *OpusEncoder) Encode(pcmInt16LE []byte, sampleRate int) ([][]byte, error) {
+	samples := sound.BytesToInt16sLE(pcmInt16LE)
+	if len(samples) == 0 {
+		return nil, nil
+	}
+
+	if sampleRate != opusSampleRate {
+		samples = sound.ResampleInt16(samples, sampleRate, opusSampleRate)
+	}
+
+	var frames [][]byte
+	packet := make([]byte, opusMaxPacketSize)
+
+	for offset := 0; offset+opusFrameSize <= len(samples); offset += opusFrameSize {
+		frame := samples[offset : offset+opusFrameSize]
+		n, err := e.enc.Encode(frame, opusFrameSize, packet)
+		if err != nil {
+			return frames, fmt.Errorf("opus encode: %w", err)
+		}
+		out := make([]byte, n)
+		copy(out, packet[:n])
+		frames = append(frames, out)
+	}
+
+	return frames, nil
+}
+
+func (e *OpusEncoder) Close() {
+	e.enc.Close()
+}
+
+// OpusDecoder wraps libopus (via purego shim) for decoding Opus frames to PCM int16 LE.
+type OpusDecoder struct {
+	dec *opus.Decoder
+}
+
+func NewOpusDecoder() (*OpusDecoder, error) {
+	dec, err := opus.NewDecoder(opusSampleRate, opusChannels)
+	if err != nil {
+		return nil, fmt.Errorf("opus decoder: %w", err)
+	}
+	return &OpusDecoder{dec: dec}, nil
+}
+
+// Decode takes a single Opus frame and returns PCM int16 LE bytes at 48kHz.
+func (d *OpusDecoder) Decode(opusFrame []byte) ([]int16, error) {
+	pcm := make([]int16, opusMaxFrameSize)
+	n, err := d.dec.Decode(opusFrame, pcm, opusMaxFrameSize, false)
+	if err != nil {
+		return nil, fmt.Errorf("opus decode: %w", err)
+	}
+	return pcm[:n], nil
+}
+
+func (d *OpusDecoder) Close() {
+	d.dec.Close()
+}
diff --git a/core/http/endpoints/openai/opus_test.go b/core/http/endpoints/openai/opus_test.go
new file mode 100644
index 000000000000..77314c9ab5ab
--- /dev/null
+++ b/core/http/endpoints/openai/opus_test.go
@@ -0,0 +1,1267 @@
+package openai
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"math"
+	"math/rand/v2"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/mudler/LocalAI/pkg/opus"
+	"github.com/mudler/LocalAI/pkg/sound"
+	"github.com/pion/rtp"
+	"github.com/pion/webrtc/v4"
+)
+
+// --- helpers (mirror pkg/sound/testutil_test.go but in this package) ---
+
+func generateSineWave(freq float64, sampleRate, numSamples int) []int16 {
+	out := make([]int16, numSamples)
+	for i := range out {
+		t := float64(i) / float64(sampleRate)
+		out[i] = int16(math.MaxInt16 / 2 * math.Sin(2*math.Pi*freq*t))
+	}
+	return out
+}
+
+func computeRMS(samples []int16) float64 {
+	if len(samples) == 0 {
+		return 0
+	}
+	var sum float64
+	for _, s := range samples {
+		v := float64(s)
+		sum += v * v
+	}
+	return math.Sqrt(sum / float64(len(samples)))
+}
+
+// estimateFrequency uses zero-crossing count to estimate the dominant frequency.
+func estimateFrequency(samples []int16, sampleRate int) float64 {
+	if len(samples) < 2 {
+		return 0
+	}
+	crossings := 0
+	for i := 1; i < len(samples); i++ {
+		if (samples[i-1] >= 0 && samples[i] < 0) || (samples[i-1] < 0 && samples[i] >= 0) {
+			crossings++
+		}
+	}
+	duration := float64(len(samples)) / float64(sampleRate)
+	return float64(crossings) / (2 * duration)
+}
+
+// encodeDecodeRoundtrip encodes PCM at the given sample rate and decodes
+// all resulting frames, returning the concatenated decoded samples.
+func encodeDecodeRoundtrip(t *testing.T, pcmBytes []byte, sampleRate int) []int16 {
+	t.Helper()
+	enc, err := NewOpusEncoder()
+	if err != nil {
+		t.Fatalf("NewOpusEncoder: %v", err)
+	}
+	defer enc.Close()
+
+	dec, err := NewOpusDecoder()
+	if err != nil {
+		t.Fatalf("NewOpusDecoder: %v", err)
+	}
+	defer dec.Close()
+
+	frames, err := enc.Encode(pcmBytes, sampleRate)
+	if err != nil {
+		t.Fatalf("Encode: %v", err)
+	}
+
+	var all []int16
+	for _, frame := range frames {
+		d, err := dec.Decode(frame)
+		if err != nil {
+			t.Fatalf("Decode: %v", err)
+		}
+		all = append(all, d...)
+	}
+	return all
+}
+
+// --- Opus encoder tests ---
+
+// TestOpus_ChromeLikeVoIPDecode tests decoding Opus frames encoded with
+// VoIP mode at 32kbps (similar to Chrome's WebRTC encoder settings).
+// Chrome uses SILK mode for voice, which exercises different code paths
+// in the decoder compared to ApplicationAudio (CELT-preferring).
+func TestOpus_ChromeLikeVoIPDecode(t *testing.T) {
+	// Chrome typically encodes voice at 32kbps in VoIP mode
+	enc, err := opus.NewEncoder(48000, 1, opus.ApplicationVoIP)
+	if err != nil {
+		t.Fatalf("NewEncoder(VoIP): %v", err)
+	}
+	defer enc.Close()
+	if err := enc.SetBitrate(32000); err != nil {
+		t.Fatalf("SetBitrate: %v", err)
+	}
+	if err := enc.SetComplexity(5); err != nil {
+		t.Fatalf("SetComplexity: %v", err)
+	}
+
+	dec, err := NewOpusDecoder()
+	if err != nil {
+		t.Fatalf("NewOpusDecoder: %v", err)
+	}
+	defer dec.Close()
+
+	// Encode 1 second of 440Hz sine at 48kHz
+	sine := generateSineWave(440, 48000, 48000)
+	packet := make([]byte, 4000)
+
+	var allDecoded []int16
+	for offset := 0; offset+opusFrameSize <= len(sine); offset += opusFrameSize {
+		frame := sine[offset : offset+opusFrameSize]
+		n, err := enc.Encode(frame, opusFrameSize, packet)
+		if err != nil {
+			t.Fatalf("VoIP encode: %v", err)
+		}
+
+		decoded, err := dec.Decode(packet[:n])
+		if err != nil {
+			t.Fatalf("Decode VoIP frame: %v (packet size=%d)", err, n)
+		}
+		allDecoded = append(allDecoded, decoded...)
+	}
+
+	if len(allDecoded) == 0 {
+		t.Fatal("no decoded samples from VoIP encoder")
+	}
+
+	// Skip warmup
+	skip := min(len(allDecoded)/4, 48000*100/1000)
+	tail := allDecoded[skip:]
+	rms := computeRMS(tail)
+
+	t.Logf("VoIP/SILK roundtrip: %d decoded samples, RMS=%.1f", len(allDecoded), rms)
+	if rms < 50 {
+		t.Errorf("VoIP decoded RMS=%.1f is too low; SILK decoder may be broken", rms)
+	}
+}
+
+// TestOpus_StereoEncoderMonoDecoder tests decoding stereo-encoded Opus
+// with a mono decoder. Chrome signals opus/48000/2 in SDP and may send
+// stereo Opus. The mono decoder should downmix correctly.
+func TestOpus_StereoEncoderMonoDecoder(t *testing.T) {
+	// Encode as stereo (2 channels) — similar to what Chrome might send
+	enc, err := opus.NewEncoder(48000, 2, opus.ApplicationVoIP)
+	if err != nil {
+		t.Fatalf("NewEncoder(stereo): %v", err)
+	}
+	defer enc.Close()
+	if err := enc.SetBitrate(32000); err != nil {
+		t.Fatalf("SetBitrate: %v", err)
+	}
+
+	// Decode with our standard mono decoder
+	dec, err := NewOpusDecoder()
+	if err != nil {
+		t.Fatalf("NewOpusDecoder: %v", err)
+	}
+	defer dec.Close()
+
+	// Create stereo signal: same sine in both channels (interleaved L,R,L,R...)
+	mono := generateSineWave(440, 48000, 48000)
+	stereo := make([]int16, len(mono)*2)
+	for i, s := range mono {
+		stereo[i*2] = s   // L
+		stereo[i*2+1] = s // R
+	}
+
+	packet := make([]byte, 4000)
+	var allDecoded []int16
+	for offset := 0; offset+opusFrameSize*2 <= len(stereo); offset += opusFrameSize * 2 {
+		frame := stereo[offset : offset+opusFrameSize*2]
+		n, err := enc.Encode(frame, opusFrameSize, packet)
+		if err != nil {
+			t.Fatalf("Stereo encode: %v", err)
+		}
+
+		decoded, err := dec.Decode(packet[:n])
+		if err != nil {
+			t.Fatalf("Decode stereo->mono: %v (packet size=%d)", err, n)
+		}
+		allDecoded = append(allDecoded, decoded...)
+	}
+
+	if len(allDecoded) == 0 {
+		t.Fatal("no decoded samples from stereo encoder")
+	}
+
+	skip := min(len(allDecoded)/4, 48000*100/1000)
+	tail := allDecoded[skip:]
+	rms := computeRMS(tail)
+
+	t.Logf("Stereo->Mono: %d decoded samples, RMS=%.1f", len(allDecoded), rms)
+	if rms < 50 {
+		t.Errorf("Stereo->Mono decoded RMS=%.1f is too low; cross-channel decoding may be broken", rms)
+	}
+}
+
+// TestOpus_DecodeLibopusEncoded uses ffmpeg (real libopus) to encode audio,
+// then decodes with our opus-go decoder. This simulates Chrome sending Opus
+// frames to the server. Skipped if ffmpeg is not available.
+func TestOpus_DecodeLibopusEncoded(t *testing.T) {
+	ffmpegPath, err := exec.LookPath("ffmpeg")
+	if err != nil {
+		t.Skip("ffmpeg not found")
+	}
+
+	tmpDir := t.TempDir()
+
+	// Generate 1 second of 440Hz tone as raw PCM (16-bit LE mono 48kHz)
+	sine := generateSineWave(440, 48000, 48000)
+	pcmPath := filepath.Join(tmpDir, "input.raw")
+	pcmBytes := sound.Int16toBytesLE(sine)
+	if err := os.WriteFile(pcmPath, pcmBytes, 0644); err != nil {
+		t.Fatalf("write PCM: %v", err)
+	}
+
+	for _, tc := range []struct {
+		name    string
+		bitrate string
+		app     string
+	}{
+		{"voip_32k", "32000", "voip"},
+		{"voip_64k", "64000", "voip"},
+		{"audio_64k", "64000", "audio"},
+		{"audio_128k", "128000", "audio"},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			testDecodeLibopus(t, ffmpegPath, tmpDir, pcmPath, sine, tc.bitrate, tc.app)
+		})
+	}
+}
+
+func testDecodeLibopus(t *testing.T, ffmpegPath, tmpDir, pcmPath string, _ []int16, bitrate, app string) {
+	t.Helper()
+
+	oggPath := filepath.Join(tmpDir, fmt.Sprintf("libopus_%s_%s.ogg", app, bitrate))
+	cmd := exec.Command(ffmpegPath,
+		"-y",
+		"-f", "s16le", "-ar", "48000", "-ac", "1", "-i", pcmPath,
+		"-c:a", "libopus",
+		"-b:a", bitrate,
+		"-application", app,
+		"-frame_duration", "20",
+		"-vbr", "on",
+		oggPath,
+	)
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatalf("ffmpeg encode: %v\n%s", err, out)
+	}
+
+	// Read the Ogg/Opus file and extract raw Opus frames
+	oggData, err := os.ReadFile(oggPath)
+	if err != nil {
+		t.Fatalf("read ogg: %v", err)
+	}
+
+	frames := extractOpusFramesFromOgg(t, oggData)
+	if len(frames) == 0 {
+		t.Fatal("no Opus frames extracted from Ogg container")
+	}
+	t.Logf("Extracted %d Opus frames from libopus encoder (first frame %d bytes)", len(frames), len(frames[0]))
+
+	// Decode with our opus-go decoder
+	dec, err := NewOpusDecoder()
+	if err != nil {
+		t.Fatalf("NewOpusDecoder: %v", err)
+	}
+	defer dec.Close()
+
+	var allDecoded []int16
+	decodeErrors := 0
+	for i, frame := range frames {
+		decoded, err := dec.Decode(frame)
+		if err != nil {
+			decodeErrors++
+			if decodeErrors <= 5 {
+				t.Logf("frame %d: decode error: %v (size=%d)", i, err, len(frame))
+			}
+			continue
+		}
+		if i < 5 {
+			t.Logf("frame %d: payload=%d bytes, decoded=%d samples (%.1fms @ 48kHz)",
+				i, len(frame), len(decoded), float64(len(decoded))/48.0)
+		}
+		allDecoded = append(allDecoded, decoded...)
+	}
+
+	if decodeErrors > 0 {
+		t.Logf("Total decode errors: %d/%d frames", decodeErrors, len(frames))
+	}
+
+	if len(allDecoded) == 0 {
+		t.Fatal("no decoded samples from libopus-encoded Opus")
+	}
+
+	// Skip warmup and check quality
+	skip := min(len(allDecoded)/4, 48000*100/1000)
+	tail := allDecoded[skip:]
+	rms := computeRMS(tail)
+	freq := estimateFrequency(tail, 48000)
+
+	t.Logf("libopus->opus-go: %d decoded samples, RMS=%.1f, freq≈%.0f Hz", len(allDecoded), rms, freq)
+
+	if rms < 50 {
+		t.Errorf("RMS=%.1f is too low — opus-go cannot decode libopus output", rms)
+	}
+	if math.Abs(freq-440) > 30 {
+		t.Errorf("frequency %.0f Hz deviates from expected 440 Hz (ratio=%.3f)", freq, freq/440.0)
+	}
+}
+
+// extractOpusFramesFromOgg parses an Ogg container and extracts raw Opus audio frames.
+func extractOpusFramesFromOgg(t *testing.T, data []byte) [][]byte {
+	t.Helper()
+	var frames [][]byte
+	pos := 0
+	pageNum := 0
+
+	for pos+27 <= len(data) {
+		// Check for OggS sync
+		if string(data[pos:pos+4]) != "OggS" {
+			t.Fatalf("invalid Ogg page at offset %d", pos)
+		}
+
+		nSegments := int(data[pos+26])
+		if pos+27+nSegments > len(data) {
+			break
+		}
+
+		segTable := data[pos+27 : pos+27+nSegments]
+		dataStart := pos + 27 + nSegments
+
+		// Calculate total page data size
+		var totalDataSize int
+		for _, s := range segTable {
+			totalDataSize += int(s)
+		}
+
+		if dataStart+totalDataSize > len(data) {
+			break
+		}
+
+		// Skip first two pages (OpusHead + OpusTags)
+		if pageNum >= 2 {
+			// Extract packets from segment table
+			pageData := data[dataStart : dataStart+totalDataSize]
+			offset := 0
+			var packet []byte
+			for _, segSize := range segTable {
+				packet = append(packet, pageData[offset:offset+int(segSize)]...)
+				offset += int(segSize)
+				if segSize < 255 {
+					// End of packet
+					if len(packet) > 0 {
+						frameCopy := make([]byte, len(packet))
+						copy(frameCopy, packet)
+						frames = append(frames, frameCopy)
+					}
+					packet = nil
+				}
+			}
+			// If last segment was 255, packet continues on next page
+			if len(packet) > 0 {
+				frameCopy := make([]byte, len(packet))
+				copy(frameCopy, packet)
+				frames = append(frames, frameCopy)
+			}
+		}
+
+		pos = dataStart + totalDataSize
+		pageNum++
+	}
+
+	return frames
+}
+
+func TestOpusEncodeDecode_Roundtrip_48kHz(t *testing.T) {
+	// Use a longer signal (1 second) so the codec can stabilise past its
+	// lookahead period and produce meaningful output.
+	sine := generateSineWave(440, 48000, 48000)
+	pcmBytes := sound.Int16toBytesLE(sine)
+
+	decoded := encodeDecodeRoundtrip(t, pcmBytes, 48000)
+	if len(decoded) == 0 {
+		t.Fatal("no decoded samples")
+	}
+
+	// Skip initial codec warmup (first 50ms) for frequency estimation.
+	skip := 48000 * 50 / 1000 // 2400 samples at 48kHz
+	// The decoder may return fewer samples per frame (e.g. 480 instead of 960),
+	// so the total decoded length may differ. Adjust skip proportionally.
+	decodedSR := 48000 // decoder is initialised at 48kHz
+	skipDecoded := decodedSR * 50 / 1000
+	if skipDecoded > len(decoded)/2 {
+		skipDecoded = len(decoded) / 4
+	}
+	tail := decoded[skipDecoded:]
+
+	rms := computeRMS(tail)
+	t.Logf("48kHz roundtrip: %d decoded samples, RMS=%.1f (skip=%d, analysed=%d)",
+		len(decoded), rms, skip, len(tail))
+
+	if rms < 50 {
+		t.Errorf("decoded audio RMS=%.1f is too low; signal appears silent", rms)
+	}
+}
+
+func TestOpusEncodeDecode_Roundtrip_16kHz(t *testing.T) {
+	// 1 second of 440Hz at 16kHz. Encoder resamples 16k->48k internally.
+	sine16k := generateSineWave(440, 16000, 16000)
+	pcmBytes := sound.Int16toBytesLE(sine16k)
+
+	decoded := encodeDecodeRoundtrip(t, pcmBytes, 16000)
+	if len(decoded) == 0 {
+		t.Fatal("no decoded samples")
+	}
+
+	// Resample back to 16kHz
+	decoded16k := sound.ResampleInt16(decoded, 48000, 16000)
+
+	// Skip warmup
+	skip := min(len(decoded16k)/4, 16000*50/1000)
+	tail := decoded16k[skip:]
+
+	rms := computeRMS(tail)
+	t.Logf("16kHz roundtrip: %d decoded@48k -> %d resampled@16k, RMS=%.1f",
+		len(decoded), len(decoded16k), rms)
+
+	if rms < 50 {
+		t.Errorf("decoded audio RMS=%.1f is too low; signal appears silent", rms)
+	}
+}
+
+func TestOpusEncode_EmptyInput(t *testing.T) {
+	enc, err := NewOpusEncoder()
+	if err != nil {
+		t.Fatalf("NewOpusEncoder: %v", err)
+	}
+	defer enc.Close()
+
+	frames, err := enc.Encode([]byte{}, 48000)
+	if err != nil {
+		t.Fatalf("Encode empty: %v", err)
+	}
+	if frames != nil {
+		t.Errorf("expected nil frames for empty input, got %d frames", len(frames))
+	}
+}
+
+func TestOpusEncode_SubFrameInput_SilentDrop(t *testing.T) {
+	// Less than 960 samples at 48kHz = not enough for a single frame.
+	// The encoder silently drops these trailing samples.
+	enc, err := NewOpusEncoder()
+	if err != nil {
+		t.Fatalf("NewOpusEncoder: %v", err)
+	}
+	defer enc.Close()
+
+	sine := generateSineWave(440, 48000, 500) // < 960
+	pcmBytes := sound.Int16toBytesLE(sine)
+
+	frames, err := enc.Encode(pcmBytes, 48000)
+	if err != nil {
+		t.Fatalf("Encode: %v", err)
+	}
+	if len(frames) != 0 {
+		t.Errorf("expected 0 frames for %d samples (< 960), got %d", len(sine), len(frames))
+	}
+}
+
+func TestOpusEncode_MultiFrame(t *testing.T) {
+	enc, err := NewOpusEncoder()
+	if err != nil {
+		t.Fatalf("NewOpusEncoder: %v", err)
+	}
+	defer enc.Close()
+
+	// 2880 samples at 48kHz = exactly 3 frames of 960
+	sine := generateSineWave(440, 48000, 2880)
+	pcmBytes := sound.Int16toBytesLE(sine)
+
+	frames, err := enc.Encode(pcmBytes, 48000)
+	if err != nil {
+		t.Fatalf("Encode: %v", err)
+	}
+	if len(frames) != 3 {
+		t.Errorf("expected 3 frames for 2880 samples, got %d", len(frames))
+	}
+}
+
+func TestOpusDecode_FrameSize(t *testing.T) {
+	// Document the actual decoded frame size from the pure Go opus-go library.
+	enc, err := NewOpusEncoder()
+	if err != nil {
+		t.Fatalf("NewOpusEncoder: %v", err)
+	}
+	defer enc.Close()
+
+	dec, err := NewOpusDecoder()
+	if err != nil {
+		t.Fatalf("NewOpusDecoder: %v", err)
+	}
+	defer dec.Close()
+
+	sine := generateSineWave(440, 48000, 960)
+	pcmBytes := sound.Int16toBytesLE(sine)
+
+	frames, err := enc.Encode(pcmBytes, 48000)
+	if err != nil {
+		t.Fatalf("Encode: %v", err)
+	}
+	if len(frames) != 1 {
+		t.Fatalf("expected 1 frame, got %d", len(frames))
+	}
+
+	decoded, err := dec.Decode(frames[0])
+	if err != nil {
+		t.Fatalf("Decode: %v", err)
+	}
+
+	t.Logf("Encoder input: 960 samples (20ms @ 48kHz)")
+	t.Logf("Decoder output: %d samples (%.1fms @ 48kHz)",
+		len(decoded), float64(len(decoded))/48.0)
+
+	// The decoder may return a different frame size due to internal
+	// bandwidth decisions in VoIP mode. Document the actual value.
+	if len(decoded) != 960 && len(decoded) != 480 {
+		t.Errorf("unexpected decoded frame size %d (expected 960 or 480)", len(decoded))
+	}
+}
+
+func TestOpus_FullWebRTCOutputPath(t *testing.T) {
+	// Simulates the TTS -> SendAudio path:
+	// PCM at 16kHz -> Encode(pcm, 16000) -> Opus frames -> Decode -> 48kHz samples
+	// Use 1 second of audio to let codec stabilise.
+	sine16k := generateSineWave(440, 16000, 16000)
+	pcmBytes := sound.Int16toBytesLE(sine16k)
+
+	decoded := encodeDecodeRoundtrip(t, pcmBytes, 16000)
+	if len(decoded) == 0 {
+		t.Fatal("no frames produced")
+	}
+
+	rms := computeRMS(decoded)
+	t.Logf("WebRTC output path: %d decoded samples at 48kHz, RMS=%.1f", len(decoded), rms)
+
+	if rms < 50 {
+		t.Errorf("decoded audio RMS=%.1f is too low; expected recognisable signal", rms)
+	}
+}
+
+func TestOpus_FullWebRTCInputPath(t *testing.T) {
+	// Simulates the client -> server path:
+	// PCM@48k -> Encode -> Decode -> Resample 48k->24k->16k
+	// Verify that the pipeline produces non-silent audio.
+	sine48k := generateSineWave(440, 48000, 48000) // 1 second
+	pcmBytes := sound.Int16toBytesLE(sine48k)
+
+	decoded48k := encodeDecodeRoundtrip(t, pcmBytes, 48000)
+	if len(decoded48k) == 0 {
+		t.Fatal("no decoded samples")
+	}
+
+	// WebRTC path: 48k -> 24k -> (VAD) -> 16k
+	step24k := sound.ResampleInt16(decoded48k, 48000, 24000)
+	webrtcPath := sound.ResampleInt16(step24k, 24000, 16000)
+
+	rms := computeRMS(webrtcPath)
+	t.Logf("WebRTC input path: %d decoded@48k -> %d@24k -> %d@16k, RMS=%.1f",
+		len(decoded48k), len(step24k), len(webrtcPath), rms)
+
+	if rms < 50 {
+		t.Errorf("WebRTC input path RMS=%.1f is too low; signal lost in pipeline", rms)
+	}
+}
+
+// --- Bug documentation tests ---
+
+func TestOpusBug_TrailingSampleLoss(t *testing.T) {
+	// Encode 1000 samples at 48kHz -> only 1 frame (960 samples) returned.
+	// 40 trailing samples are silently lost.
+	enc, err := NewOpusEncoder()
+	if err != nil {
+		t.Fatalf("NewOpusEncoder: %v", err)
+	}
+	defer enc.Close()
+
+	sine := generateSineWave(440, 48000, 1000)
+	pcmBytes := sound.Int16toBytesLE(sine)
+
+	frames, err := enc.Encode(pcmBytes, 48000)
+	if err != nil {
+		t.Fatalf("Encode: %v", err)
+	}
+	if len(frames) != 1 {
+		t.Fatalf("expected 1 frame, got %d", len(frames))
+	}
+
+	dec, err := NewOpusDecoder()
+	if err != nil {
+		t.Fatalf("NewOpusDecoder: %v", err)
+	}
+	defer dec.Close()
+
+	decoded, err := dec.Decode(frames[0])
+	if err != nil {
+		t.Fatalf("Decode: %v", err)
+	}
+
+	// The encoder only encoded 960 of 1000 input samples.
+	// Decoded frame size may be 960 or 480 depending on codec mode.
+	// Either way, 40 input samples are permanently lost.
+	t.Logf("Input: 1000 samples, Encoded: 1 frame, Decoded: %d samples (40 samples lost)", len(decoded))
+	if len(decoded) > 960 {
+		t.Errorf("decoded more samples (%d) than the encoder consumed (960)", len(decoded))
+	}
+}
+
+func TestOpusBug_TTSSampleRateMismatch(t *testing.T) {
+	// If TTS produces 24kHz audio but the pipeline assumes 16kHz,
+	// the Opus encoder resamples from 16kHz to 48kHz (3x) instead of
+	// 24kHz to 48kHz (2x). The result is pitched up by 50%.
+	//
+	// This test uses a longer signal and compares the two paths to
+	// demonstrate the frequency distortion.
+
+	// Generate 440Hz at 24kHz (what TTS actually produces)
+	sine24k := generateSineWave(440, 24000, 24000) // 1 second
+	pcmBytes := sound.Int16toBytesLE(sine24k)
+
+	// BUG path: Pipeline passes sampleRate=16000 (assumed) instead of 24000 (actual)
+	decodedBug := encodeDecodeRoundtrip(t, pcmBytes, 16000)
+	// CORRECT path: Pipeline should pass sampleRate=24000
+	decodedCorrect := encodeDecodeRoundtrip(t, pcmBytes, 24000)
+
+	// Skip warmup for frequency estimation
+	skipBug := min(len(decodedBug)/4, 48000*100/1000)
+	skipCorrect := min(len(decodedCorrect)/4, 48000*100/1000)
+
+	bugTail := decodedBug[skipBug:]
+	correctTail := decodedCorrect[skipCorrect:]
+
+	bugFreq := estimateFrequency(bugTail, 48000)
+	correctFreq := estimateFrequency(correctTail, 48000)
+
+	t.Logf("Bug path:     %d decoded samples, freq≈%.0f Hz (expected ~660 Hz = 440*1.5)", len(decodedBug), bugFreq)
+	t.Logf("Correct path: %d decoded samples, freq≈%.0f Hz (expected ~440 Hz)", len(decodedCorrect), correctFreq)
+
+	// The bug path produces significantly more decoded samples because
+	// the encoder thinks the input is 16kHz and upsamples by 3x instead of 2x.
+	// This also means the perceived playback speed and pitch are wrong.
+	if len(decodedBug) > 0 && len(decodedCorrect) > 0 {
+		ratio := float64(len(decodedBug)) / float64(len(decodedCorrect))
+		t.Logf("Sample count ratio (bug/correct): %.2f (expected ~1.5)", ratio)
+		if ratio < 1.1 {
+			t.Error("expected bug path to produce significantly more samples due to wrong resample ratio")
+		}
+	}
+}
+
+// TestOpus_CrossLibraryCompat encodes a sine wave with opus-go, wraps the
+// output in a minimal Ogg/Opus container, and decodes it with ffmpeg. This
+// catches issues where the pure-Go encoder produces Opus frames that only
+// its own decoder can parse (but not a browser or standard decoder).
+// Skipped if ffmpeg is not available.
+func TestOpus_CrossLibraryCompat(t *testing.T) {
+	ffmpegPath, err := exec.LookPath("ffmpeg")
+	if err != nil {
+		t.Skip("ffmpeg not found, skipping cross-library compatibility test")
+	}
+
+	// Encode 1 second of 440Hz sine at 48kHz with opus-go
+	sine := generateSineWave(440, 48000, 48000)
+	pcmBytes := sound.Int16toBytesLE(sine)
+
+	enc, err := NewOpusEncoder()
+	if err != nil {
+		t.Fatalf("NewOpusEncoder: %v", err)
+	}
+	defer enc.Close()
+
+	frames, err := enc.Encode(pcmBytes, 48000)
+	if err != nil {
+		t.Fatalf("Encode: %v", err)
+	}
+	if len(frames) == 0 {
+		t.Fatal("no frames produced")
+	}
+	t.Logf("opus-go produced %d frames (first frame %d bytes)", len(frames), len(frames[0]))
+
+	// Wrap the Opus frames in an Ogg/Opus container so ffmpeg can decode them.
+	tmpDir := t.TempDir()
+	oggPath := filepath.Join(tmpDir, "opus_go_output.ogg")
+	if err := writeOggOpus(oggPath, frames, 48000, 1); err != nil {
+		t.Fatalf("writeOggOpus: %v", err)
+	}
+
+	// Decode with ffmpeg
+	decodedWavPath := filepath.Join(tmpDir, "ffmpeg_decoded.wav")
+	cmd := exec.Command(ffmpegPath, "-y", "-i", oggPath, "-ar", "48000", "-ac", "1", "-c:a", "pcm_s16le", decodedWavPath)
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatalf("ffmpeg failed to decode opus-go output: %v\n%s", err, out)
+	}
+
+	// Read the decoded WAV and check audio quality
+	decodedData, err := os.ReadFile(decodedWavPath)
+	if err != nil {
+		t.Fatalf("read decoded WAV: %v", err)
+	}
+
+	// Use our robust ParseWAV to handle ffmpeg's WAV output
+	decodedPCM, sr := parseTestWAV(decodedData)
+	if sr == 0 {
+		t.Fatal("ffmpeg output has no WAV header")
+	}
+	decodedSamples := sound.BytesToInt16sLE(decodedPCM)
+
+	// Skip codec warmup (first 100ms), check RMS of the rest
+	skip := min(len(decodedSamples)/4, sr*100/1000)
+	if skip >= len(decodedSamples) {
+		skip = 0
+	}
+	tail := decodedSamples[skip:]
+	rms := computeRMS(tail)
+
+	t.Logf("ffmpeg decoded opus-go output: %d samples at %dHz, RMS=%.1f", len(decodedSamples), sr, rms)
+
+	if rms < 50 {
+		t.Errorf("ffmpeg decoded RMS=%.1f is too low — opus-go frames are likely incompatible with standard decoders", rms)
+	} else {
+		t.Logf("PASS: opus-go Opus frames are decodable by ffmpeg (libopus) with good signal quality")
+	}
+}
+
+// parseTestWAV is a simple WAV parser for test output (ffmpeg always writes standard headers).
+func parseTestWAV(data []byte) (pcm []byte, sampleRate int) {
+	if len(data) < 44 || string(data[0:4]) != "RIFF" {
+		return data, 0
+	}
+	// Walk chunks to find "data"
+	pos := 12
+	sr := int(binary.LittleEndian.Uint32(data[24:28]))
+	for pos+8 <= len(data) {
+		id := string(data[pos : pos+4])
+		sz := int(binary.LittleEndian.Uint32(data[pos+4 : pos+8]))
+		if id == "data" {
+			end := pos + 8 + sz
+			if end > len(data) {
+				end = len(data)
+			}
+			return data[pos+8 : end], sr
+		}
+		pos += 8 + sz
+		if sz%2 != 0 {
+			pos++
+		}
+	}
+	return data[44:], sr
+}
+
+// writeOggOpus writes Opus frames into a minimal Ogg/Opus container file.
+func writeOggOpus(path string, frames [][]byte, sampleRate, channels int) error {
+	f, err := os.Create(path)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	serial := uint32(0x4C6F6341) // "LocA"
+	var pageSeq uint32
+	const preSkip = 312 // standard Opus pre-skip for 48kHz
+
+	// Page 1: OpusHead (BOS page)
+	opusHead := make([]byte, 19)
+	copy(opusHead[0:8], "OpusHead")
+	opusHead[8] = 1                                                    // version
+	opusHead[9] = byte(channels)                                       // channel count
+	binary.LittleEndian.PutUint16(opusHead[10:12], uint16(preSkip))    // pre-skip
+	binary.LittleEndian.PutUint32(opusHead[12:16], uint32(sampleRate)) // input sample rate
+	binary.LittleEndian.PutUint16(opusHead[16:18], 0)                  // output gain
+	opusHead[18] = 0                                                   // channel mapping family
+	if err := writeOggPage(f, serial, pageSeq, 0, 0x02, [][]byte{opusHead}); err != nil {
+		return err
+	}
+	pageSeq++
+
+	// Page 2: OpusTags
+	opusTags := make([]byte, 16)
+	copy(opusTags[0:8], "OpusTags")
+	binary.LittleEndian.PutUint32(opusTags[8:12], 0)  // vendor string length
+	binary.LittleEndian.PutUint32(opusTags[12:16], 0) // comment list length
+	if err := writeOggPage(f, serial, pageSeq, 0, 0x00, [][]byte{opusTags}); err != nil {
+		return err
+	}
+	pageSeq++
+
+	// Audio pages: one Opus frame per page for simplicity
+	var granulePos uint64
+	for i, frame := range frames {
+		granulePos += 960 // 20ms at 48kHz
+		headerType := byte(0x00)
+		if i == len(frames)-1 {
+			headerType = 0x04 // EOS
+		}
+		if err := writeOggPage(f, serial, pageSeq, granulePos, headerType, [][]byte{frame}); err != nil {
+			return err
+		}
+		pageSeq++
+	}
+
+	return nil
+}
+
+// writeOggPage writes a single Ogg page containing the given packets.
+func writeOggPage(w io.Writer, serial, pageSeq uint32, granulePos uint64, headerType byte, packets [][]byte) error {
+	// Build segment table
+	var segments []byte
+	var pageData []byte
+	for _, pkt := range packets {
+		remaining := len(pkt)
+		for remaining >= 255 {
+			segments = append(segments, 255)
+			remaining -= 255
+		}
+		segments = append(segments, byte(remaining))
+		pageData = append(pageData, pkt...)
+	}
+
+	// Build page header (27 bytes + segment table)
+	hdr := make([]byte, 27+len(segments))
+	copy(hdr[0:4], "OggS")
+	hdr[4] = 0 // version
+	hdr[5] = headerType
+	binary.LittleEndian.PutUint64(hdr[6:14], granulePos)
+	binary.LittleEndian.PutUint32(hdr[14:18], serial)
+	binary.LittleEndian.PutUint32(hdr[18:22], pageSeq)
+	// CRC at [22:26] — filled after computing
+	hdr[26] = byte(len(segments))
+	copy(hdr[27:], segments)
+
+	// Compute CRC-32 over header + page data
+	crc := oggCRC32(hdr, pageData)
+	binary.LittleEndian.PutUint32(hdr[22:26], crc)
+
+	if _, err := w.Write(hdr); err != nil {
+		return err
+	}
+	_, err := w.Write(pageData)
+	return err
+}
+
+// oggCRC32 computes the Ogg CRC-32 checksum (polynomial 0x04C11DB7).
+func oggCRC32(header, data []byte) uint32 {
+	var crc uint32
+	for _, b := range header {
+		crc = (crc << 8) ^ oggCRCTable[byte(crc>>24)^b]
+	}
+	for _, b := range data {
+		crc = (crc << 8) ^ oggCRCTable[byte(crc>>24)^b]
+	}
+	return crc
+}
+
+var oggCRCTable = func() [256]uint32 {
+	var t [256]uint32
+	for i := range 256 {
+		r := uint32(i) << 24
+		for range 8 {
+			if r&0x80000000 != 0 {
+				r = (r << 1) ^ 0x04C11DB7
+			} else {
+				r <<= 1
+			}
+		}
+		t[i] = r
+	}
+	return t
+}()
+
+// goertzel computes the power at a specific frequency using the Goertzel algorithm.
+// Returns power in linear scale (not dB).
+func goertzel(samples []int16, targetFreq float64, sampleRate int) float64 {
+	N := len(samples)
+	if N == 0 {
+		return 0
+	}
+	k := 0.5 + float64(N)*targetFreq/float64(sampleRate)
+	w := 2 * math.Pi * k / float64(N)
+	coeff := 2 * math.Cos(w)
+	var s1, s2 float64
+	for _, sample := range samples {
+		s0 := float64(sample) + coeff*s1 - s2
+		s2 = s1
+		s1 = s0
+	}
+	return s1*s1 + s2*s2 - coeff*s1*s2
+}
+
+// computeTHD computes Total Harmonic Distortion for a signal with known fundamental.
+// THD = sqrt(sum of harmonic powers) / fundamental power, returned as percentage.
+func computeTHD(samples []int16, fundamentalHz float64, sampleRate, numHarmonics int) float64 {
+	fundPower := goertzel(samples, fundamentalHz, sampleRate)
+	if fundPower <= 0 {
+		return 0
+	}
+	var harmonicSum float64
+	for h := 2; h <= numHarmonics; h++ {
+		harmonicSum += goertzel(samples, fundamentalHz*float64(h), sampleRate)
+	}
+	return math.Sqrt(harmonicSum/fundPower) * 100
+}
+
+// TestWebRTCPipeline_TestToneQuality exercises the full audio pipeline:
+//
+//	PCM (24kHz) → resample to 48kHz → Opus encode → RTP packetize →
+//	WebRTC transport (local loopback) → RTP depacketize → Opus decode → PCM (48kHz)
+//
+// Two local PeerConnections are connected via SDP exchange (no network).
+// The sender uses the same RTP construction as WebRTCTransport.SendAudio.
+// Quality metrics are computed on the received/decoded audio and logged.
+//
+// This test catches regressions in:
+//   - Opus encoder output quality
+//   - RTP packetization (sequence numbers, timestamps, marker bit)
+//   - Sample rate handling in the encode path
+//   - Packet delivery through pion's internal transport
+func TestWebRTCPipeline_TestToneQuality(t *testing.T) {
+	const (
+		toneFreq       = 440.0
+		toneSampleRate = 24000 // matches sendTestTone
+		toneDuration   = 1     // seconds
+		toneAmplitude  = 16000
+		toneNumSamples = toneSampleRate * toneDuration
+	)
+
+	// Generate test tone (same as sendTestTone in realtime.go)
+	pcm := make([]byte, toneNumSamples*2)
+	for i := 0; i < toneNumSamples; i++ {
+		sample := int16(toneAmplitude * math.Sin(2*math.Pi*toneFreq*float64(i)/float64(toneSampleRate)))
+		binary.LittleEndian.PutUint16(pcm[i*2:], uint16(sample))
+	}
+
+	// Encode to Opus frames (same path as SendAudio)
+	enc, err := NewOpusEncoder()
+	if err != nil {
+		t.Fatalf("NewOpusEncoder: %v", err)
+	}
+	defer enc.Close()
+
+	opusFrames, err := enc.Encode(pcm, toneSampleRate)
+	if err != nil {
+		t.Fatalf("Encode: %v", err)
+	}
+	if len(opusFrames) == 0 {
+		t.Fatal("no Opus frames produced")
+	}
+	t.Logf("Encoded %d Opus frames from %d PCM samples at %dHz", len(opusFrames), toneNumSamples, toneSampleRate)
+
+	// --- Create sender PeerConnection ---
+	senderME := &webrtc.MediaEngine{}
+	if err := senderME.RegisterDefaultCodecs(); err != nil {
+		t.Fatalf("sender RegisterDefaultCodecs: %v", err)
+	}
+	senderAPI := webrtc.NewAPI(webrtc.WithMediaEngine(senderME))
+	senderPC, err := senderAPI.NewPeerConnection(webrtc.Configuration{})
+	if err != nil {
+		t.Fatalf("sender NewPeerConnection: %v", err)
+	}
+	defer senderPC.Close()
+
+	audioTrack, err := webrtc.NewTrackLocalStaticRTP(
+		webrtc.RTPCodecCapability{
+			MimeType:  webrtc.MimeTypeOpus,
+			ClockRate: 48000,
+			Channels:  2,
+		},
+		"audio", "test",
+	)
+	if err != nil {
+		t.Fatalf("NewTrackLocalStaticRTP: %v", err)
+	}
+
+	rtpSender, err := senderPC.AddTrack(audioTrack)
+	if err != nil {
+		t.Fatalf("AddTrack: %v", err)
+	}
+	// Drain RTCP
+	go func() {
+		buf := make([]byte, 1500)
+		for {
+			if _, _, err := rtpSender.Read(buf); err != nil {
+				return
+			}
+		}
+	}()
+
+	// --- Create receiver PeerConnection ---
+	receiverME := &webrtc.MediaEngine{}
+	if err := receiverME.RegisterDefaultCodecs(); err != nil {
+		t.Fatalf("receiver RegisterDefaultCodecs: %v", err)
+	}
+	receiverAPI := webrtc.NewAPI(webrtc.WithMediaEngine(receiverME))
+	receiverPC, err := receiverAPI.NewPeerConnection(webrtc.Configuration{})
+	if err != nil {
+		t.Fatalf("receiver NewPeerConnection: %v", err)
+	}
+	defer receiverPC.Close()
+
+	// Collect received RTP payloads (Opus frames)
+	type receivedPacket struct {
+		seqNum    uint16
+		timestamp uint32
+		marker    bool
+		payload   []byte
+	}
+	var (
+		receivedMu      sync.Mutex
+		receivedPackets []receivedPacket
+		trackDone       = make(chan struct{})
+	)
+
+	receiverPC.OnTrack(func(track *webrtc.TrackRemote, receiver *webrtc.RTPReceiver) {
+		defer close(trackDone)
+		for {
+			pkt, _, err := track.ReadRTP()
+			if err != nil {
+				return
+			}
+			payload := make([]byte, len(pkt.Payload))
+			copy(payload, pkt.Payload)
+			receivedMu.Lock()
+			receivedPackets = append(receivedPackets, receivedPacket{
+				seqNum:    pkt.Header.SequenceNumber,
+				timestamp: pkt.Header.Timestamp,
+				marker:    pkt.Header.Marker,
+				payload:   payload,
+			})
+			receivedMu.Unlock()
+		}
+	})
+
+	// --- Exchange SDP ---
+	offer, err := senderPC.CreateOffer(nil)
+	if err != nil {
+		t.Fatalf("CreateOffer: %v", err)
+	}
+	if err := senderPC.SetLocalDescription(offer); err != nil {
+		t.Fatalf("sender SetLocalDescription: %v", err)
+	}
+	senderGatherDone := webrtc.GatheringCompletePromise(senderPC)
+	select {
+	case <-senderGatherDone:
+	case <-time.After(5 * time.Second):
+		t.Fatal("sender ICE gathering timeout")
+	}
+
+	if err := receiverPC.SetRemoteDescription(*senderPC.LocalDescription()); err != nil {
+		t.Fatalf("receiver SetRemoteDescription: %v", err)
+	}
+	answer, err := receiverPC.CreateAnswer(nil)
+	if err != nil {
+		t.Fatalf("CreateAnswer: %v", err)
+	}
+	if err := receiverPC.SetLocalDescription(answer); err != nil {
+		t.Fatalf("receiver SetLocalDescription: %v", err)
+	}
+	receiverGatherDone := webrtc.GatheringCompletePromise(receiverPC)
+	select {
+	case <-receiverGatherDone:
+	case <-time.After(5 * time.Second):
+		t.Fatal("receiver ICE gathering timeout")
+	}
+
+	if err := senderPC.SetRemoteDescription(*receiverPC.LocalDescription()); err != nil {
+		t.Fatalf("sender SetRemoteDescription: %v", err)
+	}
+
+	// Wait for connection
+	connected := make(chan struct{})
+	senderPC.OnConnectionStateChange(func(s webrtc.PeerConnectionState) {
+		if s == webrtc.PeerConnectionStateConnected {
+			select {
+			case <-connected:
+			default:
+				close(connected)
+			}
+		}
+	})
+	select {
+	case <-connected:
+	case <-time.After(5 * time.Second):
+		t.Fatal("timeout waiting for WebRTC connection")
+	}
+
+	// --- Send test tone via RTP (same logic as SendAudio) ---
+	const samplesPerFrame = 960
+	seqNum := uint16(rand.UintN(65536))
+	timestamp := rand.Uint32()
+	marker := true
+
+	ticker := time.NewTicker(20 * time.Millisecond)
+	defer ticker.Stop()
+
+	for i, frame := range opusFrames {
+		pkt := &rtp.Packet{
+			Header: rtp.Header{
+				Version:        2,
+				Marker:         marker,
+				SequenceNumber: seqNum,
+				Timestamp:      timestamp,
+			},
+			Payload: frame,
+		}
+		seqNum++
+		timestamp += samplesPerFrame
+		marker = false
+
+		if err := audioTrack.WriteRTP(pkt); err != nil {
+			t.Fatalf("WriteRTP frame %d: %v", i, err)
+		}
+		if i < len(opusFrames)-1 {
+			<-ticker.C
+		}
+	}
+
+	// Wait for packets to arrive (give extra time for jitter buffer)
+	time.Sleep(500 * time.Millisecond)
+
+	// Close sender to trigger track end on receiver
+	senderPC.Close()
+
+	// Wait for track reader to finish (with timeout)
+	select {
+	case <-trackDone:
+	case <-time.After(2 * time.Second):
+		// Track reader may not exit cleanly on all platforms
+	}
+
+	// --- Decode received Opus frames ---
+	receivedMu.Lock()
+	pkts := make([]receivedPacket, len(receivedPackets))
+	copy(pkts, receivedPackets)
+	receivedMu.Unlock()
+
+	if len(pkts) == 0 {
+		t.Fatal("no RTP packets received")
+	}
+
+	dec, err := NewOpusDecoder()
+	if err != nil {
+		t.Fatalf("NewOpusDecoder: %v", err)
+	}
+	defer dec.Close()
+
+	var allDecoded []int16
+	decodeErrors := 0
+	for _, pkt := range pkts {
+		samples, err := dec.Decode(pkt.payload)
+		if err != nil {
+			decodeErrors++
+			continue
+		}
+		allDecoded = append(allDecoded, samples...)
+	}
+
+	if len(allDecoded) == 0 {
+		t.Fatal("no decoded samples")
+	}
+
+	// --- Analyse RTP packet delivery ---
+	frameLoss := len(opusFrames) - len(pkts)
+	seqGaps := 0
+	for i := 1; i < len(pkts); i++ {
+		expected := pkts[i-1].seqNum + 1
+		if pkts[i].seqNum != expected {
+			seqGaps++
+		}
+	}
+	markerCount := 0
+	for _, pkt := range pkts {
+		if pkt.marker {
+			markerCount++
+		}
+	}
+
+	t.Log("── RTP Delivery ──")
+	t.Logf("  Frames sent:     %d", len(opusFrames))
+	t.Logf("  Packets recv:    %d", len(pkts))
+	t.Logf("  Frame loss:      %d", frameLoss)
+	t.Logf("  Sequence gaps:   %d", seqGaps)
+	t.Logf("  Marker packets:  %d (expect 1)", markerCount)
+	t.Logf("  Decode errors:   %d", decodeErrors)
+
+	// --- Audio quality metrics ---
+	// Skip codec warmup (first 100ms at 48kHz = 4800 samples)
+	skip := 48000 * 100 / 1000
+	if skip > len(allDecoded)/2 {
+		skip = len(allDecoded) / 4
+	}
+	tail := allDecoded[skip:]
+
+	rms := computeRMS(tail)
+	freq := estimateFrequency(tail, 48000)
+	thd := computeTHD(tail, toneFreq, 48000, 10)
+
+	t.Log("── Audio Quality ──")
+	t.Logf("  Decoded samples: %d (%.1f ms at 48kHz)", len(allDecoded), float64(len(allDecoded))/48.0)
+	t.Logf("  RMS level:       %.1f", rms)
+	t.Logf("  Peak frequency:  %.0f Hz (expected %.0f Hz)", freq, toneFreq)
+	t.Logf("  THD (h2-h10):    %.1f%%", thd)
+
+	// --- Assertions ---
+	if frameLoss > 0 {
+		t.Errorf("lost %d frames in localhost transport", frameLoss)
+	}
+	if seqGaps > 0 {
+		t.Errorf("detected %d sequence number gaps", seqGaps)
+	}
+	if markerCount != 1 {
+		t.Errorf("expected exactly 1 marker packet (first packet), got %d", markerCount)
+	}
+	if rms < 50 {
+		t.Errorf("RMS=%.1f is too low; signal appears silent or severely attenuated", rms)
+	}
+	freqDelta := math.Abs(freq - toneFreq)
+	if freqDelta > 20 {
+		t.Errorf("peak frequency %.0f Hz deviates from expected %.0f Hz by %.0f Hz", freq, toneFreq, freqDelta)
+	}
+	if thd > 50 {
+		t.Errorf("THD=%.1f%% is too high; signal is severely distorted", thd)
+	}
+
+	// Log a summary line for quick scanning
+	result := "PASS"
+	issues := []string{}
+	if frameLoss > 0 {
+		issues = append(issues, fmt.Sprintf("%d frames lost", frameLoss))
+	}
+	if freqDelta > 20 {
+		issues = append(issues, fmt.Sprintf("freq off by %.0f Hz", freqDelta))
+	}
+	if thd > 50 {
+		issues = append(issues, fmt.Sprintf("THD %.1f%%", thd))
+	}
+	if rms < 50 {
+		issues = append(issues, "silent")
+	}
+	if len(issues) > 0 {
+		result = "FAIL: " + fmt.Sprintf("%v", issues)
+	}
+	t.Logf("── Summary: %s ──", result)
+}
diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
index 415e75b18f62..b51c57181d48 100644
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -3,8 +3,10 @@ package openai
 import (
 	"context"
 	"encoding/base64"
+	"encoding/binary"
 	"encoding/json"
 	"fmt"
+	"math"
 	"os"
 	"sync"
 	"time"
@@ -40,23 +42,17 @@ const (
 	maxAudioBufferSize = 100 * 1024 * 1024
 	// Maximum WebSocket message size in bytes (10MB) to prevent DoS attacks
 	maxWebSocketMessageSize = 10 * 1024 * 1024
+
+	defaultInstructions = "You are a helpful voice assistant. " +
+		"Your responses will be spoken aloud using text-to-speech, so keep them concise and conversational. " +
+		"Do not use markdown formatting, bullet points, numbered lists, code blocks, or special characters. " +
+		"Speak naturally as you would in a phone conversation. " +
+		"Avoid parenthetical asides, URLs, and anything that cannot be clearly vocalized."
 )
 
 // A model can be "emulated" that is: transcribe audio to text -> feed text to the LLM -> generate audio as result
 // If the model support instead audio-to-audio, we will use the specific gRPC calls instead
 
-// LockedWebsocket wraps a websocket connection with a mutex for safe concurrent writes
-type LockedWebsocket struct {
-	*websocket.Conn
-	sync.Mutex
-}
-
-func (l *LockedWebsocket) WriteMessage(messageType int, data []byte) error {
-	l.Lock()
-	defer l.Unlock()
-	return l.Conn.WriteMessage(messageType, data)
-}
-
 // Session represents a single WebSocket connection and its state
 type Session struct {
 	ID                string
@@ -77,8 +73,48 @@ type Session struct {
 	ModelInterface          Model
 	// The pipeline model config or the config for an any-to-any model
 	ModelConfig     *config.ModelConfig
-	InputSampleRate int
-	MaxOutputTokens types.IntOrInf
+	InputSampleRate  int
+	OutputSampleRate int
+	MaxOutputTokens  types.IntOrInf
+
+	// Response cancellation: protects activeResponseCancel/activeResponseDone
+	responseMu           sync.Mutex
+	activeResponseCancel context.CancelFunc
+	activeResponseDone   chan struct{}
+}
+
+// cancelActiveResponse cancels any in-flight response and waits for its
+// goroutine to exit. This ensures we never have overlapping responses and
+// that interrupted responses are fully cleaned up before starting a new one.
+func (s *Session) cancelActiveResponse() {
+	s.responseMu.Lock()
+	cancel := s.activeResponseCancel
+	done := s.activeResponseDone
+	s.responseMu.Unlock()
+
+	if cancel != nil {
+		cancel()
+	}
+	if done != nil {
+		<-done
+	}
+}
+
+// startResponse cancels any active response and returns a new context for
+// the replacement response. The caller MUST close the returned done channel
+// when the response goroutine exits.
+func (s *Session) startResponse(parent context.Context) (context.Context, chan struct{}) {
+	s.cancelActiveResponse()
+
+	ctx, cancel := context.WithCancel(parent)
+	done := make(chan struct{})
+
+	s.responseMu.Lock()
+	s.activeResponseCancel = cancel
+	s.activeResponseDone = done
+	s.responseMu.Unlock()
+
+	return ctx, done
 }
 
 func (s *Session) FromClient(session *types.SessionUnion) {
@@ -187,378 +223,414 @@ func Realtime(application *application.Application) echo.HandlerFunc {
 
 func registerRealtime(application *application.Application, model string) func(c *websocket.Conn) {
 	return func(conn *websocket.Conn) {
-		c := &LockedWebsocket{Conn: conn}
-
+		t := NewWebSocketTransport(conn)
 		evaluator := application.TemplatesEvaluator()
-		xlog.Debug("Realtime WebSocket connection established", "address", c.RemoteAddr().String(), "model", model)
+		xlog.Debug("Realtime WebSocket connection established", "address", conn.RemoteAddr().String(), "model", model)
+		runRealtimeSession(application, t, model, evaluator)
+	}
+}
 
-		// TODO: Allow any-to-any model to be specified
-		cl := application.ModelConfigLoader()
-		cfg, err := cl.LoadModelConfigFileByNameDefaultOptions(model, application.ApplicationConfig())
-		if err != nil {
-			xlog.Error("failed to load model config", "error", err)
-			sendError(c, "model_load_error", "Failed to load model config", "", "")
-			return
-		}
+// runRealtimeSession runs the main event loop for a realtime session.
+// It is transport-agnostic and works with both WebSocket and WebRTC.
+func runRealtimeSession(application *application.Application, t Transport, model string, evaluator *templates.Evaluator) {
+	// TODO: Allow any-to-any model to be specified
+	cl := application.ModelConfigLoader()
+	cfg, err := cl.LoadModelConfigFileByNameDefaultOptions(model, application.ApplicationConfig())
+	if err != nil {
+		xlog.Error("failed to load model config", "error", err)
+		sendError(t, "model_load_error", "Failed to load model config", "", "")
+		return
+	}
 
-		if cfg == nil || (cfg.Pipeline.VAD == "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.TTS == "" && cfg.Pipeline.LLM == "") {
-			xlog.Error("model is not a pipeline", "model", model)
-			sendError(c, "invalid_model", "Model is not a pipeline model", "", "")
-			return
-		}
+	if cfg == nil || (cfg.Pipeline.VAD == "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.TTS == "" && cfg.Pipeline.LLM == "") {
+		xlog.Error("model is not a pipeline", "model", model)
+		sendError(t, "invalid_model", "Model is not a pipeline model", "", "")
+		return
+	}
 
-		sttModel := cfg.Pipeline.Transcription
-
-		sessionID := generateSessionID()
-		session := &Session{
-			ID:                sessionID,
-			TranscriptionOnly: false,
-			Model:             model,
-			Voice:             cfg.TTSConfig.Voice,
-			ModelConfig:       cfg,
-			TurnDetection: &types.TurnDetectionUnion{
-				ServerVad: &types.ServerVad{
-					Threshold:         0.5,
-					PrefixPaddingMs:   300,
-					SilenceDurationMs: 500,
-					CreateResponse:    true,
-				},
+	sttModel := cfg.Pipeline.Transcription
+
+	sessionID := generateSessionID()
+	session := &Session{
+		ID:                sessionID,
+		TranscriptionOnly: false,
+		Model:             model,
+		Voice:             cfg.TTSConfig.Voice,
+		Instructions:      defaultInstructions,
+		ModelConfig:       cfg,
+		TurnDetection: &types.TurnDetectionUnion{
+			ServerVad: &types.ServerVad{
+				Threshold:         0.5,
+				PrefixPaddingMs:   300,
+				SilenceDurationMs: 500,
+				CreateResponse:    true,
 			},
-			InputAudioTranscription: &types.AudioTranscription{
-				Model: sttModel,
-			},
-			Conversations:   make(map[string]*Conversation),
-			InputSampleRate: defaultRemoteSampleRate,
-		}
+		},
+		InputAudioTranscription: &types.AudioTranscription{
+			Model: sttModel,
+		},
+		Conversations:    make(map[string]*Conversation),
+		InputSampleRate:  defaultRemoteSampleRate,
+		OutputSampleRate: defaultRemoteSampleRate,
+	}
+
+	// Create a default conversation
+	conversationID := generateConversationID()
+	conversation := &Conversation{
+		ID: conversationID,
+		// TODO: We need to truncate the conversation items when a new item is added and we have run out of space. There are multiple places where items
+		//       can be added so we could use a datastructure here that enforces truncation upon addition
+		Items: []*types.MessageItemUnion{},
+	}
+	session.Conversations[conversationID] = conversation
+	session.DefaultConversationID = conversationID
+
+	m, err := newModel(
+		&cfg.Pipeline,
+		application.ModelConfigLoader(),
+		application.ModelLoader(),
+		application.ApplicationConfig(),
+		evaluator,
+	)
+	if err != nil {
+		xlog.Error("failed to load model", "error", err)
+		sendError(t, "model_load_error", "Failed to load model", "", "")
+		return
+	}
+	session.ModelInterface = m
+
+	// Store the session and notify the transport (for WebRTC audio track handling)
+	sessionLock.Lock()
+	sessions[sessionID] = session
+	sessionLock.Unlock()
+
+	// For WebRTC, inbound audio arrives as Opus (48kHz) and is decoded+resampled
+	// to localSampleRate in handleIncomingAudioTrack. Set InputSampleRate to
+	// match so handleVAD doesn't needlessly double-resample.
+	if _, ok := t.(*WebRTCTransport); ok {
+		session.InputSampleRate = localSampleRate
+	}
 
-		// Create a default conversation
-		conversationID := generateConversationID()
-		conversation := &Conversation{
-			ID: conversationID,
-			// TODO: We need to truncate the conversation items when a new item is added and we have run out of space. There are multiple places where items
-			//       can be added so we could use a datastructure here that enforces truncation upon addition
-			Items: []*types.MessageItemUnion{},
+	if sn, ok := t.(interface{ SetSession(*Session) }); ok {
+		sn.SetSession(session)
+	}
+
+	sendEvent(t, types.SessionCreatedEvent{
+		ServerEventBase: types.ServerEventBase{
+			EventID: "event_TODO",
+		},
+		Session: session.ToServer(),
+	})
+
+	var (
+		msg  []byte
+		wg   sync.WaitGroup
+		done = make(chan struct{})
+	)
+
+	vadServerStarted := false
+	toggleVAD := func() {
+		if session.TurnDetection != nil && session.TurnDetection.ServerVad != nil && !vadServerStarted {
+			xlog.Debug("Starting VAD goroutine...")
+			done = make(chan struct{})
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				conversation := session.Conversations[session.DefaultConversationID]
+				handleVAD(session, conversation, t, done)
+			}()
+			vadServerStarted = true
+		} else if (session.TurnDetection == nil || session.TurnDetection.ServerVad == nil) && vadServerStarted {
+			xlog.Debug("Stopping VAD goroutine...")
+			close(done)
+			vadServerStarted = false
 		}
-		session.Conversations[conversationID] = conversation
-		session.DefaultConversationID = conversationID
-
-		m, err := newModel(
-			&cfg.Pipeline,
-			application.ModelConfigLoader(),
-			application.ModelLoader(),
-			application.ApplicationConfig(),
-			evaluator,
-		)
+	}
+
+	toggleVAD()
+
+	for {
+		msg, err = t.ReadEvent()
 		if err != nil {
-			xlog.Error("failed to load model", "error", err)
-			sendError(c, "model_load_error", "Failed to load model", "", "")
-			return
+			xlog.Error("read error", "error", err)
+			break
 		}
-		session.ModelInterface = m
 
-		// Store the session
-		sessionLock.Lock()
-		sessions[sessionID] = session
-		sessionLock.Unlock()
+		// Handle diagnostic events that aren't part of the OpenAI protocol
+		var rawType struct {
+			Type string `json:"type"`
+		}
+		if json.Unmarshal(msg, &rawType) == nil && rawType.Type == "test_tone" {
+			xlog.Debug("Generating test tone")
+			go sendTestTone(t)
+			continue
+		}
 
-		sendEvent(c, types.SessionCreatedEvent{
-			ServerEventBase: types.ServerEventBase{
-				EventID: "event_TODO",
-			},
-			Session: session.ToServer(),
-		})
+		// Parse the incoming message
+		event, err := types.UnmarshalClientEvent(msg)
+		if err != nil {
+			xlog.Error("invalid json", "error", err)
+			sendError(t, "invalid_json", "Invalid JSON format", "", "")
+			continue
+		}
 
-		var (
-			msg  []byte
-			wg   sync.WaitGroup
-			done = make(chan struct{})
-		)
+		switch e := event.(type) {
+		case types.SessionUpdateEvent:
+			xlog.Debug("recv", "message", string(msg))
+
+			// Handle transcription session update
+			if e.Session.Transcription != nil {
+				if err := updateTransSession(
+					session,
+					&e.Session,
+					application.ModelConfigLoader(),
+					application.ModelLoader(),
+					application.ApplicationConfig(),
+				); err != nil {
+					xlog.Error("failed to update session", "error", err)
+					sendError(t, "session_update_error", "Failed to update session", "", "")
+					continue
+				}
 
-		vadServerStarted := false
-		toggleVAD := func() {
-			if session.TurnDetection.ServerVad != nil && !vadServerStarted {
-				xlog.Debug("Starting VAD goroutine...")
-				wg.Add(1)
-				go func() {
-					defer wg.Done()
-					conversation := session.Conversations[session.DefaultConversationID]
-					handleVAD(session, conversation, c, done)
-				}()
-				vadServerStarted = true
-			} else if session.TurnDetection.ServerVad == nil && vadServerStarted {
-				xlog.Debug("Stopping VAD goroutine...")
+				toggleVAD()
 
-				go func() {
-					done <- struct{}{}
-				}()
-				vadServerStarted = false
+				sendEvent(t, types.SessionUpdatedEvent{
+					ServerEventBase: types.ServerEventBase{
+						EventID: "event_TODO",
+					},
+					Session: session.ToServer(),
+				})
 			}
-		}
 
-		toggleVAD()
+			// Handle realtime session update
+			if e.Session.Realtime != nil {
+				if err := updateSession(
+					session,
+					&e.Session,
+					application.ModelConfigLoader(),
+					application.ModelLoader(),
+					application.ApplicationConfig(),
+					evaluator,
+				); err != nil {
+					xlog.Error("failed to update session", "error", err)
+					sendError(t, "session_update_error", "Failed to update session", "", "")
+					continue
+				}
+
+				toggleVAD()
 
-		for {
-			if _, msg, err = c.ReadMessage(); err != nil {
-				xlog.Error("read error", "error", err)
-				break
+				sendEvent(t, types.SessionUpdatedEvent{
+					ServerEventBase: types.ServerEventBase{
+						EventID: "event_TODO",
+					},
+					Session: session.ToServer(),
+				})
 			}
 
-			// Parse the incoming message
-			event, err := types.UnmarshalClientEvent(msg)
-			if err != nil {
-				xlog.Error("invalid json", "error", err)
-				sendError(c, "invalid_json", "Invalid JSON format", "", "")
+		case types.InputAudioBufferAppendEvent:
+			// Handle 'input_audio_buffer.append'
+			if e.Audio == "" {
+				xlog.Error("Audio data is missing in 'input_audio_buffer.append'")
+				sendError(t, "missing_audio_data", "Audio data is missing", "", "")
 				continue
 			}
 
-			switch e := event.(type) {
-			case types.SessionUpdateEvent:
-				xlog.Debug("recv", "message", string(msg))
-
-				// Handle transcription session update
-				if e.Session.Transcription != nil {
-					if err := updateTransSession(
-						session,
-						&e.Session,
-						application.ModelConfigLoader(),
-						application.ModelLoader(),
-						application.ApplicationConfig(),
-					); err != nil {
-						xlog.Error("failed to update session", "error", err)
-						sendError(c, "session_update_error", "Failed to update session", "", "")
-						continue
-					}
-
-					toggleVAD()
+			// Decode base64 audio data
+			decodedAudio, err := base64.StdEncoding.DecodeString(e.Audio)
+			if err != nil {
+				xlog.Error("failed to decode audio data", "error", err)
+				sendError(t, "invalid_audio_data", "Failed to decode audio data", "", "")
+				continue
+			}
 
-					sendEvent(c, types.SessionUpdatedEvent{
-						ServerEventBase: types.ServerEventBase{
-							EventID: "event_TODO",
-						},
-						Session: session.ToServer(),
-					})
-				}
+			// Check buffer size limits before appending
+			session.AudioBufferLock.Lock()
+			newSize := len(session.InputAudioBuffer) + len(decodedAudio)
+			if newSize > maxAudioBufferSize {
+				session.AudioBufferLock.Unlock()
+				xlog.Error("audio buffer size limit exceeded", "current_size", len(session.InputAudioBuffer), "incoming_size", len(decodedAudio), "limit", maxAudioBufferSize)
+				sendError(t, "buffer_size_exceeded", fmt.Sprintf("Audio buffer size limit exceeded (max %d bytes)", maxAudioBufferSize), "", "")
+				continue
+			}
 
-				// Handle realtime session update
-				if e.Session.Realtime != nil {
-					if err := updateSession(
-						session,
-						&e.Session,
-						application.ModelConfigLoader(),
-						application.ModelLoader(),
-						application.ApplicationConfig(),
-						evaluator,
-					); err != nil {
-						xlog.Error("failed to update session", "error", err)
-						sendError(c, "session_update_error", "Failed to update session", "", "")
-						continue
-					}
+			// Append to InputAudioBuffer
+			session.InputAudioBuffer = append(session.InputAudioBuffer, decodedAudio...)
+			session.AudioBufferLock.Unlock()
 
-					toggleVAD()
+		case types.InputAudioBufferCommitEvent:
+			xlog.Debug("recv", "message", string(msg))
 
-					sendEvent(c, types.SessionUpdatedEvent{
-						ServerEventBase: types.ServerEventBase{
-							EventID: "event_TODO",
-						},
-						Session: session.ToServer(),
-					})
-				}
+			sessionLock.Lock()
+			isServerVAD := session.TurnDetection != nil && session.TurnDetection.ServerVad != nil
+			sessionLock.Unlock()
 
-			case types.InputAudioBufferAppendEvent:
-				// Handle 'input_audio_buffer.append'
-				if e.Audio == "" {
-					xlog.Error("Audio data is missing in 'input_audio_buffer.append'")
-					sendError(c, "missing_audio_data", "Audio data is missing", "", "")
-					continue
-				}
+			// TODO: At the least need to check locking and timer state in the VAD Go routine before allowing this
+			if isServerVAD {
+				sendNotImplemented(t, "input_audio_buffer.commit in conjunction with VAD")
+				continue
+			}
 
-				// Decode base64 audio data
-				decodedAudio, err := base64.StdEncoding.DecodeString(e.Audio)
-				if err != nil {
-					xlog.Error("failed to decode audio data", "error", err)
-					sendError(c, "invalid_audio_data", "Failed to decode audio data", "", "")
-					continue
-				}
+			session.AudioBufferLock.Lock()
+			allAudio := make([]byte, len(session.InputAudioBuffer))
+			copy(allAudio, session.InputAudioBuffer)
+			session.InputAudioBuffer = nil
+			session.AudioBufferLock.Unlock()
 
-				// Check buffer size limits before appending
-				session.AudioBufferLock.Lock()
-				newSize := len(session.InputAudioBuffer) + len(decodedAudio)
-				if newSize > maxAudioBufferSize {
-					session.AudioBufferLock.Unlock()
-					xlog.Error("audio buffer size limit exceeded", "current_size", len(session.InputAudioBuffer), "incoming_size", len(decodedAudio), "limit", maxAudioBufferSize)
-					sendError(c, "buffer_size_exceeded", fmt.Sprintf("Audio buffer size limit exceeded (max %d bytes)", maxAudioBufferSize), "", "")
-					continue
-				}
+			sendEvent(t, types.InputAudioBufferCommittedEvent{
+				ServerEventBase: types.ServerEventBase{},
+				ItemID:          generateItemID(),
+			})
 
-				// Append to InputAudioBuffer
-				session.InputAudioBuffer = append(session.InputAudioBuffer, decodedAudio...)
-				session.AudioBufferLock.Unlock()
+			respCtx, respDone := session.startResponse(context.Background())
+			go func() {
+				defer close(respDone)
+				commitUtterance(respCtx, allAudio, session, conversation, t)
+			}()
+
+		case types.ConversationItemCreateEvent:
+			xlog.Debug("recv", "message", string(msg))
+			// Add the item to the conversation
+			item := e.Item
+			// Ensure IDs are present
+			if item.User != nil && item.User.ID == "" {
+				item.User.ID = generateItemID()
+			}
+			if item.Assistant != nil && item.Assistant.ID == "" {
+				item.Assistant.ID = generateItemID()
+			}
+			if item.System != nil && item.System.ID == "" {
+				item.System.ID = generateItemID()
+			}
+			if item.FunctionCall != nil && item.FunctionCall.ID == "" {
+				item.FunctionCall.ID = generateItemID()
+			}
+			if item.FunctionCallOutput != nil && item.FunctionCallOutput.ID == "" {
+				item.FunctionCallOutput.ID = generateItemID()
+			}
 
-			case types.InputAudioBufferCommitEvent:
-				xlog.Debug("recv", "message", string(msg))
+			conversation.Lock.Lock()
+			conversation.Items = append(conversation.Items, &item)
+			conversation.Lock.Unlock()
 
-				sessionLock.Lock()
-				isServerVAD := session.TurnDetection.ServerVad != nil
-				sessionLock.Unlock()
+			sendEvent(t, types.ConversationItemAddedEvent{
+				ServerEventBase: types.ServerEventBase{
+					EventID: e.EventID,
+				},
+				PreviousItemID: e.PreviousItemID,
+				Item:           item,
+			})
 
-				// TODO: At the least need to check locking and timer state in the VAD Go routine before allowing this
-				if isServerVAD {
-					sendNotImplemented(c, "input_audio_buffer.commit in conjunction with VAD")
-					continue
-				}
+		case types.ConversationItemDeleteEvent:
+			sendError(t, "not_implemented", "Deleting items not implemented", "", "event_TODO")
 
-				session.AudioBufferLock.Lock()
-				allAudio := make([]byte, len(session.InputAudioBuffer))
-				copy(allAudio, session.InputAudioBuffer)
-				session.InputAudioBuffer = nil
-				session.AudioBufferLock.Unlock()
+		case types.ConversationItemRetrieveEvent:
+			xlog.Debug("recv", "message", string(msg))
 
-				go commitUtterance(context.TODO(), allAudio, session, conversation, c)
+			if e.ItemID == "" {
+				sendError(t, "invalid_item_id", "Need item_id, but none specified", "", "event_TODO")
+				continue
+			}
 
-			case types.ConversationItemCreateEvent:
-				xlog.Debug("recv", "message", string(msg))
-				// Add the item to the conversation
-				item := e.Item
-				// Ensure IDs are present
-				if item.User != nil && item.User.ID == "" {
-					item.User.ID = generateItemID()
-				}
-				if item.Assistant != nil && item.Assistant.ID == "" {
-					item.Assistant.ID = generateItemID()
-				}
-				if item.System != nil && item.System.ID == "" {
-					item.System.ID = generateItemID()
-				}
-				if item.FunctionCall != nil && item.FunctionCall.ID == "" {
-					item.FunctionCall.ID = generateItemID()
+			conversation.Lock.Lock()
+			var retrievedItem types.MessageItemUnion
+			for _, item := range conversation.Items {
+				// We need to check ID in the union
+				var id string
+				if item.System != nil {
+					id = item.System.ID
+				} else if item.User != nil {
+					id = item.User.ID
+				} else if item.Assistant != nil {
+					id = item.Assistant.ID
+				} else if item.FunctionCall != nil {
+					id = item.FunctionCall.ID
+				} else if item.FunctionCallOutput != nil {
+					id = item.FunctionCallOutput.ID
 				}
-				if item.FunctionCallOutput != nil && item.FunctionCallOutput.ID == "" {
-					item.FunctionCallOutput.ID = generateItemID()
-				}
-
-				conversation.Lock.Lock()
-				conversation.Items = append(conversation.Items, &item)
-				conversation.Lock.Unlock()
-
-				sendEvent(c, types.ConversationItemAddedEvent{
-					ServerEventBase: types.ServerEventBase{
-						EventID: e.EventID,
-					},
-					PreviousItemID: e.PreviousItemID,
-					Item:           item,
-				})
 
-			case types.ConversationItemDeleteEvent:
-				sendError(c, "not_implemented", "Deleting items not implemented", "", "event_TODO")
+				if id == e.ItemID {
+					retrievedItem = *item
+					break
+				}
+			}
+			conversation.Lock.Unlock()
 
-			case types.ConversationItemRetrieveEvent:
-				xlog.Debug("recv", "message", string(msg))
+			sendEvent(t, types.ConversationItemRetrievedEvent{
+				ServerEventBase: types.ServerEventBase{
+					EventID: "event_TODO",
+				},
+				Item: retrievedItem,
+			})
 
-				if e.ItemID == "" {
-					sendError(c, "invalid_item_id", "Need item_id, but none specified", "", "event_TODO")
-					continue
-				}
+		case types.ResponseCreateEvent:
+			xlog.Debug("recv", "message", string(msg))
 
+			// Handle optional items to add to context
+			if len(e.Response.Input) > 0 {
 				conversation.Lock.Lock()
-				var retrievedItem types.MessageItemUnion
-				for _, item := range conversation.Items {
-					// We need to check ID in the union
-					var id string
-					if item.System != nil {
-						id = item.System.ID
-					} else if item.User != nil {
-						id = item.User.ID
-					} else if item.Assistant != nil {
-						id = item.Assistant.ID
-					} else if item.FunctionCall != nil {
-						id = item.FunctionCall.ID
-					} else if item.FunctionCallOutput != nil {
-						id = item.FunctionCallOutput.ID
+				for _, item := range e.Response.Input {
+					// Ensure IDs are present
+					if item.User != nil && item.User.ID == "" {
+						item.User.ID = generateItemID()
 					}
-
-					if id == e.ItemID {
-						retrievedItem = *item
-						break
+					if item.Assistant != nil && item.Assistant.ID == "" {
+						item.Assistant.ID = generateItemID()
 					}
-				}
-				conversation.Lock.Unlock()
-
-				sendEvent(c, types.ConversationItemRetrievedEvent{
-					ServerEventBase: types.ServerEventBase{
-						EventID: "event_TODO",
-					},
-					Item: retrievedItem,
-				})
-
-			case types.ResponseCreateEvent:
-				xlog.Debug("recv", "message", string(msg))
-
-				// Handle optional items to add to context
-				if len(e.Response.Input) > 0 {
-					conversation.Lock.Lock()
-					for _, item := range e.Response.Input {
-						// Ensure IDs are present
-						if item.User != nil && item.User.ID == "" {
-							item.User.ID = generateItemID()
-						}
-						if item.Assistant != nil && item.Assistant.ID == "" {
-							item.Assistant.ID = generateItemID()
-						}
-						if item.System != nil && item.System.ID == "" {
-							item.System.ID = generateItemID()
-						}
-						if item.FunctionCall != nil && item.FunctionCall.ID == "" {
-							item.FunctionCall.ID = generateItemID()
-						}
-						if item.FunctionCallOutput != nil && item.FunctionCallOutput.ID == "" {
-							item.FunctionCallOutput.ID = generateItemID()
-						}
-
-						conversation.Items = append(conversation.Items, &item)
+					if item.System != nil && item.System.ID == "" {
+						item.System.ID = generateItemID()
+					}
+					if item.FunctionCall != nil && item.FunctionCall.ID == "" {
+						item.FunctionCall.ID = generateItemID()
+					}
+					if item.FunctionCallOutput != nil && item.FunctionCallOutput.ID == "" {
+						item.FunctionCallOutput.ID = generateItemID()
 					}
-					conversation.Lock.Unlock()
-				}
 
-				go triggerResponse(session, conversation, c, &e.Response)
+					conversation.Items = append(conversation.Items, &item)
+				}
+				conversation.Lock.Unlock()
+			}
 
-			case types.ResponseCancelEvent:
-				xlog.Debug("recv", "message", string(msg))
+			respCtx, respDone := session.startResponse(context.Background())
+			go func() {
+				defer close(respDone)
+				triggerResponse(respCtx, session, conversation, t, &e.Response)
+			}()
 
-				// Handle cancellation of ongoing responses
-				// Implement cancellation logic as needed
-				sendNotImplemented(c, "response.cancel")
+		case types.ResponseCancelEvent:
+			xlog.Debug("recv", "message", string(msg))
+			session.cancelActiveResponse()
 
-			default:
-				xlog.Error("unknown message type")
-				// sendError(c, "unknown_message_type", fmt.Sprintf("Unknown message type: %s", incomingMsg.Type), "", "")
-			}
+		default:
+			xlog.Error("unknown message type")
+			// sendError(t, "unknown_message_type", fmt.Sprintf("Unknown message type: %s", incomingMsg.Type), "", "")
 		}
+	}
 
-		// Close the done channel to signal goroutines to exit
-		close(done)
-		wg.Wait()
+	// Cancel any in-flight response before tearing down
+	session.cancelActiveResponse()
 
-		// Remove the session from the sessions map
-		sessionLock.Lock()
-		delete(sessions, sessionID)
-		sessionLock.Unlock()
+	// Signal any running VAD goroutine to exit.
+	if vadServerStarted {
+		close(done)
 	}
+	wg.Wait()
+
+	// Remove the session from the sessions map
+	sessionLock.Lock()
+	delete(sessions, sessionID)
+	sessionLock.Unlock()
 }
 
-// Helper function to send events to the client
-func sendEvent(c *LockedWebsocket, event types.ServerEvent) {
-	eventBytes, err := json.Marshal(event)
-	if err != nil {
-		xlog.Error("failed to marshal event", "error", err)
-		return
-	}
-	if err = c.WriteMessage(websocket.TextMessage, eventBytes); err != nil {
+// sendEvent sends a server event via the transport, logging any errors.
+func sendEvent(t Transport, event types.ServerEvent) {
+	if err := t.SendEvent(event); err != nil {
 		xlog.Error("write error", "error", err)
 	}
 }
 
-// Helper function to send errors to the client
-func sendError(c *LockedWebsocket, code, message, param, eventID string) {
+// sendError sends an error event to the client.
+func sendError(t Transport, code, message, param, eventID string) {
 	errorEvent := types.ErrorEvent{
 		ServerEventBase: types.ServerEventBase{
 			EventID: eventID,
@@ -572,11 +644,35 @@ func sendError(c *LockedWebsocket, code, message, param, eventID string) {
 		},
 	}
 
-	sendEvent(c, errorEvent)
+	sendEvent(t, errorEvent)
 }
 
-func sendNotImplemented(c *LockedWebsocket, message string) {
-	sendError(c, "not_implemented", message, "", "event_TODO")
+func sendNotImplemented(t Transport, message string) {
+	sendError(t, "not_implemented", message, "", "event_TODO")
+}
+
+// sendTestTone generates a 1-second 440 Hz sine wave and sends it through
+// the transport's audio path. This exercises the full Opus encode → RTP →
+// browser decode pipeline without involving TTS.
+func sendTestTone(t Transport) {
+	const (
+		freq       = 440.0
+		sampleRate = 24000
+		duration   = 1 // seconds
+		amplitude  = 16000
+		numSamples = sampleRate * duration
+	)
+
+	pcm := make([]byte, numSamples*2) // 16-bit samples = 2 bytes each
+	for i := 0; i < numSamples; i++ {
+		sample := int16(amplitude * math.Sin(2*math.Pi*freq*float64(i)/sampleRate))
+		binary.LittleEndian.PutUint16(pcm[i*2:], uint16(sample))
+	}
+
+	xlog.Debug("Sending test tone", "samples", numSamples, "sample_rate", sampleRate, "freq", freq)
+	if err := t.SendAudio(context.Background(), pcm, sampleRate); err != nil {
+		xlog.Error("test tone send failed", "error", err)
+	}
 }
 
 func updateTransSession(session *Session, update *types.SessionUnion, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error {
@@ -616,7 +712,7 @@ func updateTransSession(session *Session, update *types.SessionUnion, cl *config
 		trCur.Prompt = trUpd.Prompt
 	}
 
-	if update.Transcription.Audio.Input.TurnDetection != nil {
+	if update.Transcription.Audio.Input.TurnDetectionSet {
 		session.TurnDetection = update.Transcription.Audio.Input.TurnDetection
 	}
 
@@ -675,7 +771,7 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
 		session.ModelInterface = m
 	}
 
-	if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.TurnDetection != nil {
+	if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.TurnDetectionSet {
 		session.TurnDetection = rt.Audio.Input.TurnDetection
 	}
 
@@ -685,6 +781,12 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
 		}
 	}
 
+	if rt.Audio != nil && rt.Audio.Output != nil && rt.Audio.Output.Format != nil && rt.Audio.Output.Format.PCM != nil {
+		if rt.Audio.Output.Format.PCM.Rate > 0 {
+			session.OutputSampleRate = rt.Audio.Output.Format.PCM.Rate
+		}
+	}
+
 	if rt.Instructions != "" {
 		session.Instructions = rt.Instructions
 	}
@@ -705,7 +807,7 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
 
 // handleVAD is a goroutine that listens for audio data from the client,
 // runs VAD on the audio data, and commits utterances to the conversation
-func handleVAD(session *Session, conv *Conversation, c *LockedWebsocket, done chan struct{}) {
+func handleVAD(session *Session, conv *Conversation, t Transport, done chan struct{}) {
 	vadContext, cancel := context.WithCancel(context.Background())
 	go func() {
 		<-done
@@ -713,7 +815,7 @@ func handleVAD(session *Session, conv *Conversation, c *LockedWebsocket, done ch
 	}()
 
 	silenceThreshold := 0.5 // Default 500ms
-	if session.TurnDetection.ServerVad != nil {
+	if session.TurnDetection != nil && session.TurnDetection.ServerVad != nil {
 		silenceThreshold = float64(session.TurnDetection.ServerVad.SilenceDurationMs) / 1000
 	}
 
@@ -734,7 +836,7 @@ func handleVAD(session *Session, conv *Conversation, c *LockedWebsocket, done ch
 			session.AudioBufferLock.Unlock()
 
 			aints := sound.BytesToInt16sLE(allAudio)
-			if len(aints) == 0 || len(aints) < int(silenceThreshold)*session.InputSampleRate {
+			if len(aints) == 0 || len(aints) < int(silenceThreshold*float64(session.InputSampleRate)) {
 				continue
 			}
 
@@ -748,7 +850,7 @@ func handleVAD(session *Session, conv *Conversation, c *LockedWebsocket, done ch
 					continue
 				}
 				xlog.Error("failed to process audio", "error", err)
-				sendError(c, "processing_error", "Failed to process audio: "+err.Error(), "", "")
+				sendError(t, "processing_error", "Failed to process audio: "+err.Error(), "", "")
 				continue
 			}
 
@@ -760,21 +862,17 @@ func handleVAD(session *Session, conv *Conversation, c *LockedWebsocket, done ch
 				session.InputAudioBuffer = nil
 				session.AudioBufferLock.Unlock()
 
-				// NOTE: OpenAI doesn't send this message unless the client requests it
-				// xlog.Debug("Detected silence for a while, clearing audio buffer")
-				// sendEvent(c, types.InputAudioBufferClearedEvent{
-				// 	ServerEventBase: types.ServerEventBase{
-				// 		EventID: "event_TODO",
-				// 	},
-				// })
-
 				continue
 			} else if len(segments) == 0 {
 				continue
 			}
 
 			if !speechStarted {
-				sendEvent(c, types.InputAudioBufferSpeechStartedEvent{
+				// Barge-in: cancel any in-flight response so we stop
+				// sending audio and don't keep the interrupted reply in history.
+				session.cancelActiveResponse()
+
+				sendEvent(t, types.InputAudioBufferSpeechStartedEvent{
 					ServerEventBase: types.ServerEventBase{
 						EventID: "event_TODO",
 					},
@@ -795,7 +893,7 @@ func handleVAD(session *Session, conv *Conversation, c *LockedWebsocket, done ch
 				session.InputAudioBuffer = nil
 				session.AudioBufferLock.Unlock()
 
-				sendEvent(c, types.InputAudioBufferSpeechStoppedEvent{
+				sendEvent(t, types.InputAudioBufferSpeechStoppedEvent{
 					ServerEventBase: types.ServerEventBase{
 						EventID: "event_TODO",
 					},
@@ -803,7 +901,7 @@ func handleVAD(session *Session, conv *Conversation, c *LockedWebsocket, done ch
 				})
 				speechStarted = false
 
-				sendEvent(c, types.InputAudioBufferCommittedEvent{
+				sendEvent(t, types.InputAudioBufferCommittedEvent{
 					ServerEventBase: types.ServerEventBase{
 						EventID: "event_TODO",
 					},
@@ -813,13 +911,17 @@ func handleVAD(session *Session, conv *Conversation, c *LockedWebsocket, done ch
 
 				abytes := sound.Int16toBytesLE(aints)
 				// TODO: Remove prefix silence that is is over TurnDetectionParams.PrefixPaddingMs
-				go commitUtterance(vadContext, abytes, session, conv, c)
+				respCtx, respDone := session.startResponse(vadContext)
+				go func() {
+					defer close(respDone)
+					commitUtterance(respCtx, abytes, session, conv, t)
+				}()
 			}
 		}
 	}
 }
 
-func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Conversation, c *LockedWebsocket) {
+func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Conversation, t Transport) {
 	if len(utt) == 0 {
 		return
 	}
@@ -851,15 +953,15 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 	if session.InputAudioTranscription != nil {
 		tr, err := session.ModelInterface.Transcribe(ctx, f.Name(), session.InputAudioTranscription.Language, false, false, session.InputAudioTranscription.Prompt)
 		if err != nil {
-			sendError(c, "transcription_failed", err.Error(), "", "event_TODO")
+			sendError(t, "transcription_failed", err.Error(), "", "event_TODO")
 			return
 		} else if tr == nil {
-			sendError(c, "transcription_failed", "trancribe result is nil", "", "event_TODO")
+			sendError(t, "transcription_failed", "trancribe result is nil", "", "event_TODO")
 			return
 		}
 
 		transcript = tr.Text
-		sendEvent(c, types.ConversationItemInputAudioTranscriptionCompletedEvent{
+		sendEvent(t, types.ConversationItemInputAudioTranscriptionCompletedEvent{
 			ServerEventBase: types.ServerEventBase{
 				EventID: "event_TODO",
 			},
@@ -871,12 +973,12 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 			Transcript:   transcript,
 		})
 	} else {
-		sendNotImplemented(c, "any-to-any models")
+		sendNotImplemented(t, "any-to-any models")
 		return
 	}
 
 	if !session.TranscriptionOnly {
-		generateResponse(session, utt, transcript, conv, c, websocket.TextMessage)
+		generateResponse(ctx, session, utt, transcript, conv, t)
 	}
 }
 
@@ -901,7 +1003,7 @@ func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADS
 }
 
 // Function to generate a response based on the conversation
-func generateResponse(session *Session, utt []byte, transcript string, conv *Conversation, c *LockedWebsocket, mt int) {
+func generateResponse(ctx context.Context, session *Session, utt []byte, transcript string, conv *Conversation, t Transport) {
 	xlog.Debug("Generating realtime response...")
 
 	// Create user message item
@@ -922,14 +1024,14 @@ func generateResponse(session *Session, utt []byte, transcript string, conv *Con
 	conv.Items = append(conv.Items, &item)
 	conv.Lock.Unlock()
 
-	sendEvent(c, types.ConversationItemAddedEvent{
+	sendEvent(t, types.ConversationItemAddedEvent{
 		Item: item,
 	})
 
-	triggerResponse(session, conv, c, nil)
+	triggerResponse(ctx, session, conv, t, nil)
 }
 
-func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, overrides *types.ResponseCreateParams) {
+func triggerResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams) {
 	config := session.ModelInterface.PredictConfig()
 
 	// Default values
@@ -1077,7 +1179,7 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
 	}
 
 	responseID := generateUniqueID()
-	sendEvent(c, types.ResponseCreatedEvent{
+	sendEvent(t, types.ResponseCreatedEvent{
 		ServerEventBase: types.ServerEventBase{},
 		Response: types.Response{
 			ID:     responseID,
@@ -1086,15 +1188,29 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
 		},
 	})
 
-	predFunc, err := session.ModelInterface.Predict(context.TODO(), conversationHistory, images, nil, nil, nil, tools, toolChoice, nil, nil, nil)
+	predFunc, err := session.ModelInterface.Predict(ctx, conversationHistory, images, nil, nil, nil, tools, toolChoice, nil, nil, nil)
 	if err != nil {
-		sendError(c, "inference_failed", fmt.Sprintf("backend error: %v", err), "", "") // item.Assistant.ID is unknown here
+		sendError(t, "inference_failed", fmt.Sprintf("backend error: %v", err), "", "") // item.Assistant.ID is unknown here
 		return
 	}
 
 	pred, err := predFunc()
 	if err != nil {
-		sendError(c, "prediction_failed", fmt.Sprintf("backend error: %v", err), "", "")
+		sendError(t, "prediction_failed", fmt.Sprintf("backend error: %v", err), "", "")
+		return
+	}
+
+	// Check for cancellation after LLM inference (barge-in may have fired)
+	if ctx.Err() != nil {
+		xlog.Debug("Response cancelled after LLM inference (barge-in)")
+		sendEvent(t, types.ResponseDoneEvent{
+			ServerEventBase: types.ServerEventBase{},
+			Response: types.Response{
+				ID:     responseID,
+				Object: "realtime.response",
+				Status: types.ResponseStatusCancelled,
+			},
+		})
 		return
 	}
 
@@ -1194,14 +1310,14 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
 		conv.Items = append(conv.Items, &item)
 		conv.Lock.Unlock()
 
-		sendEvent(c, types.ResponseOutputItemAddedEvent{
+		sendEvent(t, types.ResponseOutputItemAddedEvent{
 			ServerEventBase: types.ServerEventBase{},
 			ResponseID:      responseID,
 			OutputIndex:     0,
 			Item:            item,
 		})
 
-		sendEvent(c, types.ResponseContentPartAddedEvent{
+		sendEvent(t, types.ResponseContentPartAddedEvent{
 			ServerEventBase: types.ServerEventBase{},
 			ResponseID:      responseID,
 			ItemID:          item.Assistant.ID,
@@ -1210,15 +1326,54 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
 			Part:            item.Assistant.Content[0],
 		})
 
-		audioFilePath, res, err := session.ModelInterface.TTS(context.TODO(), finalSpeech, session.Voice, session.InputAudioTranscription.Language)
+		// removeItemFromConv removes the last occurrence of an item with
+		// the given assistant ID from conversation history.
+		removeItemFromConv := func(assistantID string) {
+			conv.Lock.Lock()
+			for i := len(conv.Items) - 1; i >= 0; i-- {
+				if conv.Items[i].Assistant != nil && conv.Items[i].Assistant.ID == assistantID {
+					conv.Items = append(conv.Items[:i], conv.Items[i+1:]...)
+					break
+				}
+			}
+			conv.Lock.Unlock()
+		}
+
+		// sendCancelledResponse emits the cancelled status and cleans up the
+		// assistant item so the interrupted reply is not in chat history.
+		sendCancelledResponse := func() {
+			removeItemFromConv(item.Assistant.ID)
+			sendEvent(t, types.ResponseDoneEvent{
+				ServerEventBase: types.ServerEventBase{},
+				Response: types.Response{
+					ID:     responseID,
+					Object: "realtime.response",
+					Status: types.ResponseStatusCancelled,
+				},
+			})
+		}
+
+		// Check for cancellation before TTS
+		if ctx.Err() != nil {
+			xlog.Debug("Response cancelled before TTS (barge-in)")
+			sendCancelledResponse()
+			return
+		}
+
+		audioFilePath, res, err := session.ModelInterface.TTS(ctx, finalSpeech, session.Voice, session.InputAudioTranscription.Language)
 		if err != nil {
+			if ctx.Err() != nil {
+				xlog.Debug("TTS cancelled (barge-in)")
+				sendCancelledResponse()
+				return
+			}
 			xlog.Error("TTS failed", "error", err)
-			sendError(c, "tts_error", fmt.Sprintf("TTS generation failed: %v", err), "", item.Assistant.ID)
+			sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %v", err), "", item.Assistant.ID)
 			return
 		}
 		if !res.Success {
 			xlog.Error("TTS failed", "message", res.Message)
-			sendError(c, "tts_error", fmt.Sprintf("TTS generation failed: %s", res.Message), "", item.Assistant.ID)
+			sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %s", res.Message), "", item.Assistant.ID)
 			return
 		}
 		defer os.Remove(audioFilePath)
@@ -1226,21 +1381,41 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
 		audioBytes, err := os.ReadFile(audioFilePath)
 		if err != nil {
 			xlog.Error("failed to read TTS file", "error", err)
-			sendError(c, "tts_error", fmt.Sprintf("Failed to read TTS audio: %v", err), "", item.Assistant.ID)
+			sendError(t, "tts_error", fmt.Sprintf("Failed to read TTS audio: %v", err), "", item.Assistant.ID)
 			return
 		}
 
-		// Strip WAV header (44 bytes) to get raw PCM data
-		// The OpenAI Realtime API expects raw PCM, not WAV files
-		const wavHeaderSize = 44
-		pcmData := audioBytes
-		if len(audioBytes) > wavHeaderSize {
-			pcmData = audioBytes[wavHeaderSize:]
+		// Parse WAV header to get raw PCM and the actual sample rate from the TTS backend.
+		pcmData, ttsSampleRate := laudio.ParseWAV(audioBytes)
+		if ttsSampleRate == 0 {
+			ttsSampleRate = localSampleRate
+		}
+		xlog.Debug("TTS audio parsed", "raw_bytes", len(audioBytes), "pcm_bytes", len(pcmData), "sample_rate", ttsSampleRate)
+
+		// SendAudio (WebRTC) passes PCM at the TTS sample rate directly to the
+		// Opus encoder, which resamples to 48kHz internally. This avoids a
+		// lossy intermediate resample through 16kHz.
+		if err := t.SendAudio(ctx, pcmData, ttsSampleRate); err != nil {
+			if ctx.Err() != nil {
+				xlog.Debug("Audio playback cancelled (barge-in)")
+				sendCancelledResponse()
+				return
+			}
+			xlog.Error("failed to send audio via transport", "error", err)
 		}
 
-		audioString := base64.StdEncoding.EncodeToString(pcmData)
+		// The base64 event (used by WebSocket clients) should be at the
+		// session's output sample rate. This is separate from InputSampleRate
+		// which tracks inbound audio (e.g. 16kHz for WebRTC).
+		wsPCM := pcmData
+		if ttsSampleRate != session.OutputSampleRate {
+			samples := sound.BytesToInt16sLE(pcmData)
+			resampled := sound.ResampleInt16(samples, ttsSampleRate, session.OutputSampleRate)
+			wsPCM = sound.Int16toBytesLE(resampled)
+		}
+		audioString := base64.StdEncoding.EncodeToString(wsPCM)
 
-		sendEvent(c, types.ResponseOutputAudioTranscriptDeltaEvent{
+		sendEvent(t, types.ResponseOutputAudioTranscriptDeltaEvent{
 			ServerEventBase: types.ServerEventBase{},
 			ResponseID:      responseID,
 			ItemID:          item.Assistant.ID,
@@ -1248,7 +1423,7 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
 			ContentIndex:    0,
 			Delta:           finalSpeech,
 		})
-		sendEvent(c, types.ResponseOutputAudioTranscriptDoneEvent{
+		sendEvent(t, types.ResponseOutputAudioTranscriptDoneEvent{
 			ServerEventBase: types.ServerEventBase{},
 			ResponseID:      responseID,
 			ItemID:          item.Assistant.ID,
@@ -1257,7 +1432,7 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
 			Transcript:      finalSpeech,
 		})
 
-		sendEvent(c, types.ResponseOutputAudioDeltaEvent{
+		sendEvent(t, types.ResponseOutputAudioDeltaEvent{
 			ServerEventBase: types.ServerEventBase{},
 			ResponseID:      responseID,
 			ItemID:          item.Assistant.ID,
@@ -1265,7 +1440,7 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
 			ContentIndex:    0,
 			Delta:           audioString,
 		})
-		sendEvent(c, types.ResponseOutputAudioDoneEvent{
+		sendEvent(t, types.ResponseOutputAudioDoneEvent{
 			ServerEventBase: types.ServerEventBase{},
 			ResponseID:      responseID,
 			ItemID:          item.Assistant.ID,
@@ -1273,7 +1448,7 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
 			ContentIndex:    0,
 		})
 
-		sendEvent(c, types.ResponseContentPartDoneEvent{
+		sendEvent(t, types.ResponseContentPartDoneEvent{
 			ServerEventBase: types.ServerEventBase{},
 			ResponseID:      responseID,
 			ItemID:          item.Assistant.ID,
@@ -1287,7 +1462,7 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
 		item.Assistant.Content[0].Audio = audioString
 		conv.Lock.Unlock()
 
-		sendEvent(c, types.ResponseOutputItemDoneEvent{
+		sendEvent(t, types.ResponseOutputItemDoneEvent{
 			ServerEventBase: types.ServerEventBase{},
 			ResponseID:      responseID,
 			OutputIndex:     0,
@@ -1321,14 +1496,14 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
 			outputIndex++
 		}
 
-		sendEvent(c, types.ResponseOutputItemAddedEvent{
+		sendEvent(t, types.ResponseOutputItemAddedEvent{
 			ServerEventBase: types.ServerEventBase{},
 			ResponseID:      responseID,
 			OutputIndex:     outputIndex,
 			Item:            fcItem,
 		})
 
-		sendEvent(c, types.ResponseFunctionCallArgumentsDeltaEvent{
+		sendEvent(t, types.ResponseFunctionCallArgumentsDeltaEvent{
 			ServerEventBase: types.ServerEventBase{},
 			ResponseID:      responseID,
 			ItemID:          toolCallID,
@@ -1337,7 +1512,7 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
 			Delta:           tc.Arguments,
 		})
 
-		sendEvent(c, types.ResponseFunctionCallArgumentsDoneEvent{
+		sendEvent(t, types.ResponseFunctionCallArgumentsDoneEvent{
 			ServerEventBase: types.ServerEventBase{},
 			ResponseID:      responseID,
 			ItemID:          toolCallID,
@@ -1347,7 +1522,7 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
 			Name:            tc.Name,
 		})
 
-		sendEvent(c, types.ResponseOutputItemDoneEvent{
+		sendEvent(t, types.ResponseOutputItemDoneEvent{
 			ServerEventBase: types.ServerEventBase{},
 			ResponseID:      responseID,
 			OutputIndex:     outputIndex,
@@ -1355,7 +1530,7 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
 		})
 	}
 
-	sendEvent(c, types.ResponseDoneEvent{
+	sendEvent(t, types.ResponseDoneEvent{
 		ServerEventBase: types.ServerEventBase{},
 		Response: types.Response{
 			ID:     responseID,
diff --git a/core/http/endpoints/openai/realtime_transport.go b/core/http/endpoints/openai/realtime_transport.go
new file mode 100644
index 000000000000..5ffcb0ba917e
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_transport.go
@@ -0,0 +1,23 @@
+package openai
+
+import (
+	"context"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+)
+
+// Transport abstracts event and audio I/O so the same session logic
+// can serve both WebSocket and WebRTC connections.
+type Transport interface {
+	// SendEvent marshals and sends a server event to the client.
+	SendEvent(event types.ServerEvent) error
+	// ReadEvent reads the next raw client event (JSON bytes).
+	ReadEvent() ([]byte, error)
+	// SendAudio sends raw PCM audio to the client at the given sample rate.
+	// For WebSocket this is a no-op (audio is sent via JSON events).
+	// For WebRTC this encodes to Opus and writes to the media track.
+	// The context allows cancellation for barge-in support.
+	SendAudio(ctx context.Context, pcmData []byte, sampleRate int) error
+	// Close tears down the underlying connection.
+	Close() error
+}
diff --git a/core/http/endpoints/openai/realtime_transport_webrtc.go b/core/http/endpoints/openai/realtime_transport_webrtc.go
new file mode 100644
index 000000000000..f25d573db163
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_transport_webrtc.go
@@ -0,0 +1,250 @@
+package openai
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"math/rand/v2"
+	"sync"
+	"time"
+
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/xlog"
+	"github.com/pion/rtp"
+	"github.com/pion/webrtc/v4"
+)
+
+// WebRTCTransport implements Transport over a pion/webrtc PeerConnection.
+// Events travel via the "oai-events" DataChannel; audio goes over an RTP track.
+type WebRTCTransport struct {
+	pc         *webrtc.PeerConnection
+	dc         *webrtc.DataChannel
+	audioTrack *webrtc.TrackLocalStaticRTP
+	encoder    *OpusEncoder
+	inEvents   chan []byte
+	outEvents  chan []byte   // buffered outbound event queue
+	closed     chan struct{}
+	closeOnce  sync.Once
+	flushed    chan struct{} // closed when sender goroutine has drained outEvents
+	dcReady    chan struct{} // closed when data channel is open
+	dcReadyOnce sync.Once
+	sessionCh  chan *Session // delivers session from runRealtimeSession to handleIncomingAudioTrack
+
+	// RTP state for outbound audio — protected by rtpMu
+	rtpMu        sync.Mutex
+	rtpSeqNum    uint16
+	rtpTimestamp uint32
+	rtpMarker    bool // true → next packet gets marker bit set
+}
+
+func NewWebRTCTransport(pc *webrtc.PeerConnection, audioTrack *webrtc.TrackLocalStaticRTP) (*WebRTCTransport, error) {
+	enc, err := NewOpusEncoder()
+	if err != nil {
+		return nil, fmt.Errorf("webrtc transport: %w", err)
+	}
+
+	t := &WebRTCTransport{
+		pc:           pc,
+		audioTrack:   audioTrack,
+		encoder:      enc,
+		inEvents:     make(chan []byte, 256),
+		outEvents:    make(chan []byte, 256),
+		closed:       make(chan struct{}),
+		flushed:      make(chan struct{}),
+		dcReady:      make(chan struct{}),
+		sessionCh:    make(chan *Session, 1),
+		rtpSeqNum:    uint16(rand.UintN(65536)),
+		rtpTimestamp: rand.Uint32(),
+		rtpMarker:    true, // first packet of the stream gets marker
+	}
+
+	// The client creates the "oai-events" data channel (so m=application is
+	// included in the SDP offer). We receive it here via OnDataChannel.
+	pc.OnDataChannel(func(dc *webrtc.DataChannel) {
+		if dc.Label() != "oai-events" {
+			return
+		}
+		t.dc = dc
+		dc.OnOpen(func() {
+			t.dcReadyOnce.Do(func() { close(t.dcReady) })
+		})
+		dc.OnMessage(func(msg webrtc.DataChannelMessage) {
+			select {
+			case t.inEvents <- msg.Data:
+			case <-t.closed:
+			}
+		})
+		// The channel may already be open by the time OnDataChannel fires
+		if dc.ReadyState() == webrtc.DataChannelStateOpen {
+			t.dcReadyOnce.Do(func() { close(t.dcReady) })
+		}
+	})
+
+	pc.OnConnectionStateChange(func(state webrtc.PeerConnectionState) {
+		xlog.Debug("WebRTC connection state", "state", state.String())
+		if state == webrtc.PeerConnectionStateFailed ||
+			state == webrtc.PeerConnectionStateClosed ||
+			state == webrtc.PeerConnectionStateDisconnected {
+			t.closeOnce.Do(func() { close(t.closed) })
+		}
+	})
+
+	go t.sendLoop()
+
+	return t, nil
+}
+
+// sendLoop is a dedicated goroutine that drains outEvents and sends them
+// over the data channel. It waits for the data channel to open before
+// sending, and drains any remaining events when closed is signalled.
+func (t *WebRTCTransport) sendLoop() {
+	defer close(t.flushed)
+
+	// Wait for data channel to be ready
+	select {
+	case <-t.dcReady:
+	case <-t.closed:
+		return
+	}
+
+	for {
+		select {
+		case data, ok := <-t.outEvents:
+			if !ok {
+				return
+			}
+			if err := t.dc.SendText(string(data)); err != nil {
+				xlog.Error("data channel send failed", "error", err)
+				return
+			}
+		case <-t.closed:
+			// Drain any remaining queued events before exiting
+			for {
+				select {
+				case data := <-t.outEvents:
+					if err := t.dc.SendText(string(data)); err != nil {
+						return
+					}
+				default:
+					return
+				}
+			}
+		}
+	}
+}
+
+func (t *WebRTCTransport) SendEvent(event types.ServerEvent) error {
+	data, err := json.Marshal(event)
+	if err != nil {
+		return fmt.Errorf("marshal event: %w", err)
+	}
+
+	select {
+	case t.outEvents <- data:
+		return nil
+	case <-t.closed:
+		return fmt.Errorf("transport closed")
+	}
+}
+
+func (t *WebRTCTransport) ReadEvent() ([]byte, error) {
+	select {
+	case msg := <-t.inEvents:
+		return msg, nil
+	case <-t.closed:
+		return nil, fmt.Errorf("transport closed")
+	}
+}
+
+// SendAudio encodes raw PCM int16 LE to Opus and writes RTP packets to the
+// audio track. The encoder resamples from the given sampleRate to 48kHz
+// internally. Frames are paced at real-time intervals (20ms per frame) to
+// avoid overwhelming the browser's jitter buffer with a burst of packets.
+//
+// The context allows callers to cancel mid-stream for barge-in support.
+// When cancelled, the marker bit is set so the next audio segment starts
+// cleanly in the browser's jitter buffer.
+//
+// RTP packets are constructed manually (rather than via WriteSample) so we
+// can control the marker bit. pion's WriteSample sets the marker bit on
+// every Opus packet, which causes Chrome's NetEq jitter buffer to reset
+// its timing estimation for each frame, producing severe audio distortion.
+func (t *WebRTCTransport) SendAudio(ctx context.Context, pcmData []byte, sampleRate int) error {
+	frames, err := t.encoder.Encode(pcmData, sampleRate)
+	if err != nil {
+		return err
+	}
+
+	const frameDuration = 20 * time.Millisecond
+	const samplesPerFrame = 960 // 20ms at 48kHz
+
+	ticker := time.NewTicker(frameDuration)
+	defer ticker.Stop()
+
+	for i, frame := range frames {
+		t.rtpMu.Lock()
+		pkt := &rtp.Packet{
+			Header: rtp.Header{
+				Version:        2,
+				Marker:         t.rtpMarker,
+				SequenceNumber: t.rtpSeqNum,
+				Timestamp:      t.rtpTimestamp,
+				// SSRC and PayloadType are overridden by pion's writeRTP
+			},
+			Payload: frame,
+		}
+		t.rtpSeqNum++
+		t.rtpTimestamp += samplesPerFrame
+		t.rtpMarker = false // only the first packet gets marker
+		t.rtpMu.Unlock()
+
+		if err := t.audioTrack.WriteRTP(pkt); err != nil {
+			return fmt.Errorf("write rtp: %w", err)
+		}
+
+		// Pace output at ~real-time so the browser's jitter buffer
+		// receives packets at the expected rate. Skip wait after last frame.
+		if i < len(frames)-1 {
+			select {
+			case <-ticker.C:
+			case <-ctx.Done():
+				// Barge-in: mark the next packet so the browser knows
+				// a new audio segment is starting after the interruption.
+				t.rtpMu.Lock()
+				t.rtpMarker = true
+				t.rtpMu.Unlock()
+				return ctx.Err()
+			case <-t.closed:
+				return fmt.Errorf("transport closed during audio send")
+			}
+		}
+	}
+	return nil
+}
+
+// SetSession delivers the session to any goroutine waiting in WaitForSession.
+func (t *WebRTCTransport) SetSession(s *Session) {
+	select {
+	case t.sessionCh <- s:
+	case <-t.closed:
+	}
+}
+
+// WaitForSession blocks until the session is available or the transport closes.
+func (t *WebRTCTransport) WaitForSession() *Session {
+	select {
+	case s := <-t.sessionCh:
+		return s
+	case <-t.closed:
+		return nil
+	}
+}
+
+func (t *WebRTCTransport) Close() error {
+	// Signal no more events and unblock the sender if it's waiting
+	t.closeOnce.Do(func() { close(t.closed) })
+	// Wait for the sender to drain any remaining queued events
+	<-t.flushed
+	t.encoder.Close()
+	return t.pc.Close()
+}
diff --git a/core/http/endpoints/openai/realtime_transport_ws.go b/core/http/endpoints/openai/realtime_transport_ws.go
new file mode 100644
index 000000000000..6621f2ca6b82
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_transport_ws.go
@@ -0,0 +1,47 @@
+package openai
+
+import (
+	"context"
+	"encoding/json"
+	"sync"
+
+	"github.com/gorilla/websocket"
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	"github.com/mudler/xlog"
+)
+
+// WebSocketTransport implements Transport over a gorilla/websocket connection.
+type WebSocketTransport struct {
+	conn *websocket.Conn
+	mu   sync.Mutex
+}
+
+func NewWebSocketTransport(conn *websocket.Conn) *WebSocketTransport {
+	return &WebSocketTransport{conn: conn}
+}
+
+func (t *WebSocketTransport) SendEvent(event types.ServerEvent) error {
+	eventBytes, err := json.Marshal(event)
+	if err != nil {
+		xlog.Error("failed to marshal event", "error", err)
+		return err
+	}
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.conn.WriteMessage(websocket.TextMessage, eventBytes)
+}
+
+func (t *WebSocketTransport) ReadEvent() ([]byte, error) {
+	_, msg, err := t.conn.ReadMessage()
+	return msg, err
+}
+
+// SendAudio is a no-op for WebSocket — audio is delivered via JSON events
+// (base64-encoded in response.audio.delta).
+func (t *WebSocketTransport) SendAudio(_ context.Context, _ []byte, _ int) error {
+	return nil
+}
+
+func (t *WebSocketTransport) Close() error {
+	return t.conn.Close()
+}
diff --git a/core/http/endpoints/openai/realtime_webrtc.go b/core/http/endpoints/openai/realtime_webrtc.go
new file mode 100644
index 000000000000..6d3ead99b820
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_webrtc.go
@@ -0,0 +1,250 @@
+package openai
+
+import (
+	"math"
+	"net/http"
+	"time"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/application"
+	"github.com/mudler/LocalAI/pkg/sound"
+	"github.com/mudler/xlog"
+	"github.com/pion/webrtc/v4"
+)
+
+// RealtimeCallRequest is the JSON body for POST /v1/realtime/calls.
+type RealtimeCallRequest struct {
+	SDP   string `json:"sdp"`
+	Model string `json:"model"`
+}
+
+// RealtimeCallResponse is the JSON response for POST /v1/realtime/calls.
+type RealtimeCallResponse struct {
+	SDP       string `json:"sdp"`
+	SessionID string `json:"session_id"`
+}
+
+// RealtimeCalls handles POST /v1/realtime/calls for WebRTC signaling.
+func RealtimeCalls(application *application.Application) echo.HandlerFunc {
+	return func(c echo.Context) error {
+		var req RealtimeCallRequest
+		if err := c.Bind(&req); err != nil {
+			return c.JSON(http.StatusBadRequest, map[string]string{"error": "invalid request body"})
+		}
+		if req.SDP == "" {
+			return c.JSON(http.StatusBadRequest, map[string]string{"error": "sdp is required"})
+		}
+		if req.Model == "" {
+			return c.JSON(http.StatusBadRequest, map[string]string{"error": "model is required"})
+		}
+
+		// Create a MediaEngine with Opus support
+		m := &webrtc.MediaEngine{}
+		if err := m.RegisterDefaultCodecs(); err != nil {
+			xlog.Error("failed to register codecs", "error", err)
+			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "codec registration failed"})
+		}
+
+		api := webrtc.NewAPI(webrtc.WithMediaEngine(m))
+
+		pc, err := api.NewPeerConnection(webrtc.Configuration{})
+		if err != nil {
+			xlog.Error("failed to create peer connection", "error", err)
+			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "failed to create peer connection"})
+		}
+
+		// Create outbound audio track (Opus, 48kHz).
+		// We use TrackLocalStaticRTP (not TrackLocalStaticSample) so that
+		// SendAudio can construct RTP packets directly and control the marker
+		// bit. pion's WriteSample sets the marker bit on every Opus packet,
+		// which causes Chrome's NetEq jitter buffer to reset for each frame.
+		audioTrack, err := webrtc.NewTrackLocalStaticRTP(
+			webrtc.RTPCodecCapability{
+				MimeType:  webrtc.MimeTypeOpus,
+				ClockRate: 48000,
+				Channels:  2, // Opus in WebRTC is always signaled as 2 channels per RFC 7587
+			},
+			"audio",
+			"localai",
+		)
+		if err != nil {
+			pc.Close()
+			xlog.Error("failed to create audio track", "error", err)
+			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "failed to create audio track"})
+		}
+
+		rtpSender, err := pc.AddTrack(audioTrack)
+		if err != nil {
+			pc.Close()
+			xlog.Error("failed to add audio track", "error", err)
+			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "failed to add audio track"})
+		}
+
+		// Drain RTCP (control protocol) packets we don't have anyting useful to do with
+		go func() {
+			buf := make([]byte, 1500)
+			for {
+				if _, _, err := rtpSender.Read(buf); err != nil {
+					return
+				}
+			}
+		}()
+
+		// Create the transport (the data channel is created by the client and
+		// received via pc.OnDataChannel inside NewWebRTCTransport)
+		transport, err := NewWebRTCTransport(pc, audioTrack)
+		if err != nil {
+			pc.Close()
+			xlog.Error("failed to create webrtc transport", "error", err)
+			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "failed to create transport"})
+		}
+
+		// Handle incoming audio track from the client
+		pc.OnTrack(func(track *webrtc.TrackRemote, receiver *webrtc.RTPReceiver) {
+			codec := track.Codec()
+			if codec.MimeType != webrtc.MimeTypeOpus {
+				xlog.Warn("unexpected track codec, ignoring", "mime", codec.MimeType)
+				return
+			}
+			xlog.Debug("Received audio track from client",
+				"codec", codec.MimeType,
+				"clock_rate", codec.ClockRate,
+				"channels", codec.Channels,
+				"sdp_fmtp", codec.SDPFmtpLine,
+				"payload_type", codec.PayloadType,
+			)
+
+			decoder, err := NewOpusDecoder()
+			if err != nil {
+				xlog.Error("failed to create opus decoder", "error", err)
+				return
+			}
+			defer decoder.Close()
+
+			handleIncomingAudioTrack(track, decoder, transport)
+		})
+
+		// Set the remote SDP (client's offer)
+		if err := pc.SetRemoteDescription(webrtc.SessionDescription{
+			Type: webrtc.SDPTypeOffer,
+			SDP:  req.SDP,
+		}); err != nil {
+			transport.Close()
+			xlog.Error("failed to set remote description", "error", err)
+			return c.JSON(http.StatusBadRequest, map[string]string{"error": "invalid SDP offer"})
+		}
+
+		// Create answer
+		answer, err := pc.CreateAnswer(nil)
+		if err != nil {
+			transport.Close()
+			xlog.Error("failed to create answer", "error", err)
+			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "failed to create answer"})
+		}
+
+		if err := pc.SetLocalDescription(answer); err != nil {
+			transport.Close()
+			xlog.Error("failed to set local description", "error", err)
+			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "failed to set local description"})
+		}
+
+		// Wait for ICE gathering to complete (with timeout)
+		gatherDone := webrtc.GatheringCompletePromise(pc)
+		select {
+		case <-gatherDone:
+		case <-time.After(10 * time.Second):
+			xlog.Warn("ICE gathering timed out, using partial candidates")
+		}
+
+		localDesc := pc.LocalDescription()
+		if localDesc == nil {
+			transport.Close()
+			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "no local description"})
+		}
+
+		sessionID := generateSessionID()
+
+		// Start the realtime session in a goroutine
+		evaluator := application.TemplatesEvaluator()
+		go func() {
+			defer transport.Close()
+			runRealtimeSession(application, transport, req.Model, evaluator)
+		}()
+
+		return c.JSON(http.StatusCreated, RealtimeCallResponse{
+			SDP:       localDesc.SDP,
+			SessionID: sessionID,
+		})
+	}
+}
+
+// handleIncomingAudioTrack reads Opus frames from a remote WebRTC track,
+// decodes them to PCM, resamples to the session's input sample rate,
+// and appends to the session's InputAudioBuffer.
+func handleIncomingAudioTrack(track *webrtc.TrackRemote, decoder *OpusDecoder, transport *WebRTCTransport) {
+	session := transport.WaitForSession()
+	if session == nil {
+		xlog.Error("could not find session for incoming audio track (transport closed)")
+		sendError(transport, "session_error", "Session failed to start — check server logs", "", "")
+		return
+	}
+
+	var frameCount int
+	var decodeErrors int
+	for {
+		pkt, _, err := track.ReadRTP()
+		if err != nil {
+			xlog.Debug("audio track read ended", "error", err)
+			return
+		}
+
+		samples, err := decoder.Decode(pkt.Payload)
+		if err != nil {
+			decodeErrors++
+			xlog.Warn("opus decode error", "error", err, "payload_bytes", len(pkt.Payload), "errors_so_far", decodeErrors)
+			continue
+		}
+
+		// Log decode diagnostics for the first 50 frames to help debug audio issues
+		frameCount++
+		if frameCount <= 50 {
+			var sumSq float64
+			var peak int16
+			for _, s := range samples {
+				sumSq += float64(s) * float64(s)
+				if s > peak {
+					peak = s
+				} else if -s > peak {
+					peak = -s
+				}
+			}
+			rms := math.Sqrt(sumSq / float64(len(samples)))
+			xlog.Debug("opus decode frame",
+				"frame", frameCount,
+				"payload_bytes", len(pkt.Payload),
+				"decoded_samples", len(samples),
+				"rms", int(rms),
+				"peak", peak,
+				"marker", pkt.Marker,
+			)
+		}
+
+		// Resample from 48kHz to the session's input sample rate (16kHz for
+		// WebRTC, set in runRealtimeSession). This single resample feeds both
+		// the audio buffer and VAD without a lossy intermediate step.
+		if session.InputSampleRate != opusSampleRate {
+			samples = sound.ResampleInt16(samples, opusSampleRate, session.InputSampleRate)
+		}
+
+		pcmBytes := sound.Int16toBytesLE(samples)
+
+		session.AudioBufferLock.Lock()
+		newSize := len(session.InputAudioBuffer) + len(pcmBytes)
+		if newSize <= maxAudioBufferSize {
+			session.InputAudioBuffer = append(session.InputAudioBuffer, pcmBytes...)
+		} else {
+			xlog.Warn("audio buffer full, dropping incoming audio")
+		}
+		session.AudioBufferLock.Unlock()
+	}
+}
diff --git a/core/http/endpoints/openai/types/types.go b/core/http/endpoints/openai/types/types.go
index 751e79b6fbd5..2f75486adcc3 100644
--- a/core/http/endpoints/openai/types/types.go
+++ b/core/http/endpoints/openai/types/types.go
@@ -712,17 +712,39 @@ type SessionAudioInput struct {
 	// Configuration for input audio noise reduction. This can be set to null to turn off. Noise reduction filters audio added to the input audio buffer before it is sent to VAD and the model. Filtering the audio can improve VAD and turn detection accuracy (reducing false positives) and model performance by improving perception of the input audio.
 	NoiseReduction *AudioNoiseReduction `json:"noise_reduction,omitempty"`
 
-	// Configuration for input audio transcription, defaults to off and can be set to null to turn off once on. Input audio transcription is not native to the model, since the model consumes audio directly. Transcription runs asynchronously through the /audio/transcriptions endpoint and should be treated as guidance of input audio content rather than precisely what the model heard. The client can optionally set the language and prompt for transcription, these offer additional guidance to the transcription service.
+	// Configuration for turn detection: Server VAD or Semantic VAD. Set to null
+	// to turn off, in which case the client must manually trigger model response.
 	TurnDetection *TurnDetectionUnion `json:"turn_detection,omitempty"`
 
-	// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to null to turn off, in which case the client must manually trigger model response.
-	//
-	// Server VAD means that the model will detect the start and end of speech based on audio volume and respond at the end of user speech.
-	//
-	// Semantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio trails off with "uhhm", the model will score a low probability of turn end and wait longer for the user to continue speaking. This can be useful for more natural conversations, but may have a higher latency.
+	// True when the JSON payload explicitly included "turn_detection" (even as null).
+	// Standard Go JSON can't distinguish absent from null for pointer fields.
+	TurnDetectionSet bool `json:"-"`
+
+	// Configuration for input audio transcription, defaults to off and can be
+	// set to null to turn off once on.
 	Transcription *AudioTranscription `json:"transcription,omitempty"`
 }
 
+func (s *SessionAudioInput) UnmarshalJSON(data []byte) error {
+	// Check whether turn_detection key exists in the raw JSON.
+	var raw map[string]json.RawMessage
+	if err := json.Unmarshal(data, &raw); err != nil {
+		return err
+	}
+
+	type alias SessionAudioInput
+	var a alias
+	if err := json.Unmarshal(data, &a); err != nil {
+		return err
+	}
+	*s = SessionAudioInput(a)
+
+	if _, ok := raw["turn_detection"]; ok {
+		s.TurnDetectionSet = true
+	}
+	return nil
+}
+
 type SessionAudioOutput struct {
 	Format *AudioFormatUnion `json:"format,omitempty"`
 	Speed  float32           `json:"speed,omitempty"`
@@ -1012,10 +1034,13 @@ func (r *SessionUnion) UnmarshalJSON(data []byte) error {
 		return err
 	}
 	switch SessionType(t.Type) {
-	case SessionTypeRealtime:
-		return json.Unmarshal(data, &r.Realtime)
+	case SessionTypeRealtime, "":
+		// Default to realtime when no type field is present (e.g. session.update events).
+		r.Realtime = &RealtimeSession{}
+		return json.Unmarshal(data, r.Realtime)
 	case SessionTypeTranscription:
-		return json.Unmarshal(data, &r.Transcription)
+		r.Transcription = &TranscriptionSession{}
+		return json.Unmarshal(data, r.Transcription)
 	default:
 		return fmt.Errorf("unknown session type: %s", t.Type)
 	}
diff --git a/core/http/middleware/trace.go b/core/http/middleware/trace.go
index 15bc970a965e..2d7bcf16719d 100644
--- a/core/http/middleware/trace.go
+++ b/core/http/middleware/trace.go
@@ -158,7 +158,7 @@ func GetTraces() []APIExchange {
 	mu.Unlock()
 
 	sort.Slice(traces, func(i, j int) bool {
-		return traces[i].Timestamp.Before(traces[j].Timestamp)
+		return traces[i].Timestamp.After(traces[j].Timestamp)
 	})
 
 	return traces
diff --git a/core/http/react-ui/src/pages/Settings.jsx b/core/http/react-ui/src/pages/Settings.jsx
index b112c91215ad..9ac3c00a9cf0 100644
--- a/core/http/react-ui/src/pages/Settings.jsx
+++ b/core/http/react-ui/src/pages/Settings.jsx
@@ -55,6 +55,7 @@ const SECTIONS = [
   { id: 'memory', icon: 'fa-memory', color: 'var(--color-accent)', label: 'Memory' },
   { id: 'backends', icon: 'fa-cogs', color: 'var(--color-accent)', label: 'Backends' },
   { id: 'performance', icon: 'fa-gauge-high', color: 'var(--color-success)', label: 'Performance' },
+  { id: 'tracing', icon: 'fa-bug', color: 'var(--color-warning)', label: 'Tracing' },
   { id: 'api', icon: 'fa-globe', color: 'var(--color-warning)', label: 'API & CORS' },
   { id: 'p2p', icon: 'fa-network-wired', color: 'var(--color-accent)', label: 'P2P' },
   { id: 'galleries', icon: 'fa-images', color: 'var(--color-accent)', label: 'Galleries' },
@@ -327,10 +328,19 @@ export default function Settings() {
               <SettingRow label="Debug Mode" description="Enable verbose debug logging">
                 <Toggle checked={settings.debug} onChange={(v) => update('debug', v)} />
               </SettingRow>
-              <SettingRow label="Enable Tracing" description="Enable request/response tracing for debugging">
+            </div>
+          </div>
+
+          {/* Tracing */}
+          <div ref={el => sectionRefs.current.tracing = el} style={{ marginBottom: 'var(--spacing-xl)' }}>
+            <h3 style={{ fontSize: '1rem', fontWeight: 700, display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)', marginBottom: 'var(--spacing-md)' }}>
+              <i className="fas fa-bug" style={{ color: 'var(--color-warning)' }} /> Tracing
+            </h3>
+            <div className="card">
+              <SettingRow label="Enable Tracing" description="Record API requests, responses, and backend operations for debugging">
                 <Toggle checked={settings.enable_tracing} onChange={(v) => update('enable_tracing', v)} />
               </SettingRow>
-              <SettingRow label="Tracing Max Items" description="Maximum number of trace items to retain">
+              <SettingRow label="Max Items" description="Maximum number of trace items to retain (0 = unlimited)">
                 <input className="input" type="number" style={{ width: 120 }} value={settings.tracing_max_items ?? ''} onChange={(e) => update('tracing_max_items', parseInt(e.target.value) || 0)} placeholder="100" disabled={!settings.enable_tracing} />
               </SettingRow>
             </div>
diff --git a/core/http/react-ui/src/pages/Talk.jsx b/core/http/react-ui/src/pages/Talk.jsx
index 590b89bda32d..fa9a784fad09 100644
--- a/core/http/react-ui/src/pages/Talk.jsx
+++ b/core/http/react-ui/src/pages/Talk.jsx
@@ -1,196 +1,688 @@
-import { useState, useRef, useCallback } from 'react'
+import { useState, useRef, useEffect, useCallback } from 'react'
 import { useOutletContext } from 'react-router-dom'
-import ModelSelector from '../components/ModelSelector'
-import LoadingSpinner from '../components/LoadingSpinner'
-import { chatApi, ttsApi, audioApi } from '../utils/api'
+import { realtimeApi } from '../utils/api'
+
+const STATUS_STYLES = {
+  disconnected: { icon: 'fa-solid fa-circle', color: 'var(--color-text-secondary)', bg: 'transparent' },
+  connecting:   { icon: 'fa-solid fa-spinner fa-spin', color: 'var(--color-primary)', bg: 'var(--color-primary-light)' },
+  connected:    { icon: 'fa-solid fa-circle', color: 'var(--color-success)', bg: 'rgba(34,197,94,0.1)' },
+  listening:    { icon: 'fa-solid fa-microphone', color: 'var(--color-success)', bg: 'rgba(34,197,94,0.1)' },
+  thinking:     { icon: 'fa-solid fa-brain fa-beat', color: 'var(--color-primary)', bg: 'var(--color-primary-light)' },
+  speaking:     { icon: 'fa-solid fa-volume-high fa-beat-fade', color: 'var(--color-accent)', bg: 'rgba(168,85,247,0.1)' },
+  error:        { icon: 'fa-solid fa-circle', color: 'var(--color-error)', bg: 'var(--color-error-light)' },
+}
 
 export default function Talk() {
   const { addToast } = useOutletContext()
-  const [llmModel, setLlmModel] = useState('')
-  const [whisperModel, setWhisperModel] = useState('')
-  const [ttsModel, setTtsModel] = useState('')
-  const [isRecording, setIsRecording] = useState(false)
-  const [loading, setLoading] = useState(false)
-  const [status, setStatus] = useState('Press the record button to start talking.')
-  const [audioUrl, setAudioUrl] = useState(null)
-  const [conversationHistory, setConversationHistory] = useState([])
-  const mediaRecorderRef = useRef(null)
-  const chunksRef = useRef([])
+
+  // Pipeline models
+  const [pipelineModels, setPipelineModels] = useState([])
+  const [selectedModel, setSelectedModel] = useState('')
+  const [modelsLoading, setModelsLoading] = useState(true)
+
+  // Connection state
+  const [status, setStatus] = useState('disconnected')
+  const [statusText, setStatusText] = useState('Disconnected')
+  const [isConnected, setIsConnected] = useState(false)
+
+  // Transcript
+  const [transcript, setTranscript] = useState([])
+  const streamingRef = useRef(null) // tracks the index of the in-progress assistant message
+
+  // Session settings
+  const [instructions, setInstructions] = useState(
+    'You are a helpful voice assistant. Your responses will be spoken aloud using text-to-speech, so keep them concise and conversational. Do not use markdown formatting, bullet points, numbered lists, code blocks, or special characters. Speak naturally as you would in a phone conversation.'
+  )
+  const [voice, setVoice] = useState('')
+  const [voiceEdited, setVoiceEdited] = useState(false)
+  const [language, setLanguage] = useState('')
+
+  // Diagnostics
+  const [diagVisible, setDiagVisible] = useState(false)
+
+  // Refs for WebRTC / audio
+  const pcRef = useRef(null)
+  const dcRef = useRef(null)
+  const localStreamRef = useRef(null)
   const audioRef = useRef(null)
+  const hasErrorRef = useRef(false)
+
+  // Diagnostics refs
+  const audioCtxRef = useRef(null)
+  const analyserRef = useRef(null)
+  const diagFrameRef = useRef(null)
+  const statsIntervalRef = useRef(null)
+  const waveCanvasRef = useRef(null)
+  const specCanvasRef = useRef(null)
+  const transcriptEndRef = useRef(null)
+
+  // Diagnostics stats (not worth re-rendering for every frame)
+  const [diagStats, setDiagStats] = useState({
+    peakFreq: '--', thd: '--', rms: '--', sampleRate: '--',
+    packetsRecv: '--', packetsLost: '--', jitter: '--', concealed: '--', raw: '',
+  })
+
+  // Fetch pipeline models on mount
+  useEffect(() => {
+    realtimeApi.pipelineModels()
+      .then(models => {
+        setPipelineModels(models || [])
+        if (models?.length > 0) {
+          setSelectedModel(models[0].name)
+          if (!voiceEdited) setVoice(models[0].voice || '')
+        }
+      })
+      .catch(err => addToast(`Failed to load pipeline models: ${err.message}`, 'error'))
+      .finally(() => setModelsLoading(false))
+  }, [])
+
+  // Auto-scroll transcript
+  useEffect(() => {
+    transcriptEndRef.current?.scrollIntoView({ behavior: 'smooth' })
+  }, [transcript])
+
+  const selectedModelInfo = pipelineModels.find(m => m.name === selectedModel)
+
+  // ── Status helper ──
+  const updateStatus = useCallback((state, text) => {
+    setStatus(state)
+    setStatusText(text || state)
+  }, [])
+
+  // ── Session update ──
+  const sendSessionUpdate = useCallback(() => {
+    const dc = dcRef.current
+    if (!dc || dc.readyState !== 'open') return
+    if (!instructions.trim() && !voice.trim() && !language.trim()) return
+
+    const session = {}
+    if (instructions.trim()) session.instructions = instructions.trim()
+    if (voice.trim() || language.trim()) {
+      session.audio = {}
+      if (voice.trim()) session.audio.output = { voice: voice.trim() }
+      if (language.trim()) session.audio.input = { transcription: { language: language.trim() } }
+    }
+
+    dc.send(JSON.stringify({ type: 'session.update', session }))
+  }, [instructions, voice, language])
+
+  // ── Server event handler ──
+  const handleServerEvent = useCallback((event) => {
+    switch (event.type) {
+      case 'session.created':
+        sendSessionUpdate()
+        updateStatus('listening', 'Listening...')
+        break
+      case 'session.updated':
+        break
+      case 'input_audio_buffer.speech_started':
+        updateStatus('listening', 'Hearing you speak...')
+        break
+      case 'input_audio_buffer.speech_stopped':
+        updateStatus('thinking', 'Processing...')
+        break
+      case 'conversation.item.input_audio_transcription.completed':
+        if (event.transcript) {
+          streamingRef.current = null
+          setTranscript(prev => [...prev, { role: 'user', text: event.transcript }])
+        }
+        updateStatus('thinking', 'Generating response...')
+        break
+      case 'response.output_audio_transcript.delta':
+        if (event.delta) {
+          setTranscript(prev => {
+            if (streamingRef.current !== null) {
+              const updated = [...prev]
+              updated[streamingRef.current] = {
+                ...updated[streamingRef.current],
+                text: updated[streamingRef.current].text + event.delta,
+              }
+              return updated
+            }
+            streamingRef.current = prev.length
+            return [...prev, { role: 'assistant', text: event.delta }]
+          })
+        }
+        break
+      case 'response.output_audio_transcript.done':
+        if (event.transcript) {
+          setTranscript(prev => {
+            if (streamingRef.current !== null) {
+              const updated = [...prev]
+              updated[streamingRef.current] = { ...updated[streamingRef.current], text: event.transcript }
+              return updated
+            }
+            return [...prev, { role: 'assistant', text: event.transcript }]
+          })
+        }
+        streamingRef.current = null
+        break
+      case 'response.output_audio.delta':
+        updateStatus('speaking', 'Speaking...')
+        break
+      case 'response.done':
+        updateStatus('listening', 'Listening...')
+        break
+      case 'error':
+        hasErrorRef.current = true
+        updateStatus('error', 'Error: ' + (event.error?.message || 'Unknown error'))
+        break
+    }
+  }, [sendSessionUpdate, updateStatus])
 
-  const startRecording = async () => {
-    if (!navigator.mediaDevices) {
-      addToast('MediaDevices API not supported', 'error')
+  // ── Connect ──
+  const connect = useCallback(async () => {
+    if (!selectedModel) {
+      addToast('Please select a pipeline model first.', 'warning')
       return
     }
+    if (!navigator.mediaDevices?.getUserMedia) {
+      updateStatus('error', 'Microphone access requires HTTPS or localhost.')
+      return
+    }
+
+    updateStatus('connecting', 'Connecting...')
+    setIsConnected(true)
+
     try {
-      const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
-      const recorder = new MediaRecorder(stream)
-      chunksRef.current = []
-      recorder.ondataavailable = (e) => chunksRef.current.push(e.data)
-      recorder.start()
-      mediaRecorderRef.current = recorder
-      setIsRecording(true)
-      setStatus('Recording... Click to stop.')
+      const localStream = await navigator.mediaDevices.getUserMedia({ audio: true })
+      localStreamRef.current = localStream
+
+      const pc = new RTCPeerConnection({})
+      pcRef.current = pc
+
+      for (const track of localStream.getAudioTracks()) {
+        pc.addTrack(track, localStream)
+      }
+
+      pc.ontrack = (event) => {
+        if (audioRef.current) audioRef.current.srcObject = event.streams[0]
+        if (diagVisible) startDiagnostics()
+      }
+
+      const dc = pc.createDataChannel('oai-events')
+      dcRef.current = dc
+      dc.onmessage = (msg) => {
+        try {
+          const text = typeof msg.data === 'string' ? msg.data : new TextDecoder().decode(msg.data)
+          handleServerEvent(JSON.parse(text))
+        } catch (e) {
+          console.error('Failed to parse server event:', e)
+        }
+      }
+      dc.onclose = () => console.log('Data channel closed')
+
+      pc.onconnectionstatechange = () => {
+        if (pc.connectionState === 'connected') {
+          updateStatus('connected', 'Connected, waiting for session...')
+        } else if (pc.connectionState === 'failed' || pc.connectionState === 'closed') {
+          disconnect()
+        }
+      }
+
+      const offer = await pc.createOffer()
+      await pc.setLocalDescription(offer)
+
+      await new Promise((resolve) => {
+        if (pc.iceGatheringState === 'complete') return resolve()
+        pc.onicegatheringstatechange = () => {
+          if (pc.iceGatheringState === 'complete') resolve()
+        }
+        setTimeout(resolve, 5000)
+      })
+
+      const data = await realtimeApi.call({
+        sdp: pc.localDescription.sdp,
+        model: selectedModel,
+      })
+
+      await pc.setRemoteDescription({ type: 'answer', sdp: data.sdp })
     } catch (err) {
-      addToast(`Microphone error: ${err.message}`, 'error')
+      hasErrorRef.current = true
+      updateStatus('error', 'Connection failed: ' + err.message)
+      disconnect()
+    }
+  }, [selectedModel, diagVisible, handleServerEvent, updateStatus, addToast])
+
+  // ── Disconnect ──
+  const disconnect = useCallback(() => {
+    stopDiagnostics()
+    if (dcRef.current) { dcRef.current.close(); dcRef.current = null }
+    if (pcRef.current) { pcRef.current.close(); pcRef.current = null }
+    if (localStreamRef.current) {
+      localStreamRef.current.getTracks().forEach(t => t.stop())
+      localStreamRef.current = null
     }
+    if (audioRef.current) audioRef.current.srcObject = null
+
+    if (!hasErrorRef.current) updateStatus('disconnected', 'Disconnected')
+    hasErrorRef.current = false
+    setIsConnected(false)
+  }, [updateStatus])
+
+  // Cleanup on unmount
+  useEffect(() => {
+    return () => {
+      stopDiagnostics()
+      if (dcRef.current) dcRef.current.close()
+      if (pcRef.current) pcRef.current.close()
+      if (localStreamRef.current) localStreamRef.current.getTracks().forEach(t => t.stop())
+    }
+  }, [])
+
+  // ── Test tone ──
+  const sendTestTone = useCallback(() => {
+    const dc = dcRef.current
+    if (!dc || dc.readyState !== 'open') return
+    dc.send(JSON.stringify({ type: 'test_tone' }))
+    setTranscript(prev => [...prev, { role: 'assistant', text: '(Test tone requested)' }])
+  }, [])
+
+  // ── Diagnostics ──
+  function startDiagnostics() {
+    const audioEl = audioRef.current
+    if (!audioEl?.srcObject) return
+
+    if (!audioCtxRef.current) {
+      const ctx = new AudioContext()
+      const source = ctx.createMediaStreamSource(audioEl.srcObject)
+      const analyser = ctx.createAnalyser()
+      analyser.fftSize = 8192
+      analyser.smoothingTimeConstant = 0.3
+      source.connect(analyser)
+      audioCtxRef.current = ctx
+      analyserRef.current = analyser
+      setDiagStats(prev => ({ ...prev, sampleRate: ctx.sampleRate + ' Hz' }))
+    }
+
+    if (!diagFrameRef.current) drawDiagnostics()
+    if (!statsIntervalRef.current) {
+      pollWebRTCStats()
+      statsIntervalRef.current = setInterval(pollWebRTCStats, 1000)
+    }
+  }
+
+  function stopDiagnostics() {
+    if (diagFrameRef.current) { cancelAnimationFrame(diagFrameRef.current); diagFrameRef.current = null }
+    if (statsIntervalRef.current) { clearInterval(statsIntervalRef.current); statsIntervalRef.current = null }
+    if (audioCtxRef.current) { audioCtxRef.current.close(); audioCtxRef.current = null; analyserRef.current = null }
   }
 
-  const stopRecording = useCallback(() => {
-    if (!mediaRecorderRef.current) return
-
-    mediaRecorderRef.current.onstop = async () => {
-      setIsRecording(false)
-      setLoading(true)
-
-      const audioBlob = new Blob(chunksRef.current, { type: 'audio/webm' })
-
-      try {
-        // 1. Transcribe
-        setStatus('Transcribing audio...')
-        const formData = new FormData()
-        formData.append('file', audioBlob)
-        formData.append('model', whisperModel)
-        const transcription = await audioApi.transcribe(formData)
-        const userText = transcription.text
-
-        setStatus(`You said: "${userText}". Generating response...`)
-
-        // 2. Chat completion
-        const newHistory = [...conversationHistory, { role: 'user', content: userText }]
-        const chatResponse = await chatApi.complete({
-          model: llmModel,
-          messages: newHistory,
-        })
-        const assistantText = chatResponse?.choices?.[0]?.message?.content || ''
-        const updatedHistory = [...newHistory, { role: 'assistant', content: assistantText }]
-        setConversationHistory(updatedHistory)
-
-        setStatus(`Response: "${assistantText}". Generating speech...`)
-
-        // 3. TTS
-        const ttsBlob = await ttsApi.generateV1({ input: assistantText, model: ttsModel })
-        const url = URL.createObjectURL(ttsBlob)
-        setAudioUrl(url)
-        setStatus('Press the record button to continue.')
-
-        // Auto-play
-        setTimeout(() => audioRef.current?.play(), 100)
-      } catch (err) {
-        addToast(`Error: ${err.message}`, 'error')
-        setStatus('Error occurred. Try again.')
-      } finally {
-        setLoading(false)
+  function drawDiagnostics() {
+    const analyser = analyserRef.current
+    if (!analyser) { diagFrameRef.current = null; return }
+
+    diagFrameRef.current = requestAnimationFrame(drawDiagnostics)
+
+    // Waveform
+    const waveCanvas = waveCanvasRef.current
+    if (waveCanvas) {
+      const wCtx = waveCanvas.getContext('2d')
+      const timeData = new Float32Array(analyser.fftSize)
+      analyser.getFloatTimeDomainData(timeData)
+      const w = waveCanvas.width, h = waveCanvas.height
+      wCtx.fillStyle = '#000'; wCtx.fillRect(0, 0, w, h)
+      wCtx.strokeStyle = '#0f0'; wCtx.lineWidth = 1; wCtx.beginPath()
+      const sliceWidth = w / timeData.length
+      let x = 0
+      for (let i = 0; i < timeData.length; i++) {
+        const y = (1 - timeData[i]) * h / 2
+        i === 0 ? wCtx.moveTo(x, y) : wCtx.lineTo(x, y)
+        x += sliceWidth
       }
+      wCtx.stroke()
+
+      let sumSq = 0
+      for (let i = 0; i < timeData.length; i++) sumSq += timeData[i] * timeData[i]
+      const rms = Math.sqrt(sumSq / timeData.length)
+      const rmsDb = rms > 0 ? (20 * Math.log10(rms)).toFixed(1) : '-Inf'
+      setDiagStats(prev => ({ ...prev, rms: rmsDb + ' dBFS' }))
     }
 
-    mediaRecorderRef.current.stop()
-    mediaRecorderRef.current.stream?.getTracks().forEach(t => t.stop())
-  }, [whisperModel, llmModel, ttsModel, conversationHistory])
+    // Spectrum
+    const specCanvas = specCanvasRef.current
+    if (specCanvas && audioCtxRef.current) {
+      const sCtx = specCanvas.getContext('2d')
+      const freqData = new Float32Array(analyser.frequencyBinCount)
+      analyser.getFloatFrequencyData(freqData)
+      const sw = specCanvas.width, sh = specCanvas.height
+      sCtx.fillStyle = '#000'; sCtx.fillRect(0, 0, sw, sh)
+
+      const sampleRate = audioCtxRef.current.sampleRate
+      const binHz = sampleRate / analyser.fftSize
+      const maxFreqDisplay = 4000
+      const maxBin = Math.min(Math.ceil(maxFreqDisplay / binHz), freqData.length)
+      const barWidth = sw / maxBin
+
+      sCtx.fillStyle = '#0cf'
+      let peakBin = 0, peakVal = -Infinity
+      for (let i = 0; i < maxBin; i++) {
+        const db = freqData[i]
+        if (db > peakVal) { peakVal = db; peakBin = i }
+        const barH = Math.max(0, ((db + 100) / 100) * sh)
+        sCtx.fillRect(i * barWidth, sh - barH, Math.max(1, barWidth - 0.5), barH)
+      }
+
+      // Frequency labels
+      sCtx.fillStyle = '#888'; sCtx.font = '10px monospace'
+      for (let f = 500; f <= maxFreqDisplay; f += 500) {
+        sCtx.fillText(f + '', (f / binHz) * barWidth - 10, sh - 2)
+      }
+
+      // 440 Hz marker
+      const bin440 = Math.round(440 / binHz)
+      const x440 = bin440 * barWidth
+      sCtx.strokeStyle = '#f00'; sCtx.lineWidth = 1
+      sCtx.beginPath(); sCtx.moveTo(x440, 0); sCtx.lineTo(x440, sh); sCtx.stroke()
+      sCtx.fillStyle = '#f00'; sCtx.fillText('440', x440 + 2, 10)
 
-  const resetConversation = () => {
-    setConversationHistory([])
-    setAudioUrl(null)
-    setStatus('Conversation reset. Press record to start.')
-    addToast('Conversation reset', 'info')
+      const peakFreq = peakBin * binHz
+      const fundamentalBin = Math.round(440 / binHz)
+      const fundamentalPower = Math.pow(10, freqData[fundamentalBin] / 10)
+      let harmonicPower = 0
+      for (let h = 2; h <= 10; h++) {
+        const hBin = Math.round(440 * h / binHz)
+        if (hBin < freqData.length) harmonicPower += Math.pow(10, freqData[hBin] / 10)
+      }
+      const thd = fundamentalPower > 0
+        ? (Math.sqrt(harmonicPower / fundamentalPower) * 100).toFixed(1) + '%'
+        : '--%'
+
+      setDiagStats(prev => ({
+        ...prev,
+        peakFreq: peakFreq.toFixed(0) + ' Hz (' + peakVal.toFixed(1) + ' dB)',
+        thd,
+      }))
+    }
   }
 
-  const allModelsSet = llmModel && whisperModel && ttsModel
+  async function pollWebRTCStats() {
+    const pc = pcRef.current
+    if (!pc) return
+    try {
+      const stats = await pc.getStats()
+      const raw = []
+      stats.forEach((report) => {
+        if (report.type === 'inbound-rtp' && report.kind === 'audio') {
+          setDiagStats(prev => ({
+            ...prev,
+            packetsRecv: report.packetsReceived ?? '--',
+            packetsLost: report.packetsLost ?? '--',
+            jitter: report.jitter !== undefined ? (report.jitter * 1000).toFixed(1) + ' ms' : '--',
+            concealed: report.concealedSamples ?? '--',
+          }))
+          raw.push('-- inbound-rtp (audio) --')
+          raw.push('  packetsReceived: ' + report.packetsReceived)
+          raw.push('  packetsLost: ' + report.packetsLost)
+          raw.push('  jitter: ' + (report.jitter !== undefined ? (report.jitter * 1000).toFixed(2) + ' ms' : 'N/A'))
+          raw.push('  bytesReceived: ' + report.bytesReceived)
+          raw.push('  concealedSamples: ' + report.concealedSamples)
+          raw.push('  totalSamplesReceived: ' + report.totalSamplesReceived)
+        }
+      })
+      setDiagStats(prev => ({ ...prev, raw: raw.join('\n') }))
+    } catch (_e) { /* stats polling error */ }
+  }
+
+  const toggleDiagnostics = useCallback(() => {
+    setDiagVisible(prev => {
+      const next = !prev
+      if (next) {
+        setTimeout(startDiagnostics, 0)
+      } else {
+        stopDiagnostics()
+      }
+      return next
+    })
+  }, [])
+
+  const statusStyle = STATUS_STYLES[status] || STATUS_STYLES.disconnected
 
+  // ── Render ──
   return (
     <div className="page" style={{ display: 'flex', flexDirection: 'column', alignItems: 'center' }}>
-      <div style={{ width: '100%', maxWidth: '40rem' }}>
+      <div style={{ width: '100%', maxWidth: '48rem' }}>
         <div style={{ textAlign: 'center', marginBottom: 'var(--spacing-lg)' }}>
           <h1 className="page-title">Talk</h1>
-          <p className="page-subtitle">Voice conversation with AI</p>
+          <p className="page-subtitle">Real-time voice conversation via WebRTC</p>
         </div>
 
-        {/* Main interaction area */}
-        <div className="card" style={{ padding: 'var(--spacing-lg)', textAlign: 'center', marginBottom: 'var(--spacing-md)' }}>
-          {/* Big record button */}
-          <button
-            onClick={isRecording ? stopRecording : startRecording}
-            disabled={loading || !allModelsSet}
-            style={{
-              width: 96, height: 96, borderRadius: '50%', border: 'none', cursor: loading || !allModelsSet ? 'not-allowed' : 'pointer',
-              background: isRecording ? 'var(--color-error)' : 'var(--color-primary)',
-              color: '#fff', fontSize: '2rem', display: 'inline-flex', alignItems: 'center', justifyContent: 'center',
-              boxShadow: isRecording ? '0 0 0 8px rgba(239,68,68,0.2)' : '0 0 0 8px var(--color-primary-light)',
-              transition: 'all 200ms', opacity: loading || !allModelsSet ? 0.5 : 1,
-              margin: '0 auto var(--spacing-md)',
-            }}
-          >
-            <i className={`fas ${isRecording ? 'fa-stop' : 'fa-microphone'}`} />
-          </button>
-
-          {/* Status */}
-          <p style={{ color: 'var(--color-text-secondary)', fontSize: '0.875rem', marginBottom: 'var(--spacing-md)' }}>
-            {loading ? <LoadingSpinner size="sm" /> : null}
-            {' '}{status}
-          </p>
-
-          {/* Recording indicator */}
-          {isRecording && (
+        <div className="card" style={{ padding: 'var(--spacing-lg)', marginBottom: 'var(--spacing-md)' }}>
+          {/* Connection status */}
+          <div style={{
+            display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)',
+            padding: 'var(--spacing-sm) var(--spacing-md)',
+            borderRadius: 'var(--radius-md)',
+            background: statusStyle.bg,
+            border: '1px solid color-mix(in srgb, ' + statusStyle.color + ' 30%, transparent)',
+            marginBottom: 'var(--spacing-md)',
+          }}>
+            <i className={statusStyle.icon} style={{ color: statusStyle.color }} />
+            <span style={{ fontWeight: 500, color: statusStyle.color }}>{statusText}</span>
+          </div>
+
+          {/* Info note */}
+          <div style={{
+            background: 'var(--color-primary-light)',
+            border: '1px solid color-mix(in srgb, var(--color-primary) 20%, transparent)',
+            borderRadius: 'var(--radius-md)',
+            padding: 'var(--spacing-sm) var(--spacing-md)',
+            marginBottom: 'var(--spacing-md)',
+            display: 'flex', alignItems: 'flex-start', gap: 'var(--spacing-sm)',
+          }}>
+            <i className="fas fa-info-circle" style={{ color: 'var(--color-primary)', marginTop: 2, flexShrink: 0 }} />
+            <p style={{ color: 'var(--color-text-secondary)', fontSize: '0.8125rem', margin: 0 }}>
+              <strong style={{ color: 'var(--color-primary)' }}>Note:</strong> Select a pipeline model and click Connect.
+              Your microphone streams continuously; the server detects speech and responds automatically.
+            </p>
+          </div>
+
+          {/* Pipeline model selector */}
+          <div style={{ marginBottom: 'var(--spacing-md)' }}>
+            <label className="form-label" style={{ fontSize: '0.8125rem' }}>
+              <i className="fas fa-brain" style={{ color: 'var(--color-primary)', marginRight: 4 }} /> Pipeline Model
+            </label>
+            <select
+              className="model-selector"
+              value={selectedModel}
+              onChange={(e) => {
+                setSelectedModel(e.target.value)
+                const m = pipelineModels.find(p => p.name === e.target.value)
+                if (m && !voiceEdited) setVoice(m.voice || '')
+              }}
+              disabled={modelsLoading || isConnected}
+              style={{ width: '100%' }}
+            >
+              {modelsLoading && <option>Loading models...</option>}
+              {!modelsLoading && pipelineModels.length === 0 && <option>No pipeline models available</option>}
+              {pipelineModels.map(m => (
+                <option key={m.name} value={m.name}>{m.name}</option>
+              ))}
+            </select>
+          </div>
+
+          {/* Pipeline details */}
+          {selectedModelInfo && (
             <div style={{
-              background: 'rgba(239, 68, 68, 0.1)', border: '1px solid rgba(239, 68, 68, 0.3)',
-              borderRadius: 'var(--radius-md)', padding: 'var(--spacing-xs) var(--spacing-sm)',
-              display: 'inline-flex', alignItems: 'center', gap: 'var(--spacing-xs)',
-              color: 'var(--color-error)', fontSize: '0.8125rem', marginBottom: 'var(--spacing-md)',
+              display: 'grid', gridTemplateColumns: 'repeat(4, 1fr)', gap: 'var(--spacing-xs)',
+              marginBottom: 'var(--spacing-md)', fontSize: '0.75rem',
             }}>
-              <i className="fas fa-circle" style={{ fontSize: '0.5rem', animation: 'pulse 1s infinite' }} />
-              Recording...
+              {[
+                { label: 'VAD', value: selectedModelInfo.vad },
+                { label: 'Transcription', value: selectedModelInfo.transcription },
+                { label: 'LLM', value: selectedModelInfo.llm },
+                { label: 'TTS', value: selectedModelInfo.tts },
+              ].map(item => (
+                <div key={item.label} style={{
+                  background: 'var(--color-bg-secondary)', borderRadius: 'var(--radius-sm)',
+                  padding: 'var(--spacing-xs)', border: '1px solid var(--color-border)',
+                }}>
+                  <div style={{ color: 'var(--color-text-secondary)', marginBottom: 2 }}>{item.label}</div>
+                  <div style={{ fontFamily: 'monospace', overflow: 'hidden', textOverflow: 'ellipsis', whiteSpace: 'nowrap' }}>{item.value}</div>
+                </div>
+              ))}
             </div>
           )}
 
-          {/* Audio playback */}
-          {audioUrl && (
-            <div style={{ marginTop: 'var(--spacing-sm)' }}>
-              <audio ref={audioRef} controls src={audioUrl} style={{ width: '100%' }} />
+          {/* Session settings */}
+          <details style={{
+            marginBottom: 'var(--spacing-md)', border: '1px solid var(--color-border)',
+            borderRadius: 'var(--radius-md)',
+          }}>
+            <summary style={{
+              cursor: 'pointer', padding: 'var(--spacing-sm) var(--spacing-md)',
+              fontWeight: 500, color: 'var(--color-text-secondary)', fontSize: '0.875rem',
+            }}>
+              <i className="fas fa-sliders" style={{ color: 'var(--color-primary)', marginRight: 'var(--spacing-xs)' }} />
+              Session Settings
+            </summary>
+            <div style={{ padding: 'var(--spacing-md)', paddingTop: 'var(--spacing-xs)', display: 'flex', flexDirection: 'column', gap: 'var(--spacing-sm)' }}>
+              <div className="form-group" style={{ margin: 0 }}>
+                <label className="form-label" style={{ fontSize: '0.75rem' }}>Instructions</label>
+                <textarea
+                  className="textarea"
+                  rows={3}
+                  value={instructions}
+                  onChange={e => setInstructions(e.target.value)}
+                  placeholder="System instructions for the model"
+                  style={{ fontSize: '0.8125rem' }}
+                />
+              </div>
+              <div className="form-group" style={{ margin: 0 }}>
+                <label className="form-label" style={{ fontSize: '0.75rem' }}>Voice</label>
+                <input
+                  className="input"
+                  value={voice}
+                  onChange={e => { setVoice(e.target.value); setVoiceEdited(true) }}
+                  placeholder="Voice name (leave blank for model default)"
+                  style={{ fontSize: '0.8125rem' }}
+                />
+              </div>
+              <div className="form-group" style={{ margin: 0 }}>
+                <label className="form-label" style={{ fontSize: '0.75rem' }}>Transcription Language</label>
+                <input
+                  className="input"
+                  value={language}
+                  onChange={e => setLanguage(e.target.value)}
+                  placeholder="Language code (e.g. 'en') — leave blank for auto-detect"
+                  style={{ fontSize: '0.8125rem' }}
+                />
+              </div>
             </div>
-          )}
-        </div>
+          </details>
 
-        {/* Model selectors */}
-        <div className="card" style={{ padding: 'var(--spacing-md)' }}>
-          <div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', marginBottom: 'var(--spacing-md)' }}>
-            <h3 style={{ fontSize: '0.875rem', fontWeight: 600, color: 'var(--color-text-secondary)' }}>
-              <i className="fas fa-sliders-h" style={{ marginRight: 'var(--spacing-xs)' }} /> Models
-            </h3>
-            <button className="btn btn-secondary btn-sm" onClick={resetConversation} style={{ fontSize: '0.75rem' }}>
-              <i className="fas fa-rotate-right" /> Reset
-            </button>
+          {/* Transcript */}
+          <div style={{
+            marginBottom: 'var(--spacing-md)',
+            maxHeight: '24rem', overflowY: 'auto', minHeight: '6rem',
+            padding: 'var(--spacing-sm)',
+            background: 'var(--color-bg-secondary)',
+            border: '1px solid var(--color-border)',
+            borderRadius: 'var(--radius-md)',
+            display: 'flex', flexDirection: 'column', gap: 'var(--spacing-xs)',
+          }}>
+            {transcript.length === 0 && (
+              <p style={{ color: 'var(--color-text-secondary)', fontStyle: 'italic', margin: 0 }}>
+                Conversation will appear here...
+              </p>
+            )}
+            {transcript.map((entry, i) => (
+              <div key={i} style={{ display: 'flex', alignItems: 'flex-start', gap: 'var(--spacing-xs)' }}>
+                <i className={entry.role === 'user' ? 'fa-solid fa-user' : 'fa-solid fa-robot'}
+                  style={{
+                    color: entry.role === 'user' ? 'var(--color-primary)' : 'var(--color-accent)',
+                    marginTop: 3, flexShrink: 0, fontSize: '0.75rem',
+                  }} />
+                <p style={{ margin: 0 }}>{entry.text}</p>
+              </div>
+            ))}
+            <div ref={transcriptEndRef} />
           </div>
 
-          <div style={{ display: 'flex', flexDirection: 'column', gap: 'var(--spacing-sm)' }}>
-            <div className="form-group" style={{ margin: 0 }}>
-              <label className="form-label" style={{ fontSize: '0.75rem' }}>
-                <i className="fas fa-brain" style={{ color: 'var(--color-primary)', marginRight: 4 }} /> LLM
-              </label>
-              <ModelSelector value={llmModel} onChange={setLlmModel} capability="FLAG_CHAT" />
-            </div>
-            <div className="form-group" style={{ margin: 0 }}>
-              <label className="form-label" style={{ fontSize: '0.75rem' }}>
-                <i className="fas fa-ear-listen" style={{ color: 'var(--color-accent)', marginRight: 4 }} /> Speech-to-Text
-              </label>
-              <ModelSelector value={whisperModel} onChange={setWhisperModel} capability="FLAG_TRANSCRIPT" />
-            </div>
-            <div className="form-group" style={{ margin: 0 }}>
-              <label className="form-label" style={{ fontSize: '0.75rem' }}>
-                <i className="fas fa-volume-high" style={{ color: 'var(--color-success)', marginRight: 4 }} /> Text-to-Speech
-              </label>
-              <ModelSelector value={ttsModel} onChange={setTtsModel} capability="FLAG_TTS" />
+          {/* Buttons */}
+          <div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between' }}>
+            <div style={{ display: 'flex', gap: 'var(--spacing-sm)' }}>
+              {!isConnected ? (
+                <button className="btn btn-primary" onClick={connect} disabled={modelsLoading || !selectedModel}>
+                  <i className="fas fa-plug" style={{ marginRight: 'var(--spacing-xs)' }} /> Connect
+                </button>
+              ) : (
+                <>
+                  <button className="btn" onClick={sendTestTone}
+                    style={{ background: 'var(--color-accent)', color: '#fff', border: 'none' }}>
+                    <i className="fas fa-wave-square" style={{ marginRight: 'var(--spacing-xs)' }} /> Test Tone
+                  </button>
+                  <button className="btn btn-secondary" onClick={toggleDiagnostics}>
+                    <i className="fas fa-chart-line" style={{ marginRight: 'var(--spacing-xs)' }} /> Diag
+                  </button>
+                </>
+              )}
             </div>
+            {isConnected && (
+              <button className="btn" onClick={disconnect}
+                style={{ background: 'var(--color-error)', color: '#fff', border: 'none' }}>
+                <i className="fas fa-plug-circle-xmark" style={{ marginRight: 'var(--spacing-xs)' }} /> Disconnect
+              </button>
+            )}
           </div>
 
-          {!allModelsSet && (
+          {/* Hidden audio element for WebRTC playback */}
+          <audio ref={audioRef} autoPlay style={{ display: 'none' }} />
+
+          {/* Diagnostics panel */}
+          {diagVisible && (
             <div style={{
-              background: 'var(--color-info-light)', border: '1px solid rgba(56, 189, 248, 0.2)',
-              borderRadius: 'var(--radius-md)', padding: 'var(--spacing-xs) var(--spacing-sm)',
-              marginTop: 'var(--spacing-sm)', fontSize: '0.75rem', color: 'var(--color-text-secondary)',
+              marginTop: 'var(--spacing-md)',
+              border: '1px solid var(--color-border)',
+              borderRadius: 'var(--radius-md)',
+              padding: 'var(--spacing-md)',
             }}>
-              <i className="fas fa-info-circle" style={{ color: 'var(--color-info)', marginRight: 4 }} />
-              Select all three models to start talking.
+              <h3 style={{ fontSize: '0.875rem', fontWeight: 600, marginBottom: 'var(--spacing-sm)' }}>
+                <i className="fas fa-chart-line" style={{ color: 'var(--color-primary)', marginRight: 'var(--spacing-xs)' }} />
+                Audio Diagnostics
+              </h3>
+
+              <div style={{ display: 'grid', gridTemplateColumns: '1fr 1fr', gap: 'var(--spacing-sm)', marginBottom: 'var(--spacing-sm)' }}>
+                <div>
+                  <p style={{ fontSize: '0.6875rem', color: 'var(--color-text-secondary)', marginBottom: 2 }}>Waveform</p>
+                  <canvas ref={waveCanvasRef} width={400} height={120}
+                    style={{ width: '100%', border: '1px solid var(--color-border)', borderRadius: 'var(--radius-sm)', background: '#000' }} />
+                </div>
+                <div>
+                  <p style={{ fontSize: '0.6875rem', color: 'var(--color-text-secondary)', marginBottom: 2 }}>Spectrum (FFT)</p>
+                  <canvas ref={specCanvasRef} width={400} height={120}
+                    style={{ width: '100%', border: '1px solid var(--color-border)', borderRadius: 'var(--radius-sm)', background: '#000' }} />
+                </div>
+              </div>
+
+              <div style={{ display: 'grid', gridTemplateColumns: 'repeat(4, 1fr)', gap: 'var(--spacing-xs)', marginBottom: 'var(--spacing-sm)', fontSize: '0.75rem' }}>
+                {[
+                  { label: 'Peak Freq', value: diagStats.peakFreq },
+                  { label: 'THD', value: diagStats.thd },
+                  { label: 'RMS Level', value: diagStats.rms },
+                  { label: 'Sample Rate', value: diagStats.sampleRate },
+                  { label: 'Packets Recv', value: diagStats.packetsRecv },
+                  { label: 'Packets Lost', value: diagStats.packetsLost },
+                  { label: 'Jitter', value: diagStats.jitter },
+                  { label: 'Concealed', value: diagStats.concealed },
+                ].map(item => (
+                  <div key={item.label} style={{
+                    background: 'var(--color-bg-secondary)', borderRadius: 'var(--radius-sm)', padding: 'var(--spacing-xs)',
+                  }}>
+                    <div style={{ color: 'var(--color-text-secondary)', fontSize: '0.6875rem' }}>{item.label}</div>
+                    <div style={{ fontFamily: 'monospace' }}>{item.value}</div>
+                  </div>
+                ))}
+              </div>
+
+              <pre style={{
+                fontSize: '0.6875rem', color: 'var(--color-text-secondary)',
+                background: 'var(--color-bg-secondary)', borderRadius: 'var(--radius-sm)',
+                padding: 'var(--spacing-xs)', maxHeight: '8rem', overflowY: 'auto',
+                fontFamily: 'monospace', whiteSpace: 'pre-wrap', margin: 0,
+              }}>
+                {diagStats.raw || 'Waiting for stats...'}
+              </pre>
             </div>
           )}
         </div>
diff --git a/core/http/react-ui/src/pages/Traces.jsx b/core/http/react-ui/src/pages/Traces.jsx
index 9d4b83215aee..b1caee450d1d 100644
--- a/core/http/react-ui/src/pages/Traces.jsx
+++ b/core/http/react-ui/src/pages/Traces.jsx
@@ -1,30 +1,277 @@
-import React, { useState, useEffect, useCallback } from 'react'
+import React, { useState, useEffect, useCallback, useRef } from 'react'
 import { useOutletContext } from 'react-router-dom'
-import { tracesApi } from '../utils/api'
+import { Link } from 'react-router-dom'
+import { tracesApi, settingsApi } from '../utils/api'
 import LoadingSpinner from '../components/LoadingSpinner'
 
+const AUDIO_DATA_KEYS = new Set([
+  'audio_wav_base64', 'audio_duration_s', 'audio_snippet_s',
+  'audio_sample_rate', 'audio_samples', 'audio_rms_dbfs',
+  'audio_peak_dbfs', 'audio_dc_offset',
+])
+
 function formatDuration(ns) {
   if (!ns && ns !== 0) return '-'
   if (ns < 1000) return `${ns}ns`
-  if (ns < 1_000_000) return `${(ns / 1000).toFixed(1)}µs`
+  if (ns < 1_000_000) return `${(ns / 1000).toFixed(1)}\u00b5s`
   if (ns < 1_000_000_000) return `${(ns / 1_000_000).toFixed(1)}ms`
   return `${(ns / 1_000_000_000).toFixed(2)}s`
 }
 
+function formatTimestamp(ts) {
+  if (!ts) return '-'
+  const d = new Date(ts)
+  return d.toLocaleTimeString() + '.' + String(d.getMilliseconds()).padStart(3, '0')
+}
+
+function decodeTraceBody(body) {
+  if (!body) return ''
+  try {
+    const bin = atob(body)
+    const bytes = new Uint8Array(bin.length)
+    for (let i = 0; i < bin.length; i++) bytes[i] = bin.charCodeAt(i)
+    const text = new TextDecoder().decode(bytes)
+    try { return JSON.stringify(JSON.parse(text), null, 2) } catch { return text }
+  } catch {
+    return body
+  }
+}
+
+function formatValue(value) {
+  if (value === null || value === undefined) return 'null'
+  if (typeof value === 'boolean') return value ? 'true' : 'false'
+  if (typeof value === 'object') return JSON.stringify(value)
+  return String(value)
+}
+
+function formatLargeValue(value) {
+  if (typeof value === 'string') {
+    try { return JSON.stringify(JSON.parse(value), null, 2) } catch { return value }
+  }
+  if (typeof value === 'object') return JSON.stringify(value, null, 2)
+  return String(value)
+}
+
+function isLargeValue(value) {
+  if (typeof value === 'string') return value.length > 120
+  if (typeof value === 'object') return JSON.stringify(value).length > 120
+  return false
+}
+
+function truncateValue(value, maxLen) {
+  const str = typeof value === 'object' ? JSON.stringify(value) : String(value)
+  if (str.length <= maxLen) return str
+  return str.substring(0, maxLen) + '...'
+}
+
+const TYPE_COLORS = {
+  llm: { bg: 'rgba(59,130,246,0.15)', color: '#60a5fa' },
+  embedding: { bg: 'rgba(168,85,247,0.15)', color: '#c084fc' },
+  transcription: { bg: 'rgba(234,179,8,0.15)', color: '#facc15' },
+  image_generation: { bg: 'rgba(34,197,94,0.15)', color: '#4ade80' },
+  video_generation: { bg: 'rgba(236,72,153,0.15)', color: '#f472b6' },
+  tts: { bg: 'rgba(249,115,22,0.15)', color: '#fb923c' },
+  sound_generation: { bg: 'rgba(20,184,166,0.15)', color: '#2dd4bf' },
+  rerank: { bg: 'rgba(99,102,241,0.15)', color: '#818cf8' },
+  tokenize: { bg: 'rgba(107,114,128,0.15)', color: '#9ca3af' },
+}
+
+function typeBadgeStyle(type) {
+  const c = TYPE_COLORS[type] || TYPE_COLORS.tokenize
+  return { background: c.bg, color: c.color, padding: '2px 8px', borderRadius: 'var(--radius-sm)', fontSize: '0.75rem', fontWeight: 500 }
+}
+
+// Audio player + metrics for transcription traces
+function AudioSnippet({ data }) {
+  if (!data?.audio_wav_base64) return null
+  const metrics = [
+    { label: 'Duration', value: data.audio_duration_s + 's' },
+    { label: 'Sample Rate', value: data.audio_sample_rate + ' Hz' },
+    { label: 'RMS Level', value: data.audio_rms_dbfs + ' dBFS' },
+    { label: 'Peak Level', value: data.audio_peak_dbfs + ' dBFS' },
+    { label: 'Samples', value: data.audio_samples },
+    { label: 'Snippet', value: data.audio_snippet_s + 's' },
+    { label: 'DC Offset', value: data.audio_dc_offset },
+  ]
+  return (
+    <div style={{ marginBottom: 'var(--spacing-md)' }}>
+      <h4 style={{ fontSize: '0.8125rem', fontWeight: 600, marginBottom: 'var(--spacing-xs)', display: 'flex', alignItems: 'center', gap: 'var(--spacing-xs)' }}>
+        <i className="fas fa-headphones" style={{ color: 'var(--color-primary)' }} /> Audio Snippet
+      </h4>
+      <div style={{ background: 'var(--color-bg-primary)', border: '1px solid var(--color-border)', borderRadius: 'var(--radius-md)', padding: 'var(--spacing-sm)' }}>
+        <audio controls style={{ width: '100%', marginBottom: 'var(--spacing-sm)' }} src={`data:audio/wav;base64,${data.audio_wav_base64}`} />
+        <div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fill, minmax(120px, 1fr))', gap: 'var(--spacing-xs)', fontSize: '0.75rem' }}>
+          {metrics.map(m => (
+            <div key={m.label} style={{ background: 'var(--color-bg-secondary)', borderRadius: 'var(--radius-sm)', padding: 'var(--spacing-xs)' }}>
+              <div style={{ color: 'var(--color-text-secondary)' }}>{m.label}</div>
+              <div style={{ fontFamily: 'monospace' }}>{m.value}</div>
+            </div>
+          ))}
+        </div>
+      </div>
+    </div>
+  )
+}
+
+// Expandable data fields for backend traces
+function DataFields({ data }) {
+  const [expandedFields, setExpandedFields] = useState({})
+  const filtered = Object.entries(data).filter(([key]) => !AUDIO_DATA_KEYS.has(key))
+  if (filtered.length === 0) return null
+
+  const toggleField = (key) => {
+    setExpandedFields(prev => ({ ...prev, [key]: !prev[key] }))
+  }
+
+  return (
+    <div>
+      <h4 style={{ fontSize: '0.8125rem', fontWeight: 600, marginBottom: 'var(--spacing-xs)' }}>Data Fields</h4>
+      <div style={{ border: '1px solid var(--color-border)', borderRadius: 'var(--radius-md)', overflow: 'hidden' }}>
+        {filtered.map(([key, value]) => {
+          const large = isLargeValue(value)
+          const expanded = expandedFields[key]
+          return (
+            <div key={key} style={{ borderBottom: '1px solid var(--color-border)' }}>
+              <div
+                onClick={large ? () => toggleField(key) : undefined}
+                style={{
+                  display: 'flex', alignItems: 'center', gap: 'var(--spacing-xs)',
+                  padding: 'var(--spacing-xs) var(--spacing-sm)',
+                  cursor: large ? 'pointer' : 'default',
+                  fontSize: '0.8125rem',
+                }}
+              >
+                {large ? (
+                  <i className={`fas fa-chevron-${expanded ? 'down' : 'right'}`} style={{ fontSize: '0.6rem', color: 'var(--color-text-secondary)', width: 12, flexShrink: 0 }} />
+                ) : (
+                  <span style={{ width: 12, flexShrink: 0 }} />
+                )}
+                <span style={{ fontFamily: 'monospace', color: 'var(--color-primary)', flexShrink: 0 }}>{key}</span>
+                {!large && <span style={{ fontFamily: 'monospace', fontSize: '0.75rem', color: 'var(--color-text-secondary)' }}>{formatValue(value)}</span>}
+                {large && !expanded && <span style={{ fontSize: '0.75rem', color: 'var(--color-text-secondary)', overflow: 'hidden', textOverflow: 'ellipsis', whiteSpace: 'nowrap' }}>{truncateValue(value, 120)}</span>}
+              </div>
+              {large && expanded && (
+                <div style={{ padding: '0 var(--spacing-sm) var(--spacing-sm)' }}>
+                  <pre style={{
+                    background: 'var(--color-bg-primary)', border: '1px solid var(--color-border)',
+                    borderRadius: 'var(--radius-sm)', padding: 'var(--spacing-sm)',
+                    fontSize: '0.75rem', fontFamily: 'monospace', whiteSpace: 'pre-wrap', wordBreak: 'break-word',
+                    overflow: 'auto', maxHeight: '50vh', margin: 0,
+                  }}>
+                    {formatLargeValue(value)}
+                  </pre>
+                </div>
+              )}
+            </div>
+          )
+        })}
+      </div>
+    </div>
+  )
+}
+
+// Expanded detail for a backend trace row
+function BackendTraceDetail({ trace }) {
+  const infoItems = [
+    { label: 'Type', value: trace.type },
+    { label: 'Model', value: trace.model_name || '-' },
+    { label: 'Backend', value: trace.backend || '-' },
+    { label: 'Duration', value: formatDuration(trace.duration) },
+  ]
+
+  return (
+    <div style={{ padding: 'var(--spacing-md)', background: 'var(--color-bg-secondary)', borderBottom: '1px solid var(--color-border)' }}>
+      {/* Summary cards */}
+      <div style={{ display: 'grid', gridTemplateColumns: 'repeat(4, 1fr)', gap: 'var(--spacing-xs)', marginBottom: 'var(--spacing-md)', fontSize: '0.75rem' }}>
+        {infoItems.map(item => (
+          <div key={item.label} style={{ background: 'var(--color-bg-primary)', borderRadius: 'var(--radius-sm)', padding: 'var(--spacing-xs)', border: '1px solid var(--color-border)' }}>
+            <div style={{ color: 'var(--color-text-secondary)' }}>{item.label}</div>
+            <div style={{ fontWeight: 500 }}>{item.label === 'Type' ? <span style={typeBadgeStyle(item.value)}>{item.value}</span> : item.value}</div>
+          </div>
+        ))}
+      </div>
+
+      {/* Error banner */}
+      {trace.error && (
+        <div style={{
+          background: 'rgba(239,68,68,0.1)', border: '1px solid rgba(239,68,68,0.3)',
+          borderRadius: 'var(--radius-md)', padding: 'var(--spacing-sm)', marginBottom: 'var(--spacing-md)',
+          display: 'flex', alignItems: 'center', gap: 'var(--spacing-xs)',
+        }}>
+          <i className="fas fa-exclamation-triangle" style={{ color: 'var(--color-error)' }} />
+          <span style={{ color: 'var(--color-error)', fontSize: '0.8125rem' }}>{trace.error}</span>
+        </div>
+      )}
+
+      {/* Audio snippet */}
+      {trace.data && <AudioSnippet data={trace.data} />}
+
+      {/* Data fields */}
+      {trace.data && Object.keys(trace.data).length > 0 && <DataFields data={trace.data} />}
+    </div>
+  )
+}
+
+// Expanded detail for an API trace row
+function ApiTraceDetail({ trace }) {
+  return (
+    <div style={{ padding: 'var(--spacing-md)', background: 'var(--color-bg-secondary)', borderBottom: '1px solid var(--color-border)' }}>
+      <div style={{ display: 'grid', gridTemplateColumns: '1fr 1fr', gap: 'var(--spacing-md)' }}>
+        <div>
+          <h4 style={{ fontSize: '0.8125rem', fontWeight: 600, marginBottom: 'var(--spacing-xs)' }}>Request Body</h4>
+          <pre style={{
+            background: 'var(--color-bg-primary)', border: '1px solid var(--color-border)',
+            borderRadius: 'var(--radius-sm)', padding: 'var(--spacing-sm)',
+            fontSize: '0.75rem', fontFamily: 'monospace', whiteSpace: 'pre-wrap', wordBreak: 'break-word',
+            overflow: 'auto', maxHeight: '50vh', margin: 0,
+          }}>
+            {decodeTraceBody(trace.request?.body)}
+          </pre>
+        </div>
+        <div>
+          <h4 style={{ fontSize: '0.8125rem', fontWeight: 600, marginBottom: 'var(--spacing-xs)' }}>Response Body</h4>
+          <pre style={{
+            background: 'var(--color-bg-primary)', border: '1px solid var(--color-border)',
+            borderRadius: 'var(--radius-sm)', padding: 'var(--spacing-sm)',
+            fontSize: '0.75rem', fontFamily: 'monospace', whiteSpace: 'pre-wrap', wordBreak: 'break-word',
+            overflow: 'auto', maxHeight: '50vh', margin: 0,
+          }}>
+            {decodeTraceBody(trace.response?.body)}
+          </pre>
+        </div>
+      </div>
+    </div>
+  )
+}
+
 export default function Traces() {
   const { addToast } = useOutletContext()
   const [activeTab, setActiveTab] = useState('api')
   const [traces, setTraces] = useState([])
+  const [apiCount, setApiCount] = useState(0)
+  const [backendCount, setBackendCount] = useState(0)
   const [loading, setLoading] = useState(true)
   const [expandedRow, setExpandedRow] = useState(null)
+  const [tracingEnabled, setTracingEnabled] = useState(null)
+  const refreshRef = useRef(null)
+
+  useEffect(() => {
+    settingsApi.get()
+      .then(data => setTracingEnabled(!!data.enable_tracing))
+      .catch(() => {})
+  }, [])
 
   const fetchTraces = useCallback(async () => {
     try {
-      setLoading(true)
-      const data = activeTab === 'api'
-        ? await tracesApi.get()
-        : await tracesApi.getBackend()
-      setTraces(Array.isArray(data) ? data : [])
+      const [apiData, backendData] = await Promise.all([
+        tracesApi.get(),
+        tracesApi.getBackend(),
+      ])
+      const api = Array.isArray(apiData) ? apiData : []
+      const backend = Array.isArray(backendData) ? backendData : []
+      setApiCount(api.length)
+      setBackendCount(backend.length)
+      setTraces(activeTab === 'api' ? api : backend)
     } catch (err) {
       addToast(`Failed to load traces: ${err.message}`, 'error')
     } finally {
@@ -33,14 +280,23 @@ export default function Traces() {
   }, [activeTab, addToast])
 
   useEffect(() => {
+    setLoading(true)
+    setExpandedRow(null)
     fetchTraces()
   }, [fetchTraces])
 
+  // Auto-refresh every 5 seconds
+  useEffect(() => {
+    refreshRef.current = setInterval(fetchTraces, 5000)
+    return () => clearInterval(refreshRef.current)
+  }, [fetchTraces])
+
   const handleClear = async () => {
     try {
       if (activeTab === 'api') await tracesApi.clear()
       else await tracesApi.clearBackend()
       setTraces([])
+      setExpandedRow(null)
       addToast('Traces cleared', 'success')
     } catch (err) {
       addToast(`Failed to clear: ${err.message}`, 'error')
@@ -61,12 +317,20 @@ export default function Traces() {
     <div className="page">
       <div className="page-header">
         <h1 className="page-title">Traces</h1>
-        <p className="page-subtitle">Debug API and backend traces</p>
+        <p className="page-subtitle">View logged API requests, responses, and backend operations</p>
       </div>
 
       <div className="tabs">
-        <button className={`tab ${activeTab === 'api' ? 'tab-active' : ''}`} onClick={() => setActiveTab('api')}>API Traces</button>
-        <button className={`tab ${activeTab === 'backend' ? 'tab-active' : ''}`} onClick={() => setActiveTab('backend')}>Backend Traces</button>
+        <button className={`tab ${activeTab === 'api' ? 'tab-active' : ''}`} onClick={() => setActiveTab('api')}>
+          <i className="fas fa-exchange-alt" style={{ marginRight: 'var(--spacing-xs)', fontSize: '0.75rem' }} />
+          API Traces
+          <span style={{ marginLeft: 'var(--spacing-xs)', opacity: 0.6, fontSize: '0.75rem' }}>({apiCount})</span>
+        </button>
+        <button className={`tab ${activeTab === 'backend' ? 'tab-active' : ''}`} onClick={() => setActiveTab('backend')}>
+          <i className="fas fa-cogs" style={{ marginRight: 'var(--spacing-xs)', fontSize: '0.75rem' }} />
+          Backend Traces
+          <span style={{ marginLeft: 'var(--spacing-xs)', opacity: 0.6, fontSize: '0.75rem' }}>({backendCount})</span>
+        </button>
       </div>
 
       <div style={{ display: 'flex', gap: 'var(--spacing-sm)', marginBottom: 'var(--spacing-md)' }}>
@@ -75,6 +339,33 @@ export default function Traces() {
         <button className="btn btn-secondary btn-sm" onClick={handleExport} disabled={traces.length === 0}><i className="fas fa-download" /> Export</button>
       </div>
 
+      {tracingEnabled === false && (
+        <div style={{
+          background: 'rgba(234,179,8,0.1)', border: '1px solid rgba(234,179,8,0.3)',
+          borderRadius: 'var(--radius-md)', padding: 'var(--spacing-sm) var(--spacing-md)',
+          marginBottom: 'var(--spacing-md)', display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)',
+        }}>
+          <i className="fas fa-exclamation-triangle" style={{ color: '#facc15', flexShrink: 0 }} />
+          <span style={{ fontSize: '0.8125rem' }}>
+            Tracing is currently <strong>disabled</strong>. New requests will not be recorded.{' '}
+            <Link to="/settings" style={{ color: 'var(--color-primary)' }}>Enable in Settings</Link>
+          </span>
+        </div>
+      )}
+      {tracingEnabled === true && (
+        <div style={{
+          background: 'rgba(34,197,94,0.08)', border: '1px solid rgba(34,197,94,0.25)',
+          borderRadius: 'var(--radius-md)', padding: 'var(--spacing-sm) var(--spacing-md)',
+          marginBottom: 'var(--spacing-md)', display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)',
+        }}>
+          <i className="fas fa-circle-check" style={{ color: 'var(--color-success)', flexShrink: 0 }} />
+          <span style={{ fontSize: '0.8125rem' }}>
+            Tracing is <strong>enabled</strong>. Requests are being recorded.{' '}
+            <Link to="/settings" style={{ color: 'var(--color-primary)' }}>Manage in Settings</Link>
+          </span>
+        </div>
+      )}
+
       {loading ? (
         <div style={{ display: 'flex', justifyContent: 'center', padding: 'var(--spacing-xl)' }}><LoadingSpinner size="lg" /></div>
       ) : traces.length === 0 ? (
@@ -89,7 +380,6 @@ export default function Traces() {
             <thead>
               <tr>
                 <th style={{ width: '30px' }}></th>
-                <th>Time</th>
                 <th>Method</th>
                 <th>Path</th>
                 <th>Status</th>
@@ -100,17 +390,14 @@ export default function Traces() {
                 <React.Fragment key={i}>
                   <tr onClick={() => setExpandedRow(expandedRow === i ? null : i)} style={{ cursor: 'pointer' }}>
                     <td><i className={`fas fa-chevron-${expandedRow === i ? 'down' : 'right'}`} style={{ fontSize: '0.7rem' }} /></td>
-                    <td>{trace.timestamp ? new Date(trace.timestamp).toLocaleTimeString() : '-'}</td>
                     <td><span className="badge badge-info">{trace.request?.method || '-'}</span></td>
                     <td style={{ fontFamily: 'JetBrains Mono, monospace', fontSize: '0.8125rem' }}>{trace.request?.path || '-'}</td>
                     <td><span className={`badge ${(trace.response?.status || 0) < 400 ? 'badge-success' : 'badge-error'}`}>{trace.response?.status || '-'}</span></td>
                   </tr>
                   {expandedRow === i && (
                     <tr>
-                      <td colSpan="5">
-                        <pre style={{ background: 'var(--color-bg-primary)', padding: 'var(--spacing-sm)', borderRadius: 'var(--radius-md)', fontSize: '0.75rem', overflow: 'auto', maxHeight: '300px' }}>
-                          {JSON.stringify(trace, null, 2)}
-                        </pre>
+                      <td colSpan="4" style={{ padding: 0 }}>
+                        <ApiTraceDetail trace={trace} />
                       </td>
                     </tr>
                   )}
@@ -125,12 +412,12 @@ export default function Traces() {
             <thead>
               <tr>
                 <th style={{ width: '30px' }}></th>
-                <th>Time</th>
                 <th>Type</th>
+                <th>Time</th>
                 <th>Model</th>
-                <th>Backend</th>
-                <th>Duration</th>
                 <th>Summary</th>
+                <th>Duration</th>
+                <th style={{ width: '40px' }}>Status</th>
               </tr>
             </thead>
             <tbody>
@@ -138,21 +425,23 @@ export default function Traces() {
                 <React.Fragment key={i}>
                   <tr onClick={() => setExpandedRow(expandedRow === i ? null : i)} style={{ cursor: 'pointer' }}>
                     <td><i className={`fas fa-chevron-${expandedRow === i ? 'down' : 'right'}`} style={{ fontSize: '0.7rem' }} /></td>
-                    <td>{trace.timestamp ? new Date(trace.timestamp).toLocaleTimeString() : '-'}</td>
-                    <td><span className="badge badge-info">{trace.type || '-'}</span></td>
+                    <td><span style={typeBadgeStyle(trace.type)}>{trace.type || '-'}</span></td>
+                    <td style={{ fontSize: '0.8125rem', color: 'var(--color-text-secondary)' }}>{formatTimestamp(trace.timestamp)}</td>
                     <td style={{ fontFamily: 'JetBrains Mono, monospace', fontSize: '0.8125rem' }}>{trace.model_name || '-'}</td>
-                    <td>{trace.backend || '-'}</td>
-                    <td>{formatDuration(trace.duration)}</td>
                     <td style={{ maxWidth: '300px', overflow: 'hidden', textOverflow: 'ellipsis', whiteSpace: 'nowrap' }}>
-                      {trace.error ? <span style={{ color: 'var(--color-error)' }}>{trace.error}</span> : (trace.summary || '-')}
+                      {trace.summary || '-'}
+                    </td>
+                    <td style={{ fontSize: '0.8125rem', color: 'var(--color-text-secondary)' }}>{formatDuration(trace.duration)}</td>
+                    <td style={{ textAlign: 'center' }}>
+                      {trace.error
+                        ? <i className="fas fa-times-circle" style={{ color: 'var(--color-error)' }} title={trace.error} />
+                        : <i className="fas fa-check-circle" style={{ color: 'var(--color-success)' }} />}
                     </td>
                   </tr>
                   {expandedRow === i && (
                     <tr>
-                      <td colSpan="7">
-                        <pre style={{ background: 'var(--color-bg-primary)', padding: 'var(--spacing-sm)', borderRadius: 'var(--radius-md)', fontSize: '0.75rem', overflow: 'auto', maxHeight: '300px' }}>
-                          {JSON.stringify(trace, null, 2)}
-                        </pre>
+                      <td colSpan="7" style={{ padding: 0 }}>
+                        <BackendTraceDetail trace={trace} />
                       </td>
                     </tr>
                   )}
diff --git a/core/http/react-ui/src/utils/api.js b/core/http/react-ui/src/utils/api.js
index 8ff11f73b448..870a6e1104db 100644
--- a/core/http/react-ui/src/utils/api.js
+++ b/core/http/react-ui/src/utils/api.js
@@ -228,6 +228,12 @@ export const audioApi = {
   },
 }
 
+// Realtime / WebRTC
+export const realtimeApi = {
+  call: (body) => postJSON(API_CONFIG.endpoints.realtimeCalls, body),
+  pipelineModels: () => fetchJSON(API_CONFIG.endpoints.pipelineModels),
+}
+
 // Backend control
 export const backendControlApi = {
   shutdown: (body) => postJSON(API_CONFIG.endpoints.backendShutdown, body),
diff --git a/core/http/react-ui/src/utils/config.js b/core/http/react-ui/src/utils/config.js
index 4a64be51c34f..cc2de7c25a9a 100644
--- a/core/http/react-ui/src/utils/config.js
+++ b/core/http/react-ui/src/utils/config.js
@@ -59,6 +59,10 @@ export const API_CONFIG = {
     modelsList: '/v1/models',
     modelsCapabilities: '/api/models/capabilities',
 
+    // Realtime / WebRTC
+    realtimeCalls: '/v1/realtime/calls',
+    pipelineModels: '/api/pipeline-models',
+
     // LocalAI-specific
     tts: '/tts',
     video: '/video',
diff --git a/core/http/routes/openai.go b/core/http/routes/openai.go
index 59514339e65f..eff121f93d5d 100644
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -21,6 +21,7 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 	app.GET("/v1/realtime", openai.Realtime(application))
 	app.POST("/v1/realtime/sessions", openai.RealtimeTranscriptionSession(application), traceMiddleware)
 	app.POST("/v1/realtime/transcription_session", openai.RealtimeTranscriptionSession(application), traceMiddleware)
+	app.POST("/v1/realtime/calls", openai.RealtimeCalls(application), traceMiddleware)
 
 	// chat
 	chatHandler := openai.ChatEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.TemplatesEvaluator(), application.ApplicationConfig())
diff --git a/core/http/routes/ui.go b/core/http/routes/ui.go
index bcb0afe466ad..274fcaa0c598 100644
--- a/core/http/routes/ui.go
+++ b/core/http/routes/ui.go
@@ -1,6 +1,9 @@
 package routes
 
 import (
+	"cmp"
+	"slices"
+
 	"github.com/labstack/echo/v4"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/middleware"
@@ -18,6 +21,41 @@ func RegisterUIRoutes(app *echo.Echo,
 	// SPA routes are handled by the 404 fallback in app.go which serves
 	// index.html for any unmatched HTML request, enabling client-side routing.
 
+	// Pipeline models API (for the Talk page WebRTC interface)
+	app.GET("/api/pipeline-models", func(c echo.Context) error {
+		type pipelineModelInfo struct {
+			Name          string `json:"name"`
+			VAD           string `json:"vad"`
+			Transcription string `json:"transcription"`
+			LLM           string `json:"llm"`
+			TTS           string `json:"tts"`
+			Voice         string `json:"voice"`
+		}
+
+		pipelineModels := cl.GetModelConfigsByFilter(func(_ string, cfg *config.ModelConfig) bool {
+			p := cfg.Pipeline
+			return p.VAD != "" && p.Transcription != "" && p.LLM != "" && p.TTS != ""
+		})
+
+		slices.SortFunc(pipelineModels, func(a, b config.ModelConfig) int {
+			return cmp.Compare(a.Name, b.Name)
+		})
+
+		var models []pipelineModelInfo
+		for _, cfg := range pipelineModels {
+			models = append(models, pipelineModelInfo{
+				Name:          cfg.Name,
+				VAD:           cfg.Pipeline.VAD,
+				Transcription: cfg.Pipeline.Transcription,
+				LLM:           cfg.Pipeline.LLM,
+				TTS:           cfg.Pipeline.TTS,
+				Voice:         cfg.TTSConfig.Voice,
+			})
+		}
+
+		return c.JSON(200, models)
+	})
+
 	app.GET("/api/traces", func(c echo.Context) error {
 		return c.JSON(200, middleware.GetTraces())
 	})
diff --git a/core/http/static/talk.js b/core/http/static/talk.js
index f8a2f838e85f..8554ac324474 100644
--- a/core/http/static/talk.js
+++ b/core/http/static/talk.js
@@ -1,159 +1,606 @@
-
-const recordButton = document.getElementById('recordButton');
+const connectButton = document.getElementById('connectButton');
+const disconnectButton = document.getElementById('disconnectButton');
+const testToneButton = document.getElementById('testToneButton');
+const diagnosticsButton = document.getElementById('diagnosticsButton');
 const audioPlayback = document.getElementById('audioPlayback');
-const resetButton = document.getElementById('resetButton');
+const transcript = document.getElementById('transcript');
+const statusIcon = document.getElementById('statusIcon');
+const statusLabel = document.getElementById('statusLabel');
+const connectionStatus = document.getElementById('connectionStatus');
+const modelSelect = document.getElementById('modelSelect');
+
+let pc = null;
+let dc = null;
+let localStream = null;
+let hasError = false;
+
+// Audio diagnostics state
+let audioCtx = null;
+let analyser = null;
+let diagAnimFrame = null;
+let statsInterval = null;
+let diagVisible = false;
+
+connectButton.addEventListener('click', connect);
+disconnectButton.addEventListener('click', disconnect);
+testToneButton.addEventListener('click', sendTestTone);
+diagnosticsButton.addEventListener('click', toggleDiagnostics);
+
+// Show pipeline details when a model is selected
+modelSelect.addEventListener('change', function() {
+  const opt = this.options[this.selectedIndex];
+  const details = document.getElementById('pipelineDetails');
+  if (!opt || !opt.value) {
+    details.classList.add('hidden');
+    return;
+  }
+  document.getElementById('pipelineVAD').textContent = opt.dataset.vad || '--';
+  document.getElementById('pipelineSTT').textContent = opt.dataset.stt || '--';
+  document.getElementById('pipelineLLM').textContent = opt.dataset.llm || '--';
+  document.getElementById('pipelineTTS').textContent = opt.dataset.tts || '--';
+  details.classList.remove('hidden');
 
-let mediaRecorder;
-let audioChunks = [];
-let isRecording = false;
-let conversationHistory = [];
-let resetTimer;
+  // Pre-fill voice from model default if the user hasn't typed anything
+  const voiceInput = document.getElementById('voiceInput');
+  if (!voiceInput.dataset.userEdited) {
+    voiceInput.value = opt.dataset.voice || '';
+  }
+});
+
+// Track if user manually edited the voice field
+document.getElementById('voiceInput').addEventListener('input', function() {
+  this.dataset.userEdited = 'true';
+});
+
+// Auto-select first model on page load
+if (modelSelect.options.length > 1) {
+  modelSelect.selectedIndex = 1;
+  modelSelect.dispatchEvent(new Event('change'));
+}
 
 function getModel() {
-    return document.getElementById('modelSelect').value;
+  return modelSelect.value;
 }
 
-function getSTTModel() {
-    return document.getElementById('sttModelSelect').value;
+function setStatus(state, text) {
+  statusLabel.textContent = text || state;
+  statusIcon.className = 'fa-solid fa-circle';
+  connectionStatus.className = 'rounded-lg p-4 mb-4 flex items-center space-x-3';
+
+  switch (state) {
+    case 'disconnected':
+      statusIcon.classList.add('text-[var(--color-text-secondary)]');
+      connectionStatus.classList.add('bg-[var(--color-bg-primary)]/50', 'border', 'border-[var(--color-border-subtle)]');
+      statusLabel.classList.add('text-[var(--color-text-secondary)]');
+      break;
+    case 'connecting':
+      statusIcon.className = 'fa-solid fa-spinner fa-spin text-[var(--color-primary)]';
+      connectionStatus.classList.add('bg-[var(--color-primary-light)]', 'border', 'border-[var(--color-primary)]/30');
+      statusLabel.className = 'font-medium text-[var(--color-primary)]';
+      break;
+    case 'connected':
+      statusIcon.classList.add('text-[var(--color-success)]');
+      connectionStatus.classList.add('bg-[var(--color-success)]/10', 'border', 'border-[var(--color-success)]/30');
+      statusLabel.className = 'font-medium text-[var(--color-success)]';
+      break;
+    case 'listening':
+      statusIcon.className = 'fa-solid fa-microphone text-[var(--color-success)]';
+      connectionStatus.classList.add('bg-[var(--color-success)]/10', 'border', 'border-[var(--color-success)]/30');
+      statusLabel.className = 'font-medium text-[var(--color-success)]';
+      break;
+    case 'thinking':
+      statusIcon.className = 'fa-solid fa-brain fa-beat text-[var(--color-primary)]';
+      connectionStatus.classList.add('bg-[var(--color-primary-light)]', 'border', 'border-[var(--color-primary)]/30');
+      statusLabel.className = 'font-medium text-[var(--color-primary)]';
+      break;
+    case 'speaking':
+      statusIcon.className = 'fa-solid fa-volume-high fa-beat-fade text-[var(--color-accent)]';
+      connectionStatus.classList.add('bg-[var(--color-accent)]/10', 'border', 'border-[var(--color-accent)]/30');
+      statusLabel.className = 'font-medium text-[var(--color-accent)]';
+      break;
+    case 'error':
+      statusIcon.classList.add('text-[var(--color-error)]');
+      connectionStatus.classList.add('bg-[var(--color-error-light)]', 'border', 'border-[var(--color-error)]/30');
+      statusLabel.className = 'font-medium text-[var(--color-error)]';
+      break;
+  }
 }
 
-function getTTSModel() {
-    return document.getElementById('ttsModelSelect').value;
+// Currently streaming assistant message element (for incremental updates)
+let streamingEntry = null;
+
+function addTranscript(role, text) {
+  // Remove the placeholder if present
+  const placeholder = transcript.querySelector('.italic');
+  if (placeholder) placeholder.remove();
+
+  const entry = document.createElement('div');
+  entry.className = 'flex items-start space-x-2';
+
+  const icon = document.createElement('i');
+  const msg = document.createElement('p');
+  msg.className = 'text-[var(--color-text-primary)]';
+  msg.textContent = text;
+
+  if (role === 'user') {
+    icon.className = 'fa-solid fa-user text-[var(--color-primary)] mt-1 flex-shrink-0';
+  } else {
+    icon.className = 'fa-solid fa-robot text-[var(--color-accent)] mt-1 flex-shrink-0';
+  }
+
+  entry.appendChild(icon);
+  entry.appendChild(msg);
+  transcript.appendChild(entry);
+  transcript.scrollTop = transcript.scrollHeight;
+  return entry;
 }
 
-function resetConversation() {
-    conversationHistory = [];
-    console.log("Conversation has been reset.");
-    clearTimeout(resetTimer);
+function updateStreamingTranscript(role, delta) {
+  if (!streamingEntry) {
+    streamingEntry = addTranscript(role, delta);
+  } else {
+    const msg = streamingEntry.querySelector('p');
+    if (msg) msg.textContent += delta;
+    transcript.scrollTop = transcript.scrollHeight;
+  }
 }
 
-function setResetTimer() {
-    clearTimeout(resetTimer);
-    resetTimer = setTimeout(resetConversation, 300000); // Reset after 5 minutes
+function finalizeStreamingTranscript(role, fullText) {
+  if (streamingEntry) {
+    const msg = streamingEntry.querySelector('p');
+    if (msg) msg.textContent = fullText;
+    streamingEntry = null;
+  } else {
+    addTranscript(role, fullText);
+  }
+  transcript.scrollTop = transcript.scrollHeight;
 }
 
-recordButton.addEventListener('click', toggleRecording);
-resetButton.addEventListener('click', resetConversation);
+// Send a session.update event with the user's settings
+function sendSessionUpdate() {
+  if (!dc || dc.readyState !== 'open') return;
+
+  const instructions = document.getElementById('instructionsInput').value.trim();
+  const voice = document.getElementById('voiceInput').value.trim();
+  const language = document.getElementById('languageInput').value.trim();
 
-function toggleRecording() {
-    if (!isRecording) {
-        startRecording();
-    } else {
-        stopRecording();
+  // Only send if the user configured something
+  if (!instructions && !voice && !language) return;
+
+  const session = {};
+
+  if (instructions) {
+    session.instructions = instructions;
+  }
+
+  if (voice || language) {
+    session.audio = {};
+    if (voice) {
+      session.audio.output = { voice: voice };
     }
+    if (language) {
+      session.audio.input = {
+        transcription: { language: language }
+      };
+    }
+  }
+
+  const event = {
+    type: 'session.update',
+    session: session,
+  };
+
+  console.log('[session.update]', event);
+  dc.send(JSON.stringify(event));
 }
 
-async function startRecording() {
-    document.getElementById("recording").style.display = "block";
-    document.getElementById("resetButton").style.display = "none";
-    if (!navigator.mediaDevices) {
-        alert('MediaDevices API not supported!');
-        return;
+function handleServerEvent(event) {
+  console.log('[event]', event.type, event);
+
+  switch (event.type) {
+    case 'session.created':
+      // Session is ready — send any user settings
+      sendSessionUpdate();
+      setStatus('listening', 'Listening...');
+      break;
+
+    case 'session.updated':
+      console.log('[session.updated] Session settings applied', event.session);
+      break;
+
+    case 'input_audio_buffer.speech_started':
+      setStatus('listening', 'Hearing you speak...');
+      break;
+
+    case 'input_audio_buffer.speech_stopped':
+      setStatus('thinking', 'Processing...');
+      break;
+
+    case 'conversation.item.input_audio_transcription.completed':
+      if (event.transcript) {
+        addTranscript('user', event.transcript);
+      }
+      setStatus('thinking', 'Generating response...');
+      break;
+
+    case 'response.output_audio_transcript.delta':
+      // Incremental transcript — update the in-progress assistant message
+      if (event.delta) {
+        updateStreamingTranscript('assistant', event.delta);
+      }
+      break;
+
+    case 'response.output_audio_transcript.done':
+      if (event.transcript) {
+        finalizeStreamingTranscript('assistant', event.transcript);
+      }
+      break;
+
+    case 'response.output_audio.delta':
+      setStatus('speaking', 'Speaking...');
+      break;
+
+    case 'response.done':
+      setStatus('listening', 'Listening...');
+      break;
+
+    case 'error':
+      console.error('Server error:', event.error);
+      hasError = true;
+      setStatus('error', 'Error: ' + (event.error?.message || 'Unknown error'));
+      break;
+  }
+}
+
+async function connect() {
+  const model = getModel();
+  if (!model) {
+    alert('Please select a pipeline model first.');
+    return;
+  }
+
+  if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
+    setStatus('error', 'Microphone access requires HTTPS or localhost.');
+    return;
+  }
+
+  setStatus('connecting', 'Connecting...');
+  connectButton.style.display = 'none';
+  disconnectButton.style.display = '';
+  testToneButton.style.display = '';
+  diagnosticsButton.style.display = '';
+
+  try {
+    // Get microphone access
+    localStream = await navigator.mediaDevices.getUserMedia({ audio: true });
+
+    // Create peer connection
+    pc = new RTCPeerConnection({});
+
+    // Add local audio track
+    for (const track of localStream.getAudioTracks()) {
+      pc.addTrack(track, localStream);
     }
-    const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-    mediaRecorder = new MediaRecorder(stream);
-    audioChunks = [];
-    mediaRecorder.ondataavailable = (event) => {
-        audioChunks.push(event.data);
+
+    // Handle remote audio track (server's TTS output)
+    pc.ontrack = (event) => {
+      audioPlayback.srcObject = event.streams[0];
+      // If diagnostics panel is open, start analyzing the new stream
+      if (diagVisible) startDiagnostics();
     };
-    mediaRecorder.start();
-    recordButton.textContent = 'Stop Recording';
-    // add class bg-red-500 to recordButton
-    recordButton.classList.add("bg-gray-500");
-    
-    isRecording = true;
-}
-
-function stopRecording() {
-    mediaRecorder.stop();
-    mediaRecorder.onstop = async () => {
-        document.getElementById("recording").style.display = "none";
-        document.getElementById("recordButton").style.display = "none";
-
-        document.getElementById("loader").style.display = "block";
-        const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
-        document.getElementById("statustext").textContent = "Processing audio...";
-        const transcript = await sendAudioToSTT(audioBlob);
-        console.log("Transcript:", transcript);
-        document.getElementById("statustext").textContent = "Seems you said: " + transcript+ ". Generating response...";
-        const responseText = await sendTextToChatGPT(transcript);
-
-        console.log("Response:", responseText);
-        document.getElementById("statustext").textContent = "Response generated: '" + responseText + "'. Generating audio response...";
-
-        const ttsAudio = await getTextToSpeechAudio(responseText);
-        playAudioResponse(ttsAudio);
-
-        recordButton.textContent = 'Record';
-        // remove class bg-red-500 from recordButton
-        recordButton.classList.remove("bg-gray-500");
-        isRecording = false;
-        document.getElementById("loader").style.display = "none";
-        document.getElementById("recordButton").style.display = "block";
-        document.getElementById("resetButton").style.display = "block";
-        document.getElementById("statustext").textContent = "Press the record button to start recording.";
+
+    // Create the events data channel (client must create it so m=application
+    // is included in the SDP offer — the answerer cannot add new m-lines)
+    dc = pc.createDataChannel('oai-events');
+    dc.onmessage = (msg) => {
+      try {
+        const text = typeof msg.data === 'string'
+          ? msg.data
+          : new TextDecoder().decode(msg.data);
+        const event = JSON.parse(text);
+        handleServerEvent(event);
+      } catch (e) {
+        console.error('Failed to parse server event:', e);
+      }
+    };
+    dc.onclose = () => {
+      console.log('Data channel closed');
+    };
+
+    pc.onconnectionstatechange = () => {
+      console.log('Connection state:', pc.connectionState);
+      if (pc.connectionState === 'connected') {
+        setStatus('connected', 'Connected, waiting for session...');
+      } else if (pc.connectionState === 'failed' || pc.connectionState === 'closed') {
+        disconnect();
+      }
     };
-}
 
-async function sendAudioToSTT(audioBlob) {
-    const formData = new FormData();
-    formData.append('file', audioBlob);
-    formData.append('model', getSTTModel());
+    // Create offer
+    const offer = await pc.createOffer();
+    await pc.setLocalDescription(offer);
 
-    const response = await fetch('v1/audio/transcriptions', {
-        method: 'POST',
-        body: formData
+    // Wait for ICE gathering
+    await new Promise((resolve) => {
+      if (pc.iceGatheringState === 'complete') {
+        resolve();
+      } else {
+        pc.onicegatheringstatechange = () => {
+          if (pc.iceGatheringState === 'complete') resolve();
+        };
+        // Timeout after 5s
+        setTimeout(resolve, 5000);
+      }
     });
 
-    const result = await response.json();
-    console.log("STT result:", result)
-    return result.text;
-}
+    // Send offer to server
+    const response = await fetch('v1/realtime/calls', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({
+        sdp: pc.localDescription.sdp,
+        model: model,
+      }),
+    });
 
-async function sendTextToChatGPT(text) {
-    conversationHistory.push({ role: "user", content: text });
+    if (!response.ok) {
+      const err = await response.json().catch(() => ({ error: 'Unknown error' }));
+      throw new Error(err.error || `HTTP ${response.status}`);
+    }
 
-    const response = await fetch('v1/chat/completions', {
-        method: 'POST',
-        headers: { "Content-Type": "application/json" },
-        body: JSON.stringify({
-            model: getModel(),
-            messages: conversationHistory
-        })
+    const data = await response.json();
+
+    // Set remote description (server's answer)
+    await pc.setRemoteDescription({
+      type: 'answer',
+      sdp: data.sdp,
     });
 
-    const result = await response.json();
-    const responseText = result.choices[0].message.content;
-    conversationHistory.push({ role: "assistant", content: responseText });
+    console.log('WebRTC connection established, session:', data.session_id);
+  } catch (err) {
+    console.error('Connection failed:', err);
+    hasError = true;
+    setStatus('error', 'Connection failed: ' + err.message);
+    disconnect();
+  }
+}
 
-    setResetTimer();
+function sendTestTone() {
+  if (!dc || dc.readyState !== 'open') {
+    console.warn('Data channel not open');
+    return;
+  }
+  console.log('[test-tone] Requesting server test tone...');
+  dc.send(JSON.stringify({ type: 'test_tone' }));
+  addTranscript('assistant', '(Test tone requested — you should hear a 440 Hz beep)');
+}
+
+function disconnect() {
+  stopDiagnostics();
+  if (dc) {
+    dc.close();
+    dc = null;
+  }
+  if (pc) {
+    pc.close();
+    pc = null;
+  }
+  if (localStream) {
+    localStream.getTracks().forEach(t => t.stop());
+    localStream = null;
+  }
+  audioPlayback.srcObject = null;
 
-    return responseText;
+  if (!hasError) {
+    setStatus('disconnected', 'Disconnected');
+  }
+  hasError = false;
+  connectButton.style.display = '';
+  disconnectButton.style.display = 'none';
+  testToneButton.style.display = 'none';
+  diagnosticsButton.style.display = 'none';
 }
 
-async function getTextToSpeechAudio(text) {
-    const response = await fetch('v1/audio/speech', {
-        
-        method: 'POST',
-        headers: {
-            'Content-Type': 'application/json'
-        },
-        body: JSON.stringify({ 
-          //  "backend": "string",
-            input: text,
-            model: getTTSModel(),
-           // "voice": "string"
-         })
-    });
+// ── Audio Diagnostics ──
+
+function toggleDiagnostics() {
+  const panel = document.getElementById('diagnosticsPanel');
+  diagVisible = !diagVisible;
+  panel.style.display = diagVisible ? '' : 'none';
+  if (diagVisible) {
+    startDiagnostics();
+  } else {
+    stopDiagnostics();
+  }
+}
+
+function startDiagnostics() {
+  if (!audioPlayback.srcObject) return;
+
+  // Create AudioContext and connect the remote stream to an AnalyserNode
+  if (!audioCtx) {
+    audioCtx = new AudioContext();
+    const source = audioCtx.createMediaStreamSource(audioPlayback.srcObject);
+    analyser = audioCtx.createAnalyser();
+    analyser.fftSize = 8192;
+    analyser.smoothingTimeConstant = 0.3;
+    source.connect(analyser);
+
+    document.getElementById('statSampleRate').textContent = audioCtx.sampleRate + ' Hz';
+  }
+
+  // Start rendering loop
+  if (!diagAnimFrame) {
+    drawDiagnostics();
+  }
 
-    const audioBlob = await response.blob();
-    return audioBlob;  // Return the blob directly
+  // Start WebRTC stats polling
+  if (!statsInterval) {
+    pollWebRTCStats();
+    statsInterval = setInterval(pollWebRTCStats, 1000);
+  }
 }
 
-function playAudioResponse(audioBlob) {
-    const audioUrl = URL.createObjectURL(audioBlob);
-    audioPlayback.src = audioUrl;
-    audioPlayback.hidden = false;
-    audioPlayback.play();
+function stopDiagnostics() {
+  if (diagAnimFrame) {
+    cancelAnimationFrame(diagAnimFrame);
+    diagAnimFrame = null;
+  }
+  if (statsInterval) {
+    clearInterval(statsInterval);
+    statsInterval = null;
+  }
+  if (audioCtx) {
+    audioCtx.close();
+    audioCtx = null;
+    analyser = null;
+  }
 }
 
+function drawDiagnostics() {
+  if (!analyser || !diagVisible) {
+    diagAnimFrame = null;
+    return;
+  }
+
+  diagAnimFrame = requestAnimationFrame(drawDiagnostics);
+
+  // ── Waveform ──
+  const waveCanvas = document.getElementById('waveformCanvas');
+  const wCtx = waveCanvas.getContext('2d');
+  const timeData = new Float32Array(analyser.fftSize);
+  analyser.getFloatTimeDomainData(timeData);
+
+  const w = waveCanvas.width;
+  const h = waveCanvas.height;
+  wCtx.fillStyle = '#000';
+  wCtx.fillRect(0, 0, w, h);
+  wCtx.strokeStyle = '#0f0';
+  wCtx.lineWidth = 1;
+  wCtx.beginPath();
+  const sliceWidth = w / timeData.length;
+  let x = 0;
+  for (let i = 0; i < timeData.length; i++) {
+    const y = (1 - timeData[i]) * h / 2;
+    if (i === 0) wCtx.moveTo(x, y);
+    else wCtx.lineTo(x, y);
+    x += sliceWidth;
+  }
+  wCtx.stroke();
+
+  // Compute RMS
+  let sumSq = 0;
+  for (let i = 0; i < timeData.length; i++) sumSq += timeData[i] * timeData[i];
+  const rms = Math.sqrt(sumSq / timeData.length);
+  const rmsDb = rms > 0 ? (20 * Math.log10(rms)).toFixed(1) : '-Inf';
+  document.getElementById('statRMS').textContent = rmsDb + ' dBFS';
+
+  // ── FFT Spectrum ──
+  const specCanvas = document.getElementById('spectrumCanvas');
+  const sCtx = specCanvas.getContext('2d');
+  const freqData = new Float32Array(analyser.frequencyBinCount);
+  analyser.getFloatFrequencyData(freqData);
+
+  const sw = specCanvas.width;
+  const sh = specCanvas.height;
+  sCtx.fillStyle = '#000';
+  sCtx.fillRect(0, 0, sw, sh);
+
+  // Draw spectrum (0 to 4kHz range for speech/tone analysis)
+  const sampleRate = audioCtx.sampleRate;
+  const binHz = sampleRate / analyser.fftSize;
+  const maxFreqDisplay = 4000;
+  const maxBin = Math.min(Math.ceil(maxFreqDisplay / binHz), freqData.length);
+  const barWidth = sw / maxBin;
+
+  sCtx.fillStyle = '#0cf';
+  let peakBin = 0;
+  let peakVal = -Infinity;
+  for (let i = 0; i < maxBin; i++) {
+    const db = freqData[i];
+    if (db > peakVal) {
+      peakVal = db;
+      peakBin = i;
+    }
+    // Map dB (-100 to 0) to pixel height
+    const barH = Math.max(0, ((db + 100) / 100) * sh);
+    sCtx.fillRect(i * barWidth, sh - barH, Math.max(1, barWidth - 0.5), barH);
+  }
+
+  // Draw frequency labels
+  sCtx.fillStyle = '#888';
+  sCtx.font = '10px monospace';
+  for (let f = 500; f <= maxFreqDisplay; f += 500) {
+    const xPos = (f / binHz) * barWidth;
+    sCtx.fillText(f + '', xPos - 10, sh - 2);
+  }
+
+  // Mark 440 Hz
+  const bin440 = Math.round(440 / binHz);
+  const x440 = bin440 * barWidth;
+  sCtx.strokeStyle = '#f00';
+  sCtx.lineWidth = 1;
+  sCtx.beginPath();
+  sCtx.moveTo(x440, 0);
+  sCtx.lineTo(x440, sh);
+  sCtx.stroke();
+  sCtx.fillStyle = '#f00';
+  sCtx.fillText('440', x440 + 2, 10);
+
+  const peakFreq = peakBin * binHz;
+  document.getElementById('statPeakFreq').textContent =
+    peakFreq.toFixed(0) + ' Hz (' + peakVal.toFixed(1) + ' dB)';
+
+  // Compute THD (Total Harmonic Distortion) relative to 440 Hz
+  // THD = sqrt(sum of harmonic powers / fundamental power)
+  const fundamentalBin = Math.round(440 / binHz);
+  const fundamentalPower = Math.pow(10, freqData[fundamentalBin] / 10);
+  let harmonicPower = 0;
+  for (let h = 2; h <= 10; h++) {
+    const hBin = Math.round(440 * h / binHz);
+    if (hBin < freqData.length) {
+      harmonicPower += Math.pow(10, freqData[hBin] / 10);
+    }
+  }
+  const thd = fundamentalPower > 0
+    ? (Math.sqrt(harmonicPower / fundamentalPower) * 100).toFixed(1)
+    : '--';
+  document.getElementById('statTHD').textContent = thd + '%';
+}
+
+async function pollWebRTCStats() {
+  if (!pc) return;
+  try {
+    const stats = await pc.getStats();
+    const raw = [];
+    stats.forEach((report) => {
+      if (report.type === 'inbound-rtp' && report.kind === 'audio') {
+        document.getElementById('statPacketsRecv').textContent =
+          report.packetsReceived ?? '--';
+        document.getElementById('statPacketsLost').textContent =
+          report.packetsLost ?? '--';
+        document.getElementById('statJitter').textContent =
+          report.jitter !== undefined ? (report.jitter * 1000).toFixed(1) + ' ms' : '--';
+        document.getElementById('statConcealed').textContent =
+          report.concealedSamples ?? '--';
+
+        raw.push('── inbound-rtp (audio) ──');
+        raw.push('  packetsReceived: ' + report.packetsReceived);
+        raw.push('  packetsLost: ' + report.packetsLost);
+        raw.push('  jitter: ' + (report.jitter !== undefined ? (report.jitter * 1000).toFixed(2) + ' ms' : 'N/A'));
+        raw.push('  bytesReceived: ' + report.bytesReceived);
+        raw.push('  concealedSamples: ' + report.concealedSamples);
+        raw.push('  silentConcealedSamples: ' + report.silentConcealedSamples);
+        raw.push('  totalSamplesReceived: ' + report.totalSamplesReceived);
+        raw.push('  insertedSamplesForDecel: ' + report.insertedSamplesForDeceleration);
+        raw.push('  removedSamplesForAccel: ' + report.removedSamplesForAcceleration);
+        raw.push('  jitterBufferDelay: ' + (report.jitterBufferDelay !== undefined ? report.jitterBufferDelay.toFixed(3) + ' s' : 'N/A'));
+        raw.push('  jitterBufferTargetDelay: ' + (report.jitterBufferTargetDelay !== undefined ? report.jitterBufferTargetDelay.toFixed(3) + ' s' : 'N/A'));
+        raw.push('  jitterBufferEmittedCount: ' + report.jitterBufferEmittedCount);
+      }
+    });
+    document.getElementById('statsRaw').textContent = raw.join('\n');
+  } catch (e) {
+    console.warn('Stats polling error:', e);
+  }
+}
diff --git a/core/http/views/talk.html b/core/http/views/talk.html
index 646b31713023..b346d1d3ffec 100644
--- a/core/http/views/talk.html
+++ b/core/http/views/talk.html
@@ -2,10 +2,10 @@
 <html lang="en">
   {{template "views/partials/head" .}}
   <script defer src="static/talk.js"></script>
-  <body class="bg-[var(--color-bg-primary)] text-[var(--color-text-primary)]" x-data="{ key: $store.chat.key }">
+  <body class="bg-[var(--color-bg-primary)] text-[var(--color-text-primary)]">
     <div class="app-layout">
       {{template "views/partials/navbar" .}}
-      
+
       <main class="main-content">
         <div class="main-content-inner">
 
@@ -16,107 +16,206 @@
             <h1 class="hero-title">
               <i class="fas fa-comments mr-2"></i>Talk Interface
             </h1>
-            <p class="hero-subtitle">Speak with your AI models using voice interaction</p>
+            <p class="hero-subtitle">Real-time voice conversation with your AI models via WebRTC</p>
           </div>
         </div>
 
         <!-- Talk Interface -->
         <div class="max-w-3xl mx-auto">
           <div class="card overflow-hidden">
-            <!-- Talk Interface Body -->
             <div class="p-6">
-              <!-- Recording Status -->
-              <div id="recording" class="bg-[var(--color-error-light)] border border-[var(--color-error)]/30 rounded-lg p-4 mb-4 flex items-center space-x-3" style="display: none;">
-                <i class="fa-solid fa-microphone text-2xl text-[var(--color-error)]"></i>
-                <span class="text-[var(--color-error)] font-medium">Recording... press "Stop recording" to stop</span>
-              </div>
-              
-              <!-- Loader -->
-              <div id="loader" class="my-4 flex justify-center" style="display: none;">
-                <div class="animate-spin rounded-full h-10 w-10 border-t-2 border-b-2 border-[var(--color-primary)]"></div>
+              <!-- Connection Status -->
+              <div id="connectionStatus" class="rounded-lg p-4 mb-4 flex items-center space-x-3 bg-[var(--color-bg-primary)]/50 border border-[var(--color-border-subtle)]">
+                <i id="statusIcon" class="fa-solid fa-circle text-[var(--color-text-secondary)]"></i>
+                <span id="statusLabel" class="font-medium text-[var(--color-text-secondary)]">Disconnected</span>
               </div>
-              
-              <!-- Status Text -->
-              <div id="statustext" class="my-4 p-3 bg-[var(--color-bg-primary)]/50 border border-[var(--color-border-subtle)] rounded-lg text-[var(--color-text-primary)]" style="min-height: 3rem;">Press the record button to start recording.</div>
-              
+
               <!-- Note -->
               <div class="bg-[var(--color-primary-light)] border border-[var(--color-primary)]/20 rounded-lg p-4 mb-6">
                 <div class="flex items-start">
                   <i class="fas fa-info-circle text-[var(--color-primary)] mt-1 mr-3 flex-shrink-0"></i>
                   <p class="text-[var(--color-text-secondary)]">
-                    <strong class="text-[var(--color-primary)]">Note:</strong> You need an LLM, an audio-transcription (whisper), and a TTS model installed for this to work. Select the appropriate models below and click 'Talk' to start recording. The recording will continue until you click 'Stop recording'. Make sure your microphone is set up and enabled.
+                    <strong class="text-[var(--color-primary)]">Note:</strong> Select a pipeline model below and click 'Connect' to start a real-time voice conversation. The pipeline model includes VAD, transcription, LLM, and TTS components. Your microphone audio streams continuously; the server detects speech and responds automatically.
                   </p>
                 </div>
               </div>
-              
-              <!-- Model Selectors -->
-              <div class="grid grid-cols-1 md:grid-cols-3 gap-6 mb-6">
-                <!-- LLM Model -->
-                <div class="space-y-2">
-                  <label for="modelSelect" class="flex items-center text-[var(--color-text-secondary)] font-medium">
-                    <i class="fas fa-brain text-[var(--color-primary)] mr-2"></i>LLM Model
-                  </label>
-                  <select id="modelSelect" 
-                    class="w-full bg-[var(--color-bg-primary)] text-[var(--color-text-primary)] border border-[var(--color-border-subtle)] focus:border-[var(--color-primary)] focus:ring-2 focus:ring-[var(--color-primary)]/50 rounded-lg shadow-sm p-2.5 appearance-none">
-                    <option value="" disabled class="text-[var(--color-text-secondary)]">Select a model</option>
-                    {{ range .ModelsConfig }}
-                    <option value="{{.}}" class="bg-[var(--color-bg-primary)] text-[var(--color-text-primary)]">{{.}}</option>
-                    {{ end }}
-                  </select>
-                </div>
-                
-                <!-- STT Model -->
-                <div class="space-y-2">
-                  <label for="sttModelSelect" class="flex items-center text-[var(--color-text-secondary)] font-medium">
-                    <i class="fas fa-ear-listen text-[var(--color-accent)] mr-2"></i>STT Model
-                  </label>
-                  <select id="sttModelSelect" 
-                    class="w-full bg-[var(--color-bg-primary)] text-[var(--color-text-primary)] border border-[var(--color-border-subtle)] focus:border-[var(--color-accent)] focus:ring-2 focus:ring-[var(--color-accent)]/50 rounded-lg shadow-sm p-2.5 appearance-none">
-                    <option value="" disabled class="text-[var(--color-text-secondary)]">Select a model</option>
-                    {{ range .ModelsConfig }}
-                    <option value="{{.}}" class="bg-[var(--color-bg-primary)] text-[var(--color-text-primary)]">{{.}}</option>
-                    {{ end }}
-                  </select>
+
+              <!-- Model Selector -->
+              <div class="mb-4 space-y-2">
+                <label for="modelSelect" class="flex items-center text-[var(--color-text-secondary)] font-medium">
+                  <i class="fas fa-brain text-[var(--color-primary)] mr-2"></i>Pipeline Model
+                </label>
+                <select id="modelSelect"
+                  class="w-full bg-[var(--color-bg-primary)] text-[var(--color-text-primary)] border border-[var(--color-border-subtle)] focus:border-[var(--color-primary)] focus:ring-2 focus:ring-[var(--color-primary)]/50 rounded-lg shadow-sm p-2.5 appearance-none">
+                  <option value="" disabled class="text-[var(--color-text-secondary)]">Select a pipeline model</option>
+                  {{ range .PipelineModels }}
+                  <option value="{{.Name}}"
+                    data-vad="{{.VAD}}"
+                    data-stt="{{.Transcription}}"
+                    data-llm="{{.LLM}}"
+                    data-tts="{{.TTS}}"
+                    data-voice="{{.Voice}}"
+                    class="bg-[var(--color-bg-primary)] text-[var(--color-text-primary)]">{{.Name}}</option>
+                  {{ end }}
+                </select>
+              </div>
+
+              <!-- Pipeline Details (shown when a model is selected) -->
+              <div id="pipelineDetails" class="mb-6 hidden">
+                <div class="grid grid-cols-2 md:grid-cols-4 gap-2 text-xs">
+                  <div class="bg-[var(--color-bg-primary)]/50 rounded p-2 border border-[var(--color-border-subtle)]">
+                    <p class="text-[var(--color-text-secondary)] mb-0.5">VAD</p>
+                    <p id="pipelineVAD" class="font-mono text-[var(--color-text-primary)] truncate"></p>
+                  </div>
+                  <div class="bg-[var(--color-bg-primary)]/50 rounded p-2 border border-[var(--color-border-subtle)]">
+                    <p class="text-[var(--color-text-secondary)] mb-0.5">Transcription</p>
+                    <p id="pipelineSTT" class="font-mono text-[var(--color-text-primary)] truncate"></p>
+                  </div>
+                  <div class="bg-[var(--color-bg-primary)]/50 rounded p-2 border border-[var(--color-border-subtle)]">
+                    <p class="text-[var(--color-text-secondary)] mb-0.5">LLM</p>
+                    <p id="pipelineLLM" class="font-mono text-[var(--color-text-primary)] truncate"></p>
+                  </div>
+                  <div class="bg-[var(--color-bg-primary)]/50 rounded p-2 border border-[var(--color-border-subtle)]">
+                    <p class="text-[var(--color-text-secondary)] mb-0.5">TTS</p>
+                    <p id="pipelineTTS" class="font-mono text-[var(--color-text-primary)] truncate"></p>
+                  </div>
                 </div>
-                
-                <!-- TTS Model -->
-                <div class="space-y-2">
-                  <label for="ttsModelSelect" class="flex items-center text-[var(--color-text-secondary)] font-medium">
-                    <i class="fas fa-volume-high text-[var(--color-success)] mr-2"></i>TTS Model
-                  </label>
-                  <select id="ttsModelSelect" 
-                    class="w-full bg-[var(--color-bg-primary)] text-[var(--color-text-primary)] border border-[var(--color-border-subtle)] focus:border-[var(--color-success)] focus:ring-2 focus:ring-[var(--color-success)]/50 rounded-lg shadow-sm p-2.5 appearance-none">
-                    <option value="" disabled class="text-[var(--color-text-secondary)]">Select a model</option>
-                    {{ range .ModelsConfig }}
-                    <option value="{{.}}" class="bg-[var(--color-bg-primary)] text-[var(--color-text-primary)]">{{.}}</option>
-                    {{ end }}
-                  </select>
+              </div>
+
+              <!-- Session Settings (collapsible) -->
+              <details class="mb-6 border border-[var(--color-border-subtle)] rounded-lg">
+                <summary class="cursor-pointer p-3 flex items-center text-[var(--color-text-secondary)] font-medium hover:bg-[var(--color-bg-primary)]/50 rounded-lg">
+                  <i class="fas fa-sliders text-[var(--color-primary)] mr-2"></i>Session Settings
+                </summary>
+                <div class="p-4 pt-2 space-y-4">
+                  <!-- Instructions -->
+                  <div class="space-y-1">
+                    <label for="instructionsInput" class="text-sm text-[var(--color-text-secondary)]">Instructions</label>
+                    <textarea id="instructionsInput" rows="3"
+                      class="w-full bg-[var(--color-bg-primary)] text-[var(--color-text-primary)] border border-[var(--color-border-subtle)] focus:border-[var(--color-primary)] focus:ring-2 focus:ring-[var(--color-primary)]/50 rounded-lg shadow-sm p-2.5 text-sm"
+                      placeholder="System instructions for the model (e.g. 'be extremely succinct', 'talk quickly')">You are a helpful voice assistant. Your responses will be spoken aloud using text-to-speech, so keep them concise and conversational. Do not use markdown formatting, bullet points, numbered lists, code blocks, or special characters. Speak naturally as you would in a phone conversation. Avoid parenthetical asides, URLs, and anything that cannot be clearly vocalized.</textarea>
+                  </div>
+
+                  <!-- Voice -->
+                  <div class="space-y-1">
+                    <label for="voiceInput" class="text-sm text-[var(--color-text-secondary)]">Voice</label>
+                    <input id="voiceInput" type="text"
+                      class="w-full bg-[var(--color-bg-primary)] text-[var(--color-text-primary)] border border-[var(--color-border-subtle)] focus:border-[var(--color-primary)] focus:ring-2 focus:ring-[var(--color-primary)]/50 rounded-lg shadow-sm p-2.5 text-sm"
+                      placeholder="Voice name (leave blank for model default)">
+                  </div>
+
+                  <!-- Language -->
+                  <div class="space-y-1">
+                    <label for="languageInput" class="text-sm text-[var(--color-text-secondary)]">Transcription Language</label>
+                    <input id="languageInput" type="text"
+                      class="w-full bg-[var(--color-bg-primary)] text-[var(--color-text-primary)] border border-[var(--color-border-subtle)] focus:border-[var(--color-primary)] focus:ring-2 focus:ring-[var(--color-primary)]/50 rounded-lg shadow-sm p-2.5 text-sm"
+                      placeholder="Language code (e.g. 'en', 'es') — leave blank for auto-detect">
+                  </div>
                 </div>
+              </details>
+
+              <!-- Conversation Transcript -->
+              <div id="transcript" class="mb-6 space-y-3 max-h-96 overflow-y-auto p-3 bg-[var(--color-bg-primary)]/50 border border-[var(--color-border-subtle)] rounded-lg" style="min-height: 6rem;">
+                <p class="text-[var(--color-text-secondary)] italic">Conversation will appear here...</p>
               </div>
-              
+
               <!-- Buttons -->
               <div class="flex items-center justify-between mt-8">
-                <button id="recordButton" 
-                  class="inline-flex items-center bg-[var(--color-error)] hover:bg-[var(--color-error)]/90 text-white font-semibold py-2 px-6 rounded-lg transition-colors">
-                  <i class="fas fa-microphone mr-2"></i>
-                  <span>Talk</span>
+                <div class="flex items-center space-x-3">
+                  <button id="connectButton"
+                    class="inline-flex items-center bg-[var(--color-success)] hover:bg-[var(--color-success)]/90 text-white font-semibold py-2 px-6 rounded-lg transition-colors">
+                    <i class="fas fa-plug mr-2"></i>
+                    <span>Connect</span>
+                  </button>
+
+                  <button id="testToneButton"
+                    class="inline-flex items-center bg-[var(--color-accent)] hover:bg-[var(--color-accent)]/90 text-white font-semibold py-2 px-6 rounded-lg transition-colors"
+                    style="display: none;">
+                    <i class="fas fa-wave-square mr-2"></i>
+                    <span>Test Tone</span>
+                  </button>
+
+                  <button id="diagnosticsButton"
+                    class="inline-flex items-center bg-[var(--color-bg-primary)] hover:bg-[var(--color-bg-primary)]/80 text-[var(--color-text-secondary)] font-semibold py-2 px-4 rounded-lg transition-colors border border-[var(--color-border-subtle)]"
+                    style="display: none;">
+                    <i class="fas fa-chart-line mr-2"></i>
+                    <span>Diag</span>
+                  </button>
+                </div>
+
+                <button id="disconnectButton"
+                  class="inline-flex items-center bg-[var(--color-error)] hover:bg-[var(--color-error)]/90 text-white font-semibold py-2 px-6 rounded-lg transition-colors"
+                  style="display: none;">
+                  <i class="fas fa-plug-circle-xmark mr-2"></i>
+                  <span>Disconnect</span>
                 </button>
-                
-                <a id="resetButton" 
-                  class="flex items-center text-[var(--color-primary)] hover:text-[var(--color-accent)] transition-colors" 
-                  href="#">
-                  <i class="fas fa-rotate-left mr-2"></i>
-                  <span>Reset conversation</span>
-                </a>
               </div>
-              
-              <!-- Audio Playback -->
-              <audio id="audioPlayback" controls hidden></audio>
+
+              <!-- Audio element for WebRTC playback -->
+              <audio id="audioPlayback" autoplay style="display:none;"></audio>
+
+              <!-- Audio Diagnostics (toggled by button) -->
+              <div id="diagnosticsPanel" style="display: none;" class="mt-6 border border-[var(--color-border-subtle)] rounded-lg p-4">
+                <h3 class="font-semibold text-[var(--color-text-primary)] mb-3">
+                  <i class="fas fa-chart-line text-[var(--color-primary)] mr-2"></i>Audio Diagnostics
+                </h3>
+
+                <div class="grid grid-cols-1 md:grid-cols-2 gap-4 mb-4">
+                  <div>
+                    <p class="text-xs text-[var(--color-text-secondary)] mb-1">Waveform (time domain)</p>
+                    <canvas id="waveformCanvas" width="400" height="120" class="w-full border border-[var(--color-border-subtle)] rounded bg-black"></canvas>
+                  </div>
+                  <div>
+                    <p class="text-xs text-[var(--color-text-secondary)] mb-1">Spectrum (FFT)</p>
+                    <canvas id="spectrumCanvas" width="400" height="120" class="w-full border border-[var(--color-border-subtle)] rounded bg-black"></canvas>
+                  </div>
+                </div>
+
+                <div class="grid grid-cols-2 md:grid-cols-4 gap-3 mb-3">
+                  <div class="bg-[var(--color-bg-primary)]/50 rounded p-2">
+                    <p class="text-xs text-[var(--color-text-secondary)]">Peak Freq</p>
+                    <p id="statPeakFreq" class="font-mono text-sm text-[var(--color-text-primary)]">--</p>
+                  </div>
+                  <div class="bg-[var(--color-bg-primary)]/50 rounded p-2">
+                    <p class="text-xs text-[var(--color-text-secondary)]">THD</p>
+                    <p id="statTHD" class="font-mono text-sm text-[var(--color-text-primary)]">--</p>
+                  </div>
+                  <div class="bg-[var(--color-bg-primary)]/50 rounded p-2">
+                    <p class="text-xs text-[var(--color-text-secondary)]">RMS Level</p>
+                    <p id="statRMS" class="font-mono text-sm text-[var(--color-text-primary)]">--</p>
+                  </div>
+                  <div class="bg-[var(--color-bg-primary)]/50 rounded p-2">
+                    <p class="text-xs text-[var(--color-text-secondary)]">Sample Rate</p>
+                    <p id="statSampleRate" class="font-mono text-sm text-[var(--color-text-primary)]">--</p>
+                  </div>
+                </div>
+
+                <div class="grid grid-cols-2 md:grid-cols-4 gap-3 mb-3">
+                  <div class="bg-[var(--color-bg-primary)]/50 rounded p-2">
+                    <p class="text-xs text-[var(--color-text-secondary)]">Packets Recv</p>
+                    <p id="statPacketsRecv" class="font-mono text-sm text-[var(--color-text-primary)]">--</p>
+                  </div>
+                  <div class="bg-[var(--color-bg-primary)]/50 rounded p-2">
+                    <p class="text-xs text-[var(--color-text-secondary)]">Packets Lost</p>
+                    <p id="statPacketsLost" class="font-mono text-sm text-[var(--color-text-primary)]">--</p>
+                  </div>
+                  <div class="bg-[var(--color-bg-primary)]/50 rounded p-2">
+                    <p class="text-xs text-[var(--color-text-secondary)]">Jitter</p>
+                    <p id="statJitter" class="font-mono text-sm text-[var(--color-text-primary)]">--</p>
+                  </div>
+                  <div class="bg-[var(--color-bg-primary)]/50 rounded p-2">
+                    <p class="text-xs text-[var(--color-text-secondary)]">Concealed</p>
+                    <p id="statConcealed" class="font-mono text-sm text-[var(--color-text-primary)]">--</p>
+                  </div>
+                </div>
+
+                <pre id="statsRaw" class="text-xs text-[var(--color-text-secondary)] bg-[var(--color-bg-primary)]/50 rounded p-2 max-h-32 overflow-y-auto font-mono" style="white-space: pre-wrap;"></pre>
+              </div>
             </div>
           </div>
         </div>
       </div>
-      
+
       {{template "views/partials/footer" .}}
       </div>
     </main>
diff --git a/core/http/views/traces.html b/core/http/views/traces.html
index 3e66c82b41f8..6287cc47782f 100644
--- a/core/http/views/traces.html
+++ b/core/http/views/traces.html
@@ -254,12 +254,54 @@ <h4 class="text-sm font-semibold text-[var(--color-text-primary)] mb-2">Response
                                             </div>
                                         </div>
 
+                                        <!-- Audio Player & Metrics (transcription traces) -->
+                                        <template x-if="trace.data && trace.data.audio_wav_base64">
+                                            <div class="mb-4">
+                                                <h4 class="text-sm font-semibold text-[var(--color-text-primary)] mb-2">
+                                                    <i class="fas fa-headphones text-[var(--color-primary)] mr-1.5"></i>Audio Snippet
+                                                </h4>
+                                                <div class="bg-[var(--color-bg-primary)] border border-[var(--color-border-subtle)] rounded-lg p-3">
+                                                    <audio controls class="w-full mb-3" :src="'data:audio/wav;base64,' + trace.data.audio_wav_base64"></audio>
+                                                    <div class="grid grid-cols-2 md:grid-cols-4 gap-2 text-xs">
+                                                        <div class="bg-[var(--color-bg-secondary)]/50 rounded p-2">
+                                                            <p class="text-[var(--color-text-secondary)]">Duration</p>
+                                                            <p class="font-mono text-[var(--color-text-primary)]" x-text="trace.data.audio_duration_s + 's'"></p>
+                                                        </div>
+                                                        <div class="bg-[var(--color-bg-secondary)]/50 rounded p-2">
+                                                            <p class="text-[var(--color-text-secondary)]">Sample Rate</p>
+                                                            <p class="font-mono text-[var(--color-text-primary)]" x-text="trace.data.audio_sample_rate + ' Hz'"></p>
+                                                        </div>
+                                                        <div class="bg-[var(--color-bg-secondary)]/50 rounded p-2">
+                                                            <p class="text-[var(--color-text-secondary)]">RMS Level</p>
+                                                            <p class="font-mono text-[var(--color-text-primary)]" x-text="trace.data.audio_rms_dbfs + ' dBFS'"></p>
+                                                        </div>
+                                                        <div class="bg-[var(--color-bg-secondary)]/50 rounded p-2">
+                                                            <p class="text-[var(--color-text-secondary)]">Peak Level</p>
+                                                            <p class="font-mono text-[var(--color-text-primary)]" x-text="trace.data.audio_peak_dbfs + ' dBFS'"></p>
+                                                        </div>
+                                                        <div class="bg-[var(--color-bg-secondary)]/50 rounded p-2">
+                                                            <p class="text-[var(--color-text-secondary)]">Samples</p>
+                                                            <p class="font-mono text-[var(--color-text-primary)]" x-text="trace.data.audio_samples"></p>
+                                                        </div>
+                                                        <div class="bg-[var(--color-bg-secondary)]/50 rounded p-2">
+                                                            <p class="text-[var(--color-text-secondary)]">Snippet</p>
+                                                            <p class="font-mono text-[var(--color-text-primary)]" x-text="trace.data.audio_snippet_s + 's'"></p>
+                                                        </div>
+                                                        <div class="bg-[var(--color-bg-secondary)]/50 rounded p-2">
+                                                            <p class="text-[var(--color-text-secondary)]">DC Offset</p>
+                                                            <p class="font-mono text-[var(--color-text-primary)]" x-text="trace.data.audio_dc_offset"></p>
+                                                        </div>
+                                                    </div>
+                                                </div>
+                                            </div>
+                                        </template>
+
                                         <!-- Data fields as nested accordions -->
                                         <template x-if="trace.data && Object.keys(trace.data).length > 0">
                                             <div>
                                                 <h4 class="text-sm font-semibold text-[var(--color-text-primary)] mb-2">Data Fields</h4>
                                                 <div class="border border-[var(--color-border-subtle)] rounded-lg overflow-hidden">
-                                                    <template x-for="[key, value] in Object.entries(trace.data)" :key="key">
+                                                    <template x-for="[key, value] in filterDataFields(trace.data)" :key="key">
                                                         <div class="border-b border-[var(--color-border-subtle)] last:border-b-0">
                                                             <!-- Field header row -->
                                                             <div @click="isLargeValue(value) && toggleBackendField(index, key)"
@@ -552,6 +594,15 @@ <h4 class="text-sm font-semibold text-[var(--color-text-primary)] mb-2">Data Fie
             if (typeof value === 'boolean') return value ? 'true' : 'false';
             if (typeof value === 'object') return JSON.stringify(value);
             return String(value);
+        },
+
+        filterDataFields(data) {
+            const audioKeys = new Set([
+                'audio_wav_base64', 'audio_duration_s', 'audio_snippet_s',
+                'audio_sample_rate', 'audio_samples', 'audio_rms_dbfs',
+                'audio_peak_dbfs', 'audio_dc_offset'
+            ]);
+            return Object.entries(data).filter(([key]) => !audioKeys.has(key));
         }
     }
 }
diff --git a/core/trace/audio_snippet.go b/core/trace/audio_snippet.go
new file mode 100644
index 000000000000..2f2190ca8aa0
--- /dev/null
+++ b/core/trace/audio_snippet.go
@@ -0,0 +1,102 @@
+package trace
+
+import (
+	"bytes"
+	"encoding/base64"
+	"math"
+	"os"
+
+	"github.com/mudler/LocalAI/pkg/audio"
+	"github.com/mudler/LocalAI/pkg/sound"
+	"github.com/mudler/xlog"
+)
+
+// MaxSnippetSeconds is the maximum number of seconds of audio captured per trace.
+const MaxSnippetSeconds = 30
+
+// AudioSnippet captures the first MaxSnippetSeconds of a WAV file and computes
+// quality metrics. The result is a map suitable for merging into a BackendTrace
+// Data field.
+func AudioSnippet(wavPath string) map[string]any {
+	raw, err := os.ReadFile(wavPath)
+	if err != nil {
+		xlog.Warn("audio snippet: read failed", "path", wavPath, "error", err)
+		return nil
+	}
+	// Only process WAV files (RIFF header)
+	if len(raw) <= audio.WAVHeaderSize || string(raw[:4]) != "RIFF" {
+		xlog.Debug("audio snippet: not a WAV file or too small", "path", wavPath, "bytes", len(raw))
+		return nil
+	}
+
+	pcm, sampleRate := audio.ParseWAV(raw)
+	if sampleRate == 0 {
+		sampleRate = 16000
+	}
+
+	return AudioSnippetFromPCM(pcm, sampleRate, len(pcm))
+}
+
+// AudioSnippetFromPCM builds an audio snippet from raw PCM bytes (int16 LE mono).
+// totalPCMBytes is the full audio size before truncation (used to compute total duration).
+func AudioSnippetFromPCM(pcm []byte, sampleRate int, totalPCMBytes int) map[string]any {
+	if len(pcm) == 0 || len(pcm)%2 != 0 {
+		return nil
+	}
+
+	samples := sound.BytesToInt16sLE(pcm)
+	totalSamples := totalPCMBytes / 2
+	durationS := float64(totalSamples) / float64(sampleRate)
+
+	// Truncate to first MaxSnippetSeconds
+	maxSamples := MaxSnippetSeconds * sampleRate
+	if len(samples) > maxSamples {
+		samples = samples[:maxSamples]
+	}
+
+	snippetDuration := float64(len(samples)) / float64(sampleRate)
+
+	rms := sound.CalculateRMS16(samples)
+	rmsDBFS := -math.Inf(1)
+	if rms > 0 {
+		rmsDBFS = 20 * math.Log10(rms/32768.0)
+	}
+
+	var peak int16
+	var dcSum int64
+	for _, s := range samples {
+		if s < 0 && -s > peak {
+			peak = -s
+		} else if s > peak {
+			peak = s
+		}
+		dcSum += int64(s)
+	}
+	peakDBFS := -math.Inf(1)
+	if peak > 0 {
+		peakDBFS = 20 * math.Log10(float64(peak) / 32768.0)
+	}
+	dcOffset := float64(dcSum) / float64(len(samples)) / 32768.0
+
+	// Encode the snippet as WAV
+	snippetPCM := sound.Int16toBytesLE(samples)
+	hdr := audio.NewWAVHeaderWithRate(uint32(len(snippetPCM)), uint32(sampleRate))
+	var buf bytes.Buffer
+	buf.Grow(audio.WAVHeaderSize + len(snippetPCM))
+	if err := hdr.Write(&buf); err != nil {
+		xlog.Warn("audio snippet: write header failed", "error", err)
+		return nil
+	}
+	buf.Write(snippetPCM)
+
+	return map[string]any{
+		"audio_wav_base64":  base64.StdEncoding.EncodeToString(buf.Bytes()),
+		"audio_duration_s":  math.Round(durationS*100) / 100,
+		"audio_snippet_s":   math.Round(snippetDuration*100) / 100,
+		"audio_sample_rate": sampleRate,
+		"audio_samples":     totalSamples,
+		"audio_rms_dbfs":    math.Round(rmsDBFS*10) / 10,
+		"audio_peak_dbfs":   math.Round(peakDBFS*10) / 10,
+		"audio_dc_offset":   math.Round(dcOffset*10000) / 10000,
+	}
+}
diff --git a/core/trace/backend_trace.go b/core/trace/backend_trace.go
index 0dfbd8458e9c..4e6237f9f700 100644
--- a/core/trace/backend_trace.go
+++ b/core/trace/backend_trace.go
@@ -85,7 +85,7 @@ func GetBackendTraces() []BackendTrace {
 	}
 
 	sort.Slice(traces, func(i, j int) bool {
-		return traces[i].Timestamp.Before(traces[j].Timestamp)
+		return traces[i].Timestamp.After(traces[j].Timestamp)
 	})
 
 	return traces
diff --git a/docs/content/advanced/advanced-usage.md b/docs/content/advanced/advanced-usage.md
index 7742eb29a874..1b7fba2976c9 100644
--- a/docs/content/advanced/advanced-usage.md
+++ b/docs/content/advanced/advanced-usage.md
@@ -188,6 +188,8 @@ there are additional environment variables available that modify the behavior of
 | `EXTRA_BACKENDS`          |         | A space separated list of backends to prepare. For example `EXTRA_BACKENDS="backend/python/diffusers backend/python/transformers"` prepares the python environment on start |
 | `DISABLE_AUTODETECT`       | `false` | Disable autodetect of CPU flagset on start                                                                     |
 | `LLAMACPP_GRPC_SERVERS`   |         | A list of llama.cpp workers to distribute the workload. For example `LLAMACPP_GRPC_SERVERS="address1:port,address2:port"` |
+| `OPUS_LIBRARY`            |         | Path to the libopus shared library (e.g. `/usr/lib/libopus.so.0`). Used by the WebRTC realtime API for Opus audio encoding/decoding. When unset, standard system paths are searched automatically. |
+| `OPUS_SHIM_LIBRARY`       |         | Path to the libopusshim shared library (e.g. `/usr/local/lib/libopusshim.so`). This thin wrapper is built from `pkg/opus/shim/` during `make build` when libopus-dev is installed. |
 
 Here is how to configure these variables:
 
diff --git a/go.mod b/go.mod
index 55b2f8228867..0244b2a3209b 100644
--- a/go.mod
+++ b/go.mod
@@ -131,6 +131,7 @@ require (
 	github.com/olekukonko/tablewriter v0.0.5 // indirect
 	github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4 // indirect
 	github.com/philippgille/chromem-go v0.7.0 // indirect
+	github.com/pion/transport/v4 v4.0.1 // indirect
 	github.com/pjbgf/sha1cd v0.3.2 // indirect
 	github.com/rs/zerolog v1.31.0 // indirect
 	github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
@@ -208,25 +209,24 @@ require (
 	github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect
 	github.com/nicksnyder/go-i18n/v2 v2.5.1 // indirect
 	github.com/otiai10/mint v1.6.3 // indirect
-	github.com/pion/datachannel v1.5.10 // indirect
+	github.com/pion/datachannel v1.6.0 // indirect
 	github.com/pion/dtls/v2 v2.2.12 // indirect
-	github.com/pion/dtls/v3 v3.0.6 // indirect
-	github.com/pion/ice/v4 v4.0.10 // indirect
-	github.com/pion/interceptor v0.1.40 // indirect
-	github.com/pion/logging v0.2.3 // indirect
-	github.com/pion/mdns/v2 v2.0.7 // indirect
+	github.com/pion/dtls/v3 v3.1.2 // indirect
+	github.com/pion/ice/v4 v4.2.1 // indirect
+	github.com/pion/interceptor v0.1.44 // indirect
+	github.com/pion/logging v0.2.4 // indirect
+	github.com/pion/mdns/v2 v2.1.0 // indirect
 	github.com/pion/randutil v0.1.0 // indirect
-	github.com/pion/rtcp v1.2.15 // indirect
-	github.com/pion/rtp v1.8.19 // indirect
-	github.com/pion/sctp v1.8.39 // indirect
-	github.com/pion/sdp/v3 v3.0.13 // indirect
-	github.com/pion/srtp/v3 v3.0.6 // indirect
+	github.com/pion/rtcp v1.2.16 // indirect
+	github.com/pion/rtp v1.10.1
+	github.com/pion/sctp v1.9.2 // indirect
+	github.com/pion/sdp/v3 v3.0.18 // indirect
+	github.com/pion/srtp/v3 v3.0.10 // indirect
 	github.com/pion/stun v0.6.1 // indirect
-	github.com/pion/stun/v3 v3.0.0 // indirect
+	github.com/pion/stun/v3 v3.1.1 // indirect
 	github.com/pion/transport/v2 v2.2.10 // indirect
-	github.com/pion/transport/v3 v3.0.7 // indirect
-	github.com/pion/turn/v4 v4.0.2 // indirect
-	github.com/pion/webrtc/v4 v4.1.2 // indirect
+	github.com/pion/turn/v4 v4.1.4 // indirect
+	github.com/pion/webrtc/v4 v4.2.9
 	github.com/prometheus/otlptranslator v1.0.0 // indirect
 	github.com/rymdport/portal v0.4.2 // indirect
 	github.com/shirou/gopsutil/v4 v4.25.6 // indirect
diff --git a/go.sum b/go.sum
index d31c0b4af09a..341656be1a3b 100644
--- a/go.sum
+++ b/go.sum
@@ -656,8 +656,6 @@ github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
 github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
 github.com/mudler/LocalAGI v0.0.0-20260306154948-5a27c471ca78 h1:B3FgipRORpDtDvNlCC/w4N6PPwIyn7M/mzeRiq0EV4o=
 github.com/mudler/LocalAGI v0.0.0-20260306154948-5a27c471ca78/go.mod h1:e/00in01SHCpzUD/UyJMopn7P+vJMjsk6qkxZC1qPW0=
-github.com/mudler/cogito v0.9.2 h1:KbzNpuJ782njeBKfg3q7kLIBHTCFi9DgXhPTXnZqu1Y=
-github.com/mudler/cogito v0.9.2/go.mod h1:6sfja3lcu2nWRzEc0wwqGNu/eCG3EWgij+8s7xyUeQ4=
 github.com/mudler/cogito v0.9.3-0.20260306202429-e073d115bd04 h1:33Lqv8VBaV/AoaaVtZ5+Bcig4T9fvj0dQmKFCon5Xxo=
 github.com/mudler/cogito v0.9.3-0.20260306202429-e073d115bd04/go.mod h1:6sfja3lcu2nWRzEc0wwqGNu/eCG3EWgij+8s7xyUeQ4=
 github.com/mudler/edgevpn v0.31.1 h1:7qegiDWd0kAg6ljhNHxqvp8hbo/6BbzSdbb7/2WZfiY=
@@ -750,48 +748,50 @@ github.com/philippgille/chromem-go v0.7.0/go.mod h1:hTd+wGEm/fFPQl7ilfCwQXkgEUxc
 github.com/phpdave11/gofpdi v1.0.7/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI=
 github.com/pierrec/lz4/v4 v4.1.2 h1:qvY3YFXRQE/XB8MlLzJH7mSzBs74eA2gg52YTk6jUPM=
 github.com/pierrec/lz4/v4 v4.1.2/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
-github.com/pion/datachannel v1.5.10 h1:ly0Q26K1i6ZkGf42W7D4hQYR90pZwzFOjTq5AuCKk4o=
-github.com/pion/datachannel v1.5.10/go.mod h1:p/jJfC9arb29W7WrxyKbepTU20CFgyx5oLo8Rs4Py/M=
+github.com/pion/datachannel v1.6.0 h1:XecBlj+cvsxhAMZWFfFcPyUaDZtd7IJvrXqlXD/53i0=
+github.com/pion/datachannel v1.6.0/go.mod h1:ur+wzYF8mWdC+Mkis5Thosk+u/VOL287apDNEbFpsIk=
 github.com/pion/dtls/v2 v2.2.7/go.mod h1:8WiMkebSHFD0T+dIU+UeBaoV7kDhOW5oDCzZ7WZ/F9s=
 github.com/pion/dtls/v2 v2.2.12 h1:KP7H5/c1EiVAAKUmXyCzPiQe5+bCJrpOeKg/L05dunk=
 github.com/pion/dtls/v2 v2.2.12/go.mod h1:d9SYc9fch0CqK90mRk1dC7AkzzpwJj6u2GU3u+9pqFE=
-github.com/pion/dtls/v3 v3.0.6 h1:7Hkd8WhAJNbRgq9RgdNh1aaWlZlGpYTzdqjy9x9sK2E=
-github.com/pion/dtls/v3 v3.0.6/go.mod h1:iJxNQ3Uhn1NZWOMWlLxEEHAN5yX7GyPvvKw04v9bzYU=
-github.com/pion/ice/v4 v4.0.10 h1:P59w1iauC/wPk9PdY8Vjl4fOFL5B+USq1+xbDcN6gT4=
-github.com/pion/ice/v4 v4.0.10/go.mod h1:y3M18aPhIxLlcO/4dn9X8LzLLSma84cx6emMSu14FGw=
-github.com/pion/interceptor v0.1.40 h1:e0BjnPcGpr2CFQgKhrQisBU7V3GXK6wrfYrGYaU6Jq4=
-github.com/pion/interceptor v0.1.40/go.mod h1:Z6kqH7M/FYirg3frjGJ21VLSRJGBXB/KqaTIrdqnOic=
+github.com/pion/dtls/v3 v3.1.2 h1:gqEdOUXLtCGW+afsBLO0LtDD8GnuBBjEy6HRtyofZTc=
+github.com/pion/dtls/v3 v3.1.2/go.mod h1:Hw/igcX4pdY69z1Hgv5x7wJFrUkdgHwAn/Q/uo7YHRo=
+github.com/pion/ice/v4 v4.2.1 h1:XPRYXaLiFq3LFDG7a7bMrmr3mFr27G/gtXN3v/TVfxY=
+github.com/pion/ice/v4 v4.2.1/go.mod h1:2quLV1S5v1tAx3VvAJaH//KGitRXvo4RKlX6D3tnN+c=
+github.com/pion/interceptor v0.1.44 h1:sNlZwM8dWXU9JQAkJh8xrarC0Etn8Oolcniukmuy0/I=
+github.com/pion/interceptor v0.1.44/go.mod h1:4atVlBkcgXuUP+ykQF0qOCGU2j7pQzX2ofvPRFsY5RY=
 github.com/pion/logging v0.2.2/go.mod h1:k0/tDVsRCX2Mb2ZEmTqNa7CWsQPc+YYCB7Q+5pahoms=
-github.com/pion/logging v0.2.3 h1:gHuf0zpoh1GW67Nr6Gj4cv5Z9ZscU7g/EaoC/Ke/igI=
-github.com/pion/logging v0.2.3/go.mod h1:z8YfknkquMe1csOrxK5kc+5/ZPAzMxbKLX5aXpbpC90=
-github.com/pion/mdns/v2 v2.0.7 h1:c9kM8ewCgjslaAmicYMFQIde2H9/lrZpjBkN8VwoVtM=
-github.com/pion/mdns/v2 v2.0.7/go.mod h1:vAdSYNAT0Jy3Ru0zl2YiW3Rm/fJCwIeM0nToenfOJKA=
+github.com/pion/logging v0.2.4 h1:tTew+7cmQ+Mc1pTBLKH2puKsOvhm32dROumOZ655zB8=
+github.com/pion/logging v0.2.4/go.mod h1:DffhXTKYdNZU+KtJ5pyQDjvOAh/GsNSyv1lbkFbe3so=
+github.com/pion/mdns/v2 v2.1.0 h1:3IJ9+Xio6tWYjhN6WwuY142P/1jA0D5ERaIqawg/fOY=
+github.com/pion/mdns/v2 v2.1.0/go.mod h1:pcez23GdynwcfRU1977qKU0mDxSeucttSHbCSfFOd9A=
 github.com/pion/randutil v0.1.0 h1:CFG1UdESneORglEsnimhUjf33Rwjubwj6xfiOXBa3mA=
 github.com/pion/randutil v0.1.0/go.mod h1:XcJrSMMbbMRhASFVOlj/5hQial/Y8oH/HVo7TBZq+j8=
-github.com/pion/rtcp v1.2.15 h1:LZQi2JbdipLOj4eBjK4wlVoQWfrZbh3Q6eHtWtJBZBo=
-github.com/pion/rtcp v1.2.15/go.mod h1:jlGuAjHMEXwMUHK78RgX0UmEJFV4zUKOFHR7OP+D3D0=
-github.com/pion/rtp v1.8.19 h1:jhdO/3XhL/aKm/wARFVmvTfq0lC/CvN1xwYKmduly3c=
-github.com/pion/rtp v1.8.19/go.mod h1:bAu2UFKScgzyFqvUKmbvzSdPr+NGbZtv6UB2hesqXBk=
-github.com/pion/sctp v1.8.39 h1:PJma40vRHa3UTO3C4MyeJDQ+KIobVYRZQZ0Nt7SjQnE=
-github.com/pion/sctp v1.8.39/go.mod h1:cNiLdchXra8fHQwmIoqw0MbLLMs+f7uQ+dGMG2gWebE=
-github.com/pion/sdp/v3 v3.0.13 h1:uN3SS2b+QDZnWXgdr69SM8KB4EbcnPnPf2Laxhty/l4=
-github.com/pion/sdp/v3 v3.0.13/go.mod h1:88GMahN5xnScv1hIMTqLdu/cOcUkj6a9ytbncwMCq2E=
-github.com/pion/srtp/v3 v3.0.6 h1:E2gyj1f5X10sB/qILUGIkL4C2CqK269Xq167PbGCc/4=
-github.com/pion/srtp/v3 v3.0.6/go.mod h1:BxvziG3v/armJHAaJ87euvkhHqWe9I7iiOy50K2QkhY=
+github.com/pion/rtcp v1.2.16 h1:fk1B1dNW4hsI78XUCljZJlC4kZOPk67mNRuQ0fcEkSo=
+github.com/pion/rtcp v1.2.16/go.mod h1:/as7VKfYbs5NIb4h6muQ35kQF/J0ZVNz2Z3xKoCBYOo=
+github.com/pion/rtp v1.10.1 h1:xP1prZcCTUuhO2c83XtxyOHJteISg6o8iPsE2acaMtA=
+github.com/pion/rtp v1.10.1/go.mod h1:rF5nS1GqbR7H/TCpKwylzeq6yDM+MM6k+On5EgeThEM=
+github.com/pion/sctp v1.9.2 h1:HxsOzEV9pWoeggv7T5kewVkstFNcGvhMPx0GvUOUQXo=
+github.com/pion/sctp v1.9.2/go.mod h1:OTOlsQ5EDQ6mQ0z4MUGXt2CgQmKyafBEXhUVqLRB6G8=
+github.com/pion/sdp/v3 v3.0.18 h1:l0bAXazKHpepazVdp+tPYnrsy9dfh7ZbT8DxesH5ZnI=
+github.com/pion/sdp/v3 v3.0.18/go.mod h1:ZREGo6A9ZygQ9XkqAj5xYCQtQpif0i6Pa81HOiAdqQ8=
+github.com/pion/srtp/v3 v3.0.10 h1:tFirkpBb3XccP5VEXLi50GqXhv5SKPxqrdlhDCJlZrQ=
+github.com/pion/srtp/v3 v3.0.10/go.mod h1:3mOTIB0cq9qlbn59V4ozvv9ClW/BSEbRp4cY0VtaR7M=
 github.com/pion/stun v0.6.1 h1:8lp6YejULeHBF8NmV8e2787BogQhduZugh5PdhDyyN4=
 github.com/pion/stun v0.6.1/go.mod h1:/hO7APkX4hZKu/D0f2lHzNyvdkTGtIy3NDmLR7kSz/8=
-github.com/pion/stun/v3 v3.0.0 h1:4h1gwhWLWuZWOJIJR9s2ferRO+W3zA/b6ijOI6mKzUw=
-github.com/pion/stun/v3 v3.0.0/go.mod h1:HvCN8txt8mwi4FBvS3EmDghW6aQJ24T+y+1TKjB5jyU=
+github.com/pion/stun/v3 v3.1.1 h1:CkQxveJ4xGQjulGSROXbXq94TAWu8gIX2dT+ePhUkqw=
+github.com/pion/stun/v3 v3.1.1/go.mod h1:qC1DfmcCTQjl9PBaMa5wSn3x9IPmKxSdcCsxBcDBndM=
 github.com/pion/transport/v2 v2.2.1/go.mod h1:cXXWavvCnFF6McHTft3DWS9iic2Mftcz1Aq29pGcU5g=
 github.com/pion/transport/v2 v2.2.4/go.mod h1:q2U/tf9FEfnSBGSW6w5Qp5PFWRLRj3NjLhCCgpRK4p0=
 github.com/pion/transport/v2 v2.2.10 h1:ucLBLE8nuxiHfvkFKnkDQRYWYfp8ejf4YBOPfaQpw6Q=
 github.com/pion/transport/v2 v2.2.10/go.mod h1:sq1kSLWs+cHW9E+2fJP95QudkzbK7wscs8yYgQToO5E=
-github.com/pion/transport/v3 v3.0.7 h1:iRbMH05BzSNwhILHoBoAPxoB9xQgOaJk+591KC9P1o0=
-github.com/pion/transport/v3 v3.0.7/go.mod h1:YleKiTZ4vqNxVwh77Z0zytYi7rXHl7j6uPLGhhz9rwo=
-github.com/pion/turn/v4 v4.0.2 h1:ZqgQ3+MjP32ug30xAbD6Mn+/K4Sxi3SdNOTFf+7mpps=
-github.com/pion/turn/v4 v4.0.2/go.mod h1:pMMKP/ieNAG/fN5cZiN4SDuyKsXtNTr0ccN7IToA1zs=
-github.com/pion/webrtc/v4 v4.1.2 h1:mpuUo/EJ1zMNKGE79fAdYNFZBX790KE7kQQpLMjjR54=
-github.com/pion/webrtc/v4 v4.1.2/go.mod h1:xsCXiNAmMEjIdFxAYU0MbB3RwRieJsegSB2JZsGN+8U=
+github.com/pion/transport/v3 v3.1.1 h1:Tr684+fnnKlhPceU+ICdrw6KKkTms+5qHMgw6bIkYOM=
+github.com/pion/transport/v3 v3.1.1/go.mod h1:+c2eewC5WJQHiAA46fkMMzoYZSuGzA/7E2FPrOYHctQ=
+github.com/pion/transport/v4 v4.0.1 h1:sdROELU6BZ63Ab7FrOLn13M6YdJLY20wldXW2Cu2k8o=
+github.com/pion/transport/v4 v4.0.1/go.mod h1:nEuEA4AD5lPdcIegQDpVLgNoDGreqM/YqmEx3ovP4jM=
+github.com/pion/turn/v4 v4.1.4 h1:EU11yMXKIsK43FhcUnjLlrhE4nboHZq+TXBIi3QpcxQ=
+github.com/pion/turn/v4 v4.1.4/go.mod h1:ES1DXVFKnOhuDkqn9hn5VJlSWmZPaRJLyBXoOeO/BmQ=
+github.com/pion/webrtc/v4 v4.2.9 h1:DZIh1HAhPIL3RvwEDFsmL5hfPSLEpxsQk9/Jir2vkJE=
+github.com/pion/webrtc/v4 v4.2.9/go.mod h1:9EmLZve0H76eTzf8v2FmchZ6tcBXtDgpfTEu+drW6SY=
 github.com/pjbgf/sha1cd v0.3.2 h1:a9wb0bp1oC2TGwStyn0Umc/IGKQnEgF0vVaZ8QF8eo4=
 github.com/pjbgf/sha1cd v0.3.2/go.mod h1:zQWigSxVmsHEZow5qaLtPYxpcKMMQpa09ixqBxuCS6A=
 github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
diff --git a/pkg/audio/audio.go b/pkg/audio/audio.go
index 946d902f0539..c06d568b1960 100644
--- a/pkg/audio/audio.go
+++ b/pkg/audio/audio.go
@@ -53,3 +53,46 @@ func NewWAVHeader(pcmLen uint32) WAVHeader {
 func (h *WAVHeader) Write(writer io.Writer) error {
   return binary.Write(writer, binary.LittleEndian, h)
 }
+
+// NewWAVHeaderWithRate creates a WAV header for mono 16-bit PCM at the given sample rate.
+func NewWAVHeaderWithRate(pcmLen, sampleRate uint32) WAVHeader {
+  header := WAVHeader{
+    ChunkID:       [4]byte{'R', 'I', 'F', 'F'},
+    Format:        [4]byte{'W', 'A', 'V', 'E'},
+    Subchunk1ID:   [4]byte{'f', 'm', 't', ' '},
+    Subchunk1Size: 16,
+    AudioFormat:   1,
+    NumChannels:   1,
+    SampleRate:    sampleRate,
+    ByteRate:      sampleRate * 2,
+    BlockAlign:    2,
+    BitsPerSample: 16,
+    Subchunk2ID:   [4]byte{'d', 'a', 't', 'a'},
+    Subchunk2Size: pcmLen,
+  }
+  header.ChunkSize = 36 + header.Subchunk2Size
+  return header
+}
+
+// WAVHeaderSize is the size of a standard PCM WAV header in bytes.
+const WAVHeaderSize = 44
+
+// StripWAVHeader removes a WAV header from audio data, returning raw PCM.
+// If the data is too short to contain a header, it is returned unchanged.
+func StripWAVHeader(data []byte) []byte {
+  if len(data) > WAVHeaderSize {
+    return data[WAVHeaderSize:]
+  }
+  return data
+}
+
+// ParseWAV strips the WAV header and returns the raw PCM along with the
+// sample rate read from the header. If the data is too short to contain a
+// valid header the PCM is returned as-is with sampleRate=0.
+func ParseWAV(data []byte) (pcm []byte, sampleRate int) {
+  if len(data) <= WAVHeaderSize {
+    return data, 0
+  }
+  sr := int(binary.LittleEndian.Uint32(data[24:28]))
+  return data[WAVHeaderSize:], sr
+}
diff --git a/pkg/audio/audio_test.go b/pkg/audio/audio_test.go
new file mode 100644
index 000000000000..5cfd0c519e68
--- /dev/null
+++ b/pkg/audio/audio_test.go
@@ -0,0 +1,155 @@
+package audio
+
+import (
+	"bytes"
+	"encoding/binary"
+	"testing"
+)
+
+func TestNewWAVHeader_Valid44Bytes(t *testing.T) {
+	hdr := NewWAVHeader(3200)
+	var buf bytes.Buffer
+	if err := hdr.Write(&buf); err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+	if buf.Len() != WAVHeaderSize {
+		t.Fatalf("header size = %d, want %d", buf.Len(), WAVHeaderSize)
+	}
+
+	b := buf.Bytes()
+	// RIFF
+	if string(b[0:4]) != "RIFF" {
+		t.Errorf("ChunkID = %q, want RIFF", b[0:4])
+	}
+	// WAVE
+	if string(b[8:12]) != "WAVE" {
+		t.Errorf("Format = %q, want WAVE", b[8:12])
+	}
+	// fmt
+	if string(b[12:16]) != "fmt " {
+		t.Errorf("Subchunk1ID = %q, want 'fmt '", b[12:16])
+	}
+	// AudioFormat = 1 (PCM)
+	audioFmt := binary.LittleEndian.Uint16(b[20:22])
+	if audioFmt != 1 {
+		t.Errorf("AudioFormat = %d, want 1", audioFmt)
+	}
+	// NumChannels = 1
+	numCh := binary.LittleEndian.Uint16(b[22:24])
+	if numCh != 1 {
+		t.Errorf("NumChannels = %d, want 1", numCh)
+	}
+	// SampleRate = 16000
+	sr := binary.LittleEndian.Uint32(b[24:28])
+	if sr != 16000 {
+		t.Errorf("SampleRate = %d, want 16000", sr)
+	}
+	// ByteRate = 32000
+	br := binary.LittleEndian.Uint32(b[28:32])
+	if br != 32000 {
+		t.Errorf("ByteRate = %d, want 32000", br)
+	}
+	// data
+	if string(b[36:40]) != "data" {
+		t.Errorf("Subchunk2ID = %q, want 'data'", b[36:40])
+	}
+	// Subchunk2Size
+	dataSize := binary.LittleEndian.Uint32(b[40:44])
+	if dataSize != 3200 {
+		t.Errorf("Subchunk2Size = %d, want 3200", dataSize)
+	}
+}
+
+func TestNewWAVHeaderWithRate_CustomRate(t *testing.T) {
+	hdr := NewWAVHeaderWithRate(4800, 24000)
+	var buf bytes.Buffer
+	if err := hdr.Write(&buf); err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+	b := buf.Bytes()
+
+	sr := binary.LittleEndian.Uint32(b[24:28])
+	if sr != 24000 {
+		t.Errorf("SampleRate = %d, want 24000", sr)
+	}
+	br := binary.LittleEndian.Uint32(b[28:32])
+	if br != 48000 {
+		t.Errorf("ByteRate = %d, want 48000 (24000*2)", br)
+	}
+}
+
+func TestStripWAVHeader_Strips44Bytes(t *testing.T) {
+	pcm := []byte{0xDE, 0xAD, 0xBE, 0xEF}
+	hdr := NewWAVHeader(uint32(len(pcm)))
+	var buf bytes.Buffer
+	if err := hdr.Write(&buf); err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+	buf.Write(pcm)
+
+	got := StripWAVHeader(buf.Bytes())
+	if !bytes.Equal(got, pcm) {
+		t.Errorf("StripWAVHeader result = %v, want %v", got, pcm)
+	}
+}
+
+func TestStripWAVHeader_ShortData(t *testing.T) {
+	short := []byte{0x01, 0x02, 0x03}
+	got := StripWAVHeader(short)
+	if !bytes.Equal(got, short) {
+		t.Errorf("expected unchanged data for short input")
+	}
+
+	// Exactly 44 bytes — still "short" because there's no data after the header
+	exact := make([]byte, WAVHeaderSize)
+	got = StripWAVHeader(exact)
+	if !bytes.Equal(got, exact) {
+		t.Errorf("expected unchanged data for exact header-size input")
+	}
+}
+
+func TestParseWAV_ReturnsSampleRate(t *testing.T) {
+	pcm := make([]byte, 100)
+	for i := range pcm {
+		pcm[i] = byte(i)
+	}
+
+	// 24kHz WAV
+	hdr24 := NewWAVHeaderWithRate(uint32(len(pcm)), 24000)
+	var buf24 bytes.Buffer
+	hdr24.Write(&buf24)
+	buf24.Write(pcm)
+
+	gotPCM, gotRate := ParseWAV(buf24.Bytes())
+	if gotRate != 24000 {
+		t.Errorf("ParseWAV sample rate = %d, want 24000", gotRate)
+	}
+	if !bytes.Equal(gotPCM, pcm) {
+		t.Error("ParseWAV PCM data mismatch")
+	}
+
+	// 16kHz WAV
+	hdr16 := NewWAVHeader(uint32(len(pcm)))
+	var buf16 bytes.Buffer
+	hdr16.Write(&buf16)
+	buf16.Write(pcm)
+
+	gotPCM, gotRate = ParseWAV(buf16.Bytes())
+	if gotRate != 16000 {
+		t.Errorf("ParseWAV sample rate = %d, want 16000", gotRate)
+	}
+	if !bytes.Equal(gotPCM, pcm) {
+		t.Error("ParseWAV PCM data mismatch")
+	}
+}
+
+func TestParseWAV_ShortData(t *testing.T) {
+	short := []byte{0x01, 0x02, 0x03}
+	gotPCM, gotRate := ParseWAV(short)
+	if gotRate != 0 {
+		t.Errorf("expected sampleRate=0 for short input, got %d", gotRate)
+	}
+	if !bytes.Equal(gotPCM, short) {
+		t.Error("expected unchanged data for short input")
+	}
+}
diff --git a/pkg/opus/opus.go b/pkg/opus/opus.go
new file mode 100644
index 000000000000..e4a670e84551
--- /dev/null
+++ b/pkg/opus/opus.go
@@ -0,0 +1,261 @@
+package opus
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"sync"
+
+	"github.com/ebitengine/purego"
+)
+
+const (
+	ApplicationVoIP               = 2048
+	ApplicationAudio              = 2049
+	ApplicationRestrictedLowDelay = 2051
+)
+
+var (
+	initOnce sync.Once
+	initErr  error
+
+	opusLib uintptr
+	shimLib uintptr
+
+	// libopus functions
+	cEncoderCreate  func(fs int32, channels int32, application int32, errPtr *int32) uintptr
+	cEncode         func(st uintptr, pcm *int16, frameSize int32, data *byte, maxBytes int32) int32
+	cEncoderDestroy func(st uintptr)
+
+	cDecoderCreate  func(fs int32, channels int32, errPtr *int32) uintptr
+	cDecode         func(st uintptr, data *byte, dataLen int32, pcm *int16, frameSize int32, decodeFec int32) int32
+	cDecoderDestroy func(st uintptr)
+
+	// shim functions (non-variadic wrappers for opus_encoder_ctl)
+	cSetBitrate    func(st uintptr, bitrate int32) int32
+	cSetComplexity func(st uintptr, complexity int32) int32
+)
+
+func loadLib(names []string) (uintptr, error) {
+	var firstErr error
+	for _, name := range names {
+		h, err := purego.Dlopen(name, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+		if err == nil {
+			return h, nil
+		}
+		if firstErr == nil {
+			firstErr = err
+		}
+	}
+	return 0, firstErr
+}
+
+func ensureInit() error {
+	initOnce.Do(func() {
+		initErr = doInit()
+	})
+	return initErr
+}
+
+const shimHint = "ensure libopus-dev is installed and rebuild, or set OPUS_LIBRARY / OPUS_SHIM_LIBRARY env vars"
+
+func doInit() error {
+	opusNames := opusSearchPaths()
+	var err error
+	opusLib, err = loadLib(opusNames)
+	if err != nil {
+		return fmt.Errorf("opus: failed to load libopus (%s): %w", shimHint, err)
+	}
+
+	purego.RegisterLibFunc(&cEncoderCreate, opusLib, "opus_encoder_create")
+	purego.RegisterLibFunc(&cEncode, opusLib, "opus_encode")
+	purego.RegisterLibFunc(&cEncoderDestroy, opusLib, "opus_encoder_destroy")
+	purego.RegisterLibFunc(&cDecoderCreate, opusLib, "opus_decoder_create")
+	purego.RegisterLibFunc(&cDecode, opusLib, "opus_decode")
+	purego.RegisterLibFunc(&cDecoderDestroy, opusLib, "opus_decoder_destroy")
+
+	shimNames := shimSearchPaths()
+	shimLib, err = loadLib(shimNames)
+	if err != nil {
+		return fmt.Errorf("opus: failed to load libopusshim (%s): %w", shimHint, err)
+	}
+
+	purego.RegisterLibFunc(&cSetBitrate, shimLib, "opus_shim_encoder_set_bitrate")
+	purego.RegisterLibFunc(&cSetComplexity, shimLib, "opus_shim_encoder_set_complexity")
+
+	return nil
+}
+
+func opusSearchPaths() []string {
+	var paths []string
+
+	if env := os.Getenv("OPUS_LIBRARY"); env != "" {
+		paths = append(paths, env)
+	}
+
+	if exe, err := os.Executable(); err == nil {
+		dir := filepath.Dir(exe)
+		paths = append(paths, filepath.Join(dir, "libopus.so.0"), filepath.Join(dir, "libopus.so"))
+		if runtime.GOOS == "darwin" {
+			paths = append(paths, filepath.Join(dir, "libopus.dylib"))
+		}
+	}
+
+	paths = append(paths, "libopus.so.0", "libopus.so", "libopus.dylib", "opus.dll")
+
+	if runtime.GOOS == "darwin" {
+		paths = append(paths,
+			"/opt/homebrew/lib/libopus.dylib",
+			"/usr/local/lib/libopus.dylib",
+		)
+	}
+
+	return paths
+}
+
+func shimSearchPaths() []string {
+	var paths []string
+
+	if env := os.Getenv("OPUS_SHIM_LIBRARY"); env != "" {
+		paths = append(paths, env)
+	}
+
+	if exe, err := os.Executable(); err == nil {
+		dir := filepath.Dir(exe)
+		paths = append(paths, filepath.Join(dir, "libopusshim.so"))
+		if runtime.GOOS == "darwin" {
+			paths = append(paths, filepath.Join(dir, "libopusshim.dylib"))
+		}
+	}
+
+	paths = append(paths, "./libopusshim.so", "libopusshim.so")
+	if runtime.GOOS == "darwin" {
+		paths = append(paths, "./libopusshim.dylib", "libopusshim.dylib")
+	}
+	return paths
+}
+
+// Encoder wraps a libopus OpusEncoder via purego.
+type Encoder struct {
+	st uintptr
+}
+
+func NewEncoder(sampleRate, channels, application int) (*Encoder, error) {
+	if err := ensureInit(); err != nil {
+		return nil, err
+	}
+
+	var opusErr int32
+	st := cEncoderCreate(int32(sampleRate), int32(channels), int32(application), &opusErr)
+	if opusErr != 0 || st == 0 {
+		return nil, fmt.Errorf("opus_encoder_create failed: error %d", opusErr)
+	}
+	return &Encoder{st: st}, nil
+}
+
+// Encode encodes a frame of PCM int16 samples. It returns the number of bytes
+// written to out, or a negative error code.
+func (e *Encoder) Encode(pcm []int16, frameSize int, out []byte) (int, error) {
+	if len(pcm) == 0 || len(out) == 0 {
+		return 0, errors.New("opus encode: empty input or output buffer")
+	}
+	n := cEncode(e.st, &pcm[0], int32(frameSize), &out[0], int32(len(out)))
+	if n < 0 {
+		return 0, fmt.Errorf("opus_encode failed: error %d", n)
+	}
+	return int(n), nil
+}
+
+func (e *Encoder) SetBitrate(bitrate int) error {
+	if ret := cSetBitrate(e.st, int32(bitrate)); ret != 0 {
+		return fmt.Errorf("opus set bitrate: error %d", ret)
+	}
+	return nil
+}
+
+func (e *Encoder) SetComplexity(complexity int) error {
+	if ret := cSetComplexity(e.st, int32(complexity)); ret != 0 {
+		return fmt.Errorf("opus set complexity: error %d", ret)
+	}
+	return nil
+}
+
+func (e *Encoder) Close() {
+	if e.st != 0 {
+		cEncoderDestroy(e.st)
+		e.st = 0
+	}
+}
+
+// Decoder wraps a libopus OpusDecoder via purego.
+type Decoder struct {
+	st uintptr
+}
+
+func NewDecoder(sampleRate, channels int) (*Decoder, error) {
+	if err := ensureInit(); err != nil {
+		return nil, err
+	}
+
+	var opusErr int32
+	st := cDecoderCreate(int32(sampleRate), int32(channels), &opusErr)
+	if opusErr != 0 || st == 0 {
+		return nil, fmt.Errorf("opus_decoder_create failed: error %d", opusErr)
+	}
+	return &Decoder{st: st}, nil
+}
+
+// Decode decodes an Opus packet into pcm. frameSize is the max number of
+// samples per channel that pcm can hold. Returns the number of decoded samples
+// per channel.
+func (d *Decoder) Decode(data []byte, pcm []int16, frameSize int, fec bool) (int, error) {
+	if len(pcm) == 0 {
+		return 0, errors.New("opus decode: empty output buffer")
+	}
+
+	var dataPtr *byte
+	var dataLen int32
+	if len(data) > 0 {
+		dataPtr = &data[0]
+		dataLen = int32(len(data))
+	}
+
+	decodeFec := int32(0)
+	if fec {
+		decodeFec = 1
+	}
+
+	n := cDecode(d.st, dataPtr, dataLen, &pcm[0], int32(frameSize), decodeFec)
+	if n < 0 {
+		return 0, fmt.Errorf("opus_decode failed: error %d", n)
+	}
+	return int(n), nil
+}
+
+func (d *Decoder) Close() {
+	if d.st != 0 {
+		cDecoderDestroy(d.st)
+		d.st = 0
+	}
+}
+
+// Initialized reports whether the opus libraries were loaded successfully.
+func Initialized() bool {
+	return ensureInit() == nil
+}
+
+// Init eagerly loads the opus libraries, returning any error.
+// Calling this is optional; the libraries are loaded lazily on first use.
+func Init() error {
+	return ensureInit()
+}
+
+// Reset allows re-initialization (for testing).
+func Reset() {
+	initOnce = sync.Once{}
+	initErr = nil
+	opusLib = 0
+	shimLib = 0
+}
diff --git a/pkg/opus/shim/Makefile b/pkg/opus/shim/Makefile
new file mode 100644
index 000000000000..d9467fa39ff7
--- /dev/null
+++ b/pkg/opus/shim/Makefile
@@ -0,0 +1,10 @@
+OPUS_CFLAGS := $(shell pkg-config --cflags opus)
+OPUS_LIBS := $(shell pkg-config --libs opus)
+
+libopusshim.so: opus_shim.c
+	$(CC) -shared -fPIC -o $@ $< $(OPUS_CFLAGS) $(OPUS_LIBS)
+
+clean:
+	rm -f libopusshim.so
+
+.PHONY: clean
diff --git a/pkg/opus/shim/libopusshim.so b/pkg/opus/shim/libopusshim.so
new file mode 100755
index 0000000000000000000000000000000000000000..1ed7a1daa2244cd93990eae5dbeaf88c7182daad
GIT binary patch
literal 15240
zcmeHOU5p!76~1<w)+Nb$8&Z)bG_j&aqy>+E_AhO<#GC!eq-@gsY$OU))A4xLHtX?t
zXU4ns3W=@4PouV|PlyMeqJpA|hqO;=g}_m(Di5txgoNOUeW{cnep0HU08-33bI<Wk
zX1!4>@qo~rmFL{^ou7N>_>S+)&bi;6o}G!uVuERl_^Mc!geZ_g9bpT^4vPdF9}>HL
z+k4hk8(M2yCB$3|N*Vj`RU|Xc{4@=MIpYZX{iS3a5#sL=%sl9KFX%3(rNQO#JNeh+
zQhts&`=N+_9rEjtA9KO>#1k@%IQ5}dy4yIvLUznKho`|0`}?%S={rOHZFYXAXnf2m
zz5ejAKa^#^iGmQ%`|bMxd;N!hJp0>+JMTlYFJb?wJ{pYI4|f6I*~cF9(9eDb@B@A9
z&jGemY!ibINag3r=6i%^L%7hivSrsb*V7$O(}Z?>?u1q{9kX0@J=2*xF<G<g=A2%v
znNrtZqZut-o3GaOT6M(~cBAQPZl!8zX5FwMn&z6GR;+rC?va)ihF9Cf+_0@i&1_Y@
zwy0H$7*cU<C5wWAtx<7o_W03BEwAL25s|6bmYH#?MkQ0WGp=J~%2lt@EGiHtV>L@P
z)5*-*hF+UEp0S%=qv^TGGZ5{GOugF5xSs8p8GX#jwdacqi}S_9#bQn`jk@*rh~W;o
z&QN_}xSpn=q;txMk`H^gT4RPamS1e=jqK>~kYVMjje5CscuZehUNB0HbQuSq&MU*&
z^yugYZ!3FPUo4fz8ZDz#v9kKYsJ>XtEsa%{M;1q&YPxI~>D;g~q-1dzVRyIU=4<V5
zTO;kIv3aXJ+*oceIklz7SDax>H>{<QI|RcBI6Y$Y7x%fGFo?YYO}>trd#d*fxGH`r
z)q;N~__`J}uddMV0OK7FP>_y{3zFB4NSv=VDG6~sfYT-Ft8NBx{%t_TtpH9xt-k&5
z3CC-8i;qNyC<9Rjq6|bCh%yjmAj&|LfhYqTGw^Y8-@nxJe@>_`Zv8_>2=)6NFW&t?
zJ^xPPZQr;%auML}!B^p!++Tne$`#bzSnC2Ed;{e9S2(iz^7!ijs4s7Q4cX(jyayrl
z61E}q{oS?X{#CSnn>g4%@7s^8Aba$-df}#e^{+?NtGBkOv3J$q-1c@sfN2Vl=&sEt
z_fM_+<M>y{$3ZQcUp}XvA3uOi^}>hVcJ;;aeIQ(a21a?k1c!IGJ_vH`S?IghKl<NT
z1|^N>9JpTi`>!r0g~)ZDy|Du{^FT^+KR#IsLHgH7|Cig{uIS~kmo?usIz$<WG7x1T
z%0QHXC<9Rjq6|bCh%yjmAj&|Lf&ZBd#P)7^65qYzJEqjlZdV0f0NwyT_))j}0`M&G
zOTdwKNhV?|XGE+u7~8x3zQil=viA|dzX+cX{?_fL&}Lw8W?=VI$!*JtRdHm`laCEO
z;=jg4e|UTs*T61;c0}m=E%>ZL-~PMYiGjf%#3y&$`xJBlIO@N17d@yViZT#oAj&|L
zfhYq}2BHi^8Hh3vWgyDH|5pa6{*dbpvCbChXOclRqa>_pM0!{<agF$P;#}~!gXAfS
z`w+=o_qdm2epcH<GS@%ia~TrrKe^qteSnG?xkm9i$+E^+h(jcQ&Nqg)vLtUKc{g#c
zW$aLWBbh<vxg>rb#kxnNFGwb?QOAs@Nak@~B0DJd^oir`Bbn{~-IeFVd4>1A^86?s
zydU;u*2CkgFaHT2sDb`TGM|S;|9i(?9eOC_DA%Kg@C{-?BF82tpGX}#cJkb#DOq|e
zjGA6Ar<P2|t=jceKBwfA{08qHfaiaY_^8_hsDbk<4t3h6LgPsiz8b<05YF?P!oZFL
ztl!-wusR)y^Ns&Yh{0s$`RFU*umB(Im+N&IUxr3ZBztwejQ<!K{l<TB6Z{u`oB?qy
z1&t8(i}M!P#Km^O*BQHXpb-=BACCefl4HSr3<e+met+kyA*(5A$HyDcz&M;A#y|1#
zor13)#<xKN<9OAE^8WzfDK-l5G~oT}?Z38({WpC3KmckkP@Hgm{!5$0`8nV?CpM7b
zctt4Qub)+Hv0lfod6s5iCBF*=|0P>1*X*KR(@IeE?rM6oC1ka|XO^JCzOMyV_g6LD
zar8D+^?OcR%saYeYNe)SwZSDIX<+HyVcOdepcn!MXx2+OJWZQ9GjU>Cn?5;(I}fk}
z?Ax`qie86`eW6V~b8_Ou@kv183e=`ma!{wv00esz*!9@#(?=&}wWnuh&Q8y1a}!5r
zry(E?<E|STdc*3wPa(83!Z-d*+ZO!26rW<hL?O75K`3t9^7JBb&yl>st#ZAoRCKo@
zlv2A6W|Di3P*6iZNp}bu=p3`AqXV@XHBTshY7}THWgA4#Y{Ak00u-}C-({r)&MZk6
z`K3xrZbM|<sv0mB-P46qbX|d)E-aWN8z$#x2XLMJ2douwcI5`~LxeNT>l2$Fga)pi
ztmpNWd4~*f9fC`$w_fwQmj(=V?9b~pb6%gp#3zgqB-|c`Ya#pd`p2AqW>_EQN5LQ0
zR>tM#FaT+tO+gbWJbsA^aT*%9<g!1n_slPmA?rDReE#1ge;fxA-#;*CJM_o>OJRQ#
zFw6<x-}HW0>BRXy3QT;W+XEKGbI=O<r^v6zdzP}n`DBjkUC_Tk3g$ds4#P5YoWntX
zUay&d%#OslUxrsg{=CjH$NPXFh0lK#l-NIiKOzMJQp!IHd`A(au>X%h8T8Mlq&4%O
z2A!eJ=gsg>pfLa+oL@+M|I7R7!uBjPe-(W2o`(JTe&h;XSU_wirFEI{%b>uy!~VSP
z^S(n)4C~pCzX!bO#Per8ulJn{fItQRxPD-N=5K;3=-+e5N*}i42x0lxpg@fK=lhVv
zCjQu0klq0SF*5I|*#ApCL^jwz$a`}qV4J0YFOU+S5Ad0R$IsU_J_qpF1McDq-Sb}M
Yjz}32{oBlb{_SDu@P05Tw8Q@Y0Tc=5EdT%j

literal 0
HcmV?d00001

diff --git a/pkg/opus/shim/opus_shim.c b/pkg/opus/shim/opus_shim.c
new file mode 100644
index 000000000000..75d3babb4625
--- /dev/null
+++ b/pkg/opus/shim/opus_shim.c
@@ -0,0 +1,9 @@
+#include <opus.h>
+
+int opus_shim_encoder_set_bitrate(OpusEncoder *st, opus_int32 bitrate) {
+  return opus_encoder_ctl(st, OPUS_SET_BITRATE(bitrate));
+}
+
+int opus_shim_encoder_set_complexity(OpusEncoder *st, opus_int32 complexity) {
+  return opus_encoder_ctl(st, OPUS_SET_COMPLEXITY(complexity));
+}
diff --git a/pkg/sound/int16.go b/pkg/sound/int16.go
index f56aa14f9ebe..1b30827d2e9a 100644
--- a/pkg/sound/int16.go
+++ b/pkg/sound/int16.go
@@ -25,11 +25,23 @@ func CalculateRMS16(buffer []int16) float64 {
 }
 
 func ResampleInt16(input []int16, inputRate, outputRate int) []int16 {
+	if len(input) == 0 {
+		return nil
+	}
+	if inputRate == outputRate {
+		out := make([]int16, len(input))
+		copy(out, input)
+		return out
+	}
+
 	// Calculate the resampling ratio
 	ratio := float64(inputRate) / float64(outputRate)
 
 	// Calculate the length of the resampled output
 	outputLength := int(float64(len(input)) / ratio)
+	if outputLength <= 0 {
+		return []int16{input[0]}
+	}
 
 	// Allocate a slice for the resampled output
 	output := make([]int16, outputLength)
diff --git a/pkg/sound/int16_test.go b/pkg/sound/int16_test.go
new file mode 100644
index 000000000000..ce2bc8c2a6f7
--- /dev/null
+++ b/pkg/sound/int16_test.go
@@ -0,0 +1,162 @@
+package sound
+
+import (
+	"math"
+	"testing"
+)
+
+func TestBytesToInt16sLE_and_Int16toBytesLE_Roundtrip(t *testing.T) {
+	values := []int16{0, 1, -1, 32767, -32768}
+	b := Int16toBytesLE(values)
+	got := BytesToInt16sLE(b)
+
+	if len(got) != len(values) {
+		t.Fatalf("length mismatch: got %d, want %d", len(got), len(values))
+	}
+	for i, v := range values {
+		if got[i] != v {
+			t.Errorf("index %d: got %d, want %d", i, got[i], v)
+		}
+	}
+}
+
+func TestBytesToInt16sLE_PanicsOnOddLength(t *testing.T) {
+	defer func() {
+		if r := recover(); r == nil {
+			t.Error("expected panic on odd-length input, got none")
+		}
+	}()
+	BytesToInt16sLE([]byte{0x01, 0x02, 0x03})
+}
+
+func TestBytesToInt16sLE_EmptyInput(t *testing.T) {
+	got := BytesToInt16sLE([]byte{})
+	if len(got) != 0 {
+		t.Errorf("expected empty slice, got length %d", len(got))
+	}
+}
+
+func TestInt16toBytesLE_EmptyInput(t *testing.T) {
+	got := Int16toBytesLE([]int16{})
+	if len(got) != 0 {
+		t.Errorf("expected empty slice, got length %d", len(got))
+	}
+}
+
+func TestResampleInt16_Identity(t *testing.T) {
+	src := generateSineWave(440, 16000, 320)
+	dst := ResampleInt16(src, 16000, 16000)
+
+	if len(dst) != len(src) {
+		t.Fatalf("length mismatch: got %d, want %d", len(dst), len(src))
+	}
+	for i := range src {
+		if src[i] != dst[i] {
+			t.Errorf("sample %d differs: got %d, want %d", i, dst[i], src[i])
+		}
+	}
+}
+
+func TestResampleInt16_Downsample_48k_to_16k(t *testing.T) {
+	// 960 samples at 48kHz = 20ms
+	src := generateSineWave(440, 48000, 960)
+	dst := ResampleInt16(src, 48000, 16000)
+
+	expectedLen := 320
+	if len(dst) != expectedLen {
+		t.Fatalf("expected %d samples, got %d", expectedLen, len(dst))
+	}
+
+	// Verify the output still contains a reasonable sine wave
+	freq := estimateFrequency(dst, 16000)
+	if math.Abs(freq-440) > 50 {
+		t.Errorf("estimated frequency %.1f Hz, expected ~440 Hz", freq)
+	}
+}
+
+func TestResampleInt16_Upsample_16k_to_48k(t *testing.T) {
+	// 320 samples at 16kHz = 20ms
+	src := generateSineWave(440, 16000, 320)
+	dst := ResampleInt16(src, 16000, 48000)
+
+	expectedLen := 960
+	if len(dst) != expectedLen {
+		t.Fatalf("expected %d samples, got %d", expectedLen, len(dst))
+	}
+
+	freq := estimateFrequency(dst, 48000)
+	if math.Abs(freq-440) > 50 {
+		t.Errorf("estimated frequency %.1f Hz, expected ~440 Hz", freq)
+	}
+}
+
+func TestResampleInt16_DoubleResamplingQuality(t *testing.T) {
+	// Compare 48k->24k->16k vs direct 48k->16k
+	src := generateSineWave(440, 48000, 4800) // 100ms
+
+	direct := ResampleInt16(src, 48000, 16000)
+
+	step1 := ResampleInt16(src, 48000, 24000)
+	double := ResampleInt16(step1, 24000, 16000)
+
+	// Lengths should be the same
+	minLen := len(direct)
+	if len(double) < minLen {
+		minLen = len(double)
+	}
+
+	corr := computeCorrelation(direct[:minLen], double[:minLen])
+	if corr < 0.95 {
+		t.Errorf("double resampling correlation %.4f < 0.95 (quality loss too high)", corr)
+	}
+}
+
+func TestResampleInt16_SingleSample(t *testing.T) {
+	src := []int16{1000}
+	got := ResampleInt16(src, 48000, 16000)
+	if len(got) == 0 {
+		t.Fatal("expected non-empty output for single-sample input")
+	}
+	if got[0] != 1000 {
+		t.Errorf("expected sample value 1000, got %d", got[0])
+	}
+}
+
+func TestResampleInt16_EmptyInput(t *testing.T) {
+	got := ResampleInt16(nil, 48000, 16000)
+	if got != nil {
+		t.Errorf("expected nil for empty input, got length %d", len(got))
+	}
+}
+
+func TestCalculateRMS16_ConstantSignal(t *testing.T) {
+	buf := make([]int16, 1000)
+	for i := range buf {
+		buf[i] = 1000
+	}
+	rms := CalculateRMS16(buf)
+	if math.Abs(rms-1000) > 0.01 {
+		t.Errorf("expected RMS=1000, got %.4f", rms)
+	}
+}
+
+func TestCalculateRMS16_Silence(t *testing.T) {
+	buf := make([]int16, 1000)
+	rms := CalculateRMS16(buf)
+	if rms != 0 {
+		t.Errorf("expected RMS=0, got %.4f", rms)
+	}
+}
+
+func TestCalculateRMS16_KnownSineWave(t *testing.T) {
+	// RMS of a sine wave with amplitude A is A/sqrt(2)
+	amplitude := float64(math.MaxInt16 / 2)
+	buf := generateSineWave(440, 16000, 16000) // 1 second
+	rms := CalculateRMS16(buf)
+	expectedRMS := amplitude / math.Sqrt(2)
+
+	tolerance := expectedRMS * 0.02
+	if math.Abs(rms-expectedRMS) > tolerance {
+		t.Errorf("expected RMS≈%.1f, got %.1f (tolerance %.1f)", expectedRMS, rms, tolerance)
+	}
+}
diff --git a/pkg/sound/testutil_test.go b/pkg/sound/testutil_test.go
new file mode 100644
index 000000000000..0f044df68ec5
--- /dev/null
+++ b/pkg/sound/testutil_test.go
@@ -0,0 +1,72 @@
+package sound
+
+import "math"
+
+// generateSineWave produces a sine wave of the given frequency at the given sample rate.
+func generateSineWave(freq float64, sampleRate, numSamples int) []int16 {
+	out := make([]int16, numSamples)
+	for i := range out {
+		t := float64(i) / float64(sampleRate)
+		out[i] = int16(math.MaxInt16 / 2 * math.Sin(2*math.Pi*freq*t))
+	}
+	return out
+}
+
+// computeCorrelation returns the normalised Pearson correlation between two
+// equal-length int16 slices. Returns 0 when either signal has zero energy.
+func computeCorrelation(a, b []int16) float64 {
+	n := len(a)
+	if n == 0 || n != len(b) {
+		return 0
+	}
+	var sumAB, sumA2, sumB2 float64
+	for i := 0; i < n; i++ {
+		fa, fb := float64(a[i]), float64(b[i])
+		sumAB += fa * fb
+		sumA2 += fa * fa
+		sumB2 += fb * fb
+	}
+	denom := math.Sqrt(sumA2 * sumB2)
+	if denom == 0 {
+		return 0
+	}
+	return sumAB / denom
+}
+
+// estimateFrequency estimates the dominant frequency of a mono int16 signal
+// using zero-crossing count.
+func estimateFrequency(samples []int16, sampleRate int) float64 {
+	if len(samples) < 2 {
+		return 0
+	}
+	crossings := 0
+	for i := 1; i < len(samples); i++ {
+		if (samples[i-1] >= 0 && samples[i] < 0) || (samples[i-1] < 0 && samples[i] >= 0) {
+			crossings++
+		}
+	}
+	duration := float64(len(samples)) / float64(sampleRate)
+	// Each full cycle has 2 zero crossings.
+	return float64(crossings) / (2 * duration)
+}
+
+// computeRMS returns the root-mean-square of an int16 slice.
+func computeRMS(samples []int16) float64 {
+	if len(samples) == 0 {
+		return 0
+	}
+	var sum float64
+	for _, s := range samples {
+		v := float64(s)
+		sum += v * v
+	}
+	return math.Sqrt(sum / float64(len(samples)))
+}
+
+// generatePCMBytes creates a little-endian int16 PCM byte slice containing a
+// sine wave of the given frequency at the given sample rate and duration.
+func generatePCMBytes(freq float64, sampleRate, durationMs int) []byte {
+	numSamples := sampleRate * durationMs / 1000
+	samples := generateSineWave(freq, sampleRate, numSamples)
+	return Int16toBytesLE(samples)
+}
diff --git a/tests/e2e/e2e_suite_test.go b/tests/e2e/e2e_suite_test.go
index 66d9d6cd7ffb..d55e7d7ab469 100644
--- a/tests/e2e/e2e_suite_test.go
+++ b/tests/e2e/e2e_suite_test.go
@@ -87,10 +87,10 @@ var _ = BeforeSuite(func() {
 	Expect(os.Chmod(mockBackendPath, 0755)).To(Succeed())
 
 	// Create model config YAML
-	modelConfig := map[string]interface{}{
+	modelConfig := map[string]any{
 		"name":    "mock-model",
 		"backend": "mock-backend",
-		"parameters": map[string]interface{}{
+		"parameters": map[string]any{
 			"model": "mock-model.bin",
 		},
 	}
@@ -99,11 +99,92 @@ var _ = BeforeSuite(func() {
 	Expect(err).ToNot(HaveOccurred())
 	Expect(os.WriteFile(configPath, configYAML, 0644)).To(Succeed())
 
-	// Set up system state
-	systemState, err := system.GetSystemState(
-		system.WithBackendPath(backendPath),
+	// Create pipeline model configs for realtime API tests.
+	// Each component model uses the same mock-backend binary.
+	for _, name := range []string{"mock-vad", "mock-stt", "mock-llm", "mock-tts"} {
+		cfg := map[string]any{
+			"name":    name,
+			"backend": "mock-backend",
+			"parameters": map[string]any{
+				"model": name + ".bin",
+			},
+		}
+		data, err := yaml.Marshal(cfg)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(os.WriteFile(filepath.Join(modelsPath, name+".yaml"), data, 0644)).To(Succeed())
+	}
+
+	// Pipeline model that wires the component models together.
+	pipelineCfg := map[string]any{
+		"name": "realtime-pipeline",
+		"pipeline": map[string]any{
+			"vad":           "mock-vad",
+			"transcription": "mock-stt",
+			"llm":           "mock-llm",
+			"tts":           "mock-tts",
+		},
+	}
+	pipelineData, err := yaml.Marshal(pipelineCfg)
+	Expect(err).ToNot(HaveOccurred())
+	Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-pipeline.yaml"), pipelineData, 0644)).To(Succeed())
+
+	// If REALTIME_TEST_MODEL=realtime-test-pipeline, auto-create a pipeline
+	// config from the REALTIME_VAD/STT/LLM/TTS env vars so real-model tests
+	// can run without the user having to write a YAML file manually.
+	if os.Getenv("REALTIME_TEST_MODEL") == "realtime-test-pipeline" {
+		rtVAD := os.Getenv("REALTIME_VAD")
+		rtSTT := os.Getenv("REALTIME_STT")
+		rtLLM := os.Getenv("REALTIME_LLM")
+		rtTTS := os.Getenv("REALTIME_TTS")
+
+		if rtVAD != "" && rtSTT != "" && rtLLM != "" && rtTTS != "" {
+			testPipeline := map[string]any{
+				"name": "realtime-test-pipeline",
+				"pipeline": map[string]any{
+					"vad":           rtVAD,
+					"transcription": rtSTT,
+					"llm":           rtLLM,
+					"tts":           rtTTS,
+				},
+			}
+			data, writeErr := yaml.Marshal(testPipeline)
+			Expect(writeErr).ToNot(HaveOccurred())
+			Expect(os.WriteFile(filepath.Join(modelsPath, "realtime-test-pipeline.yaml"), data, 0644)).To(Succeed())
+			xlog.Info("created realtime-test-pipeline",
+				"vad", rtVAD, "stt", rtSTT, "llm", rtLLM, "tts", rtTTS)
+		}
+	}
+
+	// Import model configs from an external directory (e.g. real model YAMLs
+	// and weights mounted into a container). Symlinks avoid copying large files.
+	if rtModels := os.Getenv("REALTIME_MODELS_PATH"); rtModels != "" {
+		entries, err := os.ReadDir(rtModels)
+		Expect(err).ToNot(HaveOccurred())
+		for _, entry := range entries {
+			src := filepath.Join(rtModels, entry.Name())
+			dst := filepath.Join(modelsPath, entry.Name())
+			if _, err := os.Stat(dst); err == nil {
+				continue // don't overwrite mock configs
+			}
+			if entry.IsDir() {
+				continue
+			}
+			Expect(os.Symlink(src, dst)).To(Succeed())
+		}
+	}
+
+	// Set up system state. When REALTIME_BACKENDS_PATH is set, use it so the
+	// application can discover real backend binaries for real-model tests.
+	systemOpts := []system.SystemStateOptions{
 		system.WithModelPath(modelsPath),
-	)
+	}
+	if realBackends := os.Getenv("REALTIME_BACKENDS_PATH"); realBackends != "" {
+		systemOpts = append(systemOpts, system.WithBackendPath(realBackends))
+	} else {
+		systemOpts = append(systemOpts, system.WithBackendPath(backendPath))
+	}
+
+	systemState, err := system.GetSystemState(systemOpts...)
 	Expect(err).ToNot(HaveOccurred())
 
 	// Create application
@@ -120,7 +201,7 @@ var _ = BeforeSuite(func() {
 	)
 	Expect(err).ToNot(HaveOccurred())
 
-	// Register backend with application's model loader
+	// Register mock backend (always available for non-realtime tests).
 	application.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath)
 
 	// Create HTTP app
diff --git a/tests/e2e/mock-backend/main.go b/tests/e2e/mock-backend/main.go
index e94a7bf4266f..d39967f53ff5 100644
--- a/tests/e2e/mock-backend/main.go
+++ b/tests/e2e/mock-backend/main.go
@@ -7,9 +7,11 @@ import (
 	"flag"
 	"fmt"
 	"log"
+	"math"
 	"net"
 	"os"
 	"path/filepath"
+	"strconv"
 
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/xlog"
@@ -177,12 +179,28 @@ func (m *MockBackend) SoundGeneration(ctx context.Context, in *pb.SoundGeneratio
 	}, nil
 }
 
-// writeMinimalWAV writes a minimal valid WAV file (short silence) so the HTTP handler can send it.
+// ttsSampleRate returns the sample rate to use for TTS output, configurable
+// via the MOCK_TTS_SAMPLE_RATE environment variable (default 16000).
+func ttsSampleRate() int {
+	if s := os.Getenv("MOCK_TTS_SAMPLE_RATE"); s != "" {
+		if v, err := strconv.Atoi(s); err == nil && v > 0 {
+			return v
+		}
+	}
+	return 16000
+}
+
+// writeMinimalWAV writes a WAV file containing a 440Hz sine wave (0.5s)
+// so that tests can verify audio integrity end-to-end. The sample rate
+// is configurable via MOCK_TTS_SAMPLE_RATE to test rate mismatch bugs.
 func writeMinimalWAV(path string) error {
-	const sampleRate = 16000
+	sampleRate := ttsSampleRate()
 	const numChannels = 1
 	const bitsPerSample = 16
-	const numSamples = 1600 // 0.1s
+	const freq = 440.0
+	const durationSec = 0.5
+	numSamples := int(float64(sampleRate) * durationSec)
+
 	dataSize := numSamples * numChannels * (bitsPerSample / 8)
 	const headerLen = 44
 	f, err := os.Create(path)
@@ -203,23 +221,56 @@ func writeMinimalWAV(path string) error {
 	_ = binary.Write(f, binary.LittleEndian, uint32(sampleRate*numChannels*(bitsPerSample/8)))
 	_ = binary.Write(f, binary.LittleEndian, uint16(numChannels*(bitsPerSample/8)))
 	_ = binary.Write(f, binary.LittleEndian, uint16(bitsPerSample))
-	// data chunk
+	// data chunk — 440Hz sine wave
 	_, _ = f.Write([]byte("data"))
 	_ = binary.Write(f, binary.LittleEndian, uint32(dataSize))
-	_, _ = f.Write(make([]byte, dataSize))
+	for i := range numSamples {
+		t := float64(i) / float64(sampleRate)
+		sample := int16(math.MaxInt16 / 2 * math.Sin(2*math.Pi*freq*t))
+		_ = binary.Write(f, binary.LittleEndian, sample)
+	}
 	return nil
 }
 
 func (m *MockBackend) AudioTranscription(ctx context.Context, in *pb.TranscriptRequest) (*pb.TranscriptResult, error) {
-	xlog.Debug("AudioTranscription called")
+	dst := in.GetDst()
+	wavSR := 0
+	dataLen := 0
+	rms := 0.0
+
+	if dst != "" {
+		if data, err := os.ReadFile(dst); err == nil {
+			if len(data) >= 44 {
+				wavSR = int(binary.LittleEndian.Uint32(data[24:28]))
+				dataLen = int(binary.LittleEndian.Uint32(data[40:44]))
+
+				// Compute RMS of the PCM payload (16-bit LE samples)
+				pcm := data[44:]
+				var sumSq float64
+				nSamples := len(pcm) / 2
+				for i := range nSamples {
+					s := int16(pcm[2*i]) | int16(pcm[2*i+1])<<8
+					v := float64(s)
+					sumSq += v * v
+				}
+				if nSamples > 0 {
+					rms = math.Sqrt(sumSq / float64(nSamples))
+				}
+			}
+		}
+	}
+
+	xlog.Debug("AudioTranscription called", "dst", dst, "wav_sample_rate", wavSR, "data_len", dataLen, "rms", rms)
+
+	text := fmt.Sprintf("transcribed: rms=%.1f samples=%d sr=%d", rms, dataLen/2, wavSR)
 	return &pb.TranscriptResult{
-		Text: "This is a mocked transcription.",
+		Text: text,
 		Segments: []*pb.TranscriptSegment{
 			{
 				Id:    0,
 				Start: 0,
 				End:   3000,
-				Text:  "This is a mocked transcription.",
+				Text:  text,
 				Tokens: []int32{1, 2, 3, 4, 5, 6},
 			},
 		},
@@ -349,16 +400,30 @@ func (m *MockBackend) GetMetrics(ctx context.Context, in *pb.MetricsRequest) (*p
 }
 
 func (m *MockBackend) VAD(ctx context.Context, in *pb.VADRequest) (*pb.VADResponse, error) {
-	xlog.Debug("VAD called", "audio_length", len(in.Audio))
+	// Compute RMS of the received float32 audio to decide whether speech is present.
+	var sumSq float64
+	for _, s := range in.Audio {
+		v := float64(s)
+		sumSq += v * v
+	}
+	rms := 0.0
+	if len(in.Audio) > 0 {
+		rms = math.Sqrt(sumSq / float64(len(in.Audio)))
+	}
+	xlog.Debug("VAD called", "audio_length", len(in.Audio), "rms", rms)
+
+	// If audio is near-silence, return no segments (no speech detected).
+	if rms < 0.001 {
+		return &pb.VADResponse{}, nil
+	}
+
+	// Audio has signal — return a single segment covering the duration.
+	duration := float64(len(in.Audio)) / 16000.0
 	return &pb.VADResponse{
 		Segments: []*pb.VADSegment{
 			{
 				Start: 0.0,
-				End:   1.5,
-			},
-			{
-				Start: 2.0,
-				End:   3.5,
+				End:   float32(duration),
 			},
 		},
 	}, nil
diff --git a/tests/e2e/realtime_webrtc_test.go b/tests/e2e/realtime_webrtc_test.go
new file mode 100644
index 000000000000..d04c55e5941f
--- /dev/null
+++ b/tests/e2e/realtime_webrtc_test.go
@@ -0,0 +1,459 @@
+package e2e_test
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io"
+	"math"
+	"net/http"
+	"os"
+	"sync"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"github.com/pion/webrtc/v4"
+	"github.com/pion/webrtc/v4/pkg/media"
+)
+
+// --- WebRTC test client ---
+
+type webrtcTestClient struct {
+	pc        *webrtc.PeerConnection
+	dc        *webrtc.DataChannel
+	sendTrack *webrtc.TrackLocalStaticSample
+
+	events    chan map[string]any
+	audioData chan []byte // raw Opus frames received
+
+	dcOpen chan struct{} // closed when data channel opens
+	mu     sync.Mutex
+}
+
+func newWebRTCTestClient() *webrtcTestClient {
+	m := &webrtc.MediaEngine{}
+	Expect(m.RegisterDefaultCodecs()).To(Succeed())
+
+	api := webrtc.NewAPI(webrtc.WithMediaEngine(m))
+
+	pc, err := api.NewPeerConnection(webrtc.Configuration{})
+	Expect(err).ToNot(HaveOccurred())
+
+	// Create outbound audio track (Opus)
+	sendTrack, err := webrtc.NewTrackLocalStaticSample(
+		webrtc.RTPCodecCapability{MimeType: webrtc.MimeTypeOpus},
+		"audio-client",
+		"test-client",
+	)
+	Expect(err).ToNot(HaveOccurred())
+
+	rtpSender, err := pc.AddTrack(sendTrack)
+	Expect(err).ToNot(HaveOccurred())
+
+	// Drain RTCP
+	go func() {
+		buf := make([]byte, 1500)
+		for {
+			if _, _, err := rtpSender.Read(buf); err != nil {
+				return
+			}
+		}
+	}()
+
+	// Create the "oai-events" data channel (must be created by client)
+	dc, err := pc.CreateDataChannel("oai-events", nil)
+	Expect(err).ToNot(HaveOccurred())
+
+	c := &webrtcTestClient{
+		pc:        pc,
+		dc:        dc,
+		sendTrack: sendTrack,
+		events:    make(chan map[string]any, 256),
+		audioData: make(chan []byte, 4096),
+		dcOpen:    make(chan struct{}),
+	}
+
+	dc.OnOpen(func() {
+		close(c.dcOpen)
+	})
+
+	dc.OnMessage(func(msg webrtc.DataChannelMessage) {
+		var evt map[string]any
+		if err := json.Unmarshal(msg.Data, &evt); err == nil {
+			c.events <- evt
+		}
+	})
+
+	// Collect incoming audio tracks
+	pc.OnTrack(func(track *webrtc.TrackRemote, receiver *webrtc.RTPReceiver) {
+		for {
+			pkt, _, err := track.ReadRTP()
+			if err != nil {
+				return
+			}
+			c.audioData <- pkt.Payload
+		}
+	})
+
+	return c
+}
+
+// connect performs SDP exchange with the server and waits for the data channel to open.
+func (c *webrtcTestClient) connect(model string) {
+	offer, err := c.pc.CreateOffer(nil)
+	Expect(err).ToNot(HaveOccurred())
+	Expect(c.pc.SetLocalDescription(offer)).To(Succeed())
+
+	// Wait for ICE gathering
+	gatherDone := webrtc.GatheringCompletePromise(c.pc)
+	select {
+	case <-gatherDone:
+	case <-time.After(10 * time.Second):
+		Fail("ICE gathering timed out")
+	}
+
+	localDesc := c.pc.LocalDescription()
+	Expect(localDesc).ToNot(BeNil())
+
+	// POST to /v1/realtime/calls
+	reqBody, err := json.Marshal(map[string]string{
+		"sdp":   localDesc.SDP,
+		"model": model,
+	})
+	Expect(err).ToNot(HaveOccurred())
+
+	resp, err := http.Post(
+		fmt.Sprintf("http://127.0.0.1:%d/v1/realtime/calls", apiPort),
+		"application/json",
+		bytes.NewReader(reqBody),
+	)
+	Expect(err).ToNot(HaveOccurred())
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(resp.Body)
+	Expect(err).ToNot(HaveOccurred())
+	Expect(resp.StatusCode).To(Equal(http.StatusCreated),
+		"expected 201, got %d: %s", resp.StatusCode, string(body))
+
+	var callResp struct {
+		SDP       string `json:"sdp"`
+		SessionID string `json:"session_id"`
+	}
+	Expect(json.Unmarshal(body, &callResp)).To(Succeed())
+	Expect(callResp.SDP).ToNot(BeEmpty())
+
+	// Set the answer
+	Expect(c.pc.SetRemoteDescription(webrtc.SessionDescription{
+		Type: webrtc.SDPTypeAnswer,
+		SDP:  callResp.SDP,
+	})).To(Succeed())
+
+	// Wait for data channel to open
+	Eventually(c.dcOpen, 15*time.Second).Should(BeClosed())
+}
+
+// sendEvent sends a JSON event via the data channel.
+func (c *webrtcTestClient) sendEvent(event any) {
+	data, err := json.Marshal(event)
+	ExpectWithOffset(1, err).ToNot(HaveOccurred())
+	ExpectWithOffset(1, c.dc.Send(data)).To(Succeed())
+}
+
+// readEvent reads the next event from the data channel with timeout.
+func (c *webrtcTestClient) readEvent(timeout time.Duration) map[string]any {
+	select {
+	case evt := <-c.events:
+		return evt
+	case <-time.After(timeout):
+		Fail("timed out reading event from data channel")
+		return nil
+	}
+}
+
+// drainUntilEvent reads events until one with the given type appears.
+func (c *webrtcTestClient) drainUntilEvent(eventType string, timeout time.Duration) map[string]any {
+	deadline := time.Now().Add(timeout)
+	for time.Now().Before(deadline) {
+		remaining := time.Until(deadline)
+		if remaining <= 0 {
+			break
+		}
+		evt := c.readEvent(remaining)
+		if evt["type"] == eventType {
+			return evt
+		}
+	}
+	Fail("timed out waiting for event: " + eventType)
+	return nil
+}
+
+// sendSineWave encodes a sine wave to Opus and sends it over the audio track.
+// This is a simplified version that sends raw PCM wrapped as Opus-compatible
+// media samples. In a real client the Opus encoder would be used.
+func (c *webrtcTestClient) sendSilence(durationMs int) {
+	// Send silence as zero-filled PCM samples via track.
+	// We use 20ms Opus frames at 48kHz.
+	framesNeeded := durationMs / 20
+	// Minimal valid Opus silence frame (Opus DTX/silence)
+	silenceFrame := make([]byte, 3)
+	silenceFrame[0] = 0xF8 // Config: CELT-only, no VAD, 20ms frame
+	silenceFrame[1] = 0xFF
+	silenceFrame[2] = 0xFE
+
+	for range framesNeeded {
+		_ = c.sendTrack.WriteSample(media.Sample{
+			Data:     silenceFrame,
+			Duration: 20 * time.Millisecond,
+		})
+		time.Sleep(5 * time.Millisecond)
+	}
+}
+
+func (c *webrtcTestClient) close() {
+	if c.pc != nil {
+		c.pc.Close()
+	}
+}
+
+// --- Tests ---
+
+var _ = Describe("Realtime WebRTC API", Label("Realtime"), func() {
+	Context("Signaling", func() {
+		It("should complete SDP exchange and receive session.created", func() {
+			client := newWebRTCTestClient()
+			defer client.close()
+
+			client.connect(pipelineModel())
+
+			evt := client.readEvent(30 * time.Second)
+			Expect(evt["type"]).To(Equal("session.created"))
+
+			session, ok := evt["session"].(map[string]any)
+			Expect(ok).To(BeTrue())
+			Expect(session["id"]).ToNot(BeEmpty())
+		})
+	})
+
+	Context("Event exchange via DataChannel", func() {
+		It("should handle session.update", func() {
+			client := newWebRTCTestClient()
+			defer client.close()
+
+			client.connect(pipelineModel())
+
+			// Read session.created
+			created := client.readEvent(30 * time.Second)
+			Expect(created["type"]).To(Equal("session.created"))
+
+			// Disable VAD
+			client.sendEvent(disableVADEvent())
+
+			updated := client.drainUntilEvent("session.updated", 10*time.Second)
+			Expect(updated).ToNot(BeNil())
+		})
+
+		It("should handle conversation.item.create and response.create", func() {
+			client := newWebRTCTestClient()
+			defer client.close()
+
+			client.connect(pipelineModel())
+
+			created := client.readEvent(30 * time.Second)
+			Expect(created["type"]).To(Equal("session.created"))
+
+			// Disable VAD
+			client.sendEvent(disableVADEvent())
+			client.drainUntilEvent("session.updated", 10*time.Second)
+
+			// Create text item
+			client.sendEvent(map[string]any{
+				"type": "conversation.item.create",
+				"item": map[string]any{
+					"type": "message",
+					"role": "user",
+					"content": []map[string]any{
+						{
+							"type": "input_text",
+							"text": "Hello from WebRTC",
+						},
+					},
+				},
+			})
+
+			added := client.drainUntilEvent("conversation.item.added", 10*time.Second)
+			Expect(added).ToNot(BeNil())
+
+			// Trigger response
+			client.sendEvent(map[string]any{
+				"type": "response.create",
+			})
+
+			done := client.drainUntilEvent("response.done", 60*time.Second)
+			Expect(done).ToNot(BeNil())
+		})
+	})
+
+	Context("Audio track", func() {
+		It("should receive audio on the incoming track after TTS", Label("real-models"), func() {
+			if os.Getenv("REALTIME_TEST_MODEL") == "" {
+				Skip("REALTIME_TEST_MODEL not set")
+			}
+
+			client := newWebRTCTestClient()
+			defer client.close()
+
+			client.connect(pipelineModel())
+
+			created := client.readEvent(30 * time.Second)
+			Expect(created["type"]).To(Equal("session.created"))
+
+			// Disable VAD
+			client.sendEvent(disableVADEvent())
+			client.drainUntilEvent("session.updated", 10*time.Second)
+
+			// Send text and trigger response with TTS
+			client.sendEvent(map[string]any{
+				"type": "conversation.item.create",
+				"item": map[string]any{
+					"type": "message",
+					"role": "user",
+					"content": []map[string]any{
+						{
+							"type": "input_text",
+							"text": "Say hello",
+						},
+					},
+				},
+			})
+			client.drainUntilEvent("conversation.item.added", 10*time.Second)
+
+			client.sendEvent(map[string]any{
+				"type": "response.create",
+			})
+
+			// Collect audio frames while waiting for response.done
+			var audioFrames [][]byte
+			deadline := time.Now().Add(60 * time.Second)
+		loop:
+			for time.Now().Before(deadline) {
+				select {
+				case frame := <-client.audioData:
+					audioFrames = append(audioFrames, frame)
+				case evt := <-client.events:
+					if evt["type"] == "response.done" {
+						break loop
+					}
+				case <-time.After(time.Until(deadline)):
+					break loop
+				}
+			}
+
+			// We should have received some audio frames
+			Expect(len(audioFrames)).To(BeNumerically(">", 0),
+				"expected to receive audio frames on the WebRTC track")
+		})
+	})
+
+	Context("Disconnect cleanup", func() {
+		It("should handle repeated connect/disconnect cycles", func() {
+			for i := range 3 {
+				By(fmt.Sprintf("Cycle %d", i+1))
+				client := newWebRTCTestClient()
+				client.connect(pipelineModel())
+
+				evt := client.readEvent(30 * time.Second)
+				Expect(evt["type"]).To(Equal("session.created"))
+
+				client.close()
+				// Brief pause to let server clean up
+				time.Sleep(500 * time.Millisecond)
+			}
+		})
+	})
+
+	Context("Audio integrity", Label("real-models"), func() {
+		It("should receive recognizable audio from TTS through WebRTC", func() {
+			if os.Getenv("REALTIME_TEST_MODEL") == "" {
+				Skip("REALTIME_TEST_MODEL not set")
+			}
+
+			client := newWebRTCTestClient()
+			defer client.close()
+
+			client.connect(pipelineModel())
+
+			created := client.readEvent(30 * time.Second)
+			Expect(created["type"]).To(Equal("session.created"))
+
+			// Disable VAD
+			client.sendEvent(disableVADEvent())
+			client.drainUntilEvent("session.updated", 10*time.Second)
+
+			// Create text item and trigger response
+			client.sendEvent(map[string]any{
+				"type": "conversation.item.create",
+				"item": map[string]any{
+					"type": "message",
+					"role": "user",
+					"content": []map[string]any{
+						{
+							"type": "input_text",
+							"text": "Say hello",
+						},
+					},
+				},
+			})
+			client.drainUntilEvent("conversation.item.added", 10*time.Second)
+
+			client.sendEvent(map[string]any{
+				"type": "response.create",
+			})
+
+			// Collect Opus frames and decode them
+			var totalBytes int
+			deadline := time.Now().Add(60 * time.Second)
+		loop:
+			for time.Now().Before(deadline) {
+				select {
+				case frame := <-client.audioData:
+					totalBytes += len(frame)
+				case evt := <-client.events:
+					if evt["type"] == "response.done" {
+						// Drain any remaining audio
+						time.Sleep(200 * time.Millisecond)
+					drainAudio:
+						for {
+							select {
+							case frame := <-client.audioData:
+								totalBytes += len(frame)
+							default:
+								break drainAudio
+							}
+						}
+						break loop
+					}
+				case <-time.After(time.Until(deadline)):
+					break loop
+				}
+			}
+
+			// Verify we received meaningful audio data
+			Expect(totalBytes).To(BeNumerically(">", 100),
+				"expected to receive meaningful audio data")
+		})
+	})
+})
+
+// computeRMSInt16 computes RMS of int16 samples (used by audio integrity tests).
+func computeRMSInt16(samples []int16) float64 {
+	if len(samples) == 0 {
+		return 0
+	}
+	var sum float64
+	for _, s := range samples {
+		v := float64(s)
+		sum += v * v
+	}
+	return math.Sqrt(sum / float64(len(samples)))
+}
diff --git a/tests/e2e/realtime_ws_test.go b/tests/e2e/realtime_ws_test.go
new file mode 100644
index 000000000000..c69186f3345e
--- /dev/null
+++ b/tests/e2e/realtime_ws_test.go
@@ -0,0 +1,269 @@
+package e2e_test
+
+import (
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"math"
+	"net/url"
+	"os"
+	"time"
+
+	"github.com/gorilla/websocket"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// --- WebSocket test helpers ---
+
+func connectWS(model string) *websocket.Conn {
+	u := url.URL{
+		Scheme:   "ws",
+		Host:     fmt.Sprintf("127.0.0.1:%d", apiPort),
+		Path:     "/v1/realtime",
+		RawQuery: "model=" + url.QueryEscape(model),
+	}
+	conn, resp, err := websocket.DefaultDialer.Dial(u.String(), nil)
+	ExpectWithOffset(1, err).ToNot(HaveOccurred(), "websocket dial failed")
+	if resp != nil && resp.Body != nil {
+		resp.Body.Close()
+	}
+	return conn
+}
+
+func readServerEvent(conn *websocket.Conn, timeout time.Duration) map[string]any {
+	conn.SetReadDeadline(time.Now().Add(timeout))
+	_, msg, err := conn.ReadMessage()
+	ExpectWithOffset(1, err).ToNot(HaveOccurred(), "read server event")
+	var evt map[string]any
+	ExpectWithOffset(1, json.Unmarshal(msg, &evt)).To(Succeed())
+	return evt
+}
+
+func sendClientEvent(conn *websocket.Conn, event any) {
+	data, err := json.Marshal(event)
+	ExpectWithOffset(1, err).ToNot(HaveOccurred())
+	ExpectWithOffset(1, conn.WriteMessage(websocket.TextMessage, data)).To(Succeed())
+}
+
+// drainUntil reads events until it finds one with the given type, or times out.
+func drainUntil(conn *websocket.Conn, eventType string, timeout time.Duration) map[string]any {
+	deadline := time.Now().Add(timeout)
+	for time.Now().Before(deadline) {
+		evt := readServerEvent(conn, time.Until(deadline))
+		if evt["type"] == eventType {
+			return evt
+		}
+	}
+	Fail("timed out waiting for event: " + eventType)
+	return nil
+}
+
+// generatePCMBase64 creates base64-encoded 16-bit LE PCM of a sine wave.
+func generatePCMBase64(freq float64, sampleRate, durationMs int) string {
+	numSamples := sampleRate * durationMs / 1000
+	pcm := make([]byte, numSamples*2)
+	for i := range numSamples {
+		t := float64(i) / float64(sampleRate)
+		sample := int16(math.MaxInt16 / 2 * math.Sin(2*math.Pi*freq*t))
+		pcm[2*i] = byte(sample)
+		pcm[2*i+1] = byte(sample >> 8)
+	}
+	return base64.StdEncoding.EncodeToString(pcm)
+}
+
+// pipelineModel returns the model name to use for realtime tests.
+func pipelineModel() string {
+	if m := os.Getenv("REALTIME_TEST_MODEL"); m != "" {
+		return m
+	}
+	return "realtime-pipeline"
+}
+
+// disableVADEvent returns a session.update event that disables server VAD.
+func disableVADEvent() map[string]any {
+	return map[string]any{
+		"type": "session.update",
+		"session": map[string]any{
+			"audio": map[string]any{
+				"input": map[string]any{
+					"turn_detection": nil,
+				},
+			},
+		},
+	}
+}
+
+// --- Tests ---
+
+var _ = Describe("Realtime WebSocket API", Label("Realtime"), func() {
+	Context("Session management", func() {
+		It("should return session.created on connect", func() {
+			conn := connectWS(pipelineModel())
+			defer conn.Close()
+
+			evt := readServerEvent(conn, 30*time.Second)
+			Expect(evt["type"]).To(Equal("session.created"))
+
+			session, ok := evt["session"].(map[string]any)
+			Expect(ok).To(BeTrue(), "session field should be an object")
+			Expect(session["id"]).ToNot(BeEmpty())
+		})
+
+		It("should return session.updated after session.update", func() {
+			conn := connectWS(pipelineModel())
+			defer conn.Close()
+
+			// Read session.created
+			created := readServerEvent(conn, 30*time.Second)
+			Expect(created["type"]).To(Equal("session.created"))
+
+			// Send session.update to disable VAD
+			sendClientEvent(conn, disableVADEvent())
+
+			evt := drainUntil(conn, "session.updated", 10*time.Second)
+			Expect(evt["type"]).To(Equal("session.updated"))
+		})
+	})
+
+	Context("Manual audio commit", func() {
+		It("should produce a response with audio when audio is committed", func() {
+			conn := connectWS(pipelineModel())
+			defer conn.Close()
+
+			// Read session.created
+			created := readServerEvent(conn, 30*time.Second)
+			Expect(created["type"]).To(Equal("session.created"))
+
+			// Disable server VAD so we can manually commit
+			sendClientEvent(conn, disableVADEvent())
+			drainUntil(conn, "session.updated", 10*time.Second)
+
+			// Append 1 second of 440Hz sine wave at 24kHz (the default remote sample rate)
+			audio := generatePCMBase64(440, 24000, 1000)
+			sendClientEvent(conn, map[string]any{
+				"type":  "input_audio_buffer.append",
+				"audio": audio,
+			})
+
+			// Commit the audio buffer
+			sendClientEvent(conn, map[string]any{
+				"type": "input_audio_buffer.commit",
+			})
+
+			// We should receive the response event sequence.
+			// The exact events depend on the pipeline, but we expect at least:
+			// - input_audio_buffer.committed
+			// - conversation.item.input_audio_transcription.completed
+			// - response.output_audio.delta (with base64 audio)
+			// - response.done
+
+			committed := drainUntil(conn, "input_audio_buffer.committed", 30*time.Second)
+			Expect(committed).ToNot(BeNil())
+
+			// Wait for the full response cycle to complete
+			done := drainUntil(conn, "response.done", 60*time.Second)
+			Expect(done).ToNot(BeNil())
+		})
+	})
+
+	Context("Text conversation item", func() {
+		It("should create a text item and trigger a response", func() {
+			conn := connectWS(pipelineModel())
+			defer conn.Close()
+
+			// Read session.created
+			created := readServerEvent(conn, 30*time.Second)
+			Expect(created["type"]).To(Equal("session.created"))
+
+			// Disable VAD
+			sendClientEvent(conn, disableVADEvent())
+			drainUntil(conn, "session.updated", 10*time.Second)
+
+			// Create a text conversation item
+			sendClientEvent(conn, map[string]any{
+				"type": "conversation.item.create",
+				"item": map[string]any{
+					"type": "message",
+					"role": "user",
+					"content": []map[string]any{
+						{
+							"type": "input_text",
+							"text": "Hello, how are you?",
+						},
+					},
+				},
+			})
+
+			// Wait for item to be added
+			added := drainUntil(conn, "conversation.item.added", 10*time.Second)
+			Expect(added).ToNot(BeNil())
+
+			// Trigger a response
+			sendClientEvent(conn, map[string]any{
+				"type": "response.create",
+			})
+
+			// Wait for response to complete
+			done := drainUntil(conn, "response.done", 60*time.Second)
+			Expect(done).ToNot(BeNil())
+		})
+	})
+
+	Context("Audio integrity", func() {
+		It("should return non-empty audio data in response.output_audio.delta", Label("real-models"), func() {
+			if os.Getenv("REALTIME_TEST_MODEL") == "" {
+				Skip("REALTIME_TEST_MODEL not set")
+			}
+
+			conn := connectWS(pipelineModel())
+			defer conn.Close()
+
+			created := readServerEvent(conn, 30*time.Second)
+			Expect(created["type"]).To(Equal("session.created"))
+
+			// Disable VAD
+			sendClientEvent(conn, disableVADEvent())
+			drainUntil(conn, "session.updated", 10*time.Second)
+
+			// Create a text item and trigger response
+			sendClientEvent(conn, map[string]any{
+				"type": "conversation.item.create",
+				"item": map[string]any{
+					"type": "message",
+					"role": "user",
+					"content": []map[string]any{
+						{
+							"type": "input_text",
+							"text": "Say hello",
+						},
+					},
+				},
+			})
+			drainUntil(conn, "conversation.item.added", 10*time.Second)
+
+			sendClientEvent(conn, map[string]any{
+				"type": "response.create",
+			})
+
+			// Collect audio deltas
+			var totalAudioBytes int
+			deadline := time.Now().Add(60 * time.Second)
+			for time.Now().Before(deadline) {
+				evt := readServerEvent(conn, time.Until(deadline))
+				if evt["type"] == "response.output_audio.delta" {
+					if delta, ok := evt["delta"].(string); ok {
+						decoded, err := base64.StdEncoding.DecodeString(delta)
+						Expect(err).ToNot(HaveOccurred())
+						totalAudioBytes += len(decoded)
+					}
+				}
+				if evt["type"] == "response.done" {
+					break
+				}
+			}
+
+			Expect(totalAudioBytes).To(BeNumerically(">", 0), "expected non-empty audio in response")
+		})
+	})
+})

From 7b48fb53f9d50b9f9877be225772645adf2cca32 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Mon, 9 Mar 2026 14:51:14 +0000
Subject: [PATCH 2/2] chore(openai): Convert tests to Ginko so that label
 filtering works

Signed-off-by: Richard Palethorpe <io@richiejp.com>
---
 Makefile                                      |    3 +-
 core/http/endpoints/openai/inpainting_test.go |  130 +-
 .../endpoints/openai/openai_suite_test.go     |   13 +
 core/http/endpoints/openai/opus_test.go       | 1597 +++++++----------
 pkg/audio/audio_suite_test.go                 |   13 +
 pkg/audio/audio_test.go                       |  240 +--
 pkg/opus/shim/libopusshim.so                  |  Bin 15240 -> 0 bytes
 pkg/sound/int16_test.go                       |  270 ++-
 pkg/sound/sound_suite_test.go                 |   13 +
 9 files changed, 952 insertions(+), 1327 deletions(-)
 create mode 100644 core/http/endpoints/openai/openai_suite_test.go
 create mode 100644 pkg/audio/audio_suite_test.go
 delete mode 100755 pkg/opus/shim/libopusshim.so
 create mode 100644 pkg/sound/sound_suite_test.go

diff --git a/Makefile b/Makefile
index 1e0175357a1e..2a69e1536262 100644
--- a/Makefile
+++ b/Makefile
@@ -327,7 +327,8 @@ test-realtime-models-docker: build-mock-backend
 	  -w /build \
 	  localai-test-runner \
 	  bash -c 'git config --global --add safe.directory /build && \
-	    make protogen-go && make build-mock-backend && \
+	    make protogen-go && make build-mock-backend && make build-opus-shim && \
+	    OPUS_SHIM_LIBRARY=/build/pkg/opus/shim/libopusshim.so \
 	    go run github.com/onsi/ginkgo/v2/ginkgo --label-filter="Realtime" --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e'
 
 test-container:
diff --git a/core/http/endpoints/openai/inpainting_test.go b/core/http/endpoints/openai/inpainting_test.go
index de4678d347e8..69a80b6deee7 100644
--- a/core/http/endpoints/openai/inpainting_test.go
+++ b/core/http/endpoints/openai/inpainting_test.go
@@ -7,17 +7,17 @@ import (
 	"net/http/httptest"
 	"os"
 	"path/filepath"
-	"testing"
 
 	"github.com/labstack/echo/v4"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	model "github.com/mudler/LocalAI/pkg/model"
-	"github.com/stretchr/testify/require"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
 )
 
-func makeMultipartRequest(t *testing.T, fields map[string]string, files map[string][]byte) (*http.Request, string) {
+func makeMultipartRequest(fields map[string]string, files map[string][]byte) (*http.Request, string) {
 	b := &bytes.Buffer{}
 	w := multipart.NewWriter(b)
 	for k, v := range fields {
@@ -25,83 +25,73 @@ func makeMultipartRequest(t *testing.T, fields map[string]string, files map[stri
 	}
 	for fname, content := range files {
 		fw, err := w.CreateFormFile(fname, fname+".png")
-		require.NoError(t, err)
+		Expect(err).ToNot(HaveOccurred())
 		_, err = fw.Write(content)
-		require.NoError(t, err)
+		Expect(err).ToNot(HaveOccurred())
 	}
-	require.NoError(t, w.Close())
+	Expect(w.Close()).To(Succeed())
 	req := httptest.NewRequest(http.MethodPost, "/v1/images/inpainting", b)
 	req.Header.Set("Content-Type", w.FormDataContentType())
 	return req, w.FormDataContentType()
 }
 
-func TestInpainting_MissingFiles(t *testing.T) {
-	e := echo.New()
-	// handler requires cl, ml, appConfig but this test verifies missing files early
-	h := InpaintingEndpoint(nil, nil, config.NewApplicationConfig())
+var _ = Describe("Inpainting", func() {
+	It("returns error for missing files", func() {
+		e := echo.New()
+		h := InpaintingEndpoint(nil, nil, config.NewApplicationConfig())
 
-	req := httptest.NewRequest(http.MethodPost, "/v1/images/inpainting", nil)
-	rec := httptest.NewRecorder()
-	c := e.NewContext(req, rec)
+		req := httptest.NewRequest(http.MethodPost, "/v1/images/inpainting", nil)
+		rec := httptest.NewRecorder()
+		c := e.NewContext(req, rec)
 
-	err := h(c)
-	require.Error(t, err)
-}
+		err := h(c)
+		Expect(err).To(HaveOccurred())
+	})
 
-func TestInpainting_HappyPath(t *testing.T) {
-	// Setup temp generated content dir
-	tmpDir, err := os.MkdirTemp("", "gencontent")
-	require.NoError(t, err)
-	defer os.RemoveAll(tmpDir)
+	It("handles the happy path", func() {
+		tmpDir, err := os.MkdirTemp("", "gencontent")
+		Expect(err).ToNot(HaveOccurred())
+		DeferCleanup(func() { os.RemoveAll(tmpDir) })
 
-	appConf := config.NewApplicationConfig(config.WithGeneratedContentDir(tmpDir))
+		appConf := config.NewApplicationConfig(config.WithGeneratedContentDir(tmpDir))
 
-	// stub the backend.ImageGenerationFunc
-	orig := backend.ImageGenerationFunc
-	backend.ImageGenerationFunc = func(height, width, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, refImages []string) (func() error, error) {
-		fn := func() error {
-			// write a fake png file to dst
-			return os.WriteFile(dst, []byte("PNGDATA"), 0644)
+		orig := backend.ImageGenerationFunc
+		backend.ImageGenerationFunc = func(height, width, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, refImages []string) (func() error, error) {
+			fn := func() error {
+				return os.WriteFile(dst, []byte("PNGDATA"), 0644)
+			}
+			return fn, nil
 		}
-		return fn, nil
-	}
-	defer func() { backend.ImageGenerationFunc = orig }()
-
-	// prepare multipart request with image and mask
-	fields := map[string]string{"model": "dreamshaper-8-inpainting", "prompt": "A test"}
-	files := map[string][]byte{"image": []byte("IMAGEDATA"), "mask": []byte("MASKDATA")}
-	reqBuf, _ := makeMultipartRequest(t, fields, files)
-
-	rec := httptest.NewRecorder()
-	e := echo.New()
-	c := e.NewContext(reqBuf, rec)
-
-	// set a minimal model config in context as handler expects
-	c.Set(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG, &config.ModelConfig{Backend: "diffusers"})
-
-	h := InpaintingEndpoint(nil, nil, appConf)
-
-	// call handler
-	err = h(c)
-	require.NoError(t, err)
-	require.Equal(t, http.StatusOK, rec.Code)
-
-	// verify response body contains generated-images path
-	body := rec.Body.String()
-	require.Contains(t, body, "generated-images")
-
-	// confirm the file was created in tmpDir
-	// parse out filename from response (naive search)
-	// find "generated-images/" and extract until closing quote or brace
-	idx := bytes.Index(rec.Body.Bytes(), []byte("generated-images/"))
-	require.True(t, idx >= 0)
-	rest := rec.Body.Bytes()[idx:]
-	end := bytes.IndexAny(rest, "\",}\n")
-	if end == -1 {
-		end = len(rest)
-	}
-	fname := string(rest[len("generated-images/"):end])
-	// ensure file exists
-	_, err = os.Stat(filepath.Join(tmpDir, fname))
-	require.NoError(t, err)
-}
+		DeferCleanup(func() { backend.ImageGenerationFunc = orig })
+
+		fields := map[string]string{"model": "dreamshaper-8-inpainting", "prompt": "A test"}
+		files := map[string][]byte{"image": []byte("IMAGEDATA"), "mask": []byte("MASKDATA")}
+		reqBuf, _ := makeMultipartRequest(fields, files)
+
+		rec := httptest.NewRecorder()
+		e := echo.New()
+		c := e.NewContext(reqBuf, rec)
+
+		c.Set(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG, &config.ModelConfig{Backend: "diffusers"})
+
+		h := InpaintingEndpoint(nil, nil, appConf)
+
+		err = h(c)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(rec.Code).To(Equal(http.StatusOK))
+
+		body := rec.Body.String()
+		Expect(body).To(ContainSubstring("generated-images"))
+
+		idx := bytes.Index(rec.Body.Bytes(), []byte("generated-images/"))
+		Expect(idx).To(BeNumerically(">=", 0))
+		rest := rec.Body.Bytes()[idx:]
+		end := bytes.IndexAny(rest, "\",}\n")
+		if end == -1 {
+			end = len(rest)
+		}
+		fname := string(rest[len("generated-images/"):end])
+		_, err = os.Stat(filepath.Join(tmpDir, fname))
+		Expect(err).ToNot(HaveOccurred())
+	})
+})
diff --git a/core/http/endpoints/openai/openai_suite_test.go b/core/http/endpoints/openai/openai_suite_test.go
new file mode 100644
index 000000000000..fbabe6b2f18c
--- /dev/null
+++ b/core/http/endpoints/openai/openai_suite_test.go
@@ -0,0 +1,13 @@
+package openai
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestOpenAI(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "OpenAI Endpoints Suite")
+}
diff --git a/core/http/endpoints/openai/opus_test.go b/core/http/endpoints/openai/opus_test.go
index 77314c9ab5ab..2faf3fe6449f 100644
--- a/core/http/endpoints/openai/opus_test.go
+++ b/core/http/endpoints/openai/opus_test.go
@@ -10,16 +10,17 @@ import (
 	"os/exec"
 	"path/filepath"
 	"sync"
-	"testing"
 	"time"
 
 	"github.com/mudler/LocalAI/pkg/opus"
 	"github.com/mudler/LocalAI/pkg/sound"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
 	"github.com/pion/rtp"
 	"github.com/pion/webrtc/v4"
 )
 
-// --- helpers (mirror pkg/sound/testutil_test.go but in this package) ---
+// --- helpers ---
 
 func generateSineWave(freq float64, sampleRate, numSamples int) []int16 {
 	out := make([]int16, numSamples)
@@ -42,7 +43,6 @@ func computeRMS(samples []int16) float64 {
 	return math.Sqrt(sum / float64(len(samples)))
 }
 
-// estimateFrequency uses zero-crossing count to estimate the dominant frequency.
 func estimateFrequency(samples []int16, sampleRate int) float64 {
 	if len(samples) < 2 {
 		return 0
@@ -59,282 +59,36 @@ func estimateFrequency(samples []int16, sampleRate int) float64 {
 
 // encodeDecodeRoundtrip encodes PCM at the given sample rate and decodes
 // all resulting frames, returning the concatenated decoded samples.
-func encodeDecodeRoundtrip(t *testing.T, pcmBytes []byte, sampleRate int) []int16 {
-	t.Helper()
+// Must be called from within a ginkgo spec.
+func encodeDecodeRoundtrip(pcmBytes []byte, sampleRate int) []int16 {
 	enc, err := NewOpusEncoder()
-	if err != nil {
-		t.Fatalf("NewOpusEncoder: %v", err)
-	}
+	Expect(err).ToNot(HaveOccurred(), "NewOpusEncoder")
 	defer enc.Close()
 
 	dec, err := NewOpusDecoder()
-	if err != nil {
-		t.Fatalf("NewOpusDecoder: %v", err)
-	}
+	Expect(err).ToNot(HaveOccurred(), "NewOpusDecoder")
 	defer dec.Close()
 
 	frames, err := enc.Encode(pcmBytes, sampleRate)
-	if err != nil {
-		t.Fatalf("Encode: %v", err)
-	}
+	Expect(err).ToNot(HaveOccurred(), "Encode")
 
 	var all []int16
 	for _, frame := range frames {
 		d, err := dec.Decode(frame)
-		if err != nil {
-			t.Fatalf("Decode: %v", err)
-		}
+		Expect(err).ToNot(HaveOccurred(), "Decode")
 		all = append(all, d...)
 	}
 	return all
 }
 
-// --- Opus encoder tests ---
-
-// TestOpus_ChromeLikeVoIPDecode tests decoding Opus frames encoded with
-// VoIP mode at 32kbps (similar to Chrome's WebRTC encoder settings).
-// Chrome uses SILK mode for voice, which exercises different code paths
-// in the decoder compared to ApplicationAudio (CELT-preferring).
-func TestOpus_ChromeLikeVoIPDecode(t *testing.T) {
-	// Chrome typically encodes voice at 32kbps in VoIP mode
-	enc, err := opus.NewEncoder(48000, 1, opus.ApplicationVoIP)
-	if err != nil {
-		t.Fatalf("NewEncoder(VoIP): %v", err)
-	}
-	defer enc.Close()
-	if err := enc.SetBitrate(32000); err != nil {
-		t.Fatalf("SetBitrate: %v", err)
-	}
-	if err := enc.SetComplexity(5); err != nil {
-		t.Fatalf("SetComplexity: %v", err)
-	}
-
-	dec, err := NewOpusDecoder()
-	if err != nil {
-		t.Fatalf("NewOpusDecoder: %v", err)
-	}
-	defer dec.Close()
-
-	// Encode 1 second of 440Hz sine at 48kHz
-	sine := generateSineWave(440, 48000, 48000)
-	packet := make([]byte, 4000)
-
-	var allDecoded []int16
-	for offset := 0; offset+opusFrameSize <= len(sine); offset += opusFrameSize {
-		frame := sine[offset : offset+opusFrameSize]
-		n, err := enc.Encode(frame, opusFrameSize, packet)
-		if err != nil {
-			t.Fatalf("VoIP encode: %v", err)
-		}
-
-		decoded, err := dec.Decode(packet[:n])
-		if err != nil {
-			t.Fatalf("Decode VoIP frame: %v (packet size=%d)", err, n)
-		}
-		allDecoded = append(allDecoded, decoded...)
-	}
-
-	if len(allDecoded) == 0 {
-		t.Fatal("no decoded samples from VoIP encoder")
-	}
-
-	// Skip warmup
-	skip := min(len(allDecoded)/4, 48000*100/1000)
-	tail := allDecoded[skip:]
-	rms := computeRMS(tail)
-
-	t.Logf("VoIP/SILK roundtrip: %d decoded samples, RMS=%.1f", len(allDecoded), rms)
-	if rms < 50 {
-		t.Errorf("VoIP decoded RMS=%.1f is too low; SILK decoder may be broken", rms)
-	}
-}
-
-// TestOpus_StereoEncoderMonoDecoder tests decoding stereo-encoded Opus
-// with a mono decoder. Chrome signals opus/48000/2 in SDP and may send
-// stereo Opus. The mono decoder should downmix correctly.
-func TestOpus_StereoEncoderMonoDecoder(t *testing.T) {
-	// Encode as stereo (2 channels) — similar to what Chrome might send
-	enc, err := opus.NewEncoder(48000, 2, opus.ApplicationVoIP)
-	if err != nil {
-		t.Fatalf("NewEncoder(stereo): %v", err)
-	}
-	defer enc.Close()
-	if err := enc.SetBitrate(32000); err != nil {
-		t.Fatalf("SetBitrate: %v", err)
-	}
-
-	// Decode with our standard mono decoder
-	dec, err := NewOpusDecoder()
-	if err != nil {
-		t.Fatalf("NewOpusDecoder: %v", err)
-	}
-	defer dec.Close()
-
-	// Create stereo signal: same sine in both channels (interleaved L,R,L,R...)
-	mono := generateSineWave(440, 48000, 48000)
-	stereo := make([]int16, len(mono)*2)
-	for i, s := range mono {
-		stereo[i*2] = s   // L
-		stereo[i*2+1] = s // R
-	}
-
-	packet := make([]byte, 4000)
-	var allDecoded []int16
-	for offset := 0; offset+opusFrameSize*2 <= len(stereo); offset += opusFrameSize * 2 {
-		frame := stereo[offset : offset+opusFrameSize*2]
-		n, err := enc.Encode(frame, opusFrameSize, packet)
-		if err != nil {
-			t.Fatalf("Stereo encode: %v", err)
-		}
-
-		decoded, err := dec.Decode(packet[:n])
-		if err != nil {
-			t.Fatalf("Decode stereo->mono: %v (packet size=%d)", err, n)
-		}
-		allDecoded = append(allDecoded, decoded...)
-	}
-
-	if len(allDecoded) == 0 {
-		t.Fatal("no decoded samples from stereo encoder")
-	}
-
-	skip := min(len(allDecoded)/4, 48000*100/1000)
-	tail := allDecoded[skip:]
-	rms := computeRMS(tail)
-
-	t.Logf("Stereo->Mono: %d decoded samples, RMS=%.1f", len(allDecoded), rms)
-	if rms < 50 {
-		t.Errorf("Stereo->Mono decoded RMS=%.1f is too low; cross-channel decoding may be broken", rms)
-	}
-}
-
-// TestOpus_DecodeLibopusEncoded uses ffmpeg (real libopus) to encode audio,
-// then decodes with our opus-go decoder. This simulates Chrome sending Opus
-// frames to the server. Skipped if ffmpeg is not available.
-func TestOpus_DecodeLibopusEncoded(t *testing.T) {
-	ffmpegPath, err := exec.LookPath("ffmpeg")
-	if err != nil {
-		t.Skip("ffmpeg not found")
-	}
-
-	tmpDir := t.TempDir()
-
-	// Generate 1 second of 440Hz tone as raw PCM (16-bit LE mono 48kHz)
-	sine := generateSineWave(440, 48000, 48000)
-	pcmPath := filepath.Join(tmpDir, "input.raw")
-	pcmBytes := sound.Int16toBytesLE(sine)
-	if err := os.WriteFile(pcmPath, pcmBytes, 0644); err != nil {
-		t.Fatalf("write PCM: %v", err)
-	}
-
-	for _, tc := range []struct {
-		name    string
-		bitrate string
-		app     string
-	}{
-		{"voip_32k", "32000", "voip"},
-		{"voip_64k", "64000", "voip"},
-		{"audio_64k", "64000", "audio"},
-		{"audio_128k", "128000", "audio"},
-	} {
-		t.Run(tc.name, func(t *testing.T) {
-			testDecodeLibopus(t, ffmpegPath, tmpDir, pcmPath, sine, tc.bitrate, tc.app)
-		})
-	}
-}
-
-func testDecodeLibopus(t *testing.T, ffmpegPath, tmpDir, pcmPath string, _ []int16, bitrate, app string) {
-	t.Helper()
-
-	oggPath := filepath.Join(tmpDir, fmt.Sprintf("libopus_%s_%s.ogg", app, bitrate))
-	cmd := exec.Command(ffmpegPath,
-		"-y",
-		"-f", "s16le", "-ar", "48000", "-ac", "1", "-i", pcmPath,
-		"-c:a", "libopus",
-		"-b:a", bitrate,
-		"-application", app,
-		"-frame_duration", "20",
-		"-vbr", "on",
-		oggPath,
-	)
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		t.Fatalf("ffmpeg encode: %v\n%s", err, out)
-	}
-
-	// Read the Ogg/Opus file and extract raw Opus frames
-	oggData, err := os.ReadFile(oggPath)
-	if err != nil {
-		t.Fatalf("read ogg: %v", err)
-	}
-
-	frames := extractOpusFramesFromOgg(t, oggData)
-	if len(frames) == 0 {
-		t.Fatal("no Opus frames extracted from Ogg container")
-	}
-	t.Logf("Extracted %d Opus frames from libopus encoder (first frame %d bytes)", len(frames), len(frames[0]))
-
-	// Decode with our opus-go decoder
-	dec, err := NewOpusDecoder()
-	if err != nil {
-		t.Fatalf("NewOpusDecoder: %v", err)
-	}
-	defer dec.Close()
-
-	var allDecoded []int16
-	decodeErrors := 0
-	for i, frame := range frames {
-		decoded, err := dec.Decode(frame)
-		if err != nil {
-			decodeErrors++
-			if decodeErrors <= 5 {
-				t.Logf("frame %d: decode error: %v (size=%d)", i, err, len(frame))
-			}
-			continue
-		}
-		if i < 5 {
-			t.Logf("frame %d: payload=%d bytes, decoded=%d samples (%.1fms @ 48kHz)",
-				i, len(frame), len(decoded), float64(len(decoded))/48.0)
-		}
-		allDecoded = append(allDecoded, decoded...)
-	}
-
-	if decodeErrors > 0 {
-		t.Logf("Total decode errors: %d/%d frames", decodeErrors, len(frames))
-	}
-
-	if len(allDecoded) == 0 {
-		t.Fatal("no decoded samples from libopus-encoded Opus")
-	}
-
-	// Skip warmup and check quality
-	skip := min(len(allDecoded)/4, 48000*100/1000)
-	tail := allDecoded[skip:]
-	rms := computeRMS(tail)
-	freq := estimateFrequency(tail, 48000)
-
-	t.Logf("libopus->opus-go: %d decoded samples, RMS=%.1f, freq≈%.0f Hz", len(allDecoded), rms, freq)
-
-	if rms < 50 {
-		t.Errorf("RMS=%.1f is too low — opus-go cannot decode libopus output", rms)
-	}
-	if math.Abs(freq-440) > 30 {
-		t.Errorf("frequency %.0f Hz deviates from expected 440 Hz (ratio=%.3f)", freq, freq/440.0)
-	}
-}
-
 // extractOpusFramesFromOgg parses an Ogg container and extracts raw Opus audio frames.
-func extractOpusFramesFromOgg(t *testing.T, data []byte) [][]byte {
-	t.Helper()
+func extractOpusFramesFromOgg(data []byte) [][]byte {
 	var frames [][]byte
 	pos := 0
 	pageNum := 0
 
 	for pos+27 <= len(data) {
-		// Check for OggS sync
-		if string(data[pos:pos+4]) != "OggS" {
-			t.Fatalf("invalid Ogg page at offset %d", pos)
-		}
+		Expect(string(data[pos:pos+4])).To(Equal("OggS"), fmt.Sprintf("invalid Ogg page at offset %d", pos))
 
 		nSegments := int(data[pos+26])
 		if pos+27+nSegments > len(data) {
@@ -344,7 +98,6 @@ func extractOpusFramesFromOgg(t *testing.T, data []byte) [][]byte {
 		segTable := data[pos+27 : pos+27+nSegments]
 		dataStart := pos + 27 + nSegments
 
-		// Calculate total page data size
 		var totalDataSize int
 		for _, s := range segTable {
 			totalDataSize += int(s)
@@ -354,9 +107,7 @@ func extractOpusFramesFromOgg(t *testing.T, data []byte) [][]byte {
 			break
 		}
 
-		// Skip first two pages (OpusHead + OpusTags)
 		if pageNum >= 2 {
-			// Extract packets from segment table
 			pageData := data[dataStart : dataStart+totalDataSize]
 			offset := 0
 			var packet []byte
@@ -364,7 +115,6 @@ func extractOpusFramesFromOgg(t *testing.T, data []byte) [][]byte {
 				packet = append(packet, pageData[offset:offset+int(segSize)]...)
 				offset += int(segSize)
 				if segSize < 255 {
-					// End of packet
 					if len(packet) > 0 {
 						frameCopy := make([]byte, len(packet))
 						copy(frameCopy, packet)
@@ -373,7 +123,6 @@ func extractOpusFramesFromOgg(t *testing.T, data []byte) [][]byte {
 					packet = nil
 				}
 			}
-			// If last segment was 255, packet continues on next page
 			if len(packet) > 0 {
 				frameCopy := make([]byte, len(packet))
 				copy(frameCopy, packet)
@@ -388,371 +137,11 @@ func extractOpusFramesFromOgg(t *testing.T, data []byte) [][]byte {
 	return frames
 }
 
-func TestOpusEncodeDecode_Roundtrip_48kHz(t *testing.T) {
-	// Use a longer signal (1 second) so the codec can stabilise past its
-	// lookahead period and produce meaningful output.
-	sine := generateSineWave(440, 48000, 48000)
-	pcmBytes := sound.Int16toBytesLE(sine)
-
-	decoded := encodeDecodeRoundtrip(t, pcmBytes, 48000)
-	if len(decoded) == 0 {
-		t.Fatal("no decoded samples")
-	}
-
-	// Skip initial codec warmup (first 50ms) for frequency estimation.
-	skip := 48000 * 50 / 1000 // 2400 samples at 48kHz
-	// The decoder may return fewer samples per frame (e.g. 480 instead of 960),
-	// so the total decoded length may differ. Adjust skip proportionally.
-	decodedSR := 48000 // decoder is initialised at 48kHz
-	skipDecoded := decodedSR * 50 / 1000
-	if skipDecoded > len(decoded)/2 {
-		skipDecoded = len(decoded) / 4
-	}
-	tail := decoded[skipDecoded:]
-
-	rms := computeRMS(tail)
-	t.Logf("48kHz roundtrip: %d decoded samples, RMS=%.1f (skip=%d, analysed=%d)",
-		len(decoded), rms, skip, len(tail))
-
-	if rms < 50 {
-		t.Errorf("decoded audio RMS=%.1f is too low; signal appears silent", rms)
-	}
-}
-
-func TestOpusEncodeDecode_Roundtrip_16kHz(t *testing.T) {
-	// 1 second of 440Hz at 16kHz. Encoder resamples 16k->48k internally.
-	sine16k := generateSineWave(440, 16000, 16000)
-	pcmBytes := sound.Int16toBytesLE(sine16k)
-
-	decoded := encodeDecodeRoundtrip(t, pcmBytes, 16000)
-	if len(decoded) == 0 {
-		t.Fatal("no decoded samples")
-	}
-
-	// Resample back to 16kHz
-	decoded16k := sound.ResampleInt16(decoded, 48000, 16000)
-
-	// Skip warmup
-	skip := min(len(decoded16k)/4, 16000*50/1000)
-	tail := decoded16k[skip:]
-
-	rms := computeRMS(tail)
-	t.Logf("16kHz roundtrip: %d decoded@48k -> %d resampled@16k, RMS=%.1f",
-		len(decoded), len(decoded16k), rms)
-
-	if rms < 50 {
-		t.Errorf("decoded audio RMS=%.1f is too low; signal appears silent", rms)
-	}
-}
-
-func TestOpusEncode_EmptyInput(t *testing.T) {
-	enc, err := NewOpusEncoder()
-	if err != nil {
-		t.Fatalf("NewOpusEncoder: %v", err)
-	}
-	defer enc.Close()
-
-	frames, err := enc.Encode([]byte{}, 48000)
-	if err != nil {
-		t.Fatalf("Encode empty: %v", err)
-	}
-	if frames != nil {
-		t.Errorf("expected nil frames for empty input, got %d frames", len(frames))
-	}
-}
-
-func TestOpusEncode_SubFrameInput_SilentDrop(t *testing.T) {
-	// Less than 960 samples at 48kHz = not enough for a single frame.
-	// The encoder silently drops these trailing samples.
-	enc, err := NewOpusEncoder()
-	if err != nil {
-		t.Fatalf("NewOpusEncoder: %v", err)
-	}
-	defer enc.Close()
-
-	sine := generateSineWave(440, 48000, 500) // < 960
-	pcmBytes := sound.Int16toBytesLE(sine)
-
-	frames, err := enc.Encode(pcmBytes, 48000)
-	if err != nil {
-		t.Fatalf("Encode: %v", err)
-	}
-	if len(frames) != 0 {
-		t.Errorf("expected 0 frames for %d samples (< 960), got %d", len(sine), len(frames))
-	}
-}
-
-func TestOpusEncode_MultiFrame(t *testing.T) {
-	enc, err := NewOpusEncoder()
-	if err != nil {
-		t.Fatalf("NewOpusEncoder: %v", err)
-	}
-	defer enc.Close()
-
-	// 2880 samples at 48kHz = exactly 3 frames of 960
-	sine := generateSineWave(440, 48000, 2880)
-	pcmBytes := sound.Int16toBytesLE(sine)
-
-	frames, err := enc.Encode(pcmBytes, 48000)
-	if err != nil {
-		t.Fatalf("Encode: %v", err)
-	}
-	if len(frames) != 3 {
-		t.Errorf("expected 3 frames for 2880 samples, got %d", len(frames))
-	}
-}
-
-func TestOpusDecode_FrameSize(t *testing.T) {
-	// Document the actual decoded frame size from the pure Go opus-go library.
-	enc, err := NewOpusEncoder()
-	if err != nil {
-		t.Fatalf("NewOpusEncoder: %v", err)
-	}
-	defer enc.Close()
-
-	dec, err := NewOpusDecoder()
-	if err != nil {
-		t.Fatalf("NewOpusDecoder: %v", err)
-	}
-	defer dec.Close()
-
-	sine := generateSineWave(440, 48000, 960)
-	pcmBytes := sound.Int16toBytesLE(sine)
-
-	frames, err := enc.Encode(pcmBytes, 48000)
-	if err != nil {
-		t.Fatalf("Encode: %v", err)
-	}
-	if len(frames) != 1 {
-		t.Fatalf("expected 1 frame, got %d", len(frames))
-	}
-
-	decoded, err := dec.Decode(frames[0])
-	if err != nil {
-		t.Fatalf("Decode: %v", err)
-	}
-
-	t.Logf("Encoder input: 960 samples (20ms @ 48kHz)")
-	t.Logf("Decoder output: %d samples (%.1fms @ 48kHz)",
-		len(decoded), float64(len(decoded))/48.0)
-
-	// The decoder may return a different frame size due to internal
-	// bandwidth decisions in VoIP mode. Document the actual value.
-	if len(decoded) != 960 && len(decoded) != 480 {
-		t.Errorf("unexpected decoded frame size %d (expected 960 or 480)", len(decoded))
-	}
-}
-
-func TestOpus_FullWebRTCOutputPath(t *testing.T) {
-	// Simulates the TTS -> SendAudio path:
-	// PCM at 16kHz -> Encode(pcm, 16000) -> Opus frames -> Decode -> 48kHz samples
-	// Use 1 second of audio to let codec stabilise.
-	sine16k := generateSineWave(440, 16000, 16000)
-	pcmBytes := sound.Int16toBytesLE(sine16k)
-
-	decoded := encodeDecodeRoundtrip(t, pcmBytes, 16000)
-	if len(decoded) == 0 {
-		t.Fatal("no frames produced")
-	}
-
-	rms := computeRMS(decoded)
-	t.Logf("WebRTC output path: %d decoded samples at 48kHz, RMS=%.1f", len(decoded), rms)
-
-	if rms < 50 {
-		t.Errorf("decoded audio RMS=%.1f is too low; expected recognisable signal", rms)
-	}
-}
-
-func TestOpus_FullWebRTCInputPath(t *testing.T) {
-	// Simulates the client -> server path:
-	// PCM@48k -> Encode -> Decode -> Resample 48k->24k->16k
-	// Verify that the pipeline produces non-silent audio.
-	sine48k := generateSineWave(440, 48000, 48000) // 1 second
-	pcmBytes := sound.Int16toBytesLE(sine48k)
-
-	decoded48k := encodeDecodeRoundtrip(t, pcmBytes, 48000)
-	if len(decoded48k) == 0 {
-		t.Fatal("no decoded samples")
-	}
-
-	// WebRTC path: 48k -> 24k -> (VAD) -> 16k
-	step24k := sound.ResampleInt16(decoded48k, 48000, 24000)
-	webrtcPath := sound.ResampleInt16(step24k, 24000, 16000)
-
-	rms := computeRMS(webrtcPath)
-	t.Logf("WebRTC input path: %d decoded@48k -> %d@24k -> %d@16k, RMS=%.1f",
-		len(decoded48k), len(step24k), len(webrtcPath), rms)
-
-	if rms < 50 {
-		t.Errorf("WebRTC input path RMS=%.1f is too low; signal lost in pipeline", rms)
-	}
-}
-
-// --- Bug documentation tests ---
-
-func TestOpusBug_TrailingSampleLoss(t *testing.T) {
-	// Encode 1000 samples at 48kHz -> only 1 frame (960 samples) returned.
-	// 40 trailing samples are silently lost.
-	enc, err := NewOpusEncoder()
-	if err != nil {
-		t.Fatalf("NewOpusEncoder: %v", err)
-	}
-	defer enc.Close()
-
-	sine := generateSineWave(440, 48000, 1000)
-	pcmBytes := sound.Int16toBytesLE(sine)
-
-	frames, err := enc.Encode(pcmBytes, 48000)
-	if err != nil {
-		t.Fatalf("Encode: %v", err)
-	}
-	if len(frames) != 1 {
-		t.Fatalf("expected 1 frame, got %d", len(frames))
-	}
-
-	dec, err := NewOpusDecoder()
-	if err != nil {
-		t.Fatalf("NewOpusDecoder: %v", err)
-	}
-	defer dec.Close()
-
-	decoded, err := dec.Decode(frames[0])
-	if err != nil {
-		t.Fatalf("Decode: %v", err)
-	}
-
-	// The encoder only encoded 960 of 1000 input samples.
-	// Decoded frame size may be 960 or 480 depending on codec mode.
-	// Either way, 40 input samples are permanently lost.
-	t.Logf("Input: 1000 samples, Encoded: 1 frame, Decoded: %d samples (40 samples lost)", len(decoded))
-	if len(decoded) > 960 {
-		t.Errorf("decoded more samples (%d) than the encoder consumed (960)", len(decoded))
-	}
-}
-
-func TestOpusBug_TTSSampleRateMismatch(t *testing.T) {
-	// If TTS produces 24kHz audio but the pipeline assumes 16kHz,
-	// the Opus encoder resamples from 16kHz to 48kHz (3x) instead of
-	// 24kHz to 48kHz (2x). The result is pitched up by 50%.
-	//
-	// This test uses a longer signal and compares the two paths to
-	// demonstrate the frequency distortion.
-
-	// Generate 440Hz at 24kHz (what TTS actually produces)
-	sine24k := generateSineWave(440, 24000, 24000) // 1 second
-	pcmBytes := sound.Int16toBytesLE(sine24k)
-
-	// BUG path: Pipeline passes sampleRate=16000 (assumed) instead of 24000 (actual)
-	decodedBug := encodeDecodeRoundtrip(t, pcmBytes, 16000)
-	// CORRECT path: Pipeline should pass sampleRate=24000
-	decodedCorrect := encodeDecodeRoundtrip(t, pcmBytes, 24000)
-
-	// Skip warmup for frequency estimation
-	skipBug := min(len(decodedBug)/4, 48000*100/1000)
-	skipCorrect := min(len(decodedCorrect)/4, 48000*100/1000)
-
-	bugTail := decodedBug[skipBug:]
-	correctTail := decodedCorrect[skipCorrect:]
-
-	bugFreq := estimateFrequency(bugTail, 48000)
-	correctFreq := estimateFrequency(correctTail, 48000)
-
-	t.Logf("Bug path:     %d decoded samples, freq≈%.0f Hz (expected ~660 Hz = 440*1.5)", len(decodedBug), bugFreq)
-	t.Logf("Correct path: %d decoded samples, freq≈%.0f Hz (expected ~440 Hz)", len(decodedCorrect), correctFreq)
-
-	// The bug path produces significantly more decoded samples because
-	// the encoder thinks the input is 16kHz and upsamples by 3x instead of 2x.
-	// This also means the perceived playback speed and pitch are wrong.
-	if len(decodedBug) > 0 && len(decodedCorrect) > 0 {
-		ratio := float64(len(decodedBug)) / float64(len(decodedCorrect))
-		t.Logf("Sample count ratio (bug/correct): %.2f (expected ~1.5)", ratio)
-		if ratio < 1.1 {
-			t.Error("expected bug path to produce significantly more samples due to wrong resample ratio")
-		}
-	}
-}
-
-// TestOpus_CrossLibraryCompat encodes a sine wave with opus-go, wraps the
-// output in a minimal Ogg/Opus container, and decodes it with ffmpeg. This
-// catches issues where the pure-Go encoder produces Opus frames that only
-// its own decoder can parse (but not a browser or standard decoder).
-// Skipped if ffmpeg is not available.
-func TestOpus_CrossLibraryCompat(t *testing.T) {
-	ffmpegPath, err := exec.LookPath("ffmpeg")
-	if err != nil {
-		t.Skip("ffmpeg not found, skipping cross-library compatibility test")
-	}
-
-	// Encode 1 second of 440Hz sine at 48kHz with opus-go
-	sine := generateSineWave(440, 48000, 48000)
-	pcmBytes := sound.Int16toBytesLE(sine)
-
-	enc, err := NewOpusEncoder()
-	if err != nil {
-		t.Fatalf("NewOpusEncoder: %v", err)
-	}
-	defer enc.Close()
-
-	frames, err := enc.Encode(pcmBytes, 48000)
-	if err != nil {
-		t.Fatalf("Encode: %v", err)
-	}
-	if len(frames) == 0 {
-		t.Fatal("no frames produced")
-	}
-	t.Logf("opus-go produced %d frames (first frame %d bytes)", len(frames), len(frames[0]))
-
-	// Wrap the Opus frames in an Ogg/Opus container so ffmpeg can decode them.
-	tmpDir := t.TempDir()
-	oggPath := filepath.Join(tmpDir, "opus_go_output.ogg")
-	if err := writeOggOpus(oggPath, frames, 48000, 1); err != nil {
-		t.Fatalf("writeOggOpus: %v", err)
-	}
-
-	// Decode with ffmpeg
-	decodedWavPath := filepath.Join(tmpDir, "ffmpeg_decoded.wav")
-	cmd := exec.Command(ffmpegPath, "-y", "-i", oggPath, "-ar", "48000", "-ac", "1", "-c:a", "pcm_s16le", decodedWavPath)
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		t.Fatalf("ffmpeg failed to decode opus-go output: %v\n%s", err, out)
-	}
-
-	// Read the decoded WAV and check audio quality
-	decodedData, err := os.ReadFile(decodedWavPath)
-	if err != nil {
-		t.Fatalf("read decoded WAV: %v", err)
-	}
-
-	// Use our robust ParseWAV to handle ffmpeg's WAV output
-	decodedPCM, sr := parseTestWAV(decodedData)
-	if sr == 0 {
-		t.Fatal("ffmpeg output has no WAV header")
-	}
-	decodedSamples := sound.BytesToInt16sLE(decodedPCM)
-
-	// Skip codec warmup (first 100ms), check RMS of the rest
-	skip := min(len(decodedSamples)/4, sr*100/1000)
-	if skip >= len(decodedSamples) {
-		skip = 0
-	}
-	tail := decodedSamples[skip:]
-	rms := computeRMS(tail)
-
-	t.Logf("ffmpeg decoded opus-go output: %d samples at %dHz, RMS=%.1f", len(decodedSamples), sr, rms)
-
-	if rms < 50 {
-		t.Errorf("ffmpeg decoded RMS=%.1f is too low — opus-go frames are likely incompatible with standard decoders", rms)
-	} else {
-		t.Logf("PASS: opus-go Opus frames are decodable by ffmpeg (libopus) with good signal quality")
-	}
-}
-
-// parseTestWAV is a simple WAV parser for test output (ffmpeg always writes standard headers).
+// parseTestWAV is a simple WAV parser for test output.
 func parseTestWAV(data []byte) (pcm []byte, sampleRate int) {
 	if len(data) < 44 || string(data[0:4]) != "RIFF" {
 		return data, 0
 	}
-	// Walk chunks to find "data"
 	pos := 12
 	sr := int(binary.LittleEndian.Uint32(data[24:28]))
 	for pos+8 <= len(data) {
@@ -773,7 +162,6 @@ func parseTestWAV(data []byte) (pcm []byte, sampleRate int) {
 	return data[44:], sr
 }
 
-// writeOggOpus writes Opus frames into a minimal Ogg/Opus container file.
 func writeOggOpus(path string, frames [][]byte, sampleRate, channels int) error {
 	f, err := os.Create(path)
 	if err != nil {
@@ -783,39 +171,36 @@ func writeOggOpus(path string, frames [][]byte, sampleRate, channels int) error
 
 	serial := uint32(0x4C6F6341) // "LocA"
 	var pageSeq uint32
-	const preSkip = 312 // standard Opus pre-skip for 48kHz
+	const preSkip = 312
 
-	// Page 1: OpusHead (BOS page)
 	opusHead := make([]byte, 19)
 	copy(opusHead[0:8], "OpusHead")
-	opusHead[8] = 1                                                    // version
-	opusHead[9] = byte(channels)                                       // channel count
-	binary.LittleEndian.PutUint16(opusHead[10:12], uint16(preSkip))    // pre-skip
-	binary.LittleEndian.PutUint32(opusHead[12:16], uint32(sampleRate)) // input sample rate
-	binary.LittleEndian.PutUint16(opusHead[16:18], 0)                  // output gain
-	opusHead[18] = 0                                                   // channel mapping family
+	opusHead[8] = 1
+	opusHead[9] = byte(channels)
+	binary.LittleEndian.PutUint16(opusHead[10:12], uint16(preSkip))
+	binary.LittleEndian.PutUint32(opusHead[12:16], uint32(sampleRate))
+	binary.LittleEndian.PutUint16(opusHead[16:18], 0)
+	opusHead[18] = 0
 	if err := writeOggPage(f, serial, pageSeq, 0, 0x02, [][]byte{opusHead}); err != nil {
 		return err
 	}
 	pageSeq++
 
-	// Page 2: OpusTags
 	opusTags := make([]byte, 16)
 	copy(opusTags[0:8], "OpusTags")
-	binary.LittleEndian.PutUint32(opusTags[8:12], 0)  // vendor string length
-	binary.LittleEndian.PutUint32(opusTags[12:16], 0) // comment list length
+	binary.LittleEndian.PutUint32(opusTags[8:12], 0)
+	binary.LittleEndian.PutUint32(opusTags[12:16], 0)
 	if err := writeOggPage(f, serial, pageSeq, 0, 0x00, [][]byte{opusTags}); err != nil {
 		return err
 	}
 	pageSeq++
 
-	// Audio pages: one Opus frame per page for simplicity
 	var granulePos uint64
 	for i, frame := range frames {
-		granulePos += 960 // 20ms at 48kHz
+		granulePos += 960
 		headerType := byte(0x00)
 		if i == len(frames)-1 {
-			headerType = 0x04 // EOS
+			headerType = 0x04
 		}
 		if err := writeOggPage(f, serial, pageSeq, granulePos, headerType, [][]byte{frame}); err != nil {
 			return err
@@ -826,9 +211,7 @@ func writeOggOpus(path string, frames [][]byte, sampleRate, channels int) error
 	return nil
 }
 
-// writeOggPage writes a single Ogg page containing the given packets.
 func writeOggPage(w io.Writer, serial, pageSeq uint32, granulePos uint64, headerType byte, packets [][]byte) error {
-	// Build segment table
 	var segments []byte
 	var pageData []byte
 	for _, pkt := range packets {
@@ -841,19 +224,16 @@ func writeOggPage(w io.Writer, serial, pageSeq uint32, granulePos uint64, header
 		pageData = append(pageData, pkt...)
 	}
 
-	// Build page header (27 bytes + segment table)
 	hdr := make([]byte, 27+len(segments))
 	copy(hdr[0:4], "OggS")
-	hdr[4] = 0 // version
+	hdr[4] = 0
 	hdr[5] = headerType
 	binary.LittleEndian.PutUint64(hdr[6:14], granulePos)
 	binary.LittleEndian.PutUint32(hdr[14:18], serial)
 	binary.LittleEndian.PutUint32(hdr[18:22], pageSeq)
-	// CRC at [22:26] — filled after computing
 	hdr[26] = byte(len(segments))
 	copy(hdr[27:], segments)
 
-	// Compute CRC-32 over header + page data
 	crc := oggCRC32(hdr, pageData)
 	binary.LittleEndian.PutUint32(hdr[22:26], crc)
 
@@ -864,7 +244,6 @@ func writeOggPage(w io.Writer, serial, pageSeq uint32, granulePos uint64, header
 	return err
 }
 
-// oggCRC32 computes the Ogg CRC-32 checksum (polynomial 0x04C11DB7).
 func oggCRC32(header, data []byte) uint32 {
 	var crc uint32
 	for _, b := range header {
@@ -892,8 +271,6 @@ var oggCRCTable = func() [256]uint32 {
 	return t
 }()
 
-// goertzel computes the power at a specific frequency using the Goertzel algorithm.
-// Returns power in linear scale (not dB).
 func goertzel(samples []int16, targetFreq float64, sampleRate int) float64 {
 	N := len(samples)
 	if N == 0 {
@@ -911,8 +288,6 @@ func goertzel(samples []int16, targetFreq float64, sampleRate int) float64 {
 	return s1*s1 + s2*s2 - coeff*s1*s2
 }
 
-// computeTHD computes Total Harmonic Distortion for a signal with known fundamental.
-// THD = sqrt(sum of harmonic powers) / fundamental power, returned as percentage.
 func computeTHD(samples []int16, fundamentalHz float64, sampleRate, numHarmonics int) float64 {
 	fundPower := goertzel(samples, fundamentalHz, sampleRate)
 	if fundPower <= 0 {
@@ -925,343 +300,661 @@ func computeTHD(samples []int16, fundamentalHz float64, sampleRate, numHarmonics
 	return math.Sqrt(harmonicSum/fundPower) * 100
 }
 
-// TestWebRTCPipeline_TestToneQuality exercises the full audio pipeline:
-//
-//	PCM (24kHz) → resample to 48kHz → Opus encode → RTP packetize →
-//	WebRTC transport (local loopback) → RTP depacketize → Opus decode → PCM (48kHz)
-//
-// Two local PeerConnections are connected via SDP exchange (no network).
-// The sender uses the same RTP construction as WebRTCTransport.SendAudio.
-// Quality metrics are computed on the received/decoded audio and logged.
-//
-// This test catches regressions in:
-//   - Opus encoder output quality
-//   - RTP packetization (sequence numbers, timestamps, marker bit)
-//   - Sample rate handling in the encode path
-//   - Packet delivery through pion's internal transport
-func TestWebRTCPipeline_TestToneQuality(t *testing.T) {
-	const (
-		toneFreq       = 440.0
-		toneSampleRate = 24000 // matches sendTestTone
-		toneDuration   = 1     // seconds
-		toneAmplitude  = 16000
-		toneNumSamples = toneSampleRate * toneDuration
-	)
-
-	// Generate test tone (same as sendTestTone in realtime.go)
-	pcm := make([]byte, toneNumSamples*2)
-	for i := 0; i < toneNumSamples; i++ {
-		sample := int16(toneAmplitude * math.Sin(2*math.Pi*toneFreq*float64(i)/float64(toneSampleRate)))
-		binary.LittleEndian.PutUint16(pcm[i*2:], uint16(sample))
-	}
+// --- Opus specs ---
 
-	// Encode to Opus frames (same path as SendAudio)
-	enc, err := NewOpusEncoder()
-	if err != nil {
-		t.Fatalf("NewOpusEncoder: %v", err)
-	}
-	defer enc.Close()
+var _ = Describe("Opus", func() {
+	It("decodes Chrome-like VoIP frames", func() {
+		enc, err := opus.NewEncoder(48000, 1, opus.ApplicationVoIP)
+		Expect(err).ToNot(HaveOccurred())
+		defer enc.Close()
+		Expect(enc.SetBitrate(32000)).To(Succeed())
+		Expect(enc.SetComplexity(5)).To(Succeed())
 
-	opusFrames, err := enc.Encode(pcm, toneSampleRate)
-	if err != nil {
-		t.Fatalf("Encode: %v", err)
-	}
-	if len(opusFrames) == 0 {
-		t.Fatal("no Opus frames produced")
-	}
-	t.Logf("Encoded %d Opus frames from %d PCM samples at %dHz", len(opusFrames), toneNumSamples, toneSampleRate)
+		dec, err := NewOpusDecoder()
+		Expect(err).ToNot(HaveOccurred())
+		defer dec.Close()
 
-	// --- Create sender PeerConnection ---
-	senderME := &webrtc.MediaEngine{}
-	if err := senderME.RegisterDefaultCodecs(); err != nil {
-		t.Fatalf("sender RegisterDefaultCodecs: %v", err)
-	}
-	senderAPI := webrtc.NewAPI(webrtc.WithMediaEngine(senderME))
-	senderPC, err := senderAPI.NewPeerConnection(webrtc.Configuration{})
-	if err != nil {
-		t.Fatalf("sender NewPeerConnection: %v", err)
-	}
-	defer senderPC.Close()
-
-	audioTrack, err := webrtc.NewTrackLocalStaticRTP(
-		webrtc.RTPCodecCapability{
-			MimeType:  webrtc.MimeTypeOpus,
-			ClockRate: 48000,
-			Channels:  2,
-		},
-		"audio", "test",
-	)
-	if err != nil {
-		t.Fatalf("NewTrackLocalStaticRTP: %v", err)
-	}
+		sine := generateSineWave(440, 48000, 48000)
+		packet := make([]byte, 4000)
 
-	rtpSender, err := senderPC.AddTrack(audioTrack)
-	if err != nil {
-		t.Fatalf("AddTrack: %v", err)
-	}
-	// Drain RTCP
-	go func() {
-		buf := make([]byte, 1500)
-		for {
-			if _, _, err := rtpSender.Read(buf); err != nil {
-				return
-			}
+		var allDecoded []int16
+		for offset := 0; offset+opusFrameSize <= len(sine); offset += opusFrameSize {
+			frame := sine[offset : offset+opusFrameSize]
+			n, err := enc.Encode(frame, opusFrameSize, packet)
+			Expect(err).ToNot(HaveOccurred(), "VoIP encode")
+
+			decoded, err := dec.Decode(packet[:n])
+			Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Decode VoIP frame (packet size=%d)", n))
+			allDecoded = append(allDecoded, decoded...)
 		}
-	}()
 
-	// --- Create receiver PeerConnection ---
-	receiverME := &webrtc.MediaEngine{}
-	if err := receiverME.RegisterDefaultCodecs(); err != nil {
-		t.Fatalf("receiver RegisterDefaultCodecs: %v", err)
-	}
-	receiverAPI := webrtc.NewAPI(webrtc.WithMediaEngine(receiverME))
-	receiverPC, err := receiverAPI.NewPeerConnection(webrtc.Configuration{})
-	if err != nil {
-		t.Fatalf("receiver NewPeerConnection: %v", err)
-	}
-	defer receiverPC.Close()
-
-	// Collect received RTP payloads (Opus frames)
-	type receivedPacket struct {
-		seqNum    uint16
-		timestamp uint32
-		marker    bool
-		payload   []byte
-	}
-	var (
-		receivedMu      sync.Mutex
-		receivedPackets []receivedPacket
-		trackDone       = make(chan struct{})
-	)
-
-	receiverPC.OnTrack(func(track *webrtc.TrackRemote, receiver *webrtc.RTPReceiver) {
-		defer close(trackDone)
-		for {
-			pkt, _, err := track.ReadRTP()
+		Expect(allDecoded).ToNot(BeEmpty(), "no decoded samples from VoIP encoder")
+
+		skip := min(len(allDecoded)/4, 48000*100/1000)
+		tail := allDecoded[skip:]
+		rms := computeRMS(tail)
+
+		GinkgoWriter.Printf("VoIP/SILK roundtrip: %d decoded samples, RMS=%.1f\n", len(allDecoded), rms)
+		Expect(rms).To(BeNumerically(">=", 50), "VoIP decoded RMS is too low; SILK decoder may be broken")
+	})
+
+	It("decodes stereo-encoded Opus with a mono decoder", func() {
+		enc, err := opus.NewEncoder(48000, 2, opus.ApplicationVoIP)
+		Expect(err).ToNot(HaveOccurred())
+		defer enc.Close()
+		Expect(enc.SetBitrate(32000)).To(Succeed())
+
+		dec, err := NewOpusDecoder()
+		Expect(err).ToNot(HaveOccurred())
+		defer dec.Close()
+
+		mono := generateSineWave(440, 48000, 48000)
+		stereo := make([]int16, len(mono)*2)
+		for i, s := range mono {
+			stereo[i*2] = s
+			stereo[i*2+1] = s
+		}
+
+		packet := make([]byte, 4000)
+		var allDecoded []int16
+		for offset := 0; offset+opusFrameSize*2 <= len(stereo); offset += opusFrameSize * 2 {
+			frame := stereo[offset : offset+opusFrameSize*2]
+			n, err := enc.Encode(frame, opusFrameSize, packet)
+			Expect(err).ToNot(HaveOccurred(), "Stereo encode")
+
+			decoded, err := dec.Decode(packet[:n])
+			Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Decode stereo->mono (packet size=%d)", n))
+			allDecoded = append(allDecoded, decoded...)
+		}
+
+		Expect(allDecoded).ToNot(BeEmpty(), "no decoded samples from stereo encoder")
+
+		skip := min(len(allDecoded)/4, 48000*100/1000)
+		tail := allDecoded[skip:]
+		rms := computeRMS(tail)
+
+		GinkgoWriter.Printf("Stereo->Mono: %d decoded samples, RMS=%.1f\n", len(allDecoded), rms)
+		Expect(rms).To(BeNumerically(">=", 50), "Stereo->Mono decoded RMS is too low")
+	})
+
+	Describe("decoding libopus-encoded audio", func() {
+		var ffmpegPath string
+		var tmpDir string
+		var pcmPath string
+		var sine []int16
+
+		BeforeEach(func() {
+			var err error
+			ffmpegPath, err = exec.LookPath("ffmpeg")
 			if err != nil {
-				return
+				Skip("ffmpeg not found")
 			}
-			payload := make([]byte, len(pkt.Payload))
-			copy(payload, pkt.Payload)
-			receivedMu.Lock()
-			receivedPackets = append(receivedPackets, receivedPacket{
-				seqNum:    pkt.Header.SequenceNumber,
-				timestamp: pkt.Header.Timestamp,
-				marker:    pkt.Header.Marker,
-				payload:   payload,
+
+			tmpDir = GinkgoT().TempDir()
+
+			sine = generateSineWave(440, 48000, 48000)
+			pcmBytes := sound.Int16toBytesLE(sine)
+			pcmPath = filepath.Join(tmpDir, "input.raw")
+			Expect(os.WriteFile(pcmPath, pcmBytes, 0644)).To(Succeed())
+		})
+
+		for _, tc := range []struct {
+			name    string
+			bitrate string
+			app     string
+		}{
+			{"voip_32k", "32000", "voip"},
+			{"voip_64k", "64000", "voip"},
+			{"audio_64k", "64000", "audio"},
+			{"audio_128k", "128000", "audio"},
+		} {
+			tc := tc
+			It(tc.name, func() {
+				oggPath := filepath.Join(tmpDir, fmt.Sprintf("libopus_%s_%s.ogg", tc.app, tc.bitrate))
+				cmd := exec.Command(ffmpegPath,
+					"-y",
+					"-f", "s16le", "-ar", "48000", "-ac", "1", "-i", pcmPath,
+					"-c:a", "libopus",
+					"-b:a", tc.bitrate,
+					"-application", tc.app,
+					"-frame_duration", "20",
+					"-vbr", "on",
+					oggPath,
+				)
+				out, err := cmd.CombinedOutput()
+				Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("ffmpeg encode: %s", out))
+
+				oggData, err := os.ReadFile(oggPath)
+				Expect(err).ToNot(HaveOccurred())
+
+				frames := extractOpusFramesFromOgg(oggData)
+				Expect(frames).ToNot(BeEmpty(), "no Opus frames extracted from Ogg container")
+				GinkgoWriter.Printf("Extracted %d Opus frames from libopus encoder (first frame %d bytes)\n", len(frames), len(frames[0]))
+
+				dec, err := NewOpusDecoder()
+				Expect(err).ToNot(HaveOccurred())
+				defer dec.Close()
+
+				var allDecoded []int16
+				decodeErrors := 0
+				for i, frame := range frames {
+					decoded, err := dec.Decode(frame)
+					if err != nil {
+						decodeErrors++
+						if decodeErrors <= 5 {
+							GinkgoWriter.Printf("frame %d: decode error: %v (size=%d)\n", i, err, len(frame))
+						}
+						continue
+					}
+					if i < 5 {
+						GinkgoWriter.Printf("frame %d: payload=%d bytes, decoded=%d samples (%.1fms @ 48kHz)\n",
+							i, len(frame), len(decoded), float64(len(decoded))/48.0)
+					}
+					allDecoded = append(allDecoded, decoded...)
+				}
+
+				if decodeErrors > 0 {
+					GinkgoWriter.Printf("Total decode errors: %d/%d frames\n", decodeErrors, len(frames))
+				}
+
+				Expect(allDecoded).ToNot(BeEmpty(), "no decoded samples from libopus-encoded Opus")
+
+				skip := min(len(allDecoded)/4, 48000*100/1000)
+				tail := allDecoded[skip:]
+				rms := computeRMS(tail)
+				freq := estimateFrequency(tail, 48000)
+
+				GinkgoWriter.Printf("libopus->opus-go: %d decoded samples, RMS=%.1f, freq≈%.0f Hz\n", len(allDecoded), rms, freq)
+
+				Expect(rms).To(BeNumerically(">=", 50), "RMS is too low — opus-go cannot decode libopus output")
+				Expect(freq).To(BeNumerically("~", 440, 30), fmt.Sprintf("frequency %.0f Hz deviates from expected 440 Hz", freq))
 			})
-			receivedMu.Unlock()
 		}
 	})
 
-	// --- Exchange SDP ---
-	offer, err := senderPC.CreateOffer(nil)
-	if err != nil {
-		t.Fatalf("CreateOffer: %v", err)
-	}
-	if err := senderPC.SetLocalDescription(offer); err != nil {
-		t.Fatalf("sender SetLocalDescription: %v", err)
-	}
-	senderGatherDone := webrtc.GatheringCompletePromise(senderPC)
-	select {
-	case <-senderGatherDone:
-	case <-time.After(5 * time.Second):
-		t.Fatal("sender ICE gathering timeout")
-	}
+	It("roundtrips at 48kHz", func() {
+		sine := generateSineWave(440, 48000, 48000)
+		pcmBytes := sound.Int16toBytesLE(sine)
 
-	if err := receiverPC.SetRemoteDescription(*senderPC.LocalDescription()); err != nil {
-		t.Fatalf("receiver SetRemoteDescription: %v", err)
-	}
-	answer, err := receiverPC.CreateAnswer(nil)
-	if err != nil {
-		t.Fatalf("CreateAnswer: %v", err)
-	}
-	if err := receiverPC.SetLocalDescription(answer); err != nil {
-		t.Fatalf("receiver SetLocalDescription: %v", err)
-	}
-	receiverGatherDone := webrtc.GatheringCompletePromise(receiverPC)
-	select {
-	case <-receiverGatherDone:
-	case <-time.After(5 * time.Second):
-		t.Fatal("receiver ICE gathering timeout")
-	}
+		decoded := encodeDecodeRoundtrip(pcmBytes, 48000)
+		Expect(decoded).ToNot(BeEmpty())
 
-	if err := senderPC.SetRemoteDescription(*receiverPC.LocalDescription()); err != nil {
-		t.Fatalf("sender SetRemoteDescription: %v", err)
-	}
+		decodedSR := 48000
+		skipDecoded := decodedSR * 50 / 1000
+		if skipDecoded > len(decoded)/2 {
+			skipDecoded = len(decoded) / 4
+		}
+		tail := decoded[skipDecoded:]
+
+		rms := computeRMS(tail)
+		GinkgoWriter.Printf("48kHz roundtrip: %d decoded samples, RMS=%.1f\n", len(decoded), rms)
+
+		Expect(rms).To(BeNumerically(">=", 50), "decoded audio RMS is too low; signal appears silent")
+	})
+
+	It("roundtrips at 16kHz", func() {
+		sine16k := generateSineWave(440, 16000, 16000)
+		pcmBytes := sound.Int16toBytesLE(sine16k)
+
+		decoded := encodeDecodeRoundtrip(pcmBytes, 16000)
+		Expect(decoded).ToNot(BeEmpty())
+
+		decoded16k := sound.ResampleInt16(decoded, 48000, 16000)
+
+		skip := min(len(decoded16k)/4, 16000*50/1000)
+		tail := decoded16k[skip:]
+
+		rms := computeRMS(tail)
+		GinkgoWriter.Printf("16kHz roundtrip: %d decoded@48k -> %d resampled@16k, RMS=%.1f\n",
+			len(decoded), len(decoded16k), rms)
+
+		Expect(rms).To(BeNumerically(">=", 50), "decoded audio RMS is too low; signal appears silent")
+	})
+
+	It("produces nil frames for empty input", func() {
+		enc, err := NewOpusEncoder()
+		Expect(err).ToNot(HaveOccurred())
+		defer enc.Close()
+
+		frames, err := enc.Encode([]byte{}, 48000)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(frames).To(BeNil())
+	})
+
+	It("silently drops sub-frame input", func() {
+		enc, err := NewOpusEncoder()
+		Expect(err).ToNot(HaveOccurred())
+		defer enc.Close()
+
+		sine := generateSineWave(440, 48000, 500) // < 960
+		pcmBytes := sound.Int16toBytesLE(sine)
+
+		frames, err := enc.Encode(pcmBytes, 48000)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(frames).To(BeEmpty(), fmt.Sprintf("expected 0 frames for %d samples (< 960)", len(sine)))
+	})
 
-	// Wait for connection
-	connected := make(chan struct{})
-	senderPC.OnConnectionStateChange(func(s webrtc.PeerConnectionState) {
-		if s == webrtc.PeerConnectionStateConnected {
-			select {
-			case <-connected:
-			default:
-				close(connected)
+	It("encodes multiple frames", func() {
+		enc, err := NewOpusEncoder()
+		Expect(err).ToNot(HaveOccurred())
+		defer enc.Close()
+
+		sine := generateSineWave(440, 48000, 2880) // exactly 3 frames
+		pcmBytes := sound.Int16toBytesLE(sine)
+
+		frames, err := enc.Encode(pcmBytes, 48000)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(frames).To(HaveLen(3))
+	})
+
+	It("produces expected decoded frame size", func() {
+		enc, err := NewOpusEncoder()
+		Expect(err).ToNot(HaveOccurred())
+		defer enc.Close()
+
+		dec, err := NewOpusDecoder()
+		Expect(err).ToNot(HaveOccurred())
+		defer dec.Close()
+
+		sine := generateSineWave(440, 48000, 960)
+		pcmBytes := sound.Int16toBytesLE(sine)
+
+		frames, err := enc.Encode(pcmBytes, 48000)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(frames).To(HaveLen(1))
+
+		decoded, err := dec.Decode(frames[0])
+		Expect(err).ToNot(HaveOccurred())
+
+		GinkgoWriter.Printf("Encoder input: 960 samples (20ms @ 48kHz)\n")
+		GinkgoWriter.Printf("Decoder output: %d samples (%.1fms @ 48kHz)\n",
+			len(decoded), float64(len(decoded))/48.0)
+
+		Expect(len(decoded)).To(SatisfyAny(Equal(960), Equal(480)),
+			fmt.Sprintf("unexpected decoded frame size %d", len(decoded)))
+	})
+
+	It("handles the full WebRTC output path", func() {
+		sine16k := generateSineWave(440, 16000, 16000)
+		pcmBytes := sound.Int16toBytesLE(sine16k)
+
+		decoded := encodeDecodeRoundtrip(pcmBytes, 16000)
+		Expect(decoded).ToNot(BeEmpty())
+
+		rms := computeRMS(decoded)
+		GinkgoWriter.Printf("WebRTC output path: %d decoded samples at 48kHz, RMS=%.1f\n", len(decoded), rms)
+
+		Expect(rms).To(BeNumerically(">=", 50), "decoded audio RMS is too low")
+	})
+
+	It("handles the full WebRTC input path", func() {
+		sine48k := generateSineWave(440, 48000, 48000)
+		pcmBytes := sound.Int16toBytesLE(sine48k)
+
+		decoded48k := encodeDecodeRoundtrip(pcmBytes, 48000)
+		Expect(decoded48k).ToNot(BeEmpty())
+
+		step24k := sound.ResampleInt16(decoded48k, 48000, 24000)
+		webrtcPath := sound.ResampleInt16(step24k, 24000, 16000)
+
+		rms := computeRMS(webrtcPath)
+		GinkgoWriter.Printf("WebRTC input path: %d decoded@48k -> %d@24k -> %d@16k, RMS=%.1f\n",
+			len(decoded48k), len(step24k), len(webrtcPath), rms)
+
+		Expect(rms).To(BeNumerically(">=", 50), "WebRTC input path signal lost in pipeline")
+	})
+
+	Context("bug documentation", func() {
+		It("documents trailing sample loss", func() {
+			enc, err := NewOpusEncoder()
+			Expect(err).ToNot(HaveOccurred())
+			defer enc.Close()
+
+			sine := generateSineWave(440, 48000, 1000)
+			pcmBytes := sound.Int16toBytesLE(sine)
+
+			frames, err := enc.Encode(pcmBytes, 48000)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(frames).To(HaveLen(1))
+
+			dec, err := NewOpusDecoder()
+			Expect(err).ToNot(HaveOccurred())
+			defer dec.Close()
+
+			decoded, err := dec.Decode(frames[0])
+			Expect(err).ToNot(HaveOccurred())
+
+			GinkgoWriter.Printf("Input: 1000 samples, Encoded: 1 frame, Decoded: %d samples (40 samples lost)\n", len(decoded))
+			Expect(len(decoded)).To(BeNumerically("<=", 960),
+				fmt.Sprintf("decoded more samples (%d) than the encoder consumed (960)", len(decoded)))
+		})
+
+		It("documents TTS sample rate mismatch", func() {
+			sine24k := generateSineWave(440, 24000, 24000)
+			pcmBytes := sound.Int16toBytesLE(sine24k)
+
+			decodedBug := encodeDecodeRoundtrip(pcmBytes, 16000)
+			decodedCorrect := encodeDecodeRoundtrip(pcmBytes, 24000)
+
+			skipBug := min(len(decodedBug)/4, 48000*100/1000)
+			skipCorrect := min(len(decodedCorrect)/4, 48000*100/1000)
+
+			bugTail := decodedBug[skipBug:]
+			correctTail := decodedCorrect[skipCorrect:]
+
+			bugFreq := estimateFrequency(bugTail, 48000)
+			correctFreq := estimateFrequency(correctTail, 48000)
+
+			GinkgoWriter.Printf("Bug path:     %d decoded samples, freq≈%.0f Hz (expected ~660 Hz = 440*1.5)\n", len(decodedBug), bugFreq)
+			GinkgoWriter.Printf("Correct path: %d decoded samples, freq≈%.0f Hz (expected ~440 Hz)\n", len(decodedCorrect), correctFreq)
+
+			if len(decodedBug) > 0 && len(decodedCorrect) > 0 {
+				ratio := float64(len(decodedBug)) / float64(len(decodedCorrect))
+				GinkgoWriter.Printf("Sample count ratio (bug/correct): %.2f (expected ~1.5)\n", ratio)
+				Expect(ratio).To(BeNumerically(">=", 1.1),
+					"expected bug path to produce significantly more samples due to wrong resample ratio")
 			}
-		}
+		})
 	})
-	select {
-	case <-connected:
-	case <-time.After(5 * time.Second):
-		t.Fatal("timeout waiting for WebRTC connection")
-	}
 
-	// --- Send test tone via RTP (same logic as SendAudio) ---
-	const samplesPerFrame = 960
-	seqNum := uint16(rand.UintN(65536))
-	timestamp := rand.Uint32()
-	marker := true
-
-	ticker := time.NewTicker(20 * time.Millisecond)
-	defer ticker.Stop()
-
-	for i, frame := range opusFrames {
-		pkt := &rtp.Packet{
-			Header: rtp.Header{
-				Version:        2,
-				Marker:         marker,
-				SequenceNumber: seqNum,
-				Timestamp:      timestamp,
-			},
-			Payload: frame,
+	It("produces frames decodable by ffmpeg (cross-library compat)", func() {
+		ffmpegPath, err := exec.LookPath("ffmpeg")
+		if err != nil {
+			Skip("ffmpeg not found")
 		}
-		seqNum++
-		timestamp += samplesPerFrame
-		marker = false
 
-		if err := audioTrack.WriteRTP(pkt); err != nil {
-			t.Fatalf("WriteRTP frame %d: %v", i, err)
-		}
-		if i < len(opusFrames)-1 {
-			<-ticker.C
-		}
-	}
+		sine := generateSineWave(440, 48000, 48000)
+		pcmBytes := sound.Int16toBytesLE(sine)
 
-	// Wait for packets to arrive (give extra time for jitter buffer)
-	time.Sleep(500 * time.Millisecond)
+		enc, err := NewOpusEncoder()
+		Expect(err).ToNot(HaveOccurred())
+		defer enc.Close()
 
-	// Close sender to trigger track end on receiver
-	senderPC.Close()
+		frames, err := enc.Encode(pcmBytes, 48000)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(frames).ToNot(BeEmpty())
+		GinkgoWriter.Printf("opus-go produced %d frames (first frame %d bytes)\n", len(frames), len(frames[0]))
 
-	// Wait for track reader to finish (with timeout)
-	select {
-	case <-trackDone:
-	case <-time.After(2 * time.Second):
-		// Track reader may not exit cleanly on all platforms
-	}
+		tmpDir := GinkgoT().TempDir()
+		oggPath := filepath.Join(tmpDir, "opus_go_output.ogg")
+		Expect(writeOggOpus(oggPath, frames, 48000, 1)).To(Succeed())
 
-	// --- Decode received Opus frames ---
-	receivedMu.Lock()
-	pkts := make([]receivedPacket, len(receivedPackets))
-	copy(pkts, receivedPackets)
-	receivedMu.Unlock()
+		decodedWavPath := filepath.Join(tmpDir, "ffmpeg_decoded.wav")
+		cmd := exec.Command(ffmpegPath, "-y", "-i", oggPath, "-ar", "48000", "-ac", "1", "-c:a", "pcm_s16le", decodedWavPath)
+		out, err := cmd.CombinedOutput()
+		Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("ffmpeg failed to decode opus-go output: %s", out))
 
-	if len(pkts) == 0 {
-		t.Fatal("no RTP packets received")
-	}
+		decodedData, err := os.ReadFile(decodedWavPath)
+		Expect(err).ToNot(HaveOccurred())
 
-	dec, err := NewOpusDecoder()
-	if err != nil {
-		t.Fatalf("NewOpusDecoder: %v", err)
-	}
-	defer dec.Close()
+		decodedPCM, sr := parseTestWAV(decodedData)
+		Expect(sr).ToNot(BeZero(), "ffmpeg output has no WAV header")
+		decodedSamples := sound.BytesToInt16sLE(decodedPCM)
 
-	var allDecoded []int16
-	decodeErrors := 0
-	for _, pkt := range pkts {
-		samples, err := dec.Decode(pkt.payload)
-		if err != nil {
-			decodeErrors++
-			continue
+		skip := min(len(decodedSamples)/4, sr*100/1000)
+		if skip >= len(decodedSamples) {
+			skip = 0
 		}
-		allDecoded = append(allDecoded, samples...)
-	}
+		tail := decodedSamples[skip:]
+		rms := computeRMS(tail)
 
-	if len(allDecoded) == 0 {
-		t.Fatal("no decoded samples")
-	}
+		GinkgoWriter.Printf("ffmpeg decoded opus-go output: %d samples at %dHz, RMS=%.1f\n", len(decodedSamples), sr, rms)
 
-	// --- Analyse RTP packet delivery ---
-	frameLoss := len(opusFrames) - len(pkts)
-	seqGaps := 0
-	for i := 1; i < len(pkts); i++ {
-		expected := pkts[i-1].seqNum + 1
-		if pkts[i].seqNum != expected {
-			seqGaps++
+		Expect(rms).To(BeNumerically(">=", 50),
+			"ffmpeg decoded RMS is too low — opus-go frames are likely incompatible with standard decoders")
+	})
+
+	It("delivers audio through a full WebRTC pipeline", func() {
+		const (
+			toneFreq       = 440.0
+			toneSampleRate = 24000
+			toneDuration   = 1
+			toneAmplitude  = 16000
+			toneNumSamples = toneSampleRate * toneDuration
+		)
+
+		pcm := make([]byte, toneNumSamples*2)
+		for i := 0; i < toneNumSamples; i++ {
+			sample := int16(toneAmplitude * math.Sin(2*math.Pi*toneFreq*float64(i)/float64(toneSampleRate)))
+			binary.LittleEndian.PutUint16(pcm[i*2:], uint16(sample))
 		}
-	}
-	markerCount := 0
-	for _, pkt := range pkts {
-		if pkt.marker {
-			markerCount++
+
+		enc, err := NewOpusEncoder()
+		Expect(err).ToNot(HaveOccurred())
+		defer enc.Close()
+
+		opusFrames, err := enc.Encode(pcm, toneSampleRate)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(opusFrames).ToNot(BeEmpty())
+		GinkgoWriter.Printf("Encoded %d Opus frames from %d PCM samples at %dHz\n", len(opusFrames), toneNumSamples, toneSampleRate)
+
+		// Create sender PeerConnection
+		senderME := &webrtc.MediaEngine{}
+		Expect(senderME.RegisterDefaultCodecs()).To(Succeed())
+		senderAPI := webrtc.NewAPI(webrtc.WithMediaEngine(senderME))
+		senderPC, err := senderAPI.NewPeerConnection(webrtc.Configuration{})
+		Expect(err).ToNot(HaveOccurred())
+		defer senderPC.Close()
+
+		audioTrack, err := webrtc.NewTrackLocalStaticRTP(
+			webrtc.RTPCodecCapability{
+				MimeType:  webrtc.MimeTypeOpus,
+				ClockRate: 48000,
+				Channels:  2,
+			},
+			"audio", "test",
+		)
+		Expect(err).ToNot(HaveOccurred())
+
+		rtpSender, err := senderPC.AddTrack(audioTrack)
+		Expect(err).ToNot(HaveOccurred())
+		go func() {
+			buf := make([]byte, 1500)
+			for {
+				if _, _, err := rtpSender.Read(buf); err != nil {
+					return
+				}
+			}
+		}()
+
+		// Create receiver PeerConnection
+		receiverME := &webrtc.MediaEngine{}
+		Expect(receiverME.RegisterDefaultCodecs()).To(Succeed())
+		receiverAPI := webrtc.NewAPI(webrtc.WithMediaEngine(receiverME))
+		receiverPC, err := receiverAPI.NewPeerConnection(webrtc.Configuration{})
+		Expect(err).ToNot(HaveOccurred())
+		defer receiverPC.Close()
+
+		type receivedPacket struct {
+			seqNum    uint16
+			timestamp uint32
+			marker    bool
+			payload   []byte
 		}
-	}
+		var (
+			receivedMu      sync.Mutex
+			receivedPackets []receivedPacket
+			trackDone       = make(chan struct{})
+		)
+
+		receiverPC.OnTrack(func(track *webrtc.TrackRemote, receiver *webrtc.RTPReceiver) {
+			defer close(trackDone)
+			for {
+				pkt, _, err := track.ReadRTP()
+				if err != nil {
+					return
+				}
+				payload := make([]byte, len(pkt.Payload))
+				copy(payload, pkt.Payload)
+				receivedMu.Lock()
+				receivedPackets = append(receivedPackets, receivedPacket{
+					seqNum:    pkt.Header.SequenceNumber,
+					timestamp: pkt.Header.Timestamp,
+					marker:    pkt.Header.Marker,
+					payload:   payload,
+				})
+				receivedMu.Unlock()
+			}
+		})
 
-	t.Log("── RTP Delivery ──")
-	t.Logf("  Frames sent:     %d", len(opusFrames))
-	t.Logf("  Packets recv:    %d", len(pkts))
-	t.Logf("  Frame loss:      %d", frameLoss)
-	t.Logf("  Sequence gaps:   %d", seqGaps)
-	t.Logf("  Marker packets:  %d (expect 1)", markerCount)
-	t.Logf("  Decode errors:   %d", decodeErrors)
-
-	// --- Audio quality metrics ---
-	// Skip codec warmup (first 100ms at 48kHz = 4800 samples)
-	skip := 48000 * 100 / 1000
-	if skip > len(allDecoded)/2 {
-		skip = len(allDecoded) / 4
-	}
-	tail := allDecoded[skip:]
+		// Exchange SDP
+		offer, err := senderPC.CreateOffer(nil)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(senderPC.SetLocalDescription(offer)).To(Succeed())
+		senderGatherDone := webrtc.GatheringCompletePromise(senderPC)
+		Eventually(senderGatherDone, 5*time.Second).Should(BeClosed())
+
+		Expect(receiverPC.SetRemoteDescription(*senderPC.LocalDescription())).To(Succeed())
+		answer, err := receiverPC.CreateAnswer(nil)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(receiverPC.SetLocalDescription(answer)).To(Succeed())
+		receiverGatherDone := webrtc.GatheringCompletePromise(receiverPC)
+		Eventually(receiverGatherDone, 5*time.Second).Should(BeClosed())
+
+		Expect(senderPC.SetRemoteDescription(*receiverPC.LocalDescription())).To(Succeed())
+
+		// Wait for connection
+		connected := make(chan struct{})
+		senderPC.OnConnectionStateChange(func(s webrtc.PeerConnectionState) {
+			if s == webrtc.PeerConnectionStateConnected {
+				select {
+				case <-connected:
+				default:
+					close(connected)
+				}
+			}
+		})
+		Eventually(connected, 5*time.Second).Should(BeClosed())
+
+		// Send test tone via RTP
+		const samplesPerFrame = 960
+		seqNum := uint16(rand.UintN(65536))
+		timestamp := rand.Uint32()
+		marker := true
+
+		ticker := time.NewTicker(20 * time.Millisecond)
+		defer ticker.Stop()
+
+		for i, frame := range opusFrames {
+			pkt := &rtp.Packet{
+				Header: rtp.Header{
+					Version:        2,
+					Marker:         marker,
+					SequenceNumber: seqNum,
+					Timestamp:      timestamp,
+				},
+				Payload: frame,
+			}
+			seqNum++
+			timestamp += samplesPerFrame
+			marker = false
 
-	rms := computeRMS(tail)
-	freq := estimateFrequency(tail, 48000)
-	thd := computeTHD(tail, toneFreq, 48000, 10)
+			Expect(audioTrack.WriteRTP(pkt)).To(Succeed(), fmt.Sprintf("WriteRTP frame %d", i))
+			if i < len(opusFrames)-1 {
+				<-ticker.C
+			}
+		}
 
-	t.Log("── Audio Quality ──")
-	t.Logf("  Decoded samples: %d (%.1f ms at 48kHz)", len(allDecoded), float64(len(allDecoded))/48.0)
-	t.Logf("  RMS level:       %.1f", rms)
-	t.Logf("  Peak frequency:  %.0f Hz (expected %.0f Hz)", freq, toneFreq)
-	t.Logf("  THD (h2-h10):    %.1f%%", thd)
+		// Wait for packets to arrive
+		time.Sleep(500 * time.Millisecond)
 
-	// --- Assertions ---
-	if frameLoss > 0 {
-		t.Errorf("lost %d frames in localhost transport", frameLoss)
-	}
-	if seqGaps > 0 {
-		t.Errorf("detected %d sequence number gaps", seqGaps)
-	}
-	if markerCount != 1 {
-		t.Errorf("expected exactly 1 marker packet (first packet), got %d", markerCount)
-	}
-	if rms < 50 {
-		t.Errorf("RMS=%.1f is too low; signal appears silent or severely attenuated", rms)
-	}
-	freqDelta := math.Abs(freq - toneFreq)
-	if freqDelta > 20 {
-		t.Errorf("peak frequency %.0f Hz deviates from expected %.0f Hz by %.0f Hz", freq, toneFreq, freqDelta)
-	}
-	if thd > 50 {
-		t.Errorf("THD=%.1f%% is too high; signal is severely distorted", thd)
-	}
+		senderPC.Close()
 
-	// Log a summary line for quick scanning
-	result := "PASS"
-	issues := []string{}
-	if frameLoss > 0 {
-		issues = append(issues, fmt.Sprintf("%d frames lost", frameLoss))
-	}
-	if freqDelta > 20 {
-		issues = append(issues, fmt.Sprintf("freq off by %.0f Hz", freqDelta))
-	}
-	if thd > 50 {
-		issues = append(issues, fmt.Sprintf("THD %.1f%%", thd))
-	}
-	if rms < 50 {
-		issues = append(issues, "silent")
-	}
-	if len(issues) > 0 {
-		result = "FAIL: " + fmt.Sprintf("%v", issues)
-	}
-	t.Logf("── Summary: %s ──", result)
-}
+		select {
+		case <-trackDone:
+		case <-time.After(2 * time.Second):
+		}
+
+		// Decode received Opus frames
+		receivedMu.Lock()
+		pkts := make([]receivedPacket, len(receivedPackets))
+		copy(pkts, receivedPackets)
+		receivedMu.Unlock()
+
+		Expect(pkts).ToNot(BeEmpty(), "no RTP packets received")
+
+		dec, err := NewOpusDecoder()
+		Expect(err).ToNot(HaveOccurred())
+		defer dec.Close()
+
+		var allDecoded []int16
+		decodeErrors := 0
+		for _, pkt := range pkts {
+			samples, err := dec.Decode(pkt.payload)
+			if err != nil {
+				decodeErrors++
+				continue
+			}
+			allDecoded = append(allDecoded, samples...)
+		}
+
+		Expect(allDecoded).ToNot(BeEmpty(), "no decoded samples")
+
+		// Analyse RTP packet delivery
+		frameLoss := len(opusFrames) - len(pkts)
+		seqGaps := 0
+		for i := 1; i < len(pkts); i++ {
+			expected := pkts[i-1].seqNum + 1
+			if pkts[i].seqNum != expected {
+				seqGaps++
+			}
+		}
+		markerCount := 0
+		for _, pkt := range pkts {
+			if pkt.marker {
+				markerCount++
+			}
+		}
+
+		GinkgoWriter.Println("── RTP Delivery ──")
+		GinkgoWriter.Printf("  Frames sent:     %d\n", len(opusFrames))
+		GinkgoWriter.Printf("  Packets recv:    %d\n", len(pkts))
+		GinkgoWriter.Printf("  Frame loss:      %d\n", frameLoss)
+		GinkgoWriter.Printf("  Sequence gaps:   %d\n", seqGaps)
+		GinkgoWriter.Printf("  Marker packets:  %d (expect 1)\n", markerCount)
+		GinkgoWriter.Printf("  Decode errors:   %d\n", decodeErrors)
+
+		// Audio quality metrics
+		skip := 48000 * 100 / 1000
+		if skip > len(allDecoded)/2 {
+			skip = len(allDecoded) / 4
+		}
+		tail := allDecoded[skip:]
+
+		rms := computeRMS(tail)
+		freq := estimateFrequency(tail, 48000)
+		thd := computeTHD(tail, toneFreq, 48000, 10)
+
+		GinkgoWriter.Println("── Audio Quality ──")
+		GinkgoWriter.Printf("  Decoded samples: %d (%.1f ms at 48kHz)\n", len(allDecoded), float64(len(allDecoded))/48.0)
+		GinkgoWriter.Printf("  RMS level:       %.1f\n", rms)
+		GinkgoWriter.Printf("  Peak frequency:  %.0f Hz (expected %.0f Hz)\n", freq, toneFreq)
+		GinkgoWriter.Printf("  THD (h2-h10):    %.1f%%\n", thd)
+
+		Expect(frameLoss).To(BeZero(), "lost frames in localhost transport")
+		Expect(seqGaps).To(BeZero(), "sequence number gaps detected")
+		Expect(markerCount).To(Equal(1), "expected exactly 1 marker packet")
+		Expect(rms).To(BeNumerically(">=", 50), "signal appears silent or severely attenuated")
+		Expect(freq).To(BeNumerically("~", toneFreq, 20), fmt.Sprintf("peak frequency %.0f Hz deviates from expected", freq))
+		Expect(thd).To(BeNumerically("<", 50), "signal is severely distorted")
+
+		result := "PASS"
+		var issues []string
+		if frameLoss > 0 {
+			issues = append(issues, fmt.Sprintf("%d frames lost", frameLoss))
+		}
+		if math.Abs(freq-toneFreq) > 20 {
+			issues = append(issues, fmt.Sprintf("freq off by %.0f Hz", math.Abs(freq-toneFreq)))
+		}
+		if thd > 50 {
+			issues = append(issues, fmt.Sprintf("THD %.1f%%", thd))
+		}
+		if rms < 50 {
+			issues = append(issues, "silent")
+		}
+		if len(issues) > 0 {
+			result = "FAIL: " + fmt.Sprintf("%v", issues)
+		}
+		GinkgoWriter.Printf("── Summary: %s ──\n", result)
+	})
+})
diff --git a/pkg/audio/audio_suite_test.go b/pkg/audio/audio_suite_test.go
new file mode 100644
index 000000000000..9c3dd78635a6
--- /dev/null
+++ b/pkg/audio/audio_suite_test.go
@@ -0,0 +1,13 @@
+package audio
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestAudio(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Audio Suite")
+}
diff --git a/pkg/audio/audio_test.go b/pkg/audio/audio_test.go
index 5cfd0c519e68..836aa27aeb48 100644
--- a/pkg/audio/audio_test.go
+++ b/pkg/audio/audio_test.go
@@ -3,153 +3,97 @@ package audio
 import (
 	"bytes"
 	"encoding/binary"
-	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
 )
 
-func TestNewWAVHeader_Valid44Bytes(t *testing.T) {
-	hdr := NewWAVHeader(3200)
-	var buf bytes.Buffer
-	if err := hdr.Write(&buf); err != nil {
-		t.Fatalf("Write failed: %v", err)
-	}
-	if buf.Len() != WAVHeaderSize {
-		t.Fatalf("header size = %d, want %d", buf.Len(), WAVHeaderSize)
-	}
-
-	b := buf.Bytes()
-	// RIFF
-	if string(b[0:4]) != "RIFF" {
-		t.Errorf("ChunkID = %q, want RIFF", b[0:4])
-	}
-	// WAVE
-	if string(b[8:12]) != "WAVE" {
-		t.Errorf("Format = %q, want WAVE", b[8:12])
-	}
-	// fmt
-	if string(b[12:16]) != "fmt " {
-		t.Errorf("Subchunk1ID = %q, want 'fmt '", b[12:16])
-	}
-	// AudioFormat = 1 (PCM)
-	audioFmt := binary.LittleEndian.Uint16(b[20:22])
-	if audioFmt != 1 {
-		t.Errorf("AudioFormat = %d, want 1", audioFmt)
-	}
-	// NumChannels = 1
-	numCh := binary.LittleEndian.Uint16(b[22:24])
-	if numCh != 1 {
-		t.Errorf("NumChannels = %d, want 1", numCh)
-	}
-	// SampleRate = 16000
-	sr := binary.LittleEndian.Uint32(b[24:28])
-	if sr != 16000 {
-		t.Errorf("SampleRate = %d, want 16000", sr)
-	}
-	// ByteRate = 32000
-	br := binary.LittleEndian.Uint32(b[28:32])
-	if br != 32000 {
-		t.Errorf("ByteRate = %d, want 32000", br)
-	}
-	// data
-	if string(b[36:40]) != "data" {
-		t.Errorf("Subchunk2ID = %q, want 'data'", b[36:40])
-	}
-	// Subchunk2Size
-	dataSize := binary.LittleEndian.Uint32(b[40:44])
-	if dataSize != 3200 {
-		t.Errorf("Subchunk2Size = %d, want 3200", dataSize)
-	}
-}
-
-func TestNewWAVHeaderWithRate_CustomRate(t *testing.T) {
-	hdr := NewWAVHeaderWithRate(4800, 24000)
-	var buf bytes.Buffer
-	if err := hdr.Write(&buf); err != nil {
-		t.Fatalf("Write failed: %v", err)
-	}
-	b := buf.Bytes()
-
-	sr := binary.LittleEndian.Uint32(b[24:28])
-	if sr != 24000 {
-		t.Errorf("SampleRate = %d, want 24000", sr)
-	}
-	br := binary.LittleEndian.Uint32(b[28:32])
-	if br != 48000 {
-		t.Errorf("ByteRate = %d, want 48000 (24000*2)", br)
-	}
-}
-
-func TestStripWAVHeader_Strips44Bytes(t *testing.T) {
-	pcm := []byte{0xDE, 0xAD, 0xBE, 0xEF}
-	hdr := NewWAVHeader(uint32(len(pcm)))
-	var buf bytes.Buffer
-	if err := hdr.Write(&buf); err != nil {
-		t.Fatalf("Write failed: %v", err)
-	}
-	buf.Write(pcm)
-
-	got := StripWAVHeader(buf.Bytes())
-	if !bytes.Equal(got, pcm) {
-		t.Errorf("StripWAVHeader result = %v, want %v", got, pcm)
-	}
-}
-
-func TestStripWAVHeader_ShortData(t *testing.T) {
-	short := []byte{0x01, 0x02, 0x03}
-	got := StripWAVHeader(short)
-	if !bytes.Equal(got, short) {
-		t.Errorf("expected unchanged data for short input")
-	}
-
-	// Exactly 44 bytes — still "short" because there's no data after the header
-	exact := make([]byte, WAVHeaderSize)
-	got = StripWAVHeader(exact)
-	if !bytes.Equal(got, exact) {
-		t.Errorf("expected unchanged data for exact header-size input")
-	}
-}
-
-func TestParseWAV_ReturnsSampleRate(t *testing.T) {
-	pcm := make([]byte, 100)
-	for i := range pcm {
-		pcm[i] = byte(i)
-	}
-
-	// 24kHz WAV
-	hdr24 := NewWAVHeaderWithRate(uint32(len(pcm)), 24000)
-	var buf24 bytes.Buffer
-	hdr24.Write(&buf24)
-	buf24.Write(pcm)
-
-	gotPCM, gotRate := ParseWAV(buf24.Bytes())
-	if gotRate != 24000 {
-		t.Errorf("ParseWAV sample rate = %d, want 24000", gotRate)
-	}
-	if !bytes.Equal(gotPCM, pcm) {
-		t.Error("ParseWAV PCM data mismatch")
-	}
-
-	// 16kHz WAV
-	hdr16 := NewWAVHeader(uint32(len(pcm)))
-	var buf16 bytes.Buffer
-	hdr16.Write(&buf16)
-	buf16.Write(pcm)
-
-	gotPCM, gotRate = ParseWAV(buf16.Bytes())
-	if gotRate != 16000 {
-		t.Errorf("ParseWAV sample rate = %d, want 16000", gotRate)
-	}
-	if !bytes.Equal(gotPCM, pcm) {
-		t.Error("ParseWAV PCM data mismatch")
-	}
-}
-
-func TestParseWAV_ShortData(t *testing.T) {
-	short := []byte{0x01, 0x02, 0x03}
-	gotPCM, gotRate := ParseWAV(short)
-	if gotRate != 0 {
-		t.Errorf("expected sampleRate=0 for short input, got %d", gotRate)
-	}
-	if !bytes.Equal(gotPCM, short) {
-		t.Error("expected unchanged data for short input")
-	}
-}
+var _ = Describe("WAV utilities", func() {
+	Describe("NewWAVHeader", func() {
+		It("produces a valid 44-byte header", func() {
+			hdr := NewWAVHeader(3200)
+			var buf bytes.Buffer
+			Expect(hdr.Write(&buf)).To(Succeed())
+			Expect(buf.Len()).To(Equal(WAVHeaderSize))
+
+			b := buf.Bytes()
+			Expect(string(b[0:4])).To(Equal("RIFF"))
+			Expect(string(b[8:12])).To(Equal("WAVE"))
+			Expect(string(b[12:16])).To(Equal("fmt "))
+
+			Expect(binary.LittleEndian.Uint16(b[20:22])).To(Equal(uint16(1))) // PCM
+			Expect(binary.LittleEndian.Uint16(b[22:24])).To(Equal(uint16(1))) // mono
+			Expect(binary.LittleEndian.Uint32(b[24:28])).To(Equal(uint32(16000)))
+			Expect(binary.LittleEndian.Uint32(b[28:32])).To(Equal(uint32(32000)))
+			Expect(string(b[36:40])).To(Equal("data"))
+			Expect(binary.LittleEndian.Uint32(b[40:44])).To(Equal(uint32(3200)))
+		})
+	})
+
+	Describe("NewWAVHeaderWithRate", func() {
+		It("uses the custom sample rate", func() {
+			hdr := NewWAVHeaderWithRate(4800, 24000)
+			var buf bytes.Buffer
+			Expect(hdr.Write(&buf)).To(Succeed())
+			b := buf.Bytes()
+
+			Expect(binary.LittleEndian.Uint32(b[24:28])).To(Equal(uint32(24000)))
+			Expect(binary.LittleEndian.Uint32(b[28:32])).To(Equal(uint32(48000)))
+		})
+	})
+
+	Describe("StripWAVHeader", func() {
+		It("strips the 44-byte header", func() {
+			pcm := []byte{0xDE, 0xAD, 0xBE, 0xEF}
+			hdr := NewWAVHeader(uint32(len(pcm)))
+			var buf bytes.Buffer
+			Expect(hdr.Write(&buf)).To(Succeed())
+			buf.Write(pcm)
+
+			got := StripWAVHeader(buf.Bytes())
+			Expect(got).To(Equal(pcm))
+		})
+
+		It("returns short data unchanged", func() {
+			short := []byte{0x01, 0x02, 0x03}
+			Expect(StripWAVHeader(short)).To(Equal(short))
+
+			exact := make([]byte, WAVHeaderSize)
+			Expect(StripWAVHeader(exact)).To(Equal(exact))
+		})
+	})
+
+	Describe("ParseWAV", func() {
+		It("returns sample rate and PCM data", func() {
+			pcm := make([]byte, 100)
+			for i := range pcm {
+				pcm[i] = byte(i)
+			}
+
+			hdr24 := NewWAVHeaderWithRate(uint32(len(pcm)), 24000)
+			var buf24 bytes.Buffer
+			hdr24.Write(&buf24)
+			buf24.Write(pcm)
+
+			gotPCM, gotRate := ParseWAV(buf24.Bytes())
+			Expect(gotRate).To(Equal(24000))
+			Expect(gotPCM).To(Equal(pcm))
+
+			hdr16 := NewWAVHeader(uint32(len(pcm)))
+			var buf16 bytes.Buffer
+			hdr16.Write(&buf16)
+			buf16.Write(pcm)
+
+			gotPCM, gotRate = ParseWAV(buf16.Bytes())
+			Expect(gotRate).To(Equal(16000))
+			Expect(gotPCM).To(Equal(pcm))
+		})
+
+		It("returns zero rate for short data", func() {
+			short := []byte{0x01, 0x02, 0x03}
+			gotPCM, gotRate := ParseWAV(short)
+			Expect(gotRate).To(Equal(0))
+			Expect(gotPCM).To(Equal(short))
+		})
+	})
+})
diff --git a/pkg/opus/shim/libopusshim.so b/pkg/opus/shim/libopusshim.so
deleted file mode 100755
index 1ed7a1daa2244cd93990eae5dbeaf88c7182daad..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 15240
zcmeHOU5p!76~1<w)+Nb$8&Z)bG_j&aqy>+E_AhO<#GC!eq-@gsY$OU))A4xLHtX?t
zXU4ns3W=@4PouV|PlyMeqJpA|hqO;=g}_m(Di5txgoNOUeW{cnep0HU08-33bI<Wk
zX1!4>@qo~rmFL{^ou7N>_>S+)&bi;6o}G!uVuERl_^Mc!geZ_g9bpT^4vPdF9}>HL
z+k4hk8(M2yCB$3|N*Vj`RU|Xc{4@=MIpYZX{iS3a5#sL=%sl9KFX%3(rNQO#JNeh+
zQhts&`=N+_9rEjtA9KO>#1k@%IQ5}dy4yIvLUznKho`|0`}?%S={rOHZFYXAXnf2m
zz5ejAKa^#^iGmQ%`|bMxd;N!hJp0>+JMTlYFJb?wJ{pYI4|f6I*~cF9(9eDb@B@A9
z&jGemY!ibINag3r=6i%^L%7hivSrsb*V7$O(}Z?>?u1q{9kX0@J=2*xF<G<g=A2%v
znNrtZqZut-o3GaOT6M(~cBAQPZl!8zX5FwMn&z6GR;+rC?va)ihF9Cf+_0@i&1_Y@
zwy0H$7*cU<C5wWAtx<7o_W03BEwAL25s|6bmYH#?MkQ0WGp=J~%2lt@EGiHtV>L@P
z)5*-*hF+UEp0S%=qv^TGGZ5{GOugF5xSs8p8GX#jwdacqi}S_9#bQn`jk@*rh~W;o
z&QN_}xSpn=q;txMk`H^gT4RPamS1e=jqK>~kYVMjje5CscuZehUNB0HbQuSq&MU*&
z^yugYZ!3FPUo4fz8ZDz#v9kKYsJ>XtEsa%{M;1q&YPxI~>D;g~q-1dzVRyIU=4<V5
zTO;kIv3aXJ+*oceIklz7SDax>H>{<QI|RcBI6Y$Y7x%fGFo?YYO}>trd#d*fxGH`r
z)q;N~__`J}uddMV0OK7FP>_y{3zFB4NSv=VDG6~sfYT-Ft8NBx{%t_TtpH9xt-k&5
z3CC-8i;qNyC<9Rjq6|bCh%yjmAj&|LfhYqTGw^Y8-@nxJe@>_`Zv8_>2=)6NFW&t?
zJ^xPPZQr;%auML}!B^p!++Tne$`#bzSnC2Ed;{e9S2(iz^7!ijs4s7Q4cX(jyayrl
z61E}q{oS?X{#CSnn>g4%@7s^8Aba$-df}#e^{+?NtGBkOv3J$q-1c@sfN2Vl=&sEt
z_fM_+<M>y{$3ZQcUp}XvA3uOi^}>hVcJ;;aeIQ(a21a?k1c!IGJ_vH`S?IghKl<NT
z1|^N>9JpTi`>!r0g~)ZDy|Du{^FT^+KR#IsLHgH7|Cig{uIS~kmo?usIz$<WG7x1T
z%0QHXC<9Rjq6|bCh%yjmAj&|Lf&ZBd#P)7^65qYzJEqjlZdV0f0NwyT_))j}0`M&G
zOTdwKNhV?|XGE+u7~8x3zQil=viA|dzX+cX{?_fL&}Lw8W?=VI$!*JtRdHm`laCEO
z;=jg4e|UTs*T61;c0}m=E%>ZL-~PMYiGjf%#3y&$`xJBlIO@N17d@yViZT#oAj&|L
zfhYq}2BHi^8Hh3vWgyDH|5pa6{*dbpvCbChXOclRqa>_pM0!{<agF$P;#}~!gXAfS
z`w+=o_qdm2epcH<GS@%ia~TrrKe^qteSnG?xkm9i$+E^+h(jcQ&Nqg)vLtUKc{g#c
zW$aLWBbh<vxg>rb#kxnNFGwb?QOAs@Nak@~B0DJd^oir`Bbn{~-IeFVd4>1A^86?s
zydU;u*2CkgFaHT2sDb`TGM|S;|9i(?9eOC_DA%Kg@C{-?BF82tpGX}#cJkb#DOq|e
zjGA6Ar<P2|t=jceKBwfA{08qHfaiaY_^8_hsDbk<4t3h6LgPsiz8b<05YF?P!oZFL
ztl!-wusR)y^Ns&Yh{0s$`RFU*umB(Im+N&IUxr3ZBztwejQ<!K{l<TB6Z{u`oB?qy
z1&t8(i}M!P#Km^O*BQHXpb-=BACCefl4HSr3<e+met+kyA*(5A$HyDcz&M;A#y|1#
zor13)#<xKN<9OAE^8WzfDK-l5G~oT}?Z38({WpC3KmckkP@Hgm{!5$0`8nV?CpM7b
zctt4Qub)+Hv0lfod6s5iCBF*=|0P>1*X*KR(@IeE?rM6oC1ka|XO^JCzOMyV_g6LD
zar8D+^?OcR%saYeYNe)SwZSDIX<+HyVcOdepcn!MXx2+OJWZQ9GjU>Cn?5;(I}fk}
z?Ax`qie86`eW6V~b8_Ou@kv183e=`ma!{wv00esz*!9@#(?=&}wWnuh&Q8y1a}!5r
zry(E?<E|STdc*3wPa(83!Z-d*+ZO!26rW<hL?O75K`3t9^7JBb&yl>st#ZAoRCKo@
zlv2A6W|Di3P*6iZNp}bu=p3`AqXV@XHBTshY7}THWgA4#Y{Ak00u-}C-({r)&MZk6
z`K3xrZbM|<sv0mB-P46qbX|d)E-aWN8z$#x2XLMJ2douwcI5`~LxeNT>l2$Fga)pi
ztmpNWd4~*f9fC`$w_fwQmj(=V?9b~pb6%gp#3zgqB-|c`Ya#pd`p2AqW>_EQN5LQ0
zR>tM#FaT+tO+gbWJbsA^aT*%9<g!1n_slPmA?rDReE#1ge;fxA-#;*CJM_o>OJRQ#
zFw6<x-}HW0>BRXy3QT;W+XEKGbI=O<r^v6zdzP}n`DBjkUC_Tk3g$ds4#P5YoWntX
zUay&d%#OslUxrsg{=CjH$NPXFh0lK#l-NIiKOzMJQp!IHd`A(au>X%h8T8Mlq&4%O
z2A!eJ=gsg>pfLa+oL@+M|I7R7!uBjPe-(W2o`(JTe&h;XSU_wirFEI{%b>uy!~VSP
z^S(n)4C~pCzX!bO#Per8ulJn{fItQRxPD-N=5K;3=-+e5N*}i42x0lxpg@fK=lhVv
zCjQu0klq0SF*5I|*#ApCL^jwz$a`}qV4J0YFOU+S5Ad0R$IsU_J_qpF1McDq-Sb}M
Yjz}32{oBlb{_SDu@P05Tw8Q@Y0Tc=5EdT%j

diff --git a/pkg/sound/int16_test.go b/pkg/sound/int16_test.go
index ce2bc8c2a6f7..f803efda5ce0 100644
--- a/pkg/sound/int16_test.go
+++ b/pkg/sound/int16_test.go
@@ -2,161 +2,119 @@ package sound
 
 import (
 	"math"
-	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
 )
 
-func TestBytesToInt16sLE_and_Int16toBytesLE_Roundtrip(t *testing.T) {
-	values := []int16{0, 1, -1, 32767, -32768}
-	b := Int16toBytesLE(values)
-	got := BytesToInt16sLE(b)
-
-	if len(got) != len(values) {
-		t.Fatalf("length mismatch: got %d, want %d", len(got), len(values))
-	}
-	for i, v := range values {
-		if got[i] != v {
-			t.Errorf("index %d: got %d, want %d", i, got[i], v)
-		}
-	}
-}
-
-func TestBytesToInt16sLE_PanicsOnOddLength(t *testing.T) {
-	defer func() {
-		if r := recover(); r == nil {
-			t.Error("expected panic on odd-length input, got none")
-		}
-	}()
-	BytesToInt16sLE([]byte{0x01, 0x02, 0x03})
-}
-
-func TestBytesToInt16sLE_EmptyInput(t *testing.T) {
-	got := BytesToInt16sLE([]byte{})
-	if len(got) != 0 {
-		t.Errorf("expected empty slice, got length %d", len(got))
-	}
-}
-
-func TestInt16toBytesLE_EmptyInput(t *testing.T) {
-	got := Int16toBytesLE([]int16{})
-	if len(got) != 0 {
-		t.Errorf("expected empty slice, got length %d", len(got))
-	}
-}
-
-func TestResampleInt16_Identity(t *testing.T) {
-	src := generateSineWave(440, 16000, 320)
-	dst := ResampleInt16(src, 16000, 16000)
-
-	if len(dst) != len(src) {
-		t.Fatalf("length mismatch: got %d, want %d", len(dst), len(src))
-	}
-	for i := range src {
-		if src[i] != dst[i] {
-			t.Errorf("sample %d differs: got %d, want %d", i, dst[i], src[i])
-		}
-	}
-}
-
-func TestResampleInt16_Downsample_48k_to_16k(t *testing.T) {
-	// 960 samples at 48kHz = 20ms
-	src := generateSineWave(440, 48000, 960)
-	dst := ResampleInt16(src, 48000, 16000)
-
-	expectedLen := 320
-	if len(dst) != expectedLen {
-		t.Fatalf("expected %d samples, got %d", expectedLen, len(dst))
-	}
-
-	// Verify the output still contains a reasonable sine wave
-	freq := estimateFrequency(dst, 16000)
-	if math.Abs(freq-440) > 50 {
-		t.Errorf("estimated frequency %.1f Hz, expected ~440 Hz", freq)
-	}
-}
-
-func TestResampleInt16_Upsample_16k_to_48k(t *testing.T) {
-	// 320 samples at 16kHz = 20ms
-	src := generateSineWave(440, 16000, 320)
-	dst := ResampleInt16(src, 16000, 48000)
-
-	expectedLen := 960
-	if len(dst) != expectedLen {
-		t.Fatalf("expected %d samples, got %d", expectedLen, len(dst))
-	}
-
-	freq := estimateFrequency(dst, 48000)
-	if math.Abs(freq-440) > 50 {
-		t.Errorf("estimated frequency %.1f Hz, expected ~440 Hz", freq)
-	}
-}
-
-func TestResampleInt16_DoubleResamplingQuality(t *testing.T) {
-	// Compare 48k->24k->16k vs direct 48k->16k
-	src := generateSineWave(440, 48000, 4800) // 100ms
-
-	direct := ResampleInt16(src, 48000, 16000)
-
-	step1 := ResampleInt16(src, 48000, 24000)
-	double := ResampleInt16(step1, 24000, 16000)
-
-	// Lengths should be the same
-	minLen := len(direct)
-	if len(double) < minLen {
-		minLen = len(double)
-	}
-
-	corr := computeCorrelation(direct[:minLen], double[:minLen])
-	if corr < 0.95 {
-		t.Errorf("double resampling correlation %.4f < 0.95 (quality loss too high)", corr)
-	}
-}
-
-func TestResampleInt16_SingleSample(t *testing.T) {
-	src := []int16{1000}
-	got := ResampleInt16(src, 48000, 16000)
-	if len(got) == 0 {
-		t.Fatal("expected non-empty output for single-sample input")
-	}
-	if got[0] != 1000 {
-		t.Errorf("expected sample value 1000, got %d", got[0])
-	}
-}
-
-func TestResampleInt16_EmptyInput(t *testing.T) {
-	got := ResampleInt16(nil, 48000, 16000)
-	if got != nil {
-		t.Errorf("expected nil for empty input, got length %d", len(got))
-	}
-}
-
-func TestCalculateRMS16_ConstantSignal(t *testing.T) {
-	buf := make([]int16, 1000)
-	for i := range buf {
-		buf[i] = 1000
-	}
-	rms := CalculateRMS16(buf)
-	if math.Abs(rms-1000) > 0.01 {
-		t.Errorf("expected RMS=1000, got %.4f", rms)
-	}
-}
-
-func TestCalculateRMS16_Silence(t *testing.T) {
-	buf := make([]int16, 1000)
-	rms := CalculateRMS16(buf)
-	if rms != 0 {
-		t.Errorf("expected RMS=0, got %.4f", rms)
-	}
-}
-
-func TestCalculateRMS16_KnownSineWave(t *testing.T) {
-	// RMS of a sine wave with amplitude A is A/sqrt(2)
-	amplitude := float64(math.MaxInt16 / 2)
-	buf := generateSineWave(440, 16000, 16000) // 1 second
-	rms := CalculateRMS16(buf)
-	expectedRMS := amplitude / math.Sqrt(2)
-
-	tolerance := expectedRMS * 0.02
-	if math.Abs(rms-expectedRMS) > tolerance {
-		t.Errorf("expected RMS≈%.1f, got %.1f (tolerance %.1f)", expectedRMS, rms, tolerance)
-	}
-}
+var _ = Describe("Int16 utilities", func() {
+	Describe("BytesToInt16sLE / Int16toBytesLE", func() {
+		It("round-trips correctly", func() {
+			values := []int16{0, 1, -1, 32767, -32768}
+			b := Int16toBytesLE(values)
+			got := BytesToInt16sLE(b)
+
+			Expect(got).To(Equal(values))
+		})
+
+		It("panics on odd-length input", func() {
+			Expect(func() {
+				BytesToInt16sLE([]byte{0x01, 0x02, 0x03})
+			}).To(Panic())
+		})
+
+		It("returns empty slice for empty bytes input", func() {
+			got := BytesToInt16sLE([]byte{})
+			Expect(got).To(BeEmpty())
+		})
+
+		It("returns empty slice for empty int16 input", func() {
+			got := Int16toBytesLE([]int16{})
+			Expect(got).To(BeEmpty())
+		})
+	})
+
+	Describe("ResampleInt16", func() {
+		It("returns identical output for same rate", func() {
+			src := generateSineWave(440, 16000, 320)
+			dst := ResampleInt16(src, 16000, 16000)
+
+			Expect(dst).To(Equal(src))
+		})
+
+		It("downsamples 48k to 16k", func() {
+			src := generateSineWave(440, 48000, 960)
+			dst := ResampleInt16(src, 48000, 16000)
+
+			Expect(dst).To(HaveLen(320))
+
+			freq := estimateFrequency(dst, 16000)
+			Expect(freq).To(BeNumerically("~", 440, 50))
+		})
+
+		It("upsamples 16k to 48k", func() {
+			src := generateSineWave(440, 16000, 320)
+			dst := ResampleInt16(src, 16000, 48000)
+
+			Expect(dst).To(HaveLen(960))
+
+			freq := estimateFrequency(dst, 48000)
+			Expect(freq).To(BeNumerically("~", 440, 50))
+		})
+
+		It("preserves quality through double resampling", func() {
+			src := generateSineWave(440, 48000, 4800) // 100ms
+
+			direct := ResampleInt16(src, 48000, 16000)
+
+			step1 := ResampleInt16(src, 48000, 24000)
+			double := ResampleInt16(step1, 24000, 16000)
+
+			minLen := len(direct)
+			if len(double) < minLen {
+				minLen = len(double)
+			}
+
+			corr := computeCorrelation(direct[:minLen], double[:minLen])
+			Expect(corr).To(BeNumerically(">=", 0.95))
+		})
+
+		It("handles single sample", func() {
+			src := []int16{1000}
+			got := ResampleInt16(src, 48000, 16000)
+			Expect(got).NotTo(BeEmpty())
+			Expect(got[0]).To(Equal(int16(1000)))
+		})
+
+		It("returns nil for empty input", func() {
+			got := ResampleInt16(nil, 48000, 16000)
+			Expect(got).To(BeNil())
+		})
+	})
+
+	Describe("CalculateRMS16", func() {
+		It("computes correct RMS for constant signal", func() {
+			buf := make([]int16, 1000)
+			for i := range buf {
+				buf[i] = 1000
+			}
+			rms := CalculateRMS16(buf)
+			Expect(rms).To(BeNumerically("~", 1000, 0.01))
+		})
+
+		It("returns zero for silence", func() {
+			buf := make([]int16, 1000)
+			rms := CalculateRMS16(buf)
+			Expect(rms).To(BeZero())
+		})
+
+		It("computes correct RMS for known sine wave", func() {
+			amplitude := float64(math.MaxInt16 / 2)
+			buf := generateSineWave(440, 16000, 16000) // 1 second
+			rms := CalculateRMS16(buf)
+			expectedRMS := amplitude / math.Sqrt(2)
+
+			Expect(rms).To(BeNumerically("~", expectedRMS, expectedRMS*0.02))
+		})
+	})
+})
diff --git a/pkg/sound/sound_suite_test.go b/pkg/sound/sound_suite_test.go
new file mode 100644
index 000000000000..5287aa95570d
--- /dev/null
+++ b/pkg/sound/sound_suite_test.go
@@ -0,0 +1,13 @@
+package sound
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestSound(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Sound Suite")
+}