From 03e115d24622668c45aa29da8da927884c5782b3 Mon Sep 17 00:00:00 2001
From: JarbasAi <jarbasai@mailfence.com>
Date: Mon, 15 Jun 2026 21:14:24 +0100
Subject: [PATCH] feat: TTS end-to-end intelligibility harness

Add ovoscope/tts_intelligibility.py: synthesise speech with a TTS plugin
under test, transcribe the rendered audio back with a reference STT
(faster-whisper tiny), and score the round-trip with WER/CER.

- score_tts_intelligibility() + TTSIntelligibilityHarness context manager
  returning an IntelligibilityReport (per-utterance UtteranceScore, mean
  WER/CER, to_dict/to_markdown_row).
- mode="playback" drives the full ovos-audio stack and captures the rendered
  WAV via a play_audio side_effect; mode="direct" calls tts.get_tts directly.
- Extend PlaybackServiceHarness with a tts= arg (default MockTTS, backward
  compatible) and a captured_wavs list.
- Add [tts] optional extra; graceful optional import in __init__.
- Unit tests (MockTTS + MockSTT, no model download) cover WER/CER math,
  report aggregation, playback wav capture, and graceful import.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 ovoscope/__init__.py                       |  22 ++
 ovoscope/audio.py                          |  31 +-
 ovoscope/tts_intelligibility.py            | 410 +++++++++++++++++++++
 pyproject.toml                             |  12 +
 test/unittests/test_tts_intelligibility.py | 192 ++++++++++
 5 files changed, 662 insertions(+), 5 deletions(-)
 create mode 100644 ovoscope/tts_intelligibility.py
 create mode 100644 test/unittests/test_tts_intelligibility.py

diff --git a/ovoscope/__init__.py b/ovoscope/__init__.py
index 8816468..56eccda 100644
--- a/ovoscope/__init__.py
+++ b/ovoscope/__init__.py
@@ -1095,6 +1095,28 @@ def assert_spoke(self, text: str, lang: str = "en-US", timeout: int = 30) -> Non
     else:
         raise
 
+try:
+    from ovoscope.tts_intelligibility import (  # noqa: F401
+        TTSIntelligibilityHarness,
+        IntelligibilityReport,
+        UtteranceScore,
+        score_tts_intelligibility,
+    )
+except ImportError as e:
+    # Optional [tts] extra. Silence only when the missing module is one of the
+    # optional TTS-scoring deps; a logic error in a present lib must re-raise.
+    _TTS_OPTIONAL_MODULES = (
+        "jiwer",
+        "ovos_audio", "ovos_audio.audio",
+        "ovos_utterance_normalizer",
+        "ovos_stt_plugin_fasterwhisper",
+        "faster_whisper",
+    )
+    if isinstance(e, ModuleNotFoundError) and e.name in _TTS_OPTIONAL_MODULES:
+        pass
+    else:
+        raise
+
 try:
     from ovoscope.listener import (  # noqa: F401
         MiniListener,
diff --git a/ovoscope/audio.py b/ovoscope/audio.py
index d7e7e81..fd82dba 100644
--- a/ovoscope/audio.py
+++ b/ovoscope/audio.py
@@ -518,21 +518,32 @@ class PlaybackServiceHarness:
     Args:
         validate_source: Enable session-source validation in the service.
         disable_ocp: Disable legacy OCP in the encapsulated AudioService.
+        tts: TTS instance to drive the PlaybackService with. Defaults to a
+            fresh ``MockTTS()`` (backward compatible). Pass a real TTS plugin
+            to synthesise actual audio — the rendered WAV path of each
+            utterance is captured in :attr:`captured_wavs`.
     """
 
     def __init__(self, validate_source: bool = False,
-                 disable_ocp: bool = True) -> None:
+                 disable_ocp: bool = True,
+                 tts: Optional[TTS] = None) -> None:
         """Initialise harness parameters.
 
         Args:
             validate_source: Enable session-source validation.
             disable_ocp: Disable OCP audio plugin.
+            tts: TTS instance to inject. Defaults to ``MockTTS()`` when None.
         """
         self.validate_source: bool = validate_source
         self.disable_ocp: bool = disable_ocp
         self.bus: Optional[FakeBus] = None
         self.svc = None  # PlaybackService instance
-        self.mock_tts: Optional[MockTTS] = None
+        # ``mock_tts`` keeps its historic name for backward compatibility but
+        # holds whatever TTS was injected (real plugin or MockTTS).
+        self.tts: Optional[TTS] = tts
+        self.mock_tts: Optional[TTS] = None
+        # Paths captured from the ``play_audio`` side_effect, in playback order.
+        self.captured_wavs: List[str] = []
         self._play_audio_patcher = None
         self._audio_enabled_patcher = None
         self._audio_output_start = threading.Event()
@@ -558,15 +569,25 @@ def __enter__(self) -> "PlaybackServiceHarness":
         TTS.queue = Queue()
 
         self.bus = FakeBus()
-        self.mock_tts = MockTTS()
+        # Inject the provided TTS (real plugin) or fall back to MockTTS.
+        self.mock_tts = self.tts if self.tts is not None else MockTTS()
 
-        # Patch play_audio so no real audio device is accessed
+        # Patch play_audio so no real audio device is accessed. The side_effect
+        # records the first positional arg — the rendered WAV path
+        # (ovos_audio/playback.py: ``self.p = play_audio(data)``) — so callers
+        # can round-trip the synthesised audio through a reference STT.
         mock_proc = MagicMock()
         mock_proc.communicate.return_value = (b"", b"")
         mock_proc.wait.return_value = 0
 
+        self.captured_wavs = []
+
+        def _capture_play_audio(data, *args, **kwargs):
+            self.captured_wavs.append(data)
+            return mock_proc
+
         self._play_audio_patcher = patch(
-            "ovos_audio.playback.play_audio", return_value=mock_proc
+            "ovos_audio.playback.play_audio", side_effect=_capture_play_audio
         )
         self._play_audio_patcher.start()
 
diff --git a/ovoscope/tts_intelligibility.py b/ovoscope/tts_intelligibility.py
new file mode 100644
index 0000000..6489b67
--- /dev/null
+++ b/ovoscope/tts_intelligibility.py
@@ -0,0 +1,410 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""End-to-end TTS intelligibility scoring for ovoscope.
+
+Synthesises speech with a TTS plugin under test, transcribes the rendered
+audio back with a reference STT, and scores the round-trip with word- and
+character-error-rate (WER/CER). This catches regressions that file-existence
+unit tests miss — garbled audio, wrong sample rate, broken transforms, silent
+output — and gives every TTS plugin a comparable intelligibility number.
+
+Two synthesis modes:
+
+* ``"playback"`` (default) drives the full ovos-audio stack
+  (``speak`` -> PlaybackService -> tts.execute -> get_tts -> tts_transform ->
+  play_audio) via :class:`ovoscope.audio.PlaybackServiceHarness`, with the real
+  plugin injected. The rendered WAV is captured from the patched ``play_audio``.
+* ``"direct"`` calls ``tts.get_tts(utterance, wav_path, ...)`` directly with no
+  bus — a fallback for engines that hang under the playback thread or when
+  ``ovos_audio`` is unavailable.
+
+Public API:
+    score_tts_intelligibility -- convenience function returning an IntelligibilityReport
+    TTSIntelligibilityHarness  -- context manager form of the above
+    IntelligibilityReport      -- aggregate report with mean WER/CER + serialisation
+    UtteranceScore             -- per-utterance result
+"""
+
+import dataclasses
+import os
+import re
+import shutil
+import string
+import tempfile
+import threading
+from typing import Any, List, Optional
+
+import jiwer
+
+from ovos_plugin_manager.utils.audio import AudioFile
+
+# Module-level singleton for the reference STT — model load is expensive.
+_REFERENCE_STT: Optional[Any] = None
+_REFERENCE_STT_LOCK = threading.Lock()
+
+# Module-level singleton for the utterance normaliser (cheap, but shared).
+_NORMALIZER: Optional[Any] = None
+
+
+def get_reference_stt() -> Any:
+    """Return a lazily-instantiated faster-whisper ``tiny`` reference STT.
+
+    The model is loaded once per process and reused. ``beam_size=1`` and
+    ``compute_type="int8"`` keep it deterministic and light enough for CI.
+
+    Returns:
+        A ready-to-use ``FasterWhisperSTT`` instance.
+    """
+    global _REFERENCE_STT
+    if _REFERENCE_STT is None:
+        with _REFERENCE_STT_LOCK:
+            if _REFERENCE_STT is None:
+                from ovos_stt_plugin_fasterwhisper import FasterWhisperSTT
+                _REFERENCE_STT = FasterWhisperSTT({
+                    "model": "tiny",
+                    "compute_type": "int8",
+                    "beam_size": 1,
+                })
+    return _REFERENCE_STT
+
+
+def _normalize(text: str, lang: str) -> str:
+    """Normalise a transcript for fair WER/CER scoring.
+
+    Uses ``ovos_utterance_normalizer`` (lowercase, number expansion,
+    contraction expansion, punctuation strip) so cosmetic differences between
+    reference and hypothesis don't inflate the error rate. The normaliser
+    yields several variants per input; the last one is the fully-normalised
+    form, which is what we score against.
+
+    Args:
+        text: The raw text to normalise.
+        lang: BCP-47 language tag (e.g. ``"en-US"``).
+
+    Returns:
+        A single normalised string (whitespace-collapsed, lowercased).
+    """
+    global _NORMALIZER
+    if _NORMALIZER is None:
+        from ovos_utterance_normalizer import UtteranceNormalizerPlugin
+        _NORMALIZER = UtteranceNormalizerPlugin()
+    if not text:
+        return ""
+    variants, _ = _NORMALIZER.transform([text], {"lang": lang})
+    # transform() emits [contraction-expanded, original, number-normalized]
+    # per utterance, deduplicated and order-preserving. The first variant
+    # (contractions expanded, punctuation stripped, words kept as words) is the
+    # stable choice — the number-collapsed variant ("two" -> "2") would create
+    # spurious mismatches against a word-emitting STT. Lowercasing/whitespace
+    # collapse are applied here so both reference and hypothesis align.
+    normalized = variants[0] if variants else text
+    # The normaliser only strips leading/trailing punctuation of the whole
+    # string; interior punctuation (e.g. a tokenised comma) survives and would
+    # inflate WER. Strip all punctuation characters here, then collapse space.
+    normalized = re.sub(rf"[{re.escape(string.punctuation)}]", " ", normalized)
+    return " ".join(normalized.lower().split())
+
+
+def _score_pair(reference: str, hypothesis: str) -> "tuple[float, float]":
+    """Compute (WER, CER) for a reference/hypothesis pair.
+
+    jiwer raises on an empty reference, so empty references are handled
+    explicitly: a non-empty hypothesis against an empty reference scores 1.0
+    (fully wrong), two empties score 0.0 (trivially correct).
+
+    Args:
+        reference: Normalised ground-truth string.
+        hypothesis: Normalised transcript from the reference STT.
+
+    Returns:
+        Tuple of ``(wer, cer)`` as floats.
+    """
+    if not reference:
+        return (0.0, 0.0) if not hypothesis else (1.0, 1.0)
+    wer = float(jiwer.wer(reference, hypothesis))
+    cer = float(jiwer.cer(reference, hypothesis))
+    return wer, cer
+
+
+@dataclasses.dataclass
+class UtteranceScore:
+    """Per-utterance intelligibility result.
+
+    Attributes:
+        utterance: The text that was synthesised (the ground truth).
+        transcript: What the reference STT heard back.
+        wer: Word error rate of transcript vs utterance (0.0 = perfect).
+        cer: Character error rate of transcript vs utterance.
+        wav_path: Path to the captured rendered WAV (may be None on failure).
+        lang: BCP-47 language tag used for synthesis and scoring.
+        voice: Voice identifier used, if any.
+    """
+
+    utterance: str
+    transcript: str
+    wer: float
+    cer: float
+    wav_path: Optional[str] = None
+    lang: str = "en-US"
+    voice: Optional[str] = None
+
+    def to_dict(self) -> dict:
+        """Return a JSON-serialisable dict of this score."""
+        return {
+            "utterance": self.utterance,
+            "transcript": self.transcript,
+            "wer": round(self.wer, 4),
+            "cer": round(self.cer, 4),
+            "wav_path": self.wav_path,
+            "lang": self.lang,
+            "voice": self.voice,
+        }
+
+
+@dataclasses.dataclass
+class IntelligibilityReport:
+    """Aggregate intelligibility report over a set of utterances.
+
+    Attributes:
+        scores: Per-utterance :class:`UtteranceScore` results.
+        lang: BCP-47 language tag the run used.
+        voice: Voice identifier the run used, if any.
+        mode: Synthesis mode used (``"playback"`` or ``"direct"``).
+    """
+
+    scores: List[UtteranceScore] = dataclasses.field(default_factory=list)
+    lang: str = "en-US"
+    voice: Optional[str] = None
+    mode: str = "playback"
+
+    @property
+    def mean_wer(self) -> float:
+        """Mean word error rate across all scored utterances (0.0 if empty)."""
+        if not self.scores:
+            return 0.0
+        return sum(s.wer for s in self.scores) / len(self.scores)
+
+    @property
+    def mean_cer(self) -> float:
+        """Mean character error rate across all scored utterances (0.0 if empty)."""
+        if not self.scores:
+            return 0.0
+        return sum(s.cer for s in self.scores) / len(self.scores)
+
+    def to_dict(self) -> dict:
+        """Return a JSON-serialisable dict of the full report."""
+        return {
+            "lang": self.lang,
+            "voice": self.voice,
+            "mode": self.mode,
+            "mean_wer": round(self.mean_wer, 4),
+            "mean_cer": round(self.mean_cer, 4),
+            "n_utterances": len(self.scores),
+            "scores": [s.to_dict() for s in self.scores],
+        }
+
+    def to_markdown_row(self) -> str:
+        """Return a single markdown table row: ``| voice | lang | mean_wer | mean_cer | n |``."""
+        voice = self.voice or "default"
+        return (
+            f"| {voice} | {self.lang} | "
+            f"{self.mean_wer:.3f} | {self.mean_cer:.3f} | {len(self.scores)} |"
+        )
+
+
+class TTSIntelligibilityHarness:
+    """Context manager that scores TTS intelligibility end-to-end.
+
+    Usage::
+
+        with TTSIntelligibilityHarness(tts, lang="en-US") as h:
+            report = h.score(["hello world", "what time is it"])
+        print(report.mean_wer)
+
+    In ``mode="playback"`` the harness owns a :class:`PlaybackServiceHarness`
+    for its lifetime; in ``mode="direct"`` no bus is started. A temp directory
+    holds copies of the rendered WAVs (the TTS cache may delete originals); it
+    is cleaned up on exit.
+
+    Args:
+        tts: The TTS plugin under test.
+        lang: BCP-47 language tag for synthesis and scoring.
+        voice: Optional voice identifier passed to ``get_tts``.
+        reference_stt: STT used to transcribe. Defaults to the lazy
+            faster-whisper ``tiny`` singleton.
+        mode: ``"playback"`` (full ovos-audio stack) or ``"direct"``
+            (``tts.get_tts`` only).
+        speak_timeout: Per-utterance timeout for playback mode.
+    """
+
+    def __init__(self, tts: Any, *, lang: str = "en-US",
+                 voice: Optional[str] = None,
+                 reference_stt: Optional[Any] = None,
+                 mode: str = "playback",
+                 speak_timeout: float = 30.0) -> None:
+        if mode not in ("playback", "direct"):
+            raise ValueError(f"mode must be 'playback' or 'direct', got {mode!r}")
+        self.tts = tts
+        self.lang = lang
+        self.voice = voice
+        self._reference_stt = reference_stt
+        self.mode = mode
+        self.speak_timeout = speak_timeout
+        self._tmpdir: Optional[str] = None
+        self._playback = None  # PlaybackServiceHarness in playback mode
+
+    @property
+    def reference_stt(self) -> Any:
+        """The reference STT, lazily resolved to the faster-whisper singleton."""
+        if self._reference_stt is None:
+            self._reference_stt = get_reference_stt()
+        return self._reference_stt
+
+    def __enter__(self) -> "TTSIntelligibilityHarness":
+        self._tmpdir = tempfile.mkdtemp(prefix="ovoscope-tts-")
+        if self.mode == "playback":
+            from ovoscope.audio import PlaybackServiceHarness
+            self._playback = PlaybackServiceHarness(tts=self.tts)
+            self._playback.__enter__()
+        return self
+
+    def __exit__(self, *args) -> None:
+        if self._playback is not None:
+            try:
+                self._playback.__exit__(*args)
+            except Exception:
+                pass
+            self._playback = None
+        if self._tmpdir and os.path.isdir(self._tmpdir):
+            shutil.rmtree(self._tmpdir, ignore_errors=True)
+            self._tmpdir = None
+
+    # ------------------------------------------------------------------
+    # Synthesis
+    # ------------------------------------------------------------------
+
+    def _render_playback(self, utterance: str) -> Optional[str]:
+        """Synthesise via the full ovos-audio stack; return a copied WAV path."""
+        before = len(self._playback.captured_wavs)
+        self._playback.speak(utterance, timeout=self.speak_timeout)
+        captured = self._playback.captured_wavs[before:]
+        if not captured:
+            return None
+        return self._copy_out(captured[-1])
+
+    def _render_direct(self, utterance: str) -> Optional[str]:
+        """Synthesise via ``tts.get_tts`` directly; return the WAV path."""
+        wav_path = os.path.join(
+            self._tmpdir, f"direct_{abs(hash(utterance)) & 0xffffffff}.wav"
+        )
+        self.tts.get_tts(utterance, wav_path, lang=self.lang, voice=self.voice)
+        return wav_path if os.path.isfile(wav_path) else None
+
+    def _copy_out(self, wav_path: str) -> Optional[str]:
+        """Copy a rendered WAV into the harness temp dir before the cache prunes it."""
+        if not wav_path or not os.path.isfile(wav_path):
+            return wav_path if wav_path and os.path.isfile(wav_path) else None
+        dst = os.path.join(
+            self._tmpdir, f"play_{len(os.listdir(self._tmpdir))}_{os.path.basename(wav_path)}"
+        )
+        try:
+            shutil.copyfile(wav_path, dst)
+            return dst
+        except OSError:
+            return wav_path
+
+    # ------------------------------------------------------------------
+    # Scoring
+    # ------------------------------------------------------------------
+
+    def _transcribe(self, wav_path: str) -> str:
+        """Round-trip a WAV through the reference STT and return the transcript."""
+        with AudioFile(wav_path) as source:
+            audio = source.read()
+        return self.reference_stt.execute(audio, language=self.lang) or ""
+
+    def score_one(self, utterance: str) -> UtteranceScore:
+        """Synthesise, transcribe, and score a single utterance.
+
+        Args:
+            utterance: The text to synthesise and score.
+
+        Returns:
+            An :class:`UtteranceScore`. On synthesis/transcription failure the
+            transcript is empty and WER/CER reflect a total miss.
+        """
+        if self.mode == "playback":
+            wav_path = self._render_playback(utterance)
+        else:
+            wav_path = self._render_direct(utterance)
+
+        transcript = ""
+        if wav_path and os.path.isfile(wav_path):
+            try:
+                transcript = self._transcribe(wav_path)
+            except Exception:
+                transcript = ""
+
+        ref = _normalize(utterance, self.lang)
+        hyp = _normalize(transcript, self.lang)
+        wer, cer = _score_pair(ref, hyp)
+        return UtteranceScore(
+            utterance=utterance,
+            transcript=transcript,
+            wer=wer,
+            cer=cer,
+            wav_path=wav_path,
+            lang=self.lang,
+            voice=self.voice,
+        )
+
+    def score(self, utterances: List[str]) -> IntelligibilityReport:
+        """Score a list of utterances and return the aggregate report.
+
+        Args:
+            utterances: Phrases to synthesise and score.
+
+        Returns:
+            An :class:`IntelligibilityReport`.
+        """
+        report = IntelligibilityReport(lang=self.lang, voice=self.voice, mode=self.mode)
+        for utt in utterances:
+            report.scores.append(self.score_one(utt))
+        return report
+
+
+def score_tts_intelligibility(tts: Any, utterances: List[str], *,
+                              lang: str = "en-US",
+                              voice: Optional[str] = None,
+                              reference_stt: Optional[Any] = None,
+                              mode: str = "playback",
+                              speak_timeout: float = 30.0) -> IntelligibilityReport:
+    """Synthesise, transcribe, and score a set of utterances in one call.
+
+    Args:
+        tts: The TTS plugin under test.
+        utterances: Phrases to synthesise and score.
+        lang: BCP-47 language tag for synthesis and scoring.
+        voice: Optional voice identifier passed to ``get_tts``.
+        reference_stt: STT used to transcribe. Defaults to faster-whisper tiny.
+        mode: ``"playback"`` (full ovos-audio stack) or ``"direct"``.
+        speak_timeout: Per-utterance timeout for playback mode.
+
+    Returns:
+        An :class:`IntelligibilityReport` with per-utterance and mean scores.
+    """
+    with TTSIntelligibilityHarness(
+        tts, lang=lang, voice=voice, reference_stt=reference_stt,
+        mode=mode, speak_timeout=speak_timeout,
+    ) as harness:
+        return harness.score(utterances)
diff --git a/pyproject.toml b/pyproject.toml
index 6ec33d0..a729214 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,9 +32,21 @@ classifiers = [
 [project.optional-dependencies]
 pydantic = ["ovos-pydantic-models>=0.1.0"]
 audio = ["ovos-audio>=1.2.0"]
+# End-to-end TTS intelligibility scoring (WER/CER round-trip via reference STT).
+# faster-whisper itself is pulled by the plugin — don't list it here to avoid
+# version skew.
+tts = [
+    "ovos-audio>=1.2.0",
+    "jiwer",
+    "ovos-utterance-normalizer",
+    "ovos-stt-plugin-fasterwhisper",
+]
 dev = [
     "ovos-audio>=1.2.0",
     "ovos-pydantic-models>=0.1.0",
+    "jiwer",
+    "ovos-utterance-normalizer",
+    "ovos-stt-plugin-fasterwhisper",
     "pytest-cov",
 ]
 
diff --git a/test/unittests/test_tts_intelligibility.py b/test/unittests/test_tts_intelligibility.py
new file mode 100644
index 0000000..283f40e
--- /dev/null
+++ b/test/unittests/test_tts_intelligibility.py
@@ -0,0 +1,192 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for ovoscope.tts_intelligibility.
+
+These tests use a MockTTS (silent WAV) and a MockSTT that echoes a fixed
+transcript — no model download, no real audio. They cover WER/CER math, report
+aggregation, serialisation, and that playback interception captures a wav path.
+"""
+
+import importlib.util
+import subprocess
+import sys
+import unittest
+
+TTS_AVAILABLE = (
+    importlib.util.find_spec("jiwer") is not None
+    and importlib.util.find_spec("ovos_audio") is not None
+    and importlib.util.find_spec("ovos_utterance_normalizer") is not None
+)
+
+
+@unittest.skipUnless(TTS_AVAILABLE, "tts extra (jiwer/ovos-audio/normalizer) not installed")
+class TestScoring(unittest.TestCase):
+    """WER/CER math, normalisation and report aggregation — no synthesis."""
+
+    def setUp(self):
+        from ovoscope import tts_intelligibility as ti
+        self.ti = ti
+
+    def test_perfect_match_is_zero(self):
+        wer, cer = self.ti._score_pair("hello world", "hello world")
+        self.assertEqual(wer, 0.0)
+        self.assertEqual(cer, 0.0)
+
+    def test_one_wrong_word_half_wer(self):
+        wer, _ = self.ti._score_pair("hello world", "hello there")
+        self.assertAlmostEqual(wer, 0.5, places=3)
+
+    def test_empty_reference_handling(self):
+        self.assertEqual(self.ti._score_pair("", ""), (0.0, 0.0))
+        self.assertEqual(self.ti._score_pair("", "noise"), (1.0, 1.0))
+
+    def test_normalize_strips_case_and_punctuation(self):
+        # "Hello, World!" should normalise to "hello world" so it scores 0
+        # against the lowercased ground truth.
+        wer, _ = self.ti._score_pair(
+            self.ti._normalize("Hello, World!", "en-US"),
+            self.ti._normalize("hello world", "en-US"),
+        )
+        self.assertEqual(wer, 0.0)
+
+    def test_report_aggregation(self):
+        UtteranceScore = self.ti.UtteranceScore
+        IntelligibilityReport = self.ti.IntelligibilityReport
+        report = IntelligibilityReport(lang="en-US", voice="alan")
+        report.scores.append(UtteranceScore("a", "a", 0.0, 0.0, lang="en-US"))
+        report.scores.append(UtteranceScore("b", "x", 1.0, 1.0, lang="en-US"))
+        self.assertAlmostEqual(report.mean_wer, 0.5)
+        self.assertAlmostEqual(report.mean_cer, 0.5)
+
+    def test_empty_report_means_zero(self):
+        report = self.ti.IntelligibilityReport()
+        self.assertEqual(report.mean_wer, 0.0)
+        self.assertEqual(report.mean_cer, 0.0)
+
+    def test_to_dict_and_markdown_row(self):
+        UtteranceScore = self.ti.UtteranceScore
+        report = self.ti.IntelligibilityReport(lang="en-US", voice="alan")
+        report.scores.append(UtteranceScore("a", "a", 0.0, 0.0, lang="en-US"))
+        d = report.to_dict()
+        self.assertEqual(d["lang"], "en-US")
+        self.assertEqual(d["voice"], "alan")
+        self.assertEqual(d["n_utterances"], 1)
+        self.assertEqual(d["mean_wer"], 0.0)
+        self.assertIn("scores", d)
+        row = report.to_markdown_row()
+        self.assertIn("alan", row)
+        self.assertIn("en-US", row)
+        self.assertTrue(row.startswith("|") and row.endswith("|"))
+
+
+class MockSTT:
+    """Reference STT stub that echoes a fixed transcript regardless of audio."""
+
+    def __init__(self, transcript="hello world"):
+        self.transcript = transcript
+        self.calls = 0
+
+    def execute(self, audio, language=None):
+        self.calls += 1
+        return self.transcript
+
+
+@unittest.skipUnless(TTS_AVAILABLE, "tts extra (jiwer/ovos-audio/normalizer) not installed")
+class TestHarnessDirectMode(unittest.TestCase):
+    """Direct mode: tts.get_tts only, MockSTT echo — no bus, no model."""
+
+    def test_direct_mode_perfect_score(self):
+        from ovoscope.audio import MockTTS
+        from ovoscope.tts_intelligibility import score_tts_intelligibility
+
+        stt = MockSTT("hello world")
+        report = score_tts_intelligibility(
+            MockTTS(), ["hello world"],
+            reference_stt=stt, mode="direct",
+        )
+        self.assertEqual(len(report.scores), 1)
+        self.assertEqual(report.mean_wer, 0.0)
+        self.assertEqual(stt.calls, 1)
+        self.assertIsNotNone(report.scores[0].wav_path)
+
+    def test_direct_mode_mismatch_scores_high(self):
+        from ovoscope.audio import MockTTS
+        from ovoscope.tts_intelligibility import score_tts_intelligibility
+
+        report = score_tts_intelligibility(
+            MockTTS(), ["completely different text here"],
+            reference_stt=MockSTT("hello world"), mode="direct",
+        )
+        self.assertGreater(report.mean_wer, 0.0)
+
+
+@unittest.skipUnless(TTS_AVAILABLE, "tts extra (jiwer/ovos-audio/normalizer) not installed")
+class TestHarnessPlaybackMode(unittest.TestCase):
+    """Playback mode: full ovos-audio stack drives MockTTS; wav is captured."""
+
+    def test_playback_captures_wav_and_scores(self):
+        from ovoscope.audio import MockTTS
+        from ovoscope.tts_intelligibility import TTSIntelligibilityHarness
+
+        tts = MockTTS()
+        stt = MockSTT("hello world")
+        with TTSIntelligibilityHarness(
+            tts, reference_stt=stt, mode="playback", speak_timeout=15.0,
+        ) as h:
+            report = h.score(["hello world"])
+
+        self.assertEqual(len(report.scores), 1)
+        score = report.scores[0]
+        # Playback interception must have captured a rendered wav path.
+        self.assertIsNotNone(score.wav_path, "no wav captured from playback")
+        self.assertIn("hello world", tts.spoken_utterances)
+        self.assertEqual(report.mean_wer, 0.0)
+
+
+class TestGracefulImport(unittest.TestCase):
+    """Core ``import ovoscope`` must succeed even without the [tts] extra."""
+
+    def test_import_without_tts_extra(self):
+        # Run in a subprocess with the optional tts deps blocked at import time,
+        # simulating an environment that never installed ovoscope[tts].
+        code = (
+            "import sys, importlib.abc, importlib.machinery\n"
+            "BLOCKED = {'jiwer', 'ovos_utterance_normalizer', "
+            "'ovos_stt_plugin_fasterwhisper', 'faster_whisper'}\n"
+            "class _Block(importlib.abc.MetaPathFinder):\n"
+            "    def find_spec(self, name, path, target=None):\n"
+            "        if name.split('.')[0] in BLOCKED:\n"
+            "            raise ModuleNotFoundError(name=name.split('.')[0])\n"
+            "        return None\n"
+            "sys.meta_path.insert(0, _Block())\n"
+            "for m in list(sys.modules):\n"
+            "    if m.split('.')[0] in BLOCKED:\n"
+            "        del sys.modules[m]\n"
+            "import ovoscope\n"
+            "assert not hasattr(ovoscope, 'TTSIntelligibilityHarness'), "
+            "'harness should be absent without the tts extra'\n"
+            "print('OK')\n"
+        )
+        result = subprocess.run(
+            [sys.executable, "-c", code],
+            capture_output=True, text=True,
+        )
+        self.assertEqual(
+            result.returncode, 0,
+            f"core import failed without tts extra:\nstdout={result.stdout}\nstderr={result.stderr}",
+        )
+        self.assertIn("OK", result.stdout)
+
+
+if __name__ == "__main__":
+    unittest.main()