From 03e115d24622668c45aa29da8da927884c5782b3 Mon Sep 17 00:00:00 2001 From: JarbasAi Date: Mon, 15 Jun 2026 21:14:24 +0100 Subject: [PATCH] feat: TTS end-to-end intelligibility harness Add ovoscope/tts_intelligibility.py: synthesise speech with a TTS plugin under test, transcribe the rendered audio back with a reference STT (faster-whisper tiny), and score the round-trip with WER/CER. - score_tts_intelligibility() + TTSIntelligibilityHarness context manager returning an IntelligibilityReport (per-utterance UtteranceScore, mean WER/CER, to_dict/to_markdown_row). - mode="playback" drives the full ovos-audio stack and captures the rendered WAV via a play_audio side_effect; mode="direct" calls tts.get_tts directly. - Extend PlaybackServiceHarness with a tts= arg (default MockTTS, backward compatible) and a captured_wavs list. - Add [tts] optional extra; graceful optional import in __init__. - Unit tests (MockTTS + MockSTT, no model download) cover WER/CER math, report aggregation, playback wav capture, and graceful import. Co-Authored-By: Claude Opus 4.8 --- ovoscope/__init__.py | 22 ++ ovoscope/audio.py | 31 +- ovoscope/tts_intelligibility.py | 410 +++++++++++++++++++++ pyproject.toml | 12 + test/unittests/test_tts_intelligibility.py | 192 ++++++++++ 5 files changed, 662 insertions(+), 5 deletions(-) create mode 100644 ovoscope/tts_intelligibility.py create mode 100644 test/unittests/test_tts_intelligibility.py diff --git a/ovoscope/__init__.py b/ovoscope/__init__.py index 8816468..56eccda 100644 --- a/ovoscope/__init__.py +++ b/ovoscope/__init__.py @@ -1095,6 +1095,28 @@ def assert_spoke(self, text: str, lang: str = "en-US", timeout: int = 30) -> Non else: raise +try: + from ovoscope.tts_intelligibility import ( # noqa: F401 + TTSIntelligibilityHarness, + IntelligibilityReport, + UtteranceScore, + score_tts_intelligibility, + ) +except ImportError as e: + # Optional [tts] extra. Silence only when the missing module is one of the + # optional TTS-scoring deps; a logic error in a present lib must re-raise. + _TTS_OPTIONAL_MODULES = ( + "jiwer", + "ovos_audio", "ovos_audio.audio", + "ovos_utterance_normalizer", + "ovos_stt_plugin_fasterwhisper", + "faster_whisper", + ) + if isinstance(e, ModuleNotFoundError) and e.name in _TTS_OPTIONAL_MODULES: + pass + else: + raise + try: from ovoscope.listener import ( # noqa: F401 MiniListener, diff --git a/ovoscope/audio.py b/ovoscope/audio.py index d7e7e81..fd82dba 100644 --- a/ovoscope/audio.py +++ b/ovoscope/audio.py @@ -518,21 +518,32 @@ class PlaybackServiceHarness: Args: validate_source: Enable session-source validation in the service. disable_ocp: Disable legacy OCP in the encapsulated AudioService. + tts: TTS instance to drive the PlaybackService with. Defaults to a + fresh ``MockTTS()`` (backward compatible). Pass a real TTS plugin + to synthesise actual audio — the rendered WAV path of each + utterance is captured in :attr:`captured_wavs`. """ def __init__(self, validate_source: bool = False, - disable_ocp: bool = True) -> None: + disable_ocp: bool = True, + tts: Optional[TTS] = None) -> None: """Initialise harness parameters. Args: validate_source: Enable session-source validation. disable_ocp: Disable OCP audio plugin. + tts: TTS instance to inject. Defaults to ``MockTTS()`` when None. """ self.validate_source: bool = validate_source self.disable_ocp: bool = disable_ocp self.bus: Optional[FakeBus] = None self.svc = None # PlaybackService instance - self.mock_tts: Optional[MockTTS] = None + # ``mock_tts`` keeps its historic name for backward compatibility but + # holds whatever TTS was injected (real plugin or MockTTS). + self.tts: Optional[TTS] = tts + self.mock_tts: Optional[TTS] = None + # Paths captured from the ``play_audio`` side_effect, in playback order. + self.captured_wavs: List[str] = [] self._play_audio_patcher = None self._audio_enabled_patcher = None self._audio_output_start = threading.Event() @@ -558,15 +569,25 @@ def __enter__(self) -> "PlaybackServiceHarness": TTS.queue = Queue() self.bus = FakeBus() - self.mock_tts = MockTTS() + # Inject the provided TTS (real plugin) or fall back to MockTTS. + self.mock_tts = self.tts if self.tts is not None else MockTTS() - # Patch play_audio so no real audio device is accessed + # Patch play_audio so no real audio device is accessed. The side_effect + # records the first positional arg — the rendered WAV path + # (ovos_audio/playback.py: ``self.p = play_audio(data)``) — so callers + # can round-trip the synthesised audio through a reference STT. mock_proc = MagicMock() mock_proc.communicate.return_value = (b"", b"") mock_proc.wait.return_value = 0 + self.captured_wavs = [] + + def _capture_play_audio(data, *args, **kwargs): + self.captured_wavs.append(data) + return mock_proc + self._play_audio_patcher = patch( - "ovos_audio.playback.play_audio", return_value=mock_proc + "ovos_audio.playback.play_audio", side_effect=_capture_play_audio ) self._play_audio_patcher.start() diff --git a/ovoscope/tts_intelligibility.py b/ovoscope/tts_intelligibility.py new file mode 100644 index 0000000..6489b67 --- /dev/null +++ b/ovoscope/tts_intelligibility.py @@ -0,0 +1,410 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""End-to-end TTS intelligibility scoring for ovoscope. + +Synthesises speech with a TTS plugin under test, transcribes the rendered +audio back with a reference STT, and scores the round-trip with word- and +character-error-rate (WER/CER). This catches regressions that file-existence +unit tests miss — garbled audio, wrong sample rate, broken transforms, silent +output — and gives every TTS plugin a comparable intelligibility number. + +Two synthesis modes: + +* ``"playback"`` (default) drives the full ovos-audio stack + (``speak`` -> PlaybackService -> tts.execute -> get_tts -> tts_transform -> + play_audio) via :class:`ovoscope.audio.PlaybackServiceHarness`, with the real + plugin injected. The rendered WAV is captured from the patched ``play_audio``. +* ``"direct"`` calls ``tts.get_tts(utterance, wav_path, ...)`` directly with no + bus — a fallback for engines that hang under the playback thread or when + ``ovos_audio`` is unavailable. + +Public API: + score_tts_intelligibility -- convenience function returning an IntelligibilityReport + TTSIntelligibilityHarness -- context manager form of the above + IntelligibilityReport -- aggregate report with mean WER/CER + serialisation + UtteranceScore -- per-utterance result +""" + +import dataclasses +import os +import re +import shutil +import string +import tempfile +import threading +from typing import Any, List, Optional + +import jiwer + +from ovos_plugin_manager.utils.audio import AudioFile + +# Module-level singleton for the reference STT — model load is expensive. +_REFERENCE_STT: Optional[Any] = None +_REFERENCE_STT_LOCK = threading.Lock() + +# Module-level singleton for the utterance normaliser (cheap, but shared). +_NORMALIZER: Optional[Any] = None + + +def get_reference_stt() -> Any: + """Return a lazily-instantiated faster-whisper ``tiny`` reference STT. + + The model is loaded once per process and reused. ``beam_size=1`` and + ``compute_type="int8"`` keep it deterministic and light enough for CI. + + Returns: + A ready-to-use ``FasterWhisperSTT`` instance. + """ + global _REFERENCE_STT + if _REFERENCE_STT is None: + with _REFERENCE_STT_LOCK: + if _REFERENCE_STT is None: + from ovos_stt_plugin_fasterwhisper import FasterWhisperSTT + _REFERENCE_STT = FasterWhisperSTT({ + "model": "tiny", + "compute_type": "int8", + "beam_size": 1, + }) + return _REFERENCE_STT + + +def _normalize(text: str, lang: str) -> str: + """Normalise a transcript for fair WER/CER scoring. + + Uses ``ovos_utterance_normalizer`` (lowercase, number expansion, + contraction expansion, punctuation strip) so cosmetic differences between + reference and hypothesis don't inflate the error rate. The normaliser + yields several variants per input; the last one is the fully-normalised + form, which is what we score against. + + Args: + text: The raw text to normalise. + lang: BCP-47 language tag (e.g. ``"en-US"``). + + Returns: + A single normalised string (whitespace-collapsed, lowercased). + """ + global _NORMALIZER + if _NORMALIZER is None: + from ovos_utterance_normalizer import UtteranceNormalizerPlugin + _NORMALIZER = UtteranceNormalizerPlugin() + if not text: + return "" + variants, _ = _NORMALIZER.transform([text], {"lang": lang}) + # transform() emits [contraction-expanded, original, number-normalized] + # per utterance, deduplicated and order-preserving. The first variant + # (contractions expanded, punctuation stripped, words kept as words) is the + # stable choice — the number-collapsed variant ("two" -> "2") would create + # spurious mismatches against a word-emitting STT. Lowercasing/whitespace + # collapse are applied here so both reference and hypothesis align. + normalized = variants[0] if variants else text + # The normaliser only strips leading/trailing punctuation of the whole + # string; interior punctuation (e.g. a tokenised comma) survives and would + # inflate WER. Strip all punctuation characters here, then collapse space. + normalized = re.sub(rf"[{re.escape(string.punctuation)}]", " ", normalized) + return " ".join(normalized.lower().split()) + + +def _score_pair(reference: str, hypothesis: str) -> "tuple[float, float]": + """Compute (WER, CER) for a reference/hypothesis pair. + + jiwer raises on an empty reference, so empty references are handled + explicitly: a non-empty hypothesis against an empty reference scores 1.0 + (fully wrong), two empties score 0.0 (trivially correct). + + Args: + reference: Normalised ground-truth string. + hypothesis: Normalised transcript from the reference STT. + + Returns: + Tuple of ``(wer, cer)`` as floats. + """ + if not reference: + return (0.0, 0.0) if not hypothesis else (1.0, 1.0) + wer = float(jiwer.wer(reference, hypothesis)) + cer = float(jiwer.cer(reference, hypothesis)) + return wer, cer + + +@dataclasses.dataclass +class UtteranceScore: + """Per-utterance intelligibility result. + + Attributes: + utterance: The text that was synthesised (the ground truth). + transcript: What the reference STT heard back. + wer: Word error rate of transcript vs utterance (0.0 = perfect). + cer: Character error rate of transcript vs utterance. + wav_path: Path to the captured rendered WAV (may be None on failure). + lang: BCP-47 language tag used for synthesis and scoring. + voice: Voice identifier used, if any. + """ + + utterance: str + transcript: str + wer: float + cer: float + wav_path: Optional[str] = None + lang: str = "en-US" + voice: Optional[str] = None + + def to_dict(self) -> dict: + """Return a JSON-serialisable dict of this score.""" + return { + "utterance": self.utterance, + "transcript": self.transcript, + "wer": round(self.wer, 4), + "cer": round(self.cer, 4), + "wav_path": self.wav_path, + "lang": self.lang, + "voice": self.voice, + } + + +@dataclasses.dataclass +class IntelligibilityReport: + """Aggregate intelligibility report over a set of utterances. + + Attributes: + scores: Per-utterance :class:`UtteranceScore` results. + lang: BCP-47 language tag the run used. + voice: Voice identifier the run used, if any. + mode: Synthesis mode used (``"playback"`` or ``"direct"``). + """ + + scores: List[UtteranceScore] = dataclasses.field(default_factory=list) + lang: str = "en-US" + voice: Optional[str] = None + mode: str = "playback" + + @property + def mean_wer(self) -> float: + """Mean word error rate across all scored utterances (0.0 if empty).""" + if not self.scores: + return 0.0 + return sum(s.wer for s in self.scores) / len(self.scores) + + @property + def mean_cer(self) -> float: + """Mean character error rate across all scored utterances (0.0 if empty).""" + if not self.scores: + return 0.0 + return sum(s.cer for s in self.scores) / len(self.scores) + + def to_dict(self) -> dict: + """Return a JSON-serialisable dict of the full report.""" + return { + "lang": self.lang, + "voice": self.voice, + "mode": self.mode, + "mean_wer": round(self.mean_wer, 4), + "mean_cer": round(self.mean_cer, 4), + "n_utterances": len(self.scores), + "scores": [s.to_dict() for s in self.scores], + } + + def to_markdown_row(self) -> str: + """Return a single markdown table row: ``| voice | lang | mean_wer | mean_cer | n |``.""" + voice = self.voice or "default" + return ( + f"| {voice} | {self.lang} | " + f"{self.mean_wer:.3f} | {self.mean_cer:.3f} | {len(self.scores)} |" + ) + + +class TTSIntelligibilityHarness: + """Context manager that scores TTS intelligibility end-to-end. + + Usage:: + + with TTSIntelligibilityHarness(tts, lang="en-US") as h: + report = h.score(["hello world", "what time is it"]) + print(report.mean_wer) + + In ``mode="playback"`` the harness owns a :class:`PlaybackServiceHarness` + for its lifetime; in ``mode="direct"`` no bus is started. A temp directory + holds copies of the rendered WAVs (the TTS cache may delete originals); it + is cleaned up on exit. + + Args: + tts: The TTS plugin under test. + lang: BCP-47 language tag for synthesis and scoring. + voice: Optional voice identifier passed to ``get_tts``. + reference_stt: STT used to transcribe. Defaults to the lazy + faster-whisper ``tiny`` singleton. + mode: ``"playback"`` (full ovos-audio stack) or ``"direct"`` + (``tts.get_tts`` only). + speak_timeout: Per-utterance timeout for playback mode. + """ + + def __init__(self, tts: Any, *, lang: str = "en-US", + voice: Optional[str] = None, + reference_stt: Optional[Any] = None, + mode: str = "playback", + speak_timeout: float = 30.0) -> None: + if mode not in ("playback", "direct"): + raise ValueError(f"mode must be 'playback' or 'direct', got {mode!r}") + self.tts = tts + self.lang = lang + self.voice = voice + self._reference_stt = reference_stt + self.mode = mode + self.speak_timeout = speak_timeout + self._tmpdir: Optional[str] = None + self._playback = None # PlaybackServiceHarness in playback mode + + @property + def reference_stt(self) -> Any: + """The reference STT, lazily resolved to the faster-whisper singleton.""" + if self._reference_stt is None: + self._reference_stt = get_reference_stt() + return self._reference_stt + + def __enter__(self) -> "TTSIntelligibilityHarness": + self._tmpdir = tempfile.mkdtemp(prefix="ovoscope-tts-") + if self.mode == "playback": + from ovoscope.audio import PlaybackServiceHarness + self._playback = PlaybackServiceHarness(tts=self.tts) + self._playback.__enter__() + return self + + def __exit__(self, *args) -> None: + if self._playback is not None: + try: + self._playback.__exit__(*args) + except Exception: + pass + self._playback = None + if self._tmpdir and os.path.isdir(self._tmpdir): + shutil.rmtree(self._tmpdir, ignore_errors=True) + self._tmpdir = None + + # ------------------------------------------------------------------ + # Synthesis + # ------------------------------------------------------------------ + + def _render_playback(self, utterance: str) -> Optional[str]: + """Synthesise via the full ovos-audio stack; return a copied WAV path.""" + before = len(self._playback.captured_wavs) + self._playback.speak(utterance, timeout=self.speak_timeout) + captured = self._playback.captured_wavs[before:] + if not captured: + return None + return self._copy_out(captured[-1]) + + def _render_direct(self, utterance: str) -> Optional[str]: + """Synthesise via ``tts.get_tts`` directly; return the WAV path.""" + wav_path = os.path.join( + self._tmpdir, f"direct_{abs(hash(utterance)) & 0xffffffff}.wav" + ) + self.tts.get_tts(utterance, wav_path, lang=self.lang, voice=self.voice) + return wav_path if os.path.isfile(wav_path) else None + + def _copy_out(self, wav_path: str) -> Optional[str]: + """Copy a rendered WAV into the harness temp dir before the cache prunes it.""" + if not wav_path or not os.path.isfile(wav_path): + return wav_path if wav_path and os.path.isfile(wav_path) else None + dst = os.path.join( + self._tmpdir, f"play_{len(os.listdir(self._tmpdir))}_{os.path.basename(wav_path)}" + ) + try: + shutil.copyfile(wav_path, dst) + return dst + except OSError: + return wav_path + + # ------------------------------------------------------------------ + # Scoring + # ------------------------------------------------------------------ + + def _transcribe(self, wav_path: str) -> str: + """Round-trip a WAV through the reference STT and return the transcript.""" + with AudioFile(wav_path) as source: + audio = source.read() + return self.reference_stt.execute(audio, language=self.lang) or "" + + def score_one(self, utterance: str) -> UtteranceScore: + """Synthesise, transcribe, and score a single utterance. + + Args: + utterance: The text to synthesise and score. + + Returns: + An :class:`UtteranceScore`. On synthesis/transcription failure the + transcript is empty and WER/CER reflect a total miss. + """ + if self.mode == "playback": + wav_path = self._render_playback(utterance) + else: + wav_path = self._render_direct(utterance) + + transcript = "" + if wav_path and os.path.isfile(wav_path): + try: + transcript = self._transcribe(wav_path) + except Exception: + transcript = "" + + ref = _normalize(utterance, self.lang) + hyp = _normalize(transcript, self.lang) + wer, cer = _score_pair(ref, hyp) + return UtteranceScore( + utterance=utterance, + transcript=transcript, + wer=wer, + cer=cer, + wav_path=wav_path, + lang=self.lang, + voice=self.voice, + ) + + def score(self, utterances: List[str]) -> IntelligibilityReport: + """Score a list of utterances and return the aggregate report. + + Args: + utterances: Phrases to synthesise and score. + + Returns: + An :class:`IntelligibilityReport`. + """ + report = IntelligibilityReport(lang=self.lang, voice=self.voice, mode=self.mode) + for utt in utterances: + report.scores.append(self.score_one(utt)) + return report + + +def score_tts_intelligibility(tts: Any, utterances: List[str], *, + lang: str = "en-US", + voice: Optional[str] = None, + reference_stt: Optional[Any] = None, + mode: str = "playback", + speak_timeout: float = 30.0) -> IntelligibilityReport: + """Synthesise, transcribe, and score a set of utterances in one call. + + Args: + tts: The TTS plugin under test. + utterances: Phrases to synthesise and score. + lang: BCP-47 language tag for synthesis and scoring. + voice: Optional voice identifier passed to ``get_tts``. + reference_stt: STT used to transcribe. Defaults to faster-whisper tiny. + mode: ``"playback"`` (full ovos-audio stack) or ``"direct"``. + speak_timeout: Per-utterance timeout for playback mode. + + Returns: + An :class:`IntelligibilityReport` with per-utterance and mean scores. + """ + with TTSIntelligibilityHarness( + tts, lang=lang, voice=voice, reference_stt=reference_stt, + mode=mode, speak_timeout=speak_timeout, + ) as harness: + return harness.score(utterances) diff --git a/pyproject.toml b/pyproject.toml index 6ec33d0..a729214 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,9 +32,21 @@ classifiers = [ [project.optional-dependencies] pydantic = ["ovos-pydantic-models>=0.1.0"] audio = ["ovos-audio>=1.2.0"] +# End-to-end TTS intelligibility scoring (WER/CER round-trip via reference STT). +# faster-whisper itself is pulled by the plugin — don't list it here to avoid +# version skew. +tts = [ + "ovos-audio>=1.2.0", + "jiwer", + "ovos-utterance-normalizer", + "ovos-stt-plugin-fasterwhisper", +] dev = [ "ovos-audio>=1.2.0", "ovos-pydantic-models>=0.1.0", + "jiwer", + "ovos-utterance-normalizer", + "ovos-stt-plugin-fasterwhisper", "pytest-cov", ] diff --git a/test/unittests/test_tts_intelligibility.py b/test/unittests/test_tts_intelligibility.py new file mode 100644 index 0000000..283f40e --- /dev/null +++ b/test/unittests/test_tts_intelligibility.py @@ -0,0 +1,192 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for ovoscope.tts_intelligibility. + +These tests use a MockTTS (silent WAV) and a MockSTT that echoes a fixed +transcript — no model download, no real audio. They cover WER/CER math, report +aggregation, serialisation, and that playback interception captures a wav path. +""" + +import importlib.util +import subprocess +import sys +import unittest + +TTS_AVAILABLE = ( + importlib.util.find_spec("jiwer") is not None + and importlib.util.find_spec("ovos_audio") is not None + and importlib.util.find_spec("ovos_utterance_normalizer") is not None +) + + +@unittest.skipUnless(TTS_AVAILABLE, "tts extra (jiwer/ovos-audio/normalizer) not installed") +class TestScoring(unittest.TestCase): + """WER/CER math, normalisation and report aggregation — no synthesis.""" + + def setUp(self): + from ovoscope import tts_intelligibility as ti + self.ti = ti + + def test_perfect_match_is_zero(self): + wer, cer = self.ti._score_pair("hello world", "hello world") + self.assertEqual(wer, 0.0) + self.assertEqual(cer, 0.0) + + def test_one_wrong_word_half_wer(self): + wer, _ = self.ti._score_pair("hello world", "hello there") + self.assertAlmostEqual(wer, 0.5, places=3) + + def test_empty_reference_handling(self): + self.assertEqual(self.ti._score_pair("", ""), (0.0, 0.0)) + self.assertEqual(self.ti._score_pair("", "noise"), (1.0, 1.0)) + + def test_normalize_strips_case_and_punctuation(self): + # "Hello, World!" should normalise to "hello world" so it scores 0 + # against the lowercased ground truth. + wer, _ = self.ti._score_pair( + self.ti._normalize("Hello, World!", "en-US"), + self.ti._normalize("hello world", "en-US"), + ) + self.assertEqual(wer, 0.0) + + def test_report_aggregation(self): + UtteranceScore = self.ti.UtteranceScore + IntelligibilityReport = self.ti.IntelligibilityReport + report = IntelligibilityReport(lang="en-US", voice="alan") + report.scores.append(UtteranceScore("a", "a", 0.0, 0.0, lang="en-US")) + report.scores.append(UtteranceScore("b", "x", 1.0, 1.0, lang="en-US")) + self.assertAlmostEqual(report.mean_wer, 0.5) + self.assertAlmostEqual(report.mean_cer, 0.5) + + def test_empty_report_means_zero(self): + report = self.ti.IntelligibilityReport() + self.assertEqual(report.mean_wer, 0.0) + self.assertEqual(report.mean_cer, 0.0) + + def test_to_dict_and_markdown_row(self): + UtteranceScore = self.ti.UtteranceScore + report = self.ti.IntelligibilityReport(lang="en-US", voice="alan") + report.scores.append(UtteranceScore("a", "a", 0.0, 0.0, lang="en-US")) + d = report.to_dict() + self.assertEqual(d["lang"], "en-US") + self.assertEqual(d["voice"], "alan") + self.assertEqual(d["n_utterances"], 1) + self.assertEqual(d["mean_wer"], 0.0) + self.assertIn("scores", d) + row = report.to_markdown_row() + self.assertIn("alan", row) + self.assertIn("en-US", row) + self.assertTrue(row.startswith("|") and row.endswith("|")) + + +class MockSTT: + """Reference STT stub that echoes a fixed transcript regardless of audio.""" + + def __init__(self, transcript="hello world"): + self.transcript = transcript + self.calls = 0 + + def execute(self, audio, language=None): + self.calls += 1 + return self.transcript + + +@unittest.skipUnless(TTS_AVAILABLE, "tts extra (jiwer/ovos-audio/normalizer) not installed") +class TestHarnessDirectMode(unittest.TestCase): + """Direct mode: tts.get_tts only, MockSTT echo — no bus, no model.""" + + def test_direct_mode_perfect_score(self): + from ovoscope.audio import MockTTS + from ovoscope.tts_intelligibility import score_tts_intelligibility + + stt = MockSTT("hello world") + report = score_tts_intelligibility( + MockTTS(), ["hello world"], + reference_stt=stt, mode="direct", + ) + self.assertEqual(len(report.scores), 1) + self.assertEqual(report.mean_wer, 0.0) + self.assertEqual(stt.calls, 1) + self.assertIsNotNone(report.scores[0].wav_path) + + def test_direct_mode_mismatch_scores_high(self): + from ovoscope.audio import MockTTS + from ovoscope.tts_intelligibility import score_tts_intelligibility + + report = score_tts_intelligibility( + MockTTS(), ["completely different text here"], + reference_stt=MockSTT("hello world"), mode="direct", + ) + self.assertGreater(report.mean_wer, 0.0) + + +@unittest.skipUnless(TTS_AVAILABLE, "tts extra (jiwer/ovos-audio/normalizer) not installed") +class TestHarnessPlaybackMode(unittest.TestCase): + """Playback mode: full ovos-audio stack drives MockTTS; wav is captured.""" + + def test_playback_captures_wav_and_scores(self): + from ovoscope.audio import MockTTS + from ovoscope.tts_intelligibility import TTSIntelligibilityHarness + + tts = MockTTS() + stt = MockSTT("hello world") + with TTSIntelligibilityHarness( + tts, reference_stt=stt, mode="playback", speak_timeout=15.0, + ) as h: + report = h.score(["hello world"]) + + self.assertEqual(len(report.scores), 1) + score = report.scores[0] + # Playback interception must have captured a rendered wav path. + self.assertIsNotNone(score.wav_path, "no wav captured from playback") + self.assertIn("hello world", tts.spoken_utterances) + self.assertEqual(report.mean_wer, 0.0) + + +class TestGracefulImport(unittest.TestCase): + """Core ``import ovoscope`` must succeed even without the [tts] extra.""" + + def test_import_without_tts_extra(self): + # Run in a subprocess with the optional tts deps blocked at import time, + # simulating an environment that never installed ovoscope[tts]. + code = ( + "import sys, importlib.abc, importlib.machinery\n" + "BLOCKED = {'jiwer', 'ovos_utterance_normalizer', " + "'ovos_stt_plugin_fasterwhisper', 'faster_whisper'}\n" + "class _Block(importlib.abc.MetaPathFinder):\n" + " def find_spec(self, name, path, target=None):\n" + " if name.split('.')[0] in BLOCKED:\n" + " raise ModuleNotFoundError(name=name.split('.')[0])\n" + " return None\n" + "sys.meta_path.insert(0, _Block())\n" + "for m in list(sys.modules):\n" + " if m.split('.')[0] in BLOCKED:\n" + " del sys.modules[m]\n" + "import ovoscope\n" + "assert not hasattr(ovoscope, 'TTSIntelligibilityHarness'), " + "'harness should be absent without the tts extra'\n" + "print('OK')\n" + ) + result = subprocess.run( + [sys.executable, "-c", code], + capture_output=True, text=True, + ) + self.assertEqual( + result.returncode, 0, + f"core import failed without tts extra:\nstdout={result.stdout}\nstderr={result.stderr}", + ) + self.assertIn("OK", result.stdout) + + +if __name__ == "__main__": + unittest.main()