From 04fc0a552530155b9eb03bc435627d6398f2299c Mon Sep 17 00:00:00 2001
From: JarbasAi <jarbasai@mailfence.com>
Date: Fri, 26 Jun 2026 02:12:53 +0100
Subject: [PATCH] feat: per-clip WakeWordProbe for benchmark/test harnesses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a self-contained probe that drives a real HotWordEngine over a single clip
the way the listening loop does: a few seconds of leading silence to warm the
engine's rolling feature window (openWakeWord et al. only emit once it is full,
~2.5 s — too little lead silently drops short positives and biases false
rejects), then the clip streamed frame by frame, returning a detection decision
plus latency and frames-to-detection. Unlike MiniVoiceLoop it needs no bus or
[listener] extra — just the [bench] extra (numpy). Tolerates the
HotWordEngine(lang) signature and the vestigial found_wake_word(frame) arg.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 ovoscope/wakeword_probe.py            | 162 ++++++++++++++++++++++++++
 pyproject.toml                        |   4 +
 test/unittests/test_wakeword_probe.py |  93 +++++++++++++++
 3 files changed, 259 insertions(+)
 create mode 100644 ovoscope/wakeword_probe.py
 create mode 100644 test/unittests/test_wakeword_probe.py

diff --git a/ovoscope/wakeword_probe.py b/ovoscope/wakeword_probe.py
new file mode 100644
index 0000000..27bf6cc
--- /dev/null
+++ b/ovoscope/wakeword_probe.py
@@ -0,0 +1,162 @@
+"""Lightweight per-clip wake-word detection probe.
+
+Drives a **real** OVOS :class:`HotWordEngine` over a single audio clip the way
+the live listening loop does — a few seconds of leading silence to warm the
+engine's streaming feature buffers, then the clip streamed frame by frame —
+and returns a per-clip detection decision plus latency.
+
+Unlike :class:`ovoscope.voice_loop.MiniVoiceLoop` (which runs the full
+``DinkumVoiceLoop`` state machine and needs the ``[listener]`` extra), this is
+self-contained: no bus, no VAD, no state machine — just ``engine.update()`` /
+``engine.found_wake_word()`` over primed audio. Ideal for plugin test suites
+and benchmarks that score detection on labelled fixtures.
+
+Why the long lead matters
+-------------------------
+Streaming detectors (openWakeWord, microWakeWord, …) only emit a prediction
+once their rolling mel/embedding window is full (~2.5 s of frames). A clip fed
+with too little leading silence never fills that window: the activation is
+missed (a false reject), and on the shortest clips the half-full buffer raises
+a shape mismatch that drops the sample entirely. Priming with a few seconds of
+leading silence fills the window *before* the keyword arrives, exactly as a
+live microphone keeps the loop warm. :data:`PRIME_SECONDS` defaults to 3 s.
+
+Audio contract: mono ``float32`` in ``[-1, 1]`` at the engine's sample rate
+(16 kHz for every OVOS hotword engine). Resample upstream if your source
+differs. Needs the ``[bench]`` extra (numpy).
+"""
+from __future__ import annotations
+
+import inspect
+import time
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+SAMPLE_RATE = 16000
+FRAME_SAMPLES = 1280   # 80 ms @ 16 kHz — the OVOS listener chunk size
+PRIME_SECONDS = 3.0    # leading silence to warm the feature window (see module docstring)
+TAIL_SECONDS = 0.5     # trailing silence so a late activation can settle
+
+
+@dataclass
+class WakeWordDetection:
+    """Outcome of running one clip through a hotword engine."""
+
+    detected: bool
+    latency_ms: float
+    frames_to_detection: Optional[int]  # frames streamed before the latch fired
+
+
+def apply_hotword_compat() -> None:
+    """Let hotword plugins written for a newer plugin-manager load here.
+
+    Recent wake-word plugins call ``super().__init__(key_phrase, config, lang)``;
+    older ``HotWordEngine`` bases accept only ``(key_phrase, config)``. Widen the
+    base signature to ignore the extra argument. A no-op when the installed base
+    already accepts ``lang``.
+    """
+    from ovos_plugin_manager.templates import hotwords as hw
+
+    base = hw.HotWordEngine
+    if "lang" in inspect.signature(base.__init__).parameters:
+        return
+    _orig = base.__init__
+
+    def _compat(self, key_phrase="hey_mycroft", config=None, lang=None,
+                *args, **kwargs):
+        _orig(self, key_phrase, config)
+
+    base.__init__ = _compat
+
+
+def load_hotword_engine(plugin_id: str, key_phrase: str = "hey_mycroft",
+                        config: Optional[Dict[str, Any]] = None,
+                        lang: str = "en-us"):
+    """Load and instantiate a real OVOS hotword engine by plugin id.
+
+    Tolerates the ``HotWordEngine(lang)`` signature change and dash/underscore
+    variation in plugin ids, mirroring how the listening loop resolves engines.
+    """
+    from ovos_plugin_manager.wakewords import load_wake_word_plugin
+
+    apply_hotword_compat()
+    clazz = load_wake_word_plugin(plugin_id)
+    if clazz is None and "-" in plugin_id:
+        clazz = load_wake_word_plugin(plugin_id.replace("-", "_"))
+    if clazz is None:
+        raise ValueError(f"no wake-word plugin {plugin_id!r}")
+    return clazz(key_phrase, dict(config or {}), lang)
+
+
+class WakeWordProbe:
+    """Drive a real ``HotWordEngine`` over single clips with listener-style priming."""
+
+    def __init__(self, engine, *, sample_rate: int = SAMPLE_RATE,
+                 frame_samples: int = FRAME_SAMPLES,
+                 prime_seconds: float = PRIME_SECONDS,
+                 tail_seconds: float = TAIL_SECONDS):
+        self.engine = engine
+        self.sample_rate = sample_rate
+        self.frame_samples = frame_samples
+        self.prime_seconds = prime_seconds
+        self.tail_seconds = tail_seconds
+
+    @classmethod
+    def from_plugin(cls, plugin_id: str, key_phrase: str = "hey_mycroft",
+                    config: Optional[Dict[str, Any]] = None,
+                    lang: str = "en-us", **kwargs) -> "WakeWordProbe":
+        """Build a probe from a plugin id (loads + instantiates the engine)."""
+        engine = load_hotword_engine(plugin_id, key_phrase, config, lang)
+        return cls(engine, **kwargs)
+
+    def prime_pad(self, array):
+        """Wrap a clip in leading + trailing silence, padded to whole frames."""
+        import numpy as np
+
+        arr = np.asarray(array, dtype="float32")
+        lead = np.zeros(int(self.sample_rate * self.prime_seconds), dtype="float32")
+        tail = np.zeros(int(self.sample_rate * self.tail_seconds), dtype="float32")
+        out = np.concatenate([lead, arr, tail])
+        rem = len(out) % self.frame_samples
+        if rem:
+            out = np.concatenate(
+                [out, np.zeros(self.frame_samples - rem, dtype="float32")])
+        return out
+
+    @staticmethod
+    def to_pcm16(array) -> bytes:
+        """Float32 ``[-1, 1]`` mono array → 16-bit little-endian PCM bytes."""
+        import numpy as np
+
+        arr = np.clip(np.asarray(array, dtype="float32"), -1.0, 1.0)
+        return (arr * 32767.0).astype("<i2").tobytes()
+
+    def detect(self, array) -> WakeWordDetection:
+        """Stream one clip through the engine; return the detection decision.
+
+        The OVOS contract is ``update(chunk_bytes)`` to feed audio then
+        ``found_wake_word()`` to read the latch. Some plugins keep a vestigial
+        ``found_wake_word(frame_data)`` argument they ignore — we pass the chunk
+        through so the signature matches either way.
+        """
+        primed = self.prime_pad(array)
+        if hasattr(self.engine, "reset"):
+            try:
+                self.engine.reset()
+            except Exception:
+                pass
+        fww = self.engine.found_wake_word
+        fww_takes_arg = len(inspect.signature(fww).parameters) >= 1
+        has_update = hasattr(self.engine, "update")
+        pcm = self.to_pcm16(primed)
+        step = self.frame_samples * 2  # 2 bytes / sample (int16)
+        start = time.perf_counter()
+        for i, off in enumerate(range(0, len(pcm), step), 1):
+            chunk = pcm[off:off + step]
+            if has_update:
+                self.engine.update(chunk)
+            if (fww(chunk) if fww_takes_arg else fww()):
+                latency = (time.perf_counter() - start) * 1000
+                return WakeWordDetection(True, round(latency, 3), i)
+        latency = (time.perf_counter() - start) * 1000
+        return WakeWordDetection(False, round(latency, 3), None)
diff --git a/pyproject.toml b/pyproject.toml
index 6a12195..5a55bcb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,6 +47,9 @@ media = ["ovos-media>=0.0.2a3"]
 # AudioTransformersService. >=0.7.2a1 is the first release that allows
 # ovos-bus-client 2.x (older pins cap it <2.0.0 and conflict with ovos-core).
 listener = ["ovos-dinkum-listener>=0.7.2a1"]
+# Per-clip WakeWordProbe (ovoscope.wakeword_probe): streams audio arrays through
+# a real HotWordEngine. Only needs numpy — the engine itself is environmental.
+bench = ["numpy"]
 # End-to-end TTS intelligibility scoring (WER/CER round-trip via reference STT).
 # faster-whisper itself is pulled by the plugin — don't list it here to avoid
 # version skew.
@@ -61,6 +64,7 @@ dev = [
     "ovos-media>=0.0.2a3",
     "ovos-dinkum-listener>=0.7.2a1",
     "ovos-pydantic-models>=0.1.0",
+    "numpy",
     "jiwer",
     "ovos-utterance-normalizer",
     "ovos-stt-plugin-fasterwhisper",
diff --git a/test/unittests/test_wakeword_probe.py b/test/unittests/test_wakeword_probe.py
new file mode 100644
index 0000000..3c35ee8
--- /dev/null
+++ b/test/unittests/test_wakeword_probe.py
@@ -0,0 +1,93 @@
+"""Tests for the per-clip WakeWordProbe (no real engine / no heavy deps)."""
+import inspect
+import unittest
+
+import numpy as np
+
+from ovoscope.wakeword_probe import (
+    FRAME_SAMPLES,
+    PRIME_SECONDS,
+    SAMPLE_RATE,
+    WakeWordDetection,
+    WakeWordProbe,
+)
+
+
+class _FakeEngine:
+    """Fires once it has seen ``trigger_after`` update() calls. Resets on read."""
+
+    def __init__(self, trigger_after=None, takes_arg=False):
+        self.trigger_after = trigger_after
+        self._calls = 0
+        self._fired = False
+        self.reset_count = 0
+        # Build found_wake_word with or without the vestigial frame arg, so the
+        # probe's signature sniffing is exercised both ways.
+        if takes_arg:
+            def found_wake_word(frame_data=b""):
+                return self._read()
+        else:
+            def found_wake_word():
+                return self._read()
+        self.found_wake_word = found_wake_word
+
+    def update(self, chunk: bytes):
+        self._calls += 1
+        if self.trigger_after is not None and self._calls >= self.trigger_after:
+            self._fired = True
+
+    def _read(self):
+        fired, self._fired = self._fired, False
+        return fired
+
+    def reset(self):
+        self.reset_count += 1
+        self._calls = 0
+        self._fired = False
+
+
+class TestPrimePad(unittest.TestCase):
+    def test_leads_with_silence_and_pads_to_frames(self):
+        probe = WakeWordProbe(_FakeEngine())
+        clip = np.ones(1000, dtype="float32")
+        out = probe.prime_pad(clip)
+        # whole number of frames
+        self.assertEqual(len(out) % FRAME_SAMPLES, 0)
+        # at least PRIME_SECONDS of leading silence before any signal
+        lead = int(SAMPLE_RATE * PRIME_SECONDS)
+        self.assertTrue(np.all(out[:lead] == 0.0))
+        # the clip survives inside the padded buffer
+        self.assertGreaterEqual(len(out), lead + len(clip))
+
+    def test_default_prime_is_a_few_seconds(self):
+        self.assertGreaterEqual(PRIME_SECONDS, 2.5)
+
+
+class TestDetect(unittest.TestCase):
+    def test_detects_and_reports_frames_and_latency(self):
+        engine = _FakeEngine(trigger_after=5)
+        probe = WakeWordProbe(engine)
+        result = probe.detect(np.zeros(SAMPLE_RATE, dtype="float32"))
+        self.assertIsInstance(result, WakeWordDetection)
+        self.assertTrue(result.detected)
+        self.assertEqual(result.frames_to_detection, 5)
+        self.assertGreaterEqual(result.latency_ms, 0.0)
+        self.assertEqual(engine.reset_count, 1)  # reset before streaming
+
+    def test_no_detection_returns_none_frames(self):
+        probe = WakeWordProbe(_FakeEngine(trigger_after=None))
+        result = probe.detect(np.zeros(SAMPLE_RATE, dtype="float32"))
+        self.assertFalse(result.detected)
+        self.assertIsNone(result.frames_to_detection)
+
+    def test_handles_found_wake_word_with_frame_arg(self):
+        engine = _FakeEngine(trigger_after=3, takes_arg=True)
+        self.assertGreaterEqual(
+            len(inspect.signature(engine.found_wake_word).parameters), 1)
+        result = WakeWordProbe(engine).detect(
+            np.zeros(SAMPLE_RATE, dtype="float32"))
+        self.assertTrue(result.detected)
+
+
+if __name__ == "__main__":
+    unittest.main()