From f7ec2655e2e1f605db50d9eac11cc3437e08a57a Mon Sep 17 00:00:00 2001
From: iancharest <charest.ian@gmail.com>
Date: Thu, 11 Jun 2026 20:08:50 -0400
Subject: [PATCH] Add hardware latency stats

---
 CHANGELOG.md                      |   3 +
 README.md                         |   3 +
 docs/architecture.md              |   8 +-
 src/tachyaudio/_native.c          | 166 +++++++++++++++++++++++++++++-
 src/tachyaudio/_native_backend.py |   2 +
 src/tachyaudio/_stream.py         |   3 +
 tests/test_public_api.py          |  10 +-
 7 files changed, 188 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f2eaa3c..19bf16f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,9 @@ public APIs may still change while the backend design stabilizes.
 
 - Refactored native ring-buffer and stream-stat helpers without changing public
   behavior.
+- Added `hardware_latency` to stream statistics. macOS reports Core Audio
+  device latency plus safety offset; Linux reports `None` until a reliable
+  miniaudio latency value is available.
 
 ## [0.2.0a2] - 2026-06-10
 
diff --git a/README.md b/README.md
index a20f45f..1c11b7a 100644
--- a/README.md
+++ b/README.md
@@ -57,8 +57,11 @@ Lifecycle semantics:
 
 - `frames_processed`: frames consumed by the backend
 - `underruns` / `overruns`: buffer starvation or rejected writes
+- `hardware_latency`: backend-reported device latency in seconds when available
 - `queued_frames`: frames currently waiting in the native ring
 - `queued_latency`: queued ring duration in seconds
+- `estimated_latency`: `queued_latency + hardware_latency` when hardware latency
+  is available, otherwise queued latency
 - `buffer_size`: native callback buffer size in frames
 
 ## Development
diff --git a/docs/architecture.md b/docs/architecture.md
index 7a9f105..453ffbc 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -35,9 +35,11 @@ captured float32 frames into a native ring buffer, and Python reads available
 frames without invoking Python from the audio callback.
 
 Stream statistics distinguish queue state from hardware behavior. `queued_frames`
-and `queued_latency` describe the native ring buffer. `buffer_size` describes
-the callback buffer size in frames. `estimated_latency` currently aliases queued
-latency; future backends should include hardware output latency when available.
+and `queued_latency` describe the native ring buffer. `hardware_latency`
+describes backend-reported device latency when available. `buffer_size`
+describes the callback buffer size in frames. `estimated_latency` reports
+`queued_latency + hardware_latency` when hardware latency is available, otherwise
+queued latency.
 
 ## Real-time constraints
 
diff --git a/src/tachyaudio/_native.c b/src/tachyaudio/_native.c
index 2b5a227..668af0b 100644
--- a/src/tachyaudio/_native.c
+++ b/src/tachyaudio/_native.c
@@ -50,6 +50,44 @@ static size_t tachy_ring_copy_out_raw(
 }
 
 static PyObject *tachy_build_stream_stats(
+    unsigned long long frames_processed,
+    unsigned int underruns,
+    unsigned int overruns,
+    unsigned long long queued_frames,
+    double queued_latency,
+    double hardware_latency,
+    int has_hardware_latency,
+    unsigned int buffer_size)
+{
+    double estimated_latency = queued_latency;
+    PyObject *hardware_latency_object = Py_None;
+    if (has_hardware_latency) {
+        estimated_latency += hardware_latency;
+        hardware_latency_object = PyFloat_FromDouble(hardware_latency);
+        if (hardware_latency_object == NULL) {
+            return NULL;
+        }
+    } else {
+        Py_INCREF(Py_None);
+    }
+
+    PyObject *stats = NULL;
+    stats = Py_BuildValue(
+        "{s:K,s:I,s:I,s:d,s:O,s:K,s:d,s:I}",
+        "frames_processed", frames_processed,
+        "underruns", underruns,
+        "overruns", overruns,
+        "estimated_latency", estimated_latency,
+        "hardware_latency", hardware_latency_object,
+        "queued_frames", queued_frames,
+        "queued_latency", queued_latency,
+        "buffer_size", buffer_size
+    );
+    Py_DECREF(hardware_latency_object);
+    return stats;
+}
+
+static PyObject *tachy_build_stream_stats_without_hardware_latency(
     unsigned long long frames_processed,
     unsigned int underruns,
     unsigned int overruns,
@@ -58,11 +96,12 @@ static PyObject *tachy_build_stream_stats(
     unsigned int buffer_size)
 {
     return Py_BuildValue(
-        "{s:K,s:I,s:I,s:d,s:K,s:d,s:I}",
+        "{s:K,s:I,s:I,s:d,s:O,s:K,s:d,s:I}",
         "frames_processed", frames_processed,
         "underruns", underruns,
         "overruns", overruns,
         "estimated_latency", queued_latency,
+        "hardware_latency", Py_None,
         "queued_frames", queued_frames,
         "queued_latency", queued_latency,
         "buffer_size", buffer_size
@@ -95,6 +134,7 @@ typedef struct {
     UInt32 pending_buffers;
     UInt32 underruns;
     UInt32 overruns;
+    double hardware_latency;
     uint8_t *ring;
     size_t ring_capacity;
     size_t ring_read;
@@ -119,6 +159,7 @@ typedef struct {
     UInt64 frames_processed;
     UInt32 underruns;
     UInt32 overruns;
+    double hardware_latency;
     uint8_t *ring;
     size_t ring_capacity;
     size_t ring_read;
@@ -132,6 +173,13 @@ typedef struct {
 
 static PyTypeObject TachyInputStreamType;
 
+static AudioObjectID tachy_get_default_device(AudioObjectPropertySelector selector);
+static AudioObjectID tachy_find_device_by_uid(const char *device_uid);
+static double tachy_get_coreaudio_hardware_latency(
+    AudioObjectID device_id,
+    AudioObjectPropertyScope scope,
+    double sample_rate);
+
 static void tachy_ring_copy_in(TachyOutputStream *stream, const uint8_t *source, size_t byte_count)
 {
     tachy_ring_copy_in_raw(
@@ -332,6 +380,7 @@ static PyObject *tachy_output_new(PyTypeObject *type, PyObject *args, PyObject *
     self->pending_buffers = 0;
     self->underruns = 0;
     self->overruns = 0;
+    self->hardware_latency = 0.0;
     self->ring = NULL;
     self->ring_capacity = (size_t)sample_rate * TACHY_RING_SECONDS * self->bytes_per_frame;
     if (self->ring_capacity < (size_t)self->buffer_byte_size * TACHY_OUTPUT_BUFFER_COUNT * 2) {
@@ -395,6 +444,14 @@ static PyObject *tachy_output_new(PyTypeObject *type, PyObject *args, PyObject *
         return NULL;
     }
 
+    AudioObjectID output_device = device_id == NULL || device_id[0] == '\0'
+        ? tachy_get_default_device(kAudioHardwarePropertyDefaultOutputDevice)
+        : tachy_find_device_by_uid(device_id);
+    self->hardware_latency = tachy_get_coreaudio_hardware_latency(
+        output_device,
+        kAudioDevicePropertyScopeOutput,
+        (double)sample_rate);
+
     for (UInt32 index = 0; index < TACHY_OUTPUT_BUFFER_COUNT; index++) {
         status = AudioQueueAllocateBuffer(self->queue, self->buffer_byte_size, &self->buffers[index]);
         if (status != noErr || self->buffers[index] == NULL) {
@@ -635,6 +692,8 @@ static PyObject *tachy_output_stats(TachyOutputStream *self, PyObject *Py_UNUSED
         overruns,
         queued_frames,
         queued_latency,
+        self->hardware_latency,
+        1,
         buffer_size);
 }
 
@@ -757,6 +816,7 @@ static PyObject *tachy_input_new(PyTypeObject *type, PyObject *args, PyObject *k
     self->frames_processed = 0;
     self->underruns = 0;
     self->overruns = 0;
+    self->hardware_latency = 0.0;
     self->ring = NULL;
     self->ring_capacity = (size_t)sample_rate * TACHY_RING_SECONDS * self->bytes_per_frame;
     if (self->ring_capacity < (size_t)self->buffer_byte_size * TACHY_INPUT_BUFFER_COUNT * 2) {
@@ -819,6 +879,14 @@ static PyObject *tachy_input_new(PyTypeObject *type, PyObject *args, PyObject *k
         return NULL;
     }
 
+    AudioObjectID input_device = device_id == NULL || device_id[0] == '\0'
+        ? tachy_get_default_device(kAudioHardwarePropertyDefaultInputDevice)
+        : tachy_find_device_by_uid(device_id);
+    self->hardware_latency = tachy_get_coreaudio_hardware_latency(
+        input_device,
+        kAudioDevicePropertyScopeInput,
+        (double)sample_rate);
+
     for (UInt32 index = 0; index < TACHY_INPUT_BUFFER_COUNT; index++) {
         status = AudioQueueAllocateBuffer(self->queue, self->buffer_byte_size, &self->buffers[index]);
         if (status != noErr || self->buffers[index] == NULL) {
@@ -1000,6 +1068,8 @@ static PyObject *tachy_input_stats(TachyInputStream *self, PyObject *Py_UNUSED(i
         overruns,
         queued_frames,
         queued_latency,
+        self->hardware_latency,
+        1,
         buffer_size);
 }
 
@@ -1115,6 +1185,96 @@ static AudioObjectID tachy_get_default_device(AudioObjectPropertySelector select
     return device_id;
 }
 
+static AudioObjectID tachy_find_device_by_uid(const char *device_uid)
+{
+    if (device_uid == NULL || device_uid[0] == '\0') {
+        return kAudioObjectUnknown;
+    }
+
+    AudioObjectPropertyAddress address = {
+        kAudioHardwarePropertyDevices,
+        kAudioObjectPropertyScopeGlobal,
+        kAudioObjectPropertyElementMain
+    };
+    UInt32 size = 0;
+    OSStatus status = AudioObjectGetPropertyDataSize(kAudioObjectSystemObject, &address, 0, NULL, &size);
+
+    if (status != noErr || size == 0) {
+        return kAudioObjectUnknown;
+    }
+
+    AudioObjectID *device_ids = (AudioObjectID *)PyMem_RawMalloc(size);
+    if (device_ids == NULL) {
+        return kAudioObjectUnknown;
+    }
+
+    status = AudioObjectGetPropertyData(kAudioObjectSystemObject, &address, 0, NULL, &size, device_ids);
+    if (status != noErr) {
+        PyMem_RawFree(device_ids);
+        return kAudioObjectUnknown;
+    }
+
+    AudioObjectID found_device = kAudioObjectUnknown;
+    UInt32 device_count = size / sizeof(AudioObjectID);
+    for (UInt32 index = 0; index < device_count; index++) {
+        char uid[256] = {0};
+        if (tachy_get_cf_string(device_ids[index], kAudioDevicePropertyDeviceUID, uid, sizeof(uid)) &&
+                strcmp(uid, device_uid) == 0) {
+            found_device = device_ids[index];
+            break;
+        }
+    }
+
+    PyMem_RawFree(device_ids);
+    return found_device;
+}
+
+static UInt32 tachy_get_coreaudio_uint32_property(
+    AudioObjectID device_id,
+    AudioObjectPropertySelector selector,
+    AudioObjectPropertyScope scope)
+{
+    if (device_id == kAudioObjectUnknown) {
+        return 0;
+    }
+
+    AudioObjectPropertyAddress address = {
+        selector,
+        scope,
+        kAudioObjectPropertyElementMain
+    };
+    UInt32 value = 0;
+    UInt32 size = sizeof(value);
+    OSStatus status = AudioObjectGetPropertyData(device_id, &address, 0, NULL, &size, &value);
+
+    if (status != noErr) {
+        return 0;
+    }
+
+    return value;
+}
+
+static double tachy_get_coreaudio_hardware_latency(
+    AudioObjectID device_id,
+    AudioObjectPropertyScope scope,
+    double sample_rate)
+{
+    if (sample_rate <= 0.0) {
+        return 0.0;
+    }
+
+    UInt32 latency_frames = tachy_get_coreaudio_uint32_property(
+        device_id,
+        kAudioDevicePropertyLatency,
+        scope);
+    UInt32 safety_offset_frames = tachy_get_coreaudio_uint32_property(
+        device_id,
+        kAudioDevicePropertySafetyOffset,
+        scope);
+
+    return (double)(latency_frames + safety_offset_frames) / sample_rate;
+}
+
 static int tachy_append_device(PyObject *devices, AudioObjectID device_id, const char *kind, UInt32 channels, int is_default)
 {
     char uid[256] = {0};
@@ -1688,7 +1848,7 @@ static PyObject *tachy_output_stats(TachyOutputStream *self, PyObject *Py_UNUSED
     ma_uint32 buffer_size = self->buffer_frames;
     pthread_mutex_unlock(&self->lock);
 
-    return tachy_build_stream_stats(
+    return tachy_build_stream_stats_without_hardware_latency(
         frames_processed,
         underruns,
         overruns,
@@ -2085,7 +2245,7 @@ static PyObject *tachy_input_stats(TachyInputStream *self, PyObject *Py_UNUSED(i
     ma_uint32 buffer_size = self->buffer_frames;
     pthread_mutex_unlock(&self->lock);
 
-    return tachy_build_stream_stats(
+    return tachy_build_stream_stats_without_hardware_latency(
         frames_processed,
         underruns,
         overruns,
diff --git a/src/tachyaudio/_native_backend.py b/src/tachyaudio/_native_backend.py
index 944c191..6e80b91 100644
--- a/src/tachyaudio/_native_backend.py
+++ b/src/tachyaudio/_native_backend.py
@@ -45,6 +45,7 @@ def stats(self) -> object:
             underruns=item["underruns"],
             overruns=item["overruns"],
             estimated_latency=item["estimated_latency"],
+            hardware_latency=item["hardware_latency"],
             queued_frames=item["queued_frames"],
             queued_latency=item["queued_latency"],
             buffer_size=item["buffer_size"],
@@ -87,6 +88,7 @@ def stats(self) -> object:
             underruns=item["underruns"],
             overruns=item["overruns"],
             estimated_latency=item["estimated_latency"],
+            hardware_latency=item["hardware_latency"],
             queued_frames=item["queued_frames"],
             queued_latency=item["queued_latency"],
             buffer_size=item["buffer_size"],
diff --git a/src/tachyaudio/_stream.py b/src/tachyaudio/_stream.py
index 2c60e38..2dcf876 100644
--- a/src/tachyaudio/_stream.py
+++ b/src/tachyaudio/_stream.py
@@ -41,6 +41,7 @@ class StreamStats:
     underruns: int = 0
     overruns: int = 0
     estimated_latency: float | None = None
+    hardware_latency: float | None = None
     queued_frames: int = 0
     queued_latency: float = 0.0
     buffer_size: int | None = None
@@ -54,6 +55,8 @@ def __post_init__(self) -> None:
             raise ValueError("overruns cannot be negative")
         if self.estimated_latency is not None and self.estimated_latency < 0:
             raise ValueError("estimated_latency cannot be negative")
+        if self.hardware_latency is not None and self.hardware_latency < 0:
+            raise ValueError("hardware_latency cannot be negative")
         if self.queued_frames < 0:
             raise ValueError("queued_frames cannot be negative")
         if self.queued_latency < 0:
diff --git a/tests/test_public_api.py b/tests/test_public_api.py
index b27a3df..dec46f4 100644
--- a/tests/test_public_api.py
+++ b/tests/test_public_api.py
@@ -252,11 +252,19 @@ def test_stream_stats_preserves_zero_latency(self) -> None:
         self.assertEqual(stats.estimated_latency, 0.0)
 
     def test_stream_stats_validates_richer_fields(self) -> None:
-        stats = ta.StreamStats(queued_frames=12, queued_latency=0.25, buffer_size=256)
+        stats = ta.StreamStats(
+            hardware_latency=0.01,
+            queued_frames=12,
+            queued_latency=0.25,
+            buffer_size=256,
+        )
+        self.assertEqual(stats.hardware_latency, 0.01)
         self.assertEqual(stats.queued_frames, 12)
         self.assertEqual(stats.queued_latency, 0.25)
         self.assertEqual(stats.buffer_size, 256)
 
+        with self.assertRaises(ValueError):
+            ta.StreamStats(hardware_latency=-0.1)
         with self.assertRaises(ValueError):
             ta.StreamStats(queued_frames=-1)
         with self.assertRaises(ValueError):