From af17dd70a9508823b79c5756ad66799fa35c8328 Mon Sep 17 00:00:00 2001 From: hyperpolymath <6759885+hyperpolymath@users.noreply.github.com> Date: Wed, 20 May 2026 13:41:53 +0100 Subject: [PATCH] docs(#62): record bucket A/B/C/D root-cause cure + SingletonWatcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CHANGELOG: Unreleased ### Fixed entry covering all four buckets and the ship vehicle (PRs #70/#71/#73/#76); Unreleased ### Added entry for the SingletonWatcher diagnostic that landed in test_helper.exs. STATE.a2ml: blockers-and-issues table flips test-isolation-62 to RESOLVED 2026-05-20; session-history gains a per-PR walkthrough of the fixes (Peer/Pipeline restart:temporary, Engine try/catch :exit, Peer is_pid guard, HealthMesh Map.get) with the post-PR-D local-OTP25 evidence (707 tests, 134 failures, no watcher block). Refs #62 (issue closed via UI after merge — burble OTP27 CI deferred on this commit pending estate concurrency-pool drain). Co-Authored-By: Claude Opus 4.7 (1M context) --- .machine_readable/6a2/STATE.a2ml | 53 +++++++++++++++++++++++++++++++- CHANGELOG.md | 5 ++- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/.machine_readable/6a2/STATE.a2ml b/.machine_readable/6a2/STATE.a2ml index e93b241d..7bbf83d1 100644 --- a/.machine_readable/6a2/STATE.a2ml +++ b/.machine_readable/6a2/STATE.a2ml @@ -6,7 +6,7 @@ [metadata] project = "burble" version = "1.2.0-pre" -last-updated = "2026-05-13" +last-updated = "2026-05-20" status = "active" [project-context] @@ -44,6 +44,9 @@ doc-reality-drift = [ "RESOLVED 2026-05-19: Formal-proof claim scoped to compile/type-check only; runtime enforcement is roadmap per ADR-0008 (Option C, PoC #55) + ADR-0007 claim discipline", "RESOLVED 2026-05-19: README comparison table + Status note state PTP <1us needs a PTP NIC (NTP ~1ms fallback), hardware unvalidated, per ADR-0007" ] +test-isolation-62 = [ + "RESOLVED 2026-05-20: #62 (Earn-core/#39 split-out) fully closed at the root via PRs #70/#71/#73/#76. Bucket A (port-bind races on RTSP/PTP/phc2sys) + Bucket C (Task.shutdown cross-pid ownership) cured by app-owned-GenServer unique-name pattern (#71). Bucket B (mid-run singleton death cascade) cured by Media.Peer + Coprocessor.Pipeline restart:temporary + Media.Engine try/catch :exit + Peer is_pid guard on channel_pid + HealthMesh Map.get(:port) (#76). Bucket D (SNIFBackend Logger.require + def-promotions + RoomChannel catch-all) fixed (#70). SingletonWatcher diagnostic landed in test_helper.exs (#73) — advisory, runs every suite, freezes pre-shutdown." +] resolved-2026-04-16 = [ "Opus naming/contract drift: Backend.audio_encode/4 + audio_decode/3 docstrings rewritten to state explicitly that they are PCM frame pack/unpack, NOT Opus. Added explicit Backend.opus_transcode/4 callback returning {:error, :not_implemented} on every backend (ElixirBackend, ZigBackend, SmartBackend, SNIFBackend). Added opus_available?/0 callback (always false). Pinned by opus_contract_test.exs." ] @@ -161,6 +164,54 @@ open-failures = 0 # total non-deterministic (99..165). Cannot be # authoritatively measured/validated without CI. #39 now # HARD-BLOCKED on the stuck GitHub Actions queue (user). +# 2026-05-20: #62 fully closed at the root in 4 PRs. +# #70 (Bucket D — genuine bugs): SNIFBackend `require +# Logger` + `parse_fft_result`/`prepare_ifft_input`/ +# `parse_ifft_result` promoted defp→def (callable from +# tests + sibling backends); RoomChannel catch-all +# `join/3` clause returns {:error, %{reason: +# "invalid_topic"}} for non-room: topics. +# #71 (Bucket A — port-bind races + Bucket C — task +# ownership): app-owned singletons that bind ports +# (Burble.Timing.PTP, Burble.Transport.RTSP, +# Burble.Timing.Phc2sys) now honour a :name opt on +# start_link/1 via {name, init_opts} = Keyword.pop; +# tests pass `name: :"_test_"` so async +# test runs don't collide on registered name nor on the +# :gen_tcp port. llm_test.exs Task.async + Task.shutdown +# replaced with spawn (cross-pid ownership exit +# eliminated). 4 RTSP SETUP-over-TCP failures left as +# pre-existing parser drift (out of scope, not #62). +# #73 (Bucket B — diagnostic): SingletonWatcher in +# test_helper.exs monitors 20 app-owned singletons, +# records death (name+pid+reason+ms), freezes via +# ExUnit.after_suite before BEAM shutdown so app- +# teardown :DOWN cascade is not mis-recorded; advisory, +# does not fail CI. Caught the Bucket B cascade before +# the cure landed. +# #76 (Bucket B — cure): Burble.Media.Peer + +# Burble.Coprocessor.Pipeline switched to +# `use GenServer, restart: :temporary` (test-induced +# peer crashes no longer drain the supervisor's +# max_restarts: 3-in-5s intensity); Burble.Media.Peer +# guards `send(state.channel_pid, _)` with `is_pid/1` +# (tests pass channel_pid: nil — the :badarg was the +# cascade trigger); Burble.Media.Engine wraps the two +# DynamicSupervisor.start_child calls in try/catch :exit +# so a dead PeerSupervisor/CoprocessorSupervisor doesn't +# propagate :shutdown into the Engine; Burble.Groove. +# HealthMesh switched `v.port` → `Map.get(v, :port)` +# (peers added via report_peer_status/3 lack :port, +# :badkey was a secondary independent cause). +# Local OTP25 evidence post-PR-D: 707 tests, 134 +# failures, NO singleton-death block emitted (every +# watched singleton stayed alive). 134 is inside the +# documented same-SHA noise band (15-38 on CI, wider +# locally with PR-B's test rewrites in the tree). +# Headline signal = absence of watcher block, not the +# count. OTP27 CI deferred — estate Elixir CI queue was +# exhausted at merge time (concurrency-pool throttle); +# will surface in next post-merge run on main. [crg] grade = "C" diff --git a/CHANGELOG.md b/CHANGELOG.md index 02a6a748..974084b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- `Burble.TestSupport.SingletonWatcher` in `test/test_helper.exs` — `Process.monitor`s each of 20 app-owned singletons (PubSub, Presence, RoomRegistry/Supervisor, PeerRegistry/Supervisor, CoprocessorRegistry/Supervisor, MessageStore, NNTPSBackend, Media.Engine, Timing.{PTP,ClockCorrelator,Alignment}, Groove + HealthMesh + Feedback, Transport.RTSP, Bolt.Listener, Endpoint), reports any mid-run death (name + pid + reason + ms-since-start) to stderr at suite end, freezes via `ExUnit.after_suite/1` before BEAM shutdown so the normal app-teardown `:DOWN` cascade is not mistaken for instability. Diagnostic for #62 Bucket B; advisory (does not fail CI). + ### Changed - README/ROADMAP claims scoped to the shipped build per ADR-0007: QUIC & SNIF marked experimental (optional NIFs disabled by default), PTP <1µs flagged hardware-gated, Idris2 proofs flagged type-check-only (runtime enforcement = ADR-0008 Option C), latency/scale flagged unbenchmarked; added a README Status section. Closes the STATE.a2ml doc-reality-drift entries (issue #51) @@ -29,7 +32,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `docs/developer/wsl-mirrored-networking.adoc` rewritten — NAT + host forwarder is the recommended WSL2 Bolt path; mirrored networking demoted to last-resort (Win11 24H2/Insider `Wsl/Service/E_UNEXPECTED` instability) ### Fixed -- #39: Elixir test gate was dead because `Burble.Store` refused to start without VeriSimDB (first supervised child) — the whole supervision tree collapsed and every app-dependent test failed at boot. Added prod-safe, config-gated `offline_ok` (test env starts the Store in degraded offline mode); fixed 3 test-file compile errors (`snif_backend_test.exs` invalid index + nested `describe`, `room_property_test.exs` `check all` newline). Suite now compiles and boots (707 tests run); full green + gate re-arm still pending an authoritative CI run. +- #62: test-isolation cascade fully resolved at the root (Buckets A/B/C/D). **Bucket A** (port-binding races on RTSP 19554, PTP, phc2sys ~40+ failures) and **C** (cross-pid `Task.shutdown` ownership) fixed by switching app-owned GenServers to unique-named instances per test (`PTP.start_link/1`, `RTSP.start_link/1`, `Phc2sys.start_link/1` now honour `:name` opt; tests pass `name: :"_test_"`). **Bucket B** (mid-run singleton death cascading "(EXIT) no process") root-caused via a `SingletonWatcher` GenServer in `test_helper.exs` that `Process.monitor`s each of 20 app-owned singletons and reports mid-run deaths to stderr; cured by (i) `Burble.Media.Peer` + `Burble.Coprocessor.Pipeline` flipped to `use GenServer, restart: :temporary` (so test-induced peer crashes don't drain the supervisor's `max_restarts: 3-in-5s` intensity and kill the supervisor), (ii) `Burble.Media.Peer` guarding `send(state.channel_pid, _)` with `is_pid/1` (tests pass `channel_pid: nil`, the `:badarg` was the cascade trigger), (iii) `Burble.Media.Engine` wrapping `DynamicSupervisor.start_child` in `try/catch :exit` so a dead supervisor's `:shutdown` doesn't propagate into the Engine, (iv) `Burble.Groove.HealthMesh` using `Map.get(v, :port)` instead of `v.port` (peers added via `report_peer_status/3` lack `:port`, the `:badkey` was an independent secondary cause). **Bucket D** (genuine code bugs: `SNIFBackend` missing `Logger.require` + private parser functions, `RoomChannel.join/3` non-exhaustive clauses) fixed directly. Shipped as PRs #70 (D), #71 (A+C), #73 (C diagnostic), #76 (B cure). - SNIF: `Burble.Coprocessor.SNIFBackend` no longer emits a compile warning for the optional `Wasmex` runtime and no longer mis-fails when it is absent — `Wasmex` is referenced via `apply/3` and `available?/0` now gates on it loadable, so kernels degrade cleanly to `ZigBackend` (mirrors the `:quicer` pattern, ADR-0004) ### Removed