From 8ac655b5a291810403c484c2cb588358734ee5ea Mon Sep 17 00:00:00 2001
From: hyperpolymath <6759885+hyperpolymath@users.noreply.github.com>
Date: Wed, 20 May 2026 12:06:46 +0100
Subject: [PATCH 1/2] test(#62): boot-readiness probe + mid-run singleton-death
 watcher
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bucket B from #62 is mid-run singleton instability ("(EXIT) no process
... GenServer.call(Burble.Chat.MessageStore, ...)"), not boot-readiness.
The earlier version of this PR added only a boot probe, which was a
misdiagnosis (verified harmless but ineffective against the cascade).

This commit keeps the boot-readiness probe (deterministic precondition;
detects any genuine boot-time regression) and adds a `SingletonWatcher`
GenServer that:

1. Process.monitors every app-owned singleton named in the probe list
   (PubSub, Presence, MessageStore, all the timing/groove/transport
   children, Endpoint, ...);
2. Records each :DOWN with name, old pid, reason, and ms-since-watch-
   start;
3. Re-monitors after a death (the application supervisor restarts these
   :one_for_one), so a flapping singleton produces one record per flap;
4. Is frozen by `ExUnit.after_suite/1` *before* the BEAM begins
   application shutdown, so the normal teardown :DOWN cascade is not
   misreported as instability. A belt-and-braces reason-filter rejects
   :shutdown / :normal entries in case any slipped in.

At suite end, if any non-shutdown deaths were recorded, a clearly
delimited block is printed to stderr listing each death. The block is
advisory (does not fail the run) so the instrumentation can land
without changing CI exit semantics until the owner is ready to enforce.

If no deaths are recorded — as on a local OTP25 run (75 failures, no
deaths in stderr) — nothing is printed and the helper is invisible.

Replaces the boot-probe-only previous revision of this PR (the boot
probe is retained; the watcher is the new bucket B instrumentation).

Refs #62.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 server/test/test_helper.exs | 201 +++++++++++++++++++++++++++++++++++-
 1 file changed, 200 insertions(+), 1 deletion(-)

diff --git a/server/test/test_helper.exs b/server/test/test_helper.exs
index f0deb96..1271a0e 100644
--- a/server/test/test_helper.exs
+++ b/server/test/test_helper.exs
@@ -1,3 +1,202 @@
 # SPDX-License-Identifier: PMPL-1.0-or-later
 ExUnit.start()
-Application.ensure_all_started(:burble)
+
+{:ok, _apps} = Application.ensure_all_started(:burble)
+
+# Singletons whose mid-run liveness this helper watches. Order is irrelevant.
+required = [
+  Burble.PubSub,
+  Burble.Presence,
+  Burble.RoomRegistry,
+  Burble.RoomSupervisor,
+  Burble.PeerRegistry,
+  Burble.PeerSupervisor,
+  Burble.CoprocessorRegistry,
+  Burble.CoprocessorSupervisor,
+  Burble.Chat.MessageStore,
+  Burble.Text.NNTPSBackend,
+  Burble.Media.Engine,
+  Burble.Timing.PTP,
+  Burble.Timing.ClockCorrelator,
+  Burble.Timing.Alignment,
+  Burble.Groove,
+  Burble.Groove.HealthMesh,
+  Burble.Groove.Feedback,
+  Burble.Transport.RTSP,
+  Burble.Bolt.Listener,
+  BurbleWeb.Endpoint
+]
+
+# Boot-readiness probe: wait for each child to register its name and respond
+# to `:sys.get_state` so `init/1` is complete and the callback mailbox is
+# drained before any test runs. Deterministic precondition for the mid-run
+# watcher below.
+deadline = System.monotonic_time(:millisecond) + 5_000
+
+await_registered = fn name ->
+  Stream.repeatedly(fn ->
+    case Process.whereis(name) do
+      nil ->
+        if System.monotonic_time(:millisecond) < deadline do
+          Process.sleep(5)
+          :retry
+        else
+          :timeout
+        end
+
+      pid when is_pid(pid) ->
+        {:ok, pid}
+    end
+  end)
+  |> Enum.find(fn
+    :retry -> false
+    _ -> true
+  end)
+end
+
+Enum.each(required, fn name ->
+  case await_registered.(name) do
+    {:ok, pid} ->
+      try do
+        :sys.get_state(pid, 1_000)
+      catch
+        :exit, reason ->
+          raise "Burble test boot: #{inspect(name)} init not stable: #{inspect(reason)}"
+      end
+
+    :timeout ->
+      raise "Burble test boot: #{inspect(name)} did not register within 5s"
+  end
+end)
+
+# Mid-run singleton-death watcher (#62 bucket B instrumentation).
+#
+# The earlier diagnosis in #62 was that app-owned singletons are
+# intermittently dead mid-run, causing the "(EXIT) no process" cascade
+# despite the application booting cleanly. A static boot probe cannot
+# detect that — it only verifies the start instant. This watcher monitors
+# every singleton above and records each death + reason + timestamp. The
+# watcher re-monitors after each death (the application supervisor
+# normally restarts these as :one_for_one children), so a flapping
+# singleton produces one death record per flap.
+#
+# At suite end we emit a stderr block listing all deaths. The block is
+# advisory (does not fail the run) so it can be deployed without
+# changing CI exit semantics until the owner is ready to enforce.
+defmodule Burble.TestSupport.SingletonWatcher do
+  use GenServer
+
+  def start_link(names), do: GenServer.start_link(__MODULE__, names, name: __MODULE__)
+
+  def freeze, do: GenServer.call(__MODULE__, :freeze, 5_000)
+
+  @impl true
+  def init(names) do
+    start_ms = System.monotonic_time(:millisecond)
+
+    refs =
+      Enum.reduce(names, %{}, fn name, acc ->
+        case Process.whereis(name) do
+          pid when is_pid(pid) -> Map.put(acc, Process.monitor(pid), name)
+          nil -> raise "SingletonWatcher: #{inspect(name)} not running at watch start"
+        end
+      end)
+
+    {:ok, %{refs: refs, deaths: [], start_ms: start_ms, frozen?: false}}
+  end
+
+  # Snapshot the death list and stop accepting new deaths. Called from
+  # ExUnit.after_suite so that the subsequent application-shutdown :DOWN
+  # cascade is not recorded as mid-run instability.
+  @impl true
+  def handle_call(:freeze, _from, state) do
+    {:reply, Enum.reverse(state.deaths), %{state | frozen?: true}}
+  end
+
+  @impl true
+  def handle_info({:DOWN, _ref, :process, _pid, _reason}, %{frozen?: true} = state) do
+    {:noreply, state}
+  end
+
+  def handle_info({:DOWN, ref, :process, pid, reason}, state) do
+    case Map.fetch(state.refs, ref) do
+      {:ok, name} ->
+        death = %{
+          name: name,
+          pid: pid,
+          reason: reason,
+          at_ms: System.monotonic_time(:millisecond) - state.start_ms
+        }
+
+        new_refs =
+          case Process.whereis(name) do
+            new_pid when is_pid(new_pid) and new_pid != pid ->
+              state.refs
+              |> Map.delete(ref)
+              |> Map.put(Process.monitor(new_pid), name)
+
+            _ ->
+              Process.send_after(self(), {:rewatch, name}, 50)
+              Map.delete(state.refs, ref)
+          end
+
+        {:noreply, %{state | refs: new_refs, deaths: [death | state.deaths]}}
+
+      :error ->
+        {:noreply, state}
+    end
+  end
+
+  @impl true
+  def handle_info({:rewatch, name}, %{frozen?: true} = state), do: {:noreply, state}
+
+  def handle_info({:rewatch, name}, state) do
+    new_refs =
+      case Process.whereis(name) do
+        pid when is_pid(pid) -> Map.put(state.refs, Process.monitor(pid), name)
+        nil -> state.refs
+      end
+
+    {:noreply, %{state | refs: new_refs}}
+  end
+end
+
+{:ok, _watcher} = Burble.TestSupport.SingletonWatcher.start_link(required)
+
+ExUnit.after_suite(fn _result ->
+  # Freeze the watcher before the BEAM begins application shutdown so the
+  # subsequent normal-shutdown :DOWN cascade is not mistaken for instability.
+  deaths =
+    Burble.TestSupport.SingletonWatcher.freeze()
+    # Belt-and-braces: filter clean shutdowns even if any slipped in.
+    |> Enum.reject(fn d -> d.reason in [:shutdown, :normal, {:shutdown, :normal}] end)
+
+  unless deaths == [] do
+    IO.puts(
+      :stderr,
+      "\n" <>
+        "===========================================================================\n" <>
+        "  burble#62 — App-owned singleton deaths recorded during test run\n" <>
+        "===========================================================================\n"
+    )
+
+    Enum.with_index(deaths, 1)
+    |> Enum.each(fn {d, i} ->
+      IO.puts(
+        :stderr,
+        "  #{i}. #{inspect(d.name)} died at +#{d.at_ms}ms " <>
+          "pid=#{inspect(d.pid)} reason=#{inspect(d.reason)}"
+      )
+    end)
+
+    IO.puts(
+      :stderr,
+      "\n  #{length(deaths)} mid-run singleton death(s) observed. This is\n" <>
+        "  bucket B from #62. Correlate with --seed-fixed test order to\n" <>
+        "  identify the offending test interactions.\n" <>
+        "==========================================================================="
+    )
+  end
+
+  :ok
+end)

From f57d759de9ea891cdd8a8feedb6662d78bc48a85 Mon Sep 17 00:00:00 2001
From: hyperpolymath <6759885+hyperpolymath@users.noreply.github.com>
Date: Wed, 20 May 2026 13:09:49 +0100
Subject: [PATCH 2/2] =?UTF-8?q?fix(#62):=20bucket=20B=20mid-run=20singleto?=
 =?UTF-8?q?n=20death=20=E2=80=94=20Media.Engine=20+=20HealthMesh=20root=20?=
 =?UTF-8?q?causes?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PR-C SingletonWatcher (#73) caught the first concrete bucket B
signal on OTP27 CI: `Burble.Media.Engine` died at +8.2s with reason
`{:shutdown, {GenServer, :call, [Burble.PeerSupervisor, {:start_child,
...}, :infinity]}}`. Local re-run after the first fix surfaced a
second cause in `Burble.Groove.HealthMesh`. Both fixed here:

## A. Burble.Media.Peer / Pipeline → restart: :temporary

Both were `use GenServer` (default `:permanent`). A peer's WebRTC
session is tied 1:1 to a Phoenix channel; once the channel is gone
or the underlying ExWebRTC state fails to initialise, restarting the
GenServer without the client context is pointless. The bug it caused:
under bursts of failing peer starts (tests that exercise add_peer
without a real channel), DynamicSupervisor max_restarts intensity
(default 3 in 5s) was exceeded → `Burble.PeerSupervisor` itself
exited `:shutdown` → `Burble.Media.Engine`'s in-flight
`GenServer.call(PeerSupervisor, ...)` propagated that exit → engine
died → every later test touching `Media.Engine` cascaded with "no
process".

`Burble.Coprocessor.Pipeline` gets the same treatment for the same
reason (it's also per-peer, per-session, and called from Engine's
add_peer handler via `DynamicSupervisor.start_child` on
`Burble.CoprocessorSupervisor`).

## B. Burble.Media.Engine.handle_call({:add_peer, ...}) — defensive

Defence in depth: wrap both `DynamicSupervisor.start_child` calls in
`try / catch :exit` so a sibling supervisor's death cannot topple the
Engine. With the `:temporary` fix above, max_restarts is no longer
hit in normal operation; this catch is for the rare case where the
supervisor itself goes down (e.g. heartbeat termination during
shutdown) so the Engine logs and continues rather than crashing.

## C. Burble.Media.Peer — guard `send(channel_pid, ...)` against nil

A second-order cause surfaced after fix A landed locally:
`Burble.Media.Peer.send_offer/1` (lib/burble/media/peer.ex:413) and
the ICE-candidate handle_info clause both did
`send(state.channel_pid, ...)` unconditionally. Tests pass
`channel_pid: nil` to exercise Media.Engine without a Phoenix
channel; `:erlang.send(nil, _)` raises `:badarg`. That raise
propagated as the Peer's exit reason; when Media.Engine then called
`GenServer.stop(peer, :normal)` to remove the peer, the stop exit
carried that `:badarg`, killing the Engine.

Both sites now guarded with `if is_pid(state.channel_pid)`.

## D. Burble.Groove.HealthMesh.do_probe/1 — `Map.get(v, :port)`

Third-order cause, also caught by the watcher: HealthMesh's probe
loop at line 172 did `Enum.find(state.peers, fn {_k, v} -> v.port
== port end)`. Peers added via `report_peer_status/3` carry no
`:port` key (they're identified by `service_id` only), so the find
raised `:badkey`. Switched to `Map.get(v, :port)` which returns nil
for the keyless peers and skips them — semantically correct, since
those peers aren't reachable via port lookup anyway.

## Verification

Local full suite (OTP25): the previous run reported 5 mid-run
singleton deaths via the watcher block (Media.Engine ×1,
HealthMesh ×2, secondary cascades ×2). After this fix the full
suite (`mix test`) prints **no singleton-death block at all** — every
named singleton stayed alive through the whole 707 tests. Local
failure count is in the documented 75-134 noise band and not a
reliable progress signal; the absence of mid-run deaths is.

OTP27 CI will be the authoritative measurement. Ships as a separate
PR from the test-helper instrumentation (#73) because these are
app-code fixes that should be reviewable independently.

Refs #62.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 server/lib/burble/coprocessor/pipeline.ex |  2 +-
 server/lib/burble/groove/health_mesh.ex   |  5 +++-
 server/lib/burble/media/engine.ex         | 36 ++++++++++++++++++-----
 server/lib/burble/media/peer.ex           | 17 +++++++----
 4 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/server/lib/burble/coprocessor/pipeline.ex b/server/lib/burble/coprocessor/pipeline.ex
index 40e0145..109e0af 100644
--- a/server/lib/burble/coprocessor/pipeline.ex
+++ b/server/lib/burble/coprocessor/pipeline.ex
@@ -40,7 +40,7 @@ defmodule Burble.Coprocessor.Pipeline do
       {:ok, pcm_samples} = Pipeline.process_inbound(pid, opus_frame)
   """
 
-  use GenServer
+  use GenServer, restart: :temporary
   require Logger
 
   alias Burble.Coprocessor.SmartBackend, as: Backend
diff --git a/server/lib/burble/groove/health_mesh.ex b/server/lib/burble/groove/health_mesh.ex
index 547ab0e..46e5322 100644
--- a/server/lib/burble/groove/health_mesh.ex
+++ b/server/lib/burble/groove/health_mesh.ex
@@ -168,8 +168,11 @@ defmodule Burble.Groove.HealthMesh do
 
         {:ok, {port, {:error, _reason}}}, acc ->
           # Check if we had this peer before — mark as degraded, then down.
+          # Peers added via report_peer_status/3 don't carry a :port key
+          # (they're identified by service_id only); use Map.get/2 so the
+          # find returns nothing for those rather than raising :badkey.
           existing =
-            Enum.find(state.peers, fn {_k, v} -> v.port == port end)
+            Enum.find(state.peers, fn {_k, v} -> Map.get(v, :port) == port end)
 
           case existing do
             {key, prev} when prev.status == :up ->
diff --git a/server/lib/burble/media/engine.ex b/server/lib/burble/media/engine.ex
index b279792..d135b24 100644
--- a/server/lib/burble/media/engine.ex
+++ b/server/lib/burble/media/engine.ex
@@ -212,10 +212,23 @@ defmodule Burble.Media.Engine do
           ice_servers: ice_servers_for_mode(session.privacy_mode)
         ]
 
-        case DynamicSupervisor.start_child(
-               Burble.PeerSupervisor,
-               {Burble.Media.Peer, peer_opts}
-             ) do
+        # Defensive: a DynamicSupervisor that has hit max_restarts intensity
+        # exits :shutdown, which would propagate out of GenServer.call and
+        # bring Burble.Media.Engine down. Catch :exit here so a sibling
+        # supervisor's death cannot topple the engine. The session state is
+        # still updated below so callers see consistent {:ok, offer} return
+        # values whether or not the Peer process started.
+        peer_start_result =
+          try do
+            DynamicSupervisor.start_child(
+              Burble.PeerSupervisor,
+              {Burble.Media.Peer, peer_opts}
+            )
+          catch
+            :exit, reason -> {:error, {:peer_supervisor_unavailable, reason}}
+          end
+
+        case peer_start_result do
           {:ok, _pid} ->
             Logger.info("[Media] Peer process started for #{peer_id}")
 
@@ -243,10 +256,17 @@ defmodule Burble.Media.Engine do
           }
         ]
 
-        case DynamicSupervisor.start_child(
-               Burble.CoprocessorSupervisor,
-               {Burble.Coprocessor.Pipeline, pipeline_opts}
-             ) do
+        pipeline_start_result =
+          try do
+            DynamicSupervisor.start_child(
+              Burble.CoprocessorSupervisor,
+              {Burble.Coprocessor.Pipeline, pipeline_opts}
+            )
+          catch
+            :exit, reason -> {:error, {:coprocessor_supervisor_unavailable, reason}}
+          end
+
+        case pipeline_start_result do
           {:ok, _pid} ->
             Logger.info("[Media] Pipeline started for peer #{peer_id}")
 
diff --git a/server/lib/burble/media/peer.ex b/server/lib/burble/media/peer.ex
index d29755c..87e58eb 100644
--- a/server/lib/burble/media/peer.ex
+++ b/server/lib/burble/media/peer.ex
@@ -28,7 +28,7 @@ defmodule Burble.Media.Peer do
   RTP packets from each peer are forwarded to all other peers' sendonly tracks.
   """
 
-  use GenServer
+  use GenServer, restart: :temporary
   require Logger
 
   alias ExWebRTC.{PeerConnection, MediaStreamTrack, RTPCodecParameters, SessionDescription, ICECandidate}
@@ -149,9 +149,12 @@ defmodule Burble.Media.Peer do
 
   @impl true
   def handle_info({:ex_webrtc, pc, {:ice_candidate, candidate}}, %{pc: pc} = state) do
-    # Forward ICE candidate to client via channel.
+    # Forward ICE candidate to client via channel. channel_pid may be nil in
+    # tests; skip in that case (see send_offer/1 comment).
     json = candidate |> ICECandidate.to_json() |> Jason.encode!()
-    send(state.channel_pid, {:peer_ice_candidate, json})
+    if is_pid(state.channel_pid) do
+      send(state.channel_pid, {:peer_ice_candidate, json})
+    end
     {:noreply, state}
   end
 
@@ -409,8 +412,12 @@ defmodule Burble.Media.Peer do
     {:ok, offer} = PeerConnection.create_offer(pc)
     :ok = PeerConnection.set_local_description(pc, offer)
 
-    # Send offer to client via channel.
-    send(state.channel_pid, {:peer_sdp_offer, offer.sdp})
+    # Send offer to client via channel. channel_pid may be nil in tests that
+    # exercise Media.Engine without a real Phoenix channel; skip silently in
+    # that case rather than raising :badarg from :erlang.send(nil, _).
+    if is_pid(state.channel_pid) do
+      send(state.channel_pid, {:peer_sdp_offer, offer.sdp})
+    end
 
     %{state | negotiating: true}
   end