From 8ac655b5a291810403c484c2cb588358734ee5ea Mon Sep 17 00:00:00 2001 From: hyperpolymath <6759885+hyperpolymath@users.noreply.github.com> Date: Wed, 20 May 2026 12:06:46 +0100 Subject: [PATCH 1/2] test(#62): boot-readiness probe + mid-run singleton-death watcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bucket B from #62 is mid-run singleton instability ("(EXIT) no process ... GenServer.call(Burble.Chat.MessageStore, ...)"), not boot-readiness. The earlier version of this PR added only a boot probe, which was a misdiagnosis (verified harmless but ineffective against the cascade). This commit keeps the boot-readiness probe (deterministic precondition; detects any genuine boot-time regression) and adds a `SingletonWatcher` GenServer that: 1. Process.monitors every app-owned singleton named in the probe list (PubSub, Presence, MessageStore, all the timing/groove/transport children, Endpoint, ...); 2. Records each :DOWN with name, old pid, reason, and ms-since-watch- start; 3. Re-monitors after a death (the application supervisor restarts these :one_for_one), so a flapping singleton produces one record per flap; 4. Is frozen by `ExUnit.after_suite/1` *before* the BEAM begins application shutdown, so the normal teardown :DOWN cascade is not misreported as instability. A belt-and-braces reason-filter rejects :shutdown / :normal entries in case any slipped in. At suite end, if any non-shutdown deaths were recorded, a clearly delimited block is printed to stderr listing each death. The block is advisory (does not fail the run) so the instrumentation can land without changing CI exit semantics until the owner is ready to enforce. If no deaths are recorded — as on a local OTP25 run (75 failures, no deaths in stderr) — nothing is printed and the helper is invisible. Replaces the boot-probe-only previous revision of this PR (the boot probe is retained; the watcher is the new bucket B instrumentation). Refs #62. Co-Authored-By: Claude Opus 4.7 (1M context) --- server/test/test_helper.exs | 201 +++++++++++++++++++++++++++++++++++- 1 file changed, 200 insertions(+), 1 deletion(-) diff --git a/server/test/test_helper.exs b/server/test/test_helper.exs index f0deb96..1271a0e 100644 --- a/server/test/test_helper.exs +++ b/server/test/test_helper.exs @@ -1,3 +1,202 @@ # SPDX-License-Identifier: PMPL-1.0-or-later ExUnit.start() -Application.ensure_all_started(:burble) + +{:ok, _apps} = Application.ensure_all_started(:burble) + +# Singletons whose mid-run liveness this helper watches. Order is irrelevant. +required = [ + Burble.PubSub, + Burble.Presence, + Burble.RoomRegistry, + Burble.RoomSupervisor, + Burble.PeerRegistry, + Burble.PeerSupervisor, + Burble.CoprocessorRegistry, + Burble.CoprocessorSupervisor, + Burble.Chat.MessageStore, + Burble.Text.NNTPSBackend, + Burble.Media.Engine, + Burble.Timing.PTP, + Burble.Timing.ClockCorrelator, + Burble.Timing.Alignment, + Burble.Groove, + Burble.Groove.HealthMesh, + Burble.Groove.Feedback, + Burble.Transport.RTSP, + Burble.Bolt.Listener, + BurbleWeb.Endpoint +] + +# Boot-readiness probe: wait for each child to register its name and respond +# to `:sys.get_state` so `init/1` is complete and the callback mailbox is +# drained before any test runs. Deterministic precondition for the mid-run +# watcher below. +deadline = System.monotonic_time(:millisecond) + 5_000 + +await_registered = fn name -> + Stream.repeatedly(fn -> + case Process.whereis(name) do + nil -> + if System.monotonic_time(:millisecond) < deadline do + Process.sleep(5) + :retry + else + :timeout + end + + pid when is_pid(pid) -> + {:ok, pid} + end + end) + |> Enum.find(fn + :retry -> false + _ -> true + end) +end + +Enum.each(required, fn name -> + case await_registered.(name) do + {:ok, pid} -> + try do + :sys.get_state(pid, 1_000) + catch + :exit, reason -> + raise "Burble test boot: #{inspect(name)} init not stable: #{inspect(reason)}" + end + + :timeout -> + raise "Burble test boot: #{inspect(name)} did not register within 5s" + end +end) + +# Mid-run singleton-death watcher (#62 bucket B instrumentation). +# +# The earlier diagnosis in #62 was that app-owned singletons are +# intermittently dead mid-run, causing the "(EXIT) no process" cascade +# despite the application booting cleanly. A static boot probe cannot +# detect that — it only verifies the start instant. This watcher monitors +# every singleton above and records each death + reason + timestamp. The +# watcher re-monitors after each death (the application supervisor +# normally restarts these as :one_for_one children), so a flapping +# singleton produces one death record per flap. +# +# At suite end we emit a stderr block listing all deaths. The block is +# advisory (does not fail the run) so it can be deployed without +# changing CI exit semantics until the owner is ready to enforce. +defmodule Burble.TestSupport.SingletonWatcher do + use GenServer + + def start_link(names), do: GenServer.start_link(__MODULE__, names, name: __MODULE__) + + def freeze, do: GenServer.call(__MODULE__, :freeze, 5_000) + + @impl true + def init(names) do + start_ms = System.monotonic_time(:millisecond) + + refs = + Enum.reduce(names, %{}, fn name, acc -> + case Process.whereis(name) do + pid when is_pid(pid) -> Map.put(acc, Process.monitor(pid), name) + nil -> raise "SingletonWatcher: #{inspect(name)} not running at watch start" + end + end) + + {:ok, %{refs: refs, deaths: [], start_ms: start_ms, frozen?: false}} + end + + # Snapshot the death list and stop accepting new deaths. Called from + # ExUnit.after_suite so that the subsequent application-shutdown :DOWN + # cascade is not recorded as mid-run instability. + @impl true + def handle_call(:freeze, _from, state) do + {:reply, Enum.reverse(state.deaths), %{state | frozen?: true}} + end + + @impl true + def handle_info({:DOWN, _ref, :process, _pid, _reason}, %{frozen?: true} = state) do + {:noreply, state} + end + + def handle_info({:DOWN, ref, :process, pid, reason}, state) do + case Map.fetch(state.refs, ref) do + {:ok, name} -> + death = %{ + name: name, + pid: pid, + reason: reason, + at_ms: System.monotonic_time(:millisecond) - state.start_ms + } + + new_refs = + case Process.whereis(name) do + new_pid when is_pid(new_pid) and new_pid != pid -> + state.refs + |> Map.delete(ref) + |> Map.put(Process.monitor(new_pid), name) + + _ -> + Process.send_after(self(), {:rewatch, name}, 50) + Map.delete(state.refs, ref) + end + + {:noreply, %{state | refs: new_refs, deaths: [death | state.deaths]}} + + :error -> + {:noreply, state} + end + end + + @impl true + def handle_info({:rewatch, name}, %{frozen?: true} = state), do: {:noreply, state} + + def handle_info({:rewatch, name}, state) do + new_refs = + case Process.whereis(name) do + pid when is_pid(pid) -> Map.put(state.refs, Process.monitor(pid), name) + nil -> state.refs + end + + {:noreply, %{state | refs: new_refs}} + end +end + +{:ok, _watcher} = Burble.TestSupport.SingletonWatcher.start_link(required) + +ExUnit.after_suite(fn _result -> + # Freeze the watcher before the BEAM begins application shutdown so the + # subsequent normal-shutdown :DOWN cascade is not mistaken for instability. + deaths = + Burble.TestSupport.SingletonWatcher.freeze() + # Belt-and-braces: filter clean shutdowns even if any slipped in. + |> Enum.reject(fn d -> d.reason in [:shutdown, :normal, {:shutdown, :normal}] end) + + unless deaths == [] do + IO.puts( + :stderr, + "\n" <> + "===========================================================================\n" <> + " burble#62 — App-owned singleton deaths recorded during test run\n" <> + "===========================================================================\n" + ) + + Enum.with_index(deaths, 1) + |> Enum.each(fn {d, i} -> + IO.puts( + :stderr, + " #{i}. #{inspect(d.name)} died at +#{d.at_ms}ms " <> + "pid=#{inspect(d.pid)} reason=#{inspect(d.reason)}" + ) + end) + + IO.puts( + :stderr, + "\n #{length(deaths)} mid-run singleton death(s) observed. This is\n" <> + " bucket B from #62. Correlate with --seed-fixed test order to\n" <> + " identify the offending test interactions.\n" <> + "===========================================================================" + ) + end + + :ok +end) From f57d759de9ea891cdd8a8feedb6662d78bc48a85 Mon Sep 17 00:00:00 2001 From: hyperpolymath <6759885+hyperpolymath@users.noreply.github.com> Date: Wed, 20 May 2026 13:09:49 +0100 Subject: [PATCH 2/2] =?UTF-8?q?fix(#62):=20bucket=20B=20mid-run=20singleto?= =?UTF-8?q?n=20death=20=E2=80=94=20Media.Engine=20+=20HealthMesh=20root=20?= =?UTF-8?q?causes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PR-C SingletonWatcher (#73) caught the first concrete bucket B signal on OTP27 CI: `Burble.Media.Engine` died at +8.2s with reason `{:shutdown, {GenServer, :call, [Burble.PeerSupervisor, {:start_child, ...}, :infinity]}}`. Local re-run after the first fix surfaced a second cause in `Burble.Groove.HealthMesh`. Both fixed here: ## A. Burble.Media.Peer / Pipeline → restart: :temporary Both were `use GenServer` (default `:permanent`). A peer's WebRTC session is tied 1:1 to a Phoenix channel; once the channel is gone or the underlying ExWebRTC state fails to initialise, restarting the GenServer without the client context is pointless. The bug it caused: under bursts of failing peer starts (tests that exercise add_peer without a real channel), DynamicSupervisor max_restarts intensity (default 3 in 5s) was exceeded → `Burble.PeerSupervisor` itself exited `:shutdown` → `Burble.Media.Engine`'s in-flight `GenServer.call(PeerSupervisor, ...)` propagated that exit → engine died → every later test touching `Media.Engine` cascaded with "no process". `Burble.Coprocessor.Pipeline` gets the same treatment for the same reason (it's also per-peer, per-session, and called from Engine's add_peer handler via `DynamicSupervisor.start_child` on `Burble.CoprocessorSupervisor`). ## B. Burble.Media.Engine.handle_call({:add_peer, ...}) — defensive Defence in depth: wrap both `DynamicSupervisor.start_child` calls in `try / catch :exit` so a sibling supervisor's death cannot topple the Engine. With the `:temporary` fix above, max_restarts is no longer hit in normal operation; this catch is for the rare case where the supervisor itself goes down (e.g. heartbeat termination during shutdown) so the Engine logs and continues rather than crashing. ## C. Burble.Media.Peer — guard `send(channel_pid, ...)` against nil A second-order cause surfaced after fix A landed locally: `Burble.Media.Peer.send_offer/1` (lib/burble/media/peer.ex:413) and the ICE-candidate handle_info clause both did `send(state.channel_pid, ...)` unconditionally. Tests pass `channel_pid: nil` to exercise Media.Engine without a Phoenix channel; `:erlang.send(nil, _)` raises `:badarg`. That raise propagated as the Peer's exit reason; when Media.Engine then called `GenServer.stop(peer, :normal)` to remove the peer, the stop exit carried that `:badarg`, killing the Engine. Both sites now guarded with `if is_pid(state.channel_pid)`. ## D. Burble.Groove.HealthMesh.do_probe/1 — `Map.get(v, :port)` Third-order cause, also caught by the watcher: HealthMesh's probe loop at line 172 did `Enum.find(state.peers, fn {_k, v} -> v.port == port end)`. Peers added via `report_peer_status/3` carry no `:port` key (they're identified by `service_id` only), so the find raised `:badkey`. Switched to `Map.get(v, :port)` which returns nil for the keyless peers and skips them — semantically correct, since those peers aren't reachable via port lookup anyway. ## Verification Local full suite (OTP25): the previous run reported 5 mid-run singleton deaths via the watcher block (Media.Engine ×1, HealthMesh ×2, secondary cascades ×2). After this fix the full suite (`mix test`) prints **no singleton-death block at all** — every named singleton stayed alive through the whole 707 tests. Local failure count is in the documented 75-134 noise band and not a reliable progress signal; the absence of mid-run deaths is. OTP27 CI will be the authoritative measurement. Ships as a separate PR from the test-helper instrumentation (#73) because these are app-code fixes that should be reviewable independently. Refs #62. Co-Authored-By: Claude Opus 4.7 (1M context) --- server/lib/burble/coprocessor/pipeline.ex | 2 +- server/lib/burble/groove/health_mesh.ex | 5 +++- server/lib/burble/media/engine.ex | 36 ++++++++++++++++++----- server/lib/burble/media/peer.ex | 17 +++++++---- 4 files changed, 45 insertions(+), 15 deletions(-) diff --git a/server/lib/burble/coprocessor/pipeline.ex b/server/lib/burble/coprocessor/pipeline.ex index 40e0145..109e0af 100644 --- a/server/lib/burble/coprocessor/pipeline.ex +++ b/server/lib/burble/coprocessor/pipeline.ex @@ -40,7 +40,7 @@ defmodule Burble.Coprocessor.Pipeline do {:ok, pcm_samples} = Pipeline.process_inbound(pid, opus_frame) """ - use GenServer + use GenServer, restart: :temporary require Logger alias Burble.Coprocessor.SmartBackend, as: Backend diff --git a/server/lib/burble/groove/health_mesh.ex b/server/lib/burble/groove/health_mesh.ex index 547ab0e..46e5322 100644 --- a/server/lib/burble/groove/health_mesh.ex +++ b/server/lib/burble/groove/health_mesh.ex @@ -168,8 +168,11 @@ defmodule Burble.Groove.HealthMesh do {:ok, {port, {:error, _reason}}}, acc -> # Check if we had this peer before — mark as degraded, then down. + # Peers added via report_peer_status/3 don't carry a :port key + # (they're identified by service_id only); use Map.get/2 so the + # find returns nothing for those rather than raising :badkey. existing = - Enum.find(state.peers, fn {_k, v} -> v.port == port end) + Enum.find(state.peers, fn {_k, v} -> Map.get(v, :port) == port end) case existing do {key, prev} when prev.status == :up -> diff --git a/server/lib/burble/media/engine.ex b/server/lib/burble/media/engine.ex index b279792..d135b24 100644 --- a/server/lib/burble/media/engine.ex +++ b/server/lib/burble/media/engine.ex @@ -212,10 +212,23 @@ defmodule Burble.Media.Engine do ice_servers: ice_servers_for_mode(session.privacy_mode) ] - case DynamicSupervisor.start_child( - Burble.PeerSupervisor, - {Burble.Media.Peer, peer_opts} - ) do + # Defensive: a DynamicSupervisor that has hit max_restarts intensity + # exits :shutdown, which would propagate out of GenServer.call and + # bring Burble.Media.Engine down. Catch :exit here so a sibling + # supervisor's death cannot topple the engine. The session state is + # still updated below so callers see consistent {:ok, offer} return + # values whether or not the Peer process started. + peer_start_result = + try do + DynamicSupervisor.start_child( + Burble.PeerSupervisor, + {Burble.Media.Peer, peer_opts} + ) + catch + :exit, reason -> {:error, {:peer_supervisor_unavailable, reason}} + end + + case peer_start_result do {:ok, _pid} -> Logger.info("[Media] Peer process started for #{peer_id}") @@ -243,10 +256,17 @@ defmodule Burble.Media.Engine do } ] - case DynamicSupervisor.start_child( - Burble.CoprocessorSupervisor, - {Burble.Coprocessor.Pipeline, pipeline_opts} - ) do + pipeline_start_result = + try do + DynamicSupervisor.start_child( + Burble.CoprocessorSupervisor, + {Burble.Coprocessor.Pipeline, pipeline_opts} + ) + catch + :exit, reason -> {:error, {:coprocessor_supervisor_unavailable, reason}} + end + + case pipeline_start_result do {:ok, _pid} -> Logger.info("[Media] Pipeline started for peer #{peer_id}") diff --git a/server/lib/burble/media/peer.ex b/server/lib/burble/media/peer.ex index d29755c..87e58eb 100644 --- a/server/lib/burble/media/peer.ex +++ b/server/lib/burble/media/peer.ex @@ -28,7 +28,7 @@ defmodule Burble.Media.Peer do RTP packets from each peer are forwarded to all other peers' sendonly tracks. """ - use GenServer + use GenServer, restart: :temporary require Logger alias ExWebRTC.{PeerConnection, MediaStreamTrack, RTPCodecParameters, SessionDescription, ICECandidate} @@ -149,9 +149,12 @@ defmodule Burble.Media.Peer do @impl true def handle_info({:ex_webrtc, pc, {:ice_candidate, candidate}}, %{pc: pc} = state) do - # Forward ICE candidate to client via channel. + # Forward ICE candidate to client via channel. channel_pid may be nil in + # tests; skip in that case (see send_offer/1 comment). json = candidate |> ICECandidate.to_json() |> Jason.encode!() - send(state.channel_pid, {:peer_ice_candidate, json}) + if is_pid(state.channel_pid) do + send(state.channel_pid, {:peer_ice_candidate, json}) + end {:noreply, state} end @@ -409,8 +412,12 @@ defmodule Burble.Media.Peer do {:ok, offer} = PeerConnection.create_offer(pc) :ok = PeerConnection.set_local_description(pc, offer) - # Send offer to client via channel. - send(state.channel_pid, {:peer_sdp_offer, offer.sdp}) + # Send offer to client via channel. channel_pid may be nil in tests that + # exercise Media.Engine without a real Phoenix channel; skip silently in + # that case rather than raising :badarg from :erlang.send(nil, _). + if is_pid(state.channel_pid) do + send(state.channel_pid, {:peer_sdp_offer, offer.sdp}) + end %{state | negotiating: true} end