From a08c46a47f3f169cd1bd31fb0d7eb6e7bc2486b1 Mon Sep 17 00:00:00 2001 From: Lewis Date: Wed, 13 May 2026 11:24:11 -0400 Subject: [PATCH] [aas-repro] Add ProcessSupervisor and NamedPipeHealthCheck diagnostics for SLES-2826 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds [aas-repro] INFO-level log lines to make the pipe-collision failure mode observable without DD_TRACE_DEBUG=true: ProcessSupervisor: - Health state transitions logged at INFO on every change - Spawn logged with jvm_pid for cross-process correlation - Process start logged with child_pid and faults_reset_to_0 - Process exit logged with exit code and resulting health state - Terminal fault break logged with fault count ExternalAgentLauncher: - Supervisor creation logged with path, pipe name, and jvm_pid NamedPipeHealthCheck (previously silent): - First-check entry logged with pipe name - NEVER_CHECKED → READY_TO_START logged with retry count - NEVER_CHECKED → HEALTHY logged with warning (orphan risk) - Ongoing HEALTHY → READY_TO_START logged when pipe disappears These changes are repro-only and do not ship to customers. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .../ddagent/ExternalAgentLauncher.java | 22 +++++++++++++ .../datadog/trace/util/ProcessSupervisor.java | 33 ++++++++++++++++--- 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/communication/src/main/java/datadog/communication/ddagent/ExternalAgentLauncher.java b/communication/src/main/java/datadog/communication/ddagent/ExternalAgentLauncher.java index 8eea13de224..972a13e28ef 100644 --- a/communication/src/main/java/datadog/communication/ddagent/ExternalAgentLauncher.java +++ b/communication/src/main/java/datadog/communication/ddagent/ExternalAgentLauncher.java @@ -25,6 +25,11 @@ public class ExternalAgentLauncher implements Closeable { public ExternalAgentLauncher(Config config) { if (config.isAzureAppServices()) { if (config.getTraceAgentPath() != null) { + log.info( + "[aas-repro] ExternalAgentLauncher: spawning trace-agent — path={} pipe={} jvm_pid={}", + config.getTraceAgentPath(), + config.getAgentNamedPipe(), + ProcessHandle.current().pid()); ProcessBuilder traceProcessBuilder = new ProcessBuilder(config.getTraceAgentPath()); traceProcessBuilder.redirectOutput(DISCARD); traceProcessBuilder.redirectError(DISCARD); @@ -38,6 +43,11 @@ public ExternalAgentLauncher(Config config) { } if (config.getDogStatsDPath() != null) { + log.info( + "[aas-repro] ExternalAgentLauncher: spawning dogstatsd — path={} pipe={} jvm_pid={}", + config.getDogStatsDPath(), + config.getDogStatsDNamedPipe(), + ProcessHandle.current().pid()); ProcessBuilder dogStatsDProcessBuilder = new ProcessBuilder(config.getDogStatsDPath()); dogStatsDProcessBuilder.redirectOutput(DISCARD); dogStatsDProcessBuilder.redirectError(DISCARD); @@ -71,6 +81,7 @@ private static ProcessSupervisor.HealthCheck healthCheck(String pipeName) { static final class NamedPipeHealthCheck implements ProcessSupervisor.HealthCheck { private static final String NAMED_PIPE_PREFIX = "\\\\.\\pipe\\"; + private static final Logger log = LoggerFactory.getLogger(NamedPipeHealthCheck.class); private final File pipe; @@ -88,10 +99,15 @@ public ProcessSupervisor.Health run(ProcessSupervisor.Health previousHealth) // first-time round do a more detailed check for existing bound named-pipe if (previousHealth == NEVER_CHECKED) { + log.info("[aas-repro] NamedPipeHealthCheck first-check pipe={}", pipe); double delayMillis = 50; for (int retries = 0; retries < 7; retries++) { if (!pipe.exists()) { + log.info( + "[aas-repro] NamedPipeHealthCheck pipe={} not found on retry {} → READY_TO_START", + pipe, + retries); return READY_TO_START; // no longer bound, start our own external process } @@ -100,6 +116,11 @@ public ProcessSupervisor.Health run(ProcessSupervisor.Health previousHealth) delayMillis = delayMillis * 1.75; } + // Pipe survived all retries — something already owns it + log.warn( + "[aas-repro] NamedPipeHealthCheck pipe={} still bound after all retries" + + " — assuming existing process is healthy (orphan risk!)", + pipe); return HEALTHY; // use existing external process } @@ -107,6 +128,7 @@ public ProcessSupervisor.Health run(ProcessSupervisor.Health previousHealth) if (pipe.exists()) { return HEALTHY; // keep using external process } else { + log.info("[aas-repro] NamedPipeHealthCheck pipe={} gone → READY_TO_START", pipe); return READY_TO_START; // start our own process } } diff --git a/internal-api/src/main/java/datadog/trace/util/ProcessSupervisor.java b/internal-api/src/main/java/datadog/trace/util/ProcessSupervisor.java index 686c35ce65b..8a2f0338371 100644 --- a/internal-api/src/main/java/datadog/trace/util/ProcessSupervisor.java +++ b/internal-api/src/main/java/datadog/trace/util/ProcessSupervisor.java @@ -70,7 +70,10 @@ private void mainLoop() { try { while (!stopping) { if (currentHealth == FAULTED && ++faults >= MAX_FAULTS) { - log.warn("Failed to start process [{}] after {} attempts", imageName, faults); + log.warn( + "[aas-repro] [{}] Terminal: failed to start after {} faults — supervisor exiting", + imageName, + faults); break; } try { @@ -78,14 +81,23 @@ private void mainLoop() { if (delayMillis > 0) { Thread.sleep(delayMillis); } + Health prevHealth = currentHealth; currentHealth = healthCheck.run(currentHealth); + if (currentHealth != prevHealth) { + log.info( + "[aas-repro] [{}] Health {} → {} (faults={})", + imageName, + prevHealth, + currentHealth, + faults); + } if (currentHealth == READY_TO_START) { startProcessAndWait(); } } catch (InterruptedException e) { currentHealth = INTERRUPTED; } catch (Throwable e) { - log.warn("Exception starting process: [{}]", imageName, e); + log.warn("[aas-repro] [{}] Exception in supervisor loop", imageName, e); currentHealth = FAULTED; } scheduleNextHealthCheck(); @@ -108,18 +120,29 @@ private void scheduleNextHealthCheck() { private void startProcessAndWait() throws Exception { if (currentProcess == null) { - log.debug("Starting process: [{}]", imageName); + log.info( + "[aas-repro] [{}] Spawning — jvm_pid={}", + imageName, + ProcessHandle.current().pid()); try (TraceScope ignored = AgentTracer.get().muteTracing()) { currentProcess = processBuilder.start(); } currentHealth = HEALTHY; faults = 0; + log.info( + "[aas-repro] [{}] Started — child_pid={} faults_reset_to_0", + imageName, + currentProcess.pid()); } // Block until the process exits int code = currentProcess.waitFor(); - log.debug("Process [{}] has exited with code {}", imageName, code); currentHealth = code == 0 ? INTERRUPTED : FAULTED; + log.info( + "[aas-repro] [{}] Exited — code={} → health={}", + imageName, + code, + currentHealth); // Process is dead, no longer needs to be tracked currentProcess = null; @@ -127,7 +150,7 @@ private void startProcessAndWait() throws Exception { private void stopProcess() { if (currentProcess != null) { - log.debug("Stopping process: [{}]", imageName); + log.info("[aas-repro] [{}] Stopping (supervisor closing)", imageName); currentProcess.destroy(); if (currentProcess.isAlive()) { currentProcess.destroyForcibly();