From 341156ae37af30f9e144c41305494bf89c7fccd4 Mon Sep 17 00:00:00 2001 From: Kyle-Neale Date: Fri, 8 May 2026 14:39:27 -0400 Subject: [PATCH 1/2] [ddev] Wait for Agent cmd-server before returning from start() Without this gate, callers race the Agent's startup: `docker run` returns before the in-container Agent finishes initializing the check loader, so an immediate `agent check ` can return "no valid check found" and exit 255. This has been showing up as the SNMP master.yml flake since agent-data-plane started failing fast on TLS init. Adds a no-op `wait_until_ready` default on `AgentInterface` and a real implementation on `DockerAgent` that polls `agent status --json` via `stamina.retry_context`. Called at the end of `start()`. Co-Authored-By: Claude Opus 4.7 (1M context) --- ddev/src/ddev/e2e/agent/docker.py | 17 ++++++++++++++ ddev/src/ddev/e2e/agent/interface.py | 8 +++++++ ddev/tests/e2e/agent/test_docker.py | 34 ++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+) diff --git a/ddev/src/ddev/e2e/agent/docker.py b/ddev/src/ddev/e2e/agent/docker.py index 5ba3b61969837..ebc4dd02190b6 100644 --- a/ddev/src/ddev/e2e/agent/docker.py +++ b/ddev/src/ddev/e2e/agent/docker.py @@ -294,6 +294,8 @@ def start(self, *, agent_build: str | None, local_packages: dict[Path, str], env if local_packages or start_commands or post_install_commands: self.restart() + self.wait_until_ready() + def _initialize(self, command, local_packages, start_commands, post_install_commands): process = self._captured_process(command) if process.returncode: @@ -373,6 +375,21 @@ def run_command(self, args: list[str]) -> None: def enter_shell(self) -> None: self._run_command(self._format_command(['cmd' if self._is_windows_container else 'bash']), check=True) + def wait_until_ready(self, *, timeout: float = 60.0) -> None: + """Poll ``agent status`` until the cmd-server is responding. + + Without this gate, callers race the Agent's startup: ``docker run`` + returns before the in-container Agent finishes initializing the + check loader, so an immediate ``agent check `` can return + "no valid check found" with exit 255. + """ + cmd = self._format_command(['agent', 'status', '--json']) + for attempt in stamina.retry_context(on=RuntimeError, timeout=timeout, wait_initial=0.5, wait_max=2.0): + with attempt: + proc = self._captured_process(cmd) + if proc.returncode != 0: + raise RuntimeError(f'Agent not ready (rc={proc.returncode})') + @stamina.retry(on=RuntimeError, attempts=3) def __pull_image(self, agent_build): process = self._run_command(['docker', 'pull', agent_build]) diff --git a/ddev/src/ddev/e2e/agent/interface.py b/ddev/src/ddev/e2e/agent/interface.py index fd05b50828413..218e7f4dfd50d 100644 --- a/ddev/src/ddev/e2e/agent/interface.py +++ b/ddev/src/ddev/e2e/agent/interface.py @@ -77,3 +77,11 @@ def invoke(self, args: list[str]) -> None: ... @abstractmethod def enter_shell(self) -> None: ... + + def wait_until_ready(self, *, timeout: float = 60.0) -> None: + """Block until the Agent's command server is responding. + + Concrete backends should override this with a meaningful readiness + check. The default is a no-op so backends can opt in incrementally. + """ + return diff --git a/ddev/tests/e2e/agent/test_docker.py b/ddev/tests/e2e/agent/test_docker.py index 809f025c909ea..aac3c03f252df 100644 --- a/ddev/tests/e2e/agent/test_docker.py +++ b/ddev/tests/e2e/agent/test_docker.py @@ -28,6 +28,12 @@ def free_port(mocker): class TestStart: + @pytest.fixture(autouse=True) + def _skip_readiness(self, mocker): + # Existing TestStart cases assert exact subprocess.run call lists; the + # readiness probe is exercised in TestWaitUntilReady. + mocker.patch.object(DockerAgent, 'wait_until_ready', return_value=None) + @pytest.mark.parametrize( 'agent_build, agent_image, use_jmx', [ @@ -1256,3 +1262,31 @@ def test_windows_container(self, app, get_integration, docker_path, mocker): check=True, ), ] + + +class TestWaitUntilReady: + def test_ready_first_try(self, app, get_integration, docker_path, mocker): + run = mocker.patch('subprocess.run', return_value=mocker.MagicMock(returncode=0, stdout=b'{}')) + + integration = 'postgres' + environment = 'py3.12' + + agent = DockerAgent(app, get_integration(integration), environment, {}, Path('config.yaml')) + agent.wait_until_ready() + + assert run.call_args_list == [ + mocker.call( + [docker_path, 'exec', f'dd_{integration}_{environment}', 'agent', 'status', '--json'], + shell=False, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ), + ] + + def test_timeout_raises(self, app, get_integration, mocker): + mocker.patch('subprocess.run', return_value=mocker.MagicMock(returncode=1, stdout=b'')) + + agent = DockerAgent(app, get_integration('postgres'), 'py3.12', {}, Path('config.yaml')) + + with pytest.raises(RuntimeError, match='Agent not ready'): + agent.wait_until_ready(timeout=0.05) From acdab00c49d29b26eb6562215c67bf5e09600f76 Mon Sep 17 00:00:00 2001 From: Kyle-Neale Date: Fri, 8 May 2026 17:41:26 -0400 Subject: [PATCH 2/2] [ddev] Atomic config swap to avoid autodiscovery deregister race The `--config-file` path of `ddev env agent` previously renamed the original config away before writing the override, leaving a window in which the mounted conf.d directory had no config for the integration. Agent autodiscovery rescans on file events; if it scanned during that window it deregistered the check, and the immediately-following `agent check ` returned "no valid check found". This is the actual SNMP master.yml flake fingerprint: agent runs cleanly for 10+ minutes (20-30 successful check cycles), then a single test using `dd_agent_check` (which goes through this code path) hits the race and fails. Two of the last three master.yml SNMP failures match it exactly. Switch to read-modify-restore in place. `EnvData.write_config` now writes via tmp + os.replace so the file is never transiently absent. Co-Authored-By: Claude Opus 4.7 (1M context) --- ddev/src/ddev/cli/env/agent.py | 11 ++++++++--- ddev/src/ddev/e2e/config.py | 10 +++++++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/ddev/src/ddev/cli/env/agent.py b/ddev/src/ddev/cli/env/agent.py index 0a24f221b9a4f..ba024c50ce3c0 100644 --- a/ddev/src/ddev/cli/env/agent.py +++ b/ddev/src/ddev/cli/env/agent.py @@ -71,10 +71,15 @@ def agent(app: Application, *, intg_name: str, environment: str, args: tuple[str finally: env_data.config_file.unlink() else: - temp_config_file = env_data.config_file.parent / f'{env_data.config_file.name}.bak.example' - env_data.config_file.replace(temp_config_file) + # Read-modify-restore in place. The previous implementation renamed + # the original config away before writing the override, which left a + # window where the conf.d directory contained no config for this + # integration; if Agent autodiscovery rescanned during that window it + # deregistered the check and the immediately-following `agent check` + # returned "no valid check found". + original_config = env_data.read_config() try: env_data.write_config(config) agent.invoke(full_args) finally: - temp_config_file.replace(env_data.config_file) + env_data.write_config(original_config) diff --git a/ddev/src/ddev/e2e/config.py b/ddev/src/ddev/e2e/config.py index a59317cd780a7..c2b5a0aeb4098 100644 --- a/ddev/src/ddev/e2e/config.py +++ b/ddev/src/ddev/e2e/config.py @@ -42,13 +42,21 @@ def write_config(self, config: dict[str, Any] | None) -> None: if config is None: return + import os + import yaml if 'instances' not in config: config = {'instances': [config]} self.config_file.parent.ensure_dir_exists() - self.config_file.write_text(yaml.safe_dump(config, default_flow_style=False)) + # Write via tmp + os.replace so the file is never transiently absent. + # Agent autodiscovery watches this directory; if it observes the path + # missing it deregisters the integration's check, causing later + # `agent check ` invocations to fail with "no valid check found". + tmp = self.config_file.parent / f'.{self.config_file.name}.swap' + tmp.write_text(yaml.safe_dump(config, default_flow_style=False)) + os.replace(tmp, self.config_file) def read_metadata(self) -> dict[str, Any]: import json