DataDog · Kyle-Neale · May 8, 2026 · May 8, 2026
@@ -71,10 +71,15 @@ def agent(app: Application, *, intg_name: str, environment: str, args: tuple[str
         finally:
             env_data.config_file.unlink()
     else:
-        temp_config_file = env_data.config_file.parent / f'{env_data.config_file.name}.bak.example'
-        env_data.config_file.replace(temp_config_file)
+        # Read-modify-restore in place. The previous implementation renamed
+        # the original config away before writing the override, which left a
+        # window where the conf.d directory contained no config for this
+        # integration; if Agent autodiscovery rescanned during that window it
+        # deregistered the check and the immediately-following `agent check`
+        # returned "no valid check found".
+        original_config = env_data.read_config()
         try:
             env_data.write_config(config)
             agent.invoke(full_args)
         finally:
-            temp_config_file.replace(env_data.config_file)
+            env_data.write_config(original_config)
@@ -294,6 +294,8 @@ def start(self, *, agent_build: str | None, local_packages: dict[Path, str], env
         if local_packages or start_commands or post_install_commands:
             self.restart()
 
+        self.wait_until_ready()
+
     def _initialize(self, command, local_packages, start_commands, post_install_commands):
         process = self._captured_process(command)
         if process.returncode:
@@ -373,6 +375,21 @@ def run_command(self, args: list[str]) -> None:
     def enter_shell(self) -> None:
         self._run_command(self._format_command(['cmd' if self._is_windows_container else 'bash']), check=True)
 
+    def wait_until_ready(self, *, timeout: float = 60.0) -> None:
+        """Poll ``agent status`` until the cmd-server is responding.
+
+        Without this gate, callers race the Agent's startup: ``docker run``
+        returns before the in-container Agent finishes initializing the
+        check loader, so an immediate ``agent check <name>`` can return
+        "no valid check found" with exit 255.
+        """
+        cmd = self._format_command(['agent', 'status', '--json'])
+        for attempt in stamina.retry_context(on=RuntimeError, timeout=timeout, wait_initial=0.5, wait_max=2.0):
+            with attempt:
+                proc = self._captured_process(cmd)
+                if proc.returncode != 0:
+                    raise RuntimeError(f'Agent not ready (rc={proc.returncode})')
+
     @stamina.retry(on=RuntimeError, attempts=3)
     def __pull_image(self, agent_build):
         process = self._run_command(['docker', 'pull', agent_build])

@@ -77,3 +77,11 @@ def invoke(self, args: list[str]) -> None: ...
 
     @abstractmethod
     def enter_shell(self) -> None: ...
+
+    def wait_until_ready(self, *, timeout: float = 60.0) -> None:
+        """Block until the Agent's command server is responding.
+
+        Concrete backends should override this with a meaningful readiness
+        check. The default is a no-op so backends can opt in incrementally.
+        """
+        return
@@ -42,13 +42,21 @@ def write_config(self, config: dict[str, Any] | None) -> None:
         if config is None:
             return
 
+        import os
+
         import yaml
 
         if 'instances' not in config:
             config = {'instances': [config]}
 
         self.config_file.parent.ensure_dir_exists()
-        self.config_file.write_text(yaml.safe_dump(config, default_flow_style=False))
+        # Write via tmp + os.replace so the file is never transiently absent.
+        # Agent autodiscovery watches this directory; if it observes the path
+        # missing it deregisters the integration's check, causing later
+        # `agent check <name>` invocations to fail with "no valid check found".
+        tmp = self.config_file.parent / f'.{self.config_file.name}.swap'
+        tmp.write_text(yaml.safe_dump(config, default_flow_style=False))
+        os.replace(tmp, self.config_file)
 
     def read_metadata(self) -> dict[str, Any]:
         import json

@@ -28,6 +28,12 @@ def free_port(mocker):
 
 
 class TestStart:
+    @pytest.fixture(autouse=True)
+    def _skip_readiness(self, mocker):
+        # Existing TestStart cases assert exact subprocess.run call lists; the
+        # readiness probe is exercised in TestWaitUntilReady.
+        mocker.patch.object(DockerAgent, 'wait_until_ready', return_value=None)
+
     @pytest.mark.parametrize(
         'agent_build, agent_image, use_jmx',
         [
@@ -1256,3 +1262,31 @@ def test_windows_container(self, app, get_integration, docker_path, mocker):
                 check=True,
             ),
         ]
+
+
+class TestWaitUntilReady:
+    def test_ready_first_try(self, app, get_integration, docker_path, mocker):
+        run = mocker.patch('subprocess.run', return_value=mocker.MagicMock(returncode=0, stdout=b'{}'))
+
+        integration = 'postgres'
+        environment = 'py3.12'
+
+        agent = DockerAgent(app, get_integration(integration), environment, {}, Path('config.yaml'))
+        agent.wait_until_ready()
+
+        assert run.call_args_list == [
+            mocker.call(
+                [docker_path, 'exec', f'dd_{integration}_{environment}', 'agent', 'status', '--json'],
+                shell=False,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+            ),
+        ]
+
+    def test_timeout_raises(self, app, get_integration, mocker):
+        mocker.patch('subprocess.run', return_value=mocker.MagicMock(returncode=1, stdout=b''))
+
+        agent = DockerAgent(app, get_integration('postgres'), 'py3.12', {}, Path('config.yaml'))
+
+        with pytest.raises(RuntimeError, match='Agent not ready'):
+            agent.wait_until_ready(timeout=0.05)