Skip to content

Commit 552b746

Browse files
committed
save registry when shutting down agent and more logging
1 parent ab031c1 commit 552b746

1 file changed

Lines changed: 11 additions & 1 deletion

File tree

src/ai/backend/agent/agent.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -743,6 +743,7 @@ async def shutdown(self, stop_signal: signal.Signals) -> None:
743743
if kernel_obj.runner is not None:
744744
await kernel_obj.runner.close()
745745
await kernel_obj.close()
746+
await self.save_last_registry(force=True)
746747
if stop_signal == signal.SIGTERM:
747748
await self.clean_all_kernels(blocking=True)
748749

@@ -1285,6 +1286,7 @@ def _get_session_id(container: Container) -> SessionId | None:
12851286
)
12861287
return None
12871288

1289+
log.debug("sync_container_lifecycles(): triggered")
12881290
try:
12891291
_containers = await self.enumerate_containers(ACTIVE_STATUS_SET | DEAD_STATUS_SET)
12901292
async with self.registry_lock:
@@ -1295,6 +1297,9 @@ def _get_session_id(container: Container) -> SessionId | None:
12951297
for kid, container in _containers
12961298
if container.status in DEAD_STATUS_SET
12971299
]
1300+
log.debug(
1301+
f"detected dead containers: {[container.id[:12] for _, container in dead_containers]}"
1302+
)
12981303
for kernel_id, container in dead_containers:
12991304
if kernel_id in self.restarting_kernels:
13001305
continue
@@ -1318,6 +1323,9 @@ def _get_session_id(container: Container) -> SessionId | None:
13181323
for kid, container in _containers
13191324
if container.status in ACTIVE_STATUS_SET
13201325
]
1326+
log.debug(
1327+
f"detected active containers: {[container.id[:12] for _, container in active_containers]}"
1328+
)
13211329
for kernel_id, container in active_containers:
13221330
alive_kernels[kernel_id] = container.id
13231331
session_id = _get_session_id(container)
@@ -1340,17 +1348,19 @@ def _get_session_id(container: Container) -> SessionId | None:
13401348
or kernel_id in self.terminating_kernels
13411349
):
13421350
continue
1351+
log.debug(f"kernel with no container (kid: {kernel_id})")
13431352
terminated_kernels[kernel_id] = ContainerLifecycleEvent(
13441353
kernel_id,
13451354
kernel_session_map[kernel_id],
13461355
known_kernels[kernel_id],
13471356
LifecycleEvent.CLEAN,
13481357
KernelLifecycleEventReason.SELF_TERMINATED,
13491358
)
1350-
# Check if: there are containers already deleted from my registry or not spawned by me.
1359+
# Check if: there are containers already deleted from my registry.
13511360
for kernel_id in alive_kernels.keys() - known_kernels.keys():
13521361
if kernel_id in self.restarting_kernels:
13531362
continue
1363+
log.debug(f"kernel not found in registry (kid:{kernel_id})")
13541364
terminated_kernels[kernel_id] = ContainerLifecycleEvent(
13551365
kernel_id,
13561366
kernel_session_map[kernel_id],

0 commit comments

Comments
 (0)