Skip to content

Commit 98e4e00

Browse files
committed
handle many kernel status
1 parent d4d397f commit 98e4e00

2 files changed

Lines changed: 62 additions & 16 deletions

File tree

src/ai/backend/agent/server.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,8 @@ async def sync_kernel_registry(
483483
@collect_error
484484
async def sync_and_get_kernels(
485485
self,
486+
preparing_kernels: Collection[str],
487+
pulling_kernels: Collection[str],
486488
running_kernels: Collection[str],
487489
terminating_kernels: Collection[str],
488490
) -> dict[str, Any]:
@@ -547,10 +549,30 @@ async def sync_and_get_kernels(
547549
or KernelLifecycleEventReason.NOT_FOUND_IN_MANAGER,
548550
suppress_events=False,
549551
)
550-
elif kernel_id not in running_kernels:
551-
# The kernel status is not 'running' or 'terminating' in truth.
552-
# It should be terminated.
553-
if kernel_id not in self.agent.terminating_kernels:
552+
elif kernel_id in running_kernels:
553+
pass
554+
elif kernel_id in preparing_kernels:
555+
# kernel_registry may not have `preparing` state kernels.
556+
pass
557+
elif kernel_id in pulling_kernels:
558+
# kernel_registry does not have `pulling` state kernels.
559+
# Let's just skip it.
560+
pass
561+
else:
562+
# This kernel is not alive according to the truth data.
563+
# The kernel should be destroyed or cleaned
564+
if kernel_id in self.agent.terminating_kernels:
565+
await self.agent.inject_container_lifecycle_event(
566+
kernel_id,
567+
kernel_obj.session_id,
568+
LifecycleEvent.CLEAN,
569+
kernel_obj.termination_reason
570+
or KernelLifecycleEventReason.NOT_FOUND_IN_MANAGER,
571+
suppress_events=True,
572+
)
573+
elif kernel_id in self.agent.restarting_kernels:
574+
pass
575+
else:
554576
await self.agent.inject_container_lifecycle_event(
555577
kernel_id,
556578
kernel_obj.session_id,

src/ai/backend/manager/registry.py

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3071,11 +3071,15 @@ async def sync_agent_kernel_registry(self, agent_id: AgentId) -> None:
30713071
async def _sync_agent_resource_and_get_kerenels(
30723072
self,
30733073
agent_id: AgentId,
3074+
preparing_kernels: Collection[KernelId],
3075+
pulling_kernels: Collection[KernelId],
30743076
running_kernels: Collection[KernelId],
30753077
terminating_kernels: Collection[KernelId],
30763078
) -> AgentKernelRegistryByStatus:
30773079
async with self.agent_cache.rpc_context(agent_id) as rpc:
30783080
resp: dict[str, Any] = await rpc.call.sync_and_get_kernels(
3081+
preparing_kernels,
3082+
pulling_kernels,
30793083
running_kernels,
30803084
terminating_kernels,
30813085
)
@@ -3094,31 +3098,49 @@ async def sync_agent_resource(
30943098
.options(
30953099
selectinload(
30963100
AgentRow.kernels.and_(
3097-
KernelRow.status.in_([KernelStatus.RUNNING, KernelStatus.TERMINATING])
3101+
KernelRow.status.in_([
3102+
KernelStatus.PREPARING,
3103+
KernelStatus.PULLING,
3104+
KernelStatus.RUNNING,
3105+
KernelStatus.TERMINATING,
3106+
])
30983107
),
30993108
).options(load_only(KernelRow.id, KernelRow.status))
31003109
)
31013110
)
31023111
async with SASession(bind=db_connection) as db_session:
31033112
for _agent_row in await db_session.scalars(stmt):
31043113
agent_row = cast(AgentRow, _agent_row)
3114+
preparing_kernels: list[KernelId] = []
3115+
pulling_kernels: list[KernelId] = []
3116+
running_kernels: list[KernelId] = []
3117+
terminating_kernels: list[KernelId] = []
3118+
for kernel in agent_row.kernels:
3119+
kernel_status = cast(KernelStatus, kernel.status)
3120+
match kernel_status:
3121+
case KernelStatus.PREPARING:
3122+
preparing_kernels.append(KernelId(kernel.id))
3123+
case KernelStatus.PULLING:
3124+
pulling_kernels.append(KernelId(kernel.id))
3125+
case KernelStatus.RUNNING:
3126+
running_kernels.append(KernelId(kernel.id))
3127+
case KernelStatus.TERMINATING:
3128+
terminating_kernels.append(KernelId(kernel.id))
3129+
case _:
3130+
continue
31053131
agent_kernel_by_status[AgentId(agent_row.id)] = {
3106-
"running_kernels": [
3107-
KernelId(kern.id)
3108-
for kern in agent_row.kernels
3109-
if kern.status == KernelStatus.RUNNING
3110-
],
3111-
"terminating_kernels": [
3112-
KernelId(kern.id)
3113-
for kern in agent_row.kernels
3114-
if kern.status == KernelStatus.TERMINATING
3115-
],
3132+
"preparing_kernels": preparing_kernels,
3133+
"pulling_kernels": pulling_kernels,
3134+
"running_kernels": running_kernels,
3135+
"terminating_kernels": terminating_kernels,
31163136
}
31173137
tasks = []
31183138
for agent_id in agent_ids:
31193139
tasks.append(
31203140
self._sync_agent_resource_and_get_kerenels(
31213141
agent_id,
3142+
agent_kernel_by_status[agent_id]["preparing_kernels"],
3143+
agent_kernel_by_status[agent_id]["pulling_kernels"],
31223144
agent_kernel_by_status[agent_id]["running_kernels"],
31233145
agent_kernel_by_status[agent_id]["terminating_kernels"],
31243146
)
@@ -3136,7 +3158,9 @@ async def sync_agent_resource(
31363158
agent_errors,
31373159
)
31383160
else:
3139-
assert isinstance(resp, AgentKernelRegistryByStatus)
3161+
assert isinstance(
3162+
resp, AgentKernelRegistryByStatus
3163+
), f"response should be `AgentKernelRegistryByStatus`, not {type(resp)}"
31403164
result[aid] = resp
31413165
return result
31423166

0 commit comments

Comments
 (0)