Skip to content

Commit fdca36a

Browse files
committed
feat: schedule function returns list of kernel-agent binding
1 parent ecb36ae commit fdca36a

1 file changed

Lines changed: 19 additions & 9 deletions

File tree

src/ai/backend/manager/scheduler/dispatcher.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ async def _schedule_in_sgroup(
361361
self,
362362
sched_ctx: SchedulingContext,
363363
sgroup_name: str,
364-
) -> None:
364+
) -> list[KernelAgentBinding]:
365365
async def _apply_cancellation(
366366
db_sess: SASession, session_ids: list[SessionId], reason="pending-timeout"
367367
):
@@ -432,7 +432,8 @@ async def _update():
432432
len(cancelled_sessions),
433433
)
434434
zero = ResourceSlot()
435-
num_scheduled = 0
435+
kernel_agent_bindings_in_sgroup: list[KernelAgentBinding] = []
436+
436437
while len(pending_sessions) > 0:
437438
async with self.db.begin_readonly_session() as db_sess:
438439
candidate_agents = await list_schedulable_agents_by_sgroup(db_sess, sgroup_name)
@@ -446,7 +447,7 @@ async def _update():
446447
if picked_session_id is None:
447448
# no session is picked.
448449
# continue to next sgroup.
449-
return
450+
return kernel_agent_bindings_in_sgroup
450451
for picked_idx, sess_ctx in enumerate(pending_sessions):
451452
if sess_ctx.id == picked_session_id:
452453
break
@@ -657,7 +658,7 @@ async def _update_session_status_data() -> None:
657658
try:
658659
match schedulable_sess.cluster_mode:
659660
case ClusterMode.SINGLE_NODE:
660-
await self._schedule_single_node_session(
661+
kernel_agent_bindings = await self._schedule_single_node_session(
661662
sched_ctx,
662663
scheduler,
663664
sgroup_name,
@@ -667,7 +668,7 @@ async def _update_session_status_data() -> None:
667668
check_results,
668669
)
669670
case ClusterMode.MULTI_NODE:
670-
await self._schedule_multi_node_session(
671+
kernel_agent_bindings = await self._schedule_multi_node_session(
671672
sched_ctx,
672673
scheduler,
673674
sgroup_name,
@@ -701,9 +702,11 @@ async def _update_session_status_data() -> None:
701702
# _schedule_{single,multi}_node_session() already handle general exceptions.
702703
# Proceed to the next pending session and come back later
703704
continue
704-
num_scheduled += 1
705-
if num_scheduled > 0:
705+
else:
706+
kernel_agent_bindings_in_sgroup.extend(kernel_agent_bindings)
707+
if kernel_agent_bindings_in_sgroup:
706708
await self.event_producer.produce_event(DoPrepareEvent())
709+
return kernel_agent_bindings_in_sgroup
707710

708711
async def _filter_agent_by_container_limit(
709712
self, candidate_agents: list[AgentRow]
@@ -736,12 +739,13 @@ async def _schedule_single_node_session(
736739
sess_ctx: SessionRow,
737740
agent_selection_resource_priority: list[str],
738741
check_results: List[Tuple[str, Union[Exception, PredicateResult]]],
739-
) -> None:
742+
) -> list[KernelAgentBinding]:
740743
"""
741744
Finds and assigns an agent having resources enough to host the entire session.
742745
"""
743746
log_fmt = _log_fmt.get("")
744747
log_args = _log_args.get(tuple())
748+
kernel_agent_bindings: list[KernelAgentBinding] = []
745749

746750
try:
747751
requested_architectures = set(k.architecture for k in sess_ctx.kernels)
@@ -892,6 +896,10 @@ async def _schedule_single_node_session(
892896
agent_id,
893897
sess_ctx.requested_slots,
894898
)
899+
for kernel_row in sess_ctx.kernels:
900+
kernel_agent_bindings.append(
901+
KernelAgentBinding(kernel_row, agent_alloc_ctx, set())
902+
)
895903
except InstanceNotAvailable as sched_failure:
896904
log.debug(log_fmt + "no-available-instances", *log_args)
897905

@@ -1001,6 +1009,7 @@ async def _finalize_scheduled() -> None:
10011009
await self.registry.event_producer.produce_event(
10021010
SessionScheduledEvent(sess_ctx.id, sess_ctx.creation_id),
10031011
)
1012+
return kernel_agent_bindings
10041013

10051014
async def _schedule_multi_node_session(
10061015
self,
@@ -1011,7 +1020,7 @@ async def _schedule_multi_node_session(
10111020
sess_ctx: SessionRow,
10121021
agent_selection_resource_priority: list[str],
10131022
check_results: List[Tuple[str, Union[Exception, PredicateResult]]],
1014-
) -> None:
1023+
) -> list[KernelAgentBinding]:
10151024
"""
10161025
Finds and assigns agents having resources enough to host each kernel in the session.
10171026
"""
@@ -1239,6 +1248,7 @@ async def _finalize_scheduled() -> None:
12391248
await self.registry.event_producer.produce_event(
12401249
SessionScheduledEvent(sess_ctx.id, sess_ctx.creation_id),
12411250
)
1251+
return kernel_agent_bindings
12421252

12431253
async def prepare(
12441254
self,

0 commit comments

Comments
 (0)