Skip to content

Commit 6830de1

Browse files
committed
fix: Change context indent to handle RPC exception correctly when calling create kernel
1 parent dbe909d commit 6830de1

1 file changed

Lines changed: 41 additions & 39 deletions

File tree

src/ai/backend/manager/registry.py

Lines changed: 41 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1711,11 +1711,11 @@ async def _update_kernel() -> None:
17111711

17121712
await execute_with_retry(_update_kernel)
17131713

1714-
async with self.agent_cache.rpc_context(
1715-
agent_alloc_ctx.agent_id,
1716-
order_key=str(scheduled_session.id),
1717-
) as rpc:
1718-
try:
1714+
try:
1715+
async with self.agent_cache.rpc_context(
1716+
agent_alloc_ctx.agent_id,
1717+
order_key=str(scheduled_session.id),
1718+
) as rpc:
17191719
get_image_ref = lambda k: image_infos[str(k.image_ref)].image_ref
17201720
# Issue a batched RPC call to create kernels on this agent
17211721
# created_infos = await rpc.call.create_kernels(
@@ -1783,42 +1783,44 @@ async def _update_kernel() -> None:
17831783
[binding.kernel.id for binding in items],
17841784
agent_alloc_ctx.agent_id,
17851785
)
1786-
except (asyncio.TimeoutError, asyncio.CancelledError):
1787-
log.warning("_create_kernels_in_one_agent(s:{}) cancelled", scheduled_session.id)
1788-
except Exception as e:
1789-
# The agent has already cancelled or issued the destruction lifecycle event
1790-
# for this batch of kernels.
1791-
ex = e
1792-
for binding in items:
1793-
kernel_id = binding.kernel.id
1794-
1795-
async def _update_failure() -> None:
1796-
async with self.db.begin_session() as db_sess:
1797-
now = datetime.now(tzutc())
1798-
query = (
1799-
sa.update(KernelRow)
1800-
.where(KernelRow.id == kernel_id)
1801-
.values(
1802-
status=KernelStatus.ERROR,
1803-
status_info=f"other-error ({ex!r})",
1804-
status_changed=now,
1805-
terminated_at=now,
1806-
status_history=sql_json_merge(
1807-
KernelRow.status_history,
1808-
(),
1809-
{
1810-
KernelStatus.ERROR.name: (
1811-
now.isoformat()
1812-
), # ["PULLING", "PREPARING"]
1813-
},
1814-
),
1815-
status_data=convert_to_status_data(ex, self.debug),
1816-
)
1786+
except (asyncio.TimeoutError, asyncio.CancelledError):
1787+
log.warning("_create_kernels_in_one_agent(s:{}) cancelled", scheduled_session.id)
1788+
except Exception as e:
1789+
ex = e
1790+
err_info = convert_to_status_data(ex, self.debug)
1791+
1792+
# The agent has already cancelled or issued the destruction lifecycle event
1793+
# for this batch of kernels.
1794+
for binding in items:
1795+
kernel_id = binding.kernel.id
1796+
1797+
async def _update_failure() -> None:
1798+
async with self.db.begin_session() as db_sess:
1799+
now = datetime.now(tzutc())
1800+
query = (
1801+
sa.update(KernelRow)
1802+
.where(KernelRow.id == kernel_id)
1803+
.values(
1804+
status=KernelStatus.ERROR,
1805+
status_info=f"other-error ({ex!r})",
1806+
status_changed=now,
1807+
terminated_at=now,
1808+
status_history=sql_json_merge(
1809+
KernelRow.status_history,
1810+
(),
1811+
{
1812+
KernelStatus.ERROR.name: (
1813+
now.isoformat()
1814+
), # ["PULLING", "PREPARING"]
1815+
},
1816+
),
1817+
status_data=err_info,
18171818
)
1818-
await db_sess.execute(query)
1819+
)
1820+
await db_sess.execute(query)
18191821

1820-
await execute_with_retry(_update_failure)
1821-
raise
1822+
await execute_with_retry(_update_failure)
1823+
raise
18221824

18231825
async def create_cluster_ssh_keypair(self) -> ClusterSSHKeyPair:
18241826
key = rsa.generate_private_key(

0 commit comments

Comments
 (0)