Skip to content

Commit 37fe73f

Browse files
committed
refactor container creation codes a little
1 parent dd8b835 commit 37fe73f

1 file changed

Lines changed: 43 additions & 33 deletions

File tree

src/ai/backend/agent/docker/agent.py

Lines changed: 43 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,30 @@ def container_from_docker_container(src: DockerContainer) -> Container:
122122
)
123123

124124

125+
async def _clean_scratch(
126+
loop: asyncio.AbstractEventLoop,
127+
scratch_type: str,
128+
scratch_root: Path,
129+
kernel_id: KernelId,
130+
) -> None:
131+
scratch_dir = scratch_root / str(kernel_id)
132+
tmp_dir = scratch_root / f"{kernel_id}_tmp"
133+
try:
134+
if sys.platform.startswith("linux") and scratch_type == "memory":
135+
await destroy_scratch_filesystem(scratch_dir)
136+
await destroy_scratch_filesystem(tmp_dir)
137+
await loop.run_in_executor(None, shutil.rmtree, scratch_dir)
138+
await loop.run_in_executor(None, shutil.rmtree, tmp_dir)
139+
elif sys.platform.startswith("linux") and scratch_type == "hostfile":
140+
await destroy_loop_filesystem(scratch_root, kernel_id)
141+
else:
142+
await loop.run_in_executor(None, shutil.rmtree, scratch_dir)
143+
except CalledProcessError:
144+
pass
145+
except FileNotFoundError:
146+
pass
147+
148+
125149
def _DockerError_reduce(self):
126150
return (
127151
type(self),
@@ -853,6 +877,18 @@ async def start_container(
853877
if self.local_config["debug"]["log-kernel-config"]:
854878
log.debug("full container config: {!r}", pretty(container_config))
855879

880+
async def _rollback_container_creation() -> None:
881+
await _clean_scratch(
882+
loop,
883+
self.local_config["container"]["scratch-type"],
884+
self.local_config["container"]["scratch-root"],
885+
self.kernel_id,
886+
)
887+
self.port_pool.update(host_ports)
888+
async with self.resource_lock:
889+
for dev_name, device_alloc in resource_spec.allocations.items():
890+
self.computers[dev_name].alloc_map.free(device_alloc)
891+
856892
# We are all set! Create and start the container.
857893
async with closing_async(Docker()) as docker:
858894
container: Optional[DockerContainer] = None
@@ -884,21 +920,7 @@ async def start_container(
884920
raise
885921
except Exception as e:
886922
# Oops, we have to restore the allocated resources!
887-
scratch_type = self.local_config["container"]["scratch-type"]
888-
scratch_root = self.local_config["container"]["scratch-root"]
889-
if sys.platform.startswith("linux") and scratch_type == "memory":
890-
await destroy_scratch_filesystem(self.scratch_dir)
891-
await destroy_scratch_filesystem(self.tmp_dir)
892-
await loop.run_in_executor(None, shutil.rmtree, self.scratch_dir)
893-
await loop.run_in_executor(None, shutil.rmtree, self.tmp_dir)
894-
elif sys.platform.startswith("linux") and scratch_type == "hostfile":
895-
await destroy_loop_filesystem(scratch_root, self.kernel_id)
896-
else:
897-
await loop.run_in_executor(None, shutil.rmtree, self.scratch_dir)
898-
self.port_pool.update(host_ports)
899-
async with self.resource_lock:
900-
for dev_name, device_alloc in resource_spec.allocations.items():
901-
self.computers[dev_name].alloc_map.free(device_alloc)
923+
await _rollback_container_creation()
902924
if container is not None:
903925
raise ContainerCreationError(
904926
container_id=container._id, message=f"unknown. {repr(e)}"
@@ -1513,24 +1535,12 @@ async def log_iter():
15131535
log.warning("container deletion timeout (k:{}, c:{})", kernel_id, container_id)
15141536

15151537
if not restarting:
1516-
scratch_type = self.local_config["container"]["scratch-type"]
1517-
scratch_root = self.local_config["container"]["scratch-root"]
1518-
scratch_dir = scratch_root / str(kernel_id)
1519-
tmp_dir = scratch_root / f"{kernel_id}_tmp"
1520-
try:
1521-
if sys.platform.startswith("linux") and scratch_type == "memory":
1522-
await destroy_scratch_filesystem(scratch_dir)
1523-
await destroy_scratch_filesystem(tmp_dir)
1524-
await loop.run_in_executor(None, shutil.rmtree, scratch_dir)
1525-
await loop.run_in_executor(None, shutil.rmtree, tmp_dir)
1526-
elif sys.platform.startswith("linux") and scratch_type == "hostfile":
1527-
await destroy_loop_filesystem(scratch_root, kernel_id)
1528-
else:
1529-
await loop.run_in_executor(None, shutil.rmtree, scratch_dir)
1530-
except CalledProcessError:
1531-
pass
1532-
except FileNotFoundError:
1533-
pass
1538+
await _clean_scratch(
1539+
loop,
1540+
self.local_config["container"]["scratch-type"],
1541+
self.local_config["container"]["scratch-root"],
1542+
kernel_id,
1543+
)
15341544

15351545
async def create_local_network(self, network_name: str) -> None:
15361546
async with closing_async(Docker()) as docker:

0 commit comments

Comments
 (0)