Skip to content

Commit b7f5a6d

Browse files
committed
refactor: device injection
Signed-off-by: thxCode <thxcode0824@gmail.com>
1 parent 2bd299d commit b7f5a6d

8 files changed

Lines changed: 148 additions & 115 deletions

File tree

gpustack_runtime/deployer/__types__.py

Lines changed: 95 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,11 @@
1515
from .. import envs
1616
from ..detector import (
1717
ManufacturerEnum,
18-
Topology,
1918
detect_devices,
20-
get_devices_topologies,
2119
group_devices_by_manufacturer,
2220
manufacturer_to_backend,
2321
)
22+
from ..detector.__utils__ import map_numa_node_to_cpu_affinity
2423
from .__utils__ import (
2524
adjust_image_with_envs,
2625
correct_runner_image,
@@ -1306,29 +1305,40 @@ class Deployer(ABC):
13061305
"""
13071306
Recorded visible devices values,
13081307
the key is the runtime visible devices env name,
1309-
the value is the list of device indexes or uuids.
1308+
the value is the list of device index strings or uuids.
13101309
For example:
13111310
{
1312-
"NVIDIA_VISIBLE_DEVICES": ["0"],
1311+
"NVIDIA_VISIBLE_DEVICES": ["GPU-11111111-2222-3333-4444-555555555555"],
13131312
"AMD_VISIBLE_DEVICES": ["0", "1"]
13141313
}.
13151314
"""
1316-
_visible_devices_topologies: dict[str, Topology] | None = None
1315+
_visible_devices_numa_affinities: dict[str, dict[str, str]] | None = None
13171316
"""
1318-
Recorded visible devices topologies,
1317+
Recorded visible devices NUMA affinities,
13191318
the key is the runtime visible devices env name,
1320-
the value is the corresponding topology.
1319+
the value is the mapping from device index string to NUMA node string.
13211320
For example:
13221321
{
1323-
"NVIDIA_VISIBLE_DEVICES": Topology(...),
1324-
"AMD_VISIBLE_DEVICES": Topology(...)
1322+
"NVIDIA_VISIBLE_DEVICES": {"0": "0-1"},
1323+
"AMD_VISIBLE_DEVICES": {"0": "0-1", "1": "0-1"}
1324+
}.
1325+
"""
1326+
_visible_devices_cpus_affinities: dict[str, dict[str, str]] | None = None
1327+
"""
1328+
Recorded visible devices CPUs affinities,
1329+
the key is the runtime visible devices env name,
1330+
the value is the mapping from device index string to CPU cores string.
1331+
For example:
1332+
{
1333+
"NVIDIA_VISIBLE_DEVICES": {"0": "0-7"},
1334+
"AMD_VISIBLE_DEVICES": {"0": "0-7", "1": "8-15"}
13251335
}.
13261336
"""
13271337
_backend_visible_devices_values_alignment: dict[str, dict[str, str]] | None = None
13281338
"""
13291339
Recorded backend visible devices values alignment,
13301340
the key is the runtime visible devices env name,
1331-
the value is the mapping from backend device index to aligned index.
1341+
the value is the mapping from device index string to aligned index string.
13321342
For example:
13331343
{
13341344
"CUDA_VISIBLE_DEVICES": {"0": "0"},
@@ -1363,7 +1373,9 @@ def _prepare(self):
13631373
- Prepare visible devices manufacturers mapping.
13641374
- Prepare visible devices environment variables mapping.
13651375
- Prepare visible devices values mapping.
1366-
- Prepare visible devices topologies mapping.
1376+
- Prepare visible devices NUMA mapping.
1377+
- Prepare visible devices CPUs mapping.
1378+
- Prepare backend visible devices values alignment mapping.
13671379
"""
13681380
if self._visible_devices_manufacturers is not None:
13691381
return
@@ -1372,7 +1384,8 @@ def _prepare(self):
13721384
self._visible_devices_env = {}
13731385
self._visible_devices_cdis = {}
13741386
self._visible_devices_values = {}
1375-
self._visible_devices_topologies = {}
1387+
self._visible_devices_numa_affinities = {}
1388+
self._visible_devices_cpus_affinities = {}
13761389
self._backend_visible_devices_values_alignment = {}
13771390

13781391
group_devices = group_devices_by_manufacturer(
@@ -1398,46 +1411,48 @@ def _prepare(self):
13981411
)
13991412
if ren and ben_list:
14001413
valued_uuid = (
1401-
ren
1402-
in envs.GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID
1414+
self.allowed_uuid_values
1415+
and (
1416+
ren
1417+
in envs.GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID
1418+
)
14031419
and manu != ManufacturerEnum.ASCEND
14041420
)
1405-
dev_uuids: list[str] = []
1406-
dev_indexes: list[str] = []
1421+
dev_values: list[str] = []
1422+
dev_numa_affinities: dict[str, str] = {}
1423+
dev_cpus_affinities: dict[str, str] = {}
14071424
dev_indexes_alignment: dict[str, str] = {}
14081425
for dev_i, dev in enumerate(devs):
1409-
dev_uuids.append(dev.uuid)
1410-
dev_indexes.append(str(dev.index))
1411-
dev_indexes_alignment[str(dev.index)] = str(dev_i)
1426+
dev_index = str(dev.index)
1427+
dev_value = dev.uuid if valued_uuid else dev_index
1428+
dev_values.append(dev_value)
1429+
dev_numa_affinities[dev_index] = dev.appendix.get("numa", "")
1430+
dev_cpus_affinities[dev_index] = map_numa_node_to_cpu_affinity(
1431+
dev_numa_affinities[dev_index],
1432+
)
1433+
dev_indexes_alignment[dev_index] = str(dev_i)
14121434
# Map runtime visible devices env <-> manufacturer.
14131435
self._visible_devices_manufacturers[ren] = manu
14141436
# Map runtime visible devices env <-> backend visible devices env list.
14151437
self._visible_devices_env[ren] = ben_list
14161438
# Map runtime visible devices env <-> CDI key.
14171439
self._visible_devices_cdis[ren] = cdi
1418-
# Map runtime visible devices env <-> device indexes or uuids.
1419-
self._visible_devices_values[ren] = (
1420-
dev_uuids if valued_uuid else dev_indexes
1421-
)
1422-
# Map runtime visible devices env <-> topology.
1423-
if (
1424-
envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
1425-
or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
1426-
):
1427-
topos = get_devices_topologies(devices=devs)
1428-
if topos:
1429-
self._visible_devices_topologies[ren] = topos[0]
1440+
# Map runtime visible devices env <-> device index string or uuid.
1441+
self._visible_devices_values[ren] = dev_values
1442+
# Map runtime visible devices env <-> NUMA affinities.
1443+
self._visible_devices_numa_affinities[ren] = dev_numa_affinities
1444+
# Map runtime visible devices env <-> CPUs affinities.
1445+
self._visible_devices_cpus_affinities[ren] = dev_cpus_affinities
14301446
# Map backend visible devices env <-> devices alignment.
1431-
if not valued_uuid:
1432-
for ben in ben_list:
1433-
valued_alignment = (
1434-
ben
1435-
in envs.GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT
1447+
for ben in ben_list:
1448+
valued_alignment = (
1449+
ben
1450+
in envs.GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT
1451+
)
1452+
if valued_alignment:
1453+
self._backend_visible_devices_values_alignment[ben] = (
1454+
dev_indexes_alignment
14361455
)
1437-
if valued_alignment:
1438-
self._backend_visible_devices_values_alignment[ben] = (
1439-
dev_indexes_alignment
1440-
)
14411456

14421457
if self._visible_devices_env:
14431458
return
@@ -1492,6 +1507,7 @@ def get_visible_devices_materials(
14921507
14931508
"""
14941509
self._prepare()
1510+
14951511
return (
14961512
self._visible_devices_manufacturers,
14971513
self._visible_devices_env,
@@ -1519,20 +1535,34 @@ def get_visible_devices_affinities(
15191535
- A comma-separated string of NUMA affinities.
15201536
15211537
"""
1538+
if not (
1539+
envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
1540+
or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
1541+
):
1542+
return "", ""
1543+
15221544
dev_indexes = []
15231545
if resource_value != "all":
1524-
dev_indexes = [int(v.strip()) for v in resource_value.split(",")]
1525-
1526-
cpus_set: list[str] = []
1527-
numas_set: list[str] = []
1528-
for re_ in runtime_env:
1529-
topo = self._visible_devices_topologies.get(re_)
1530-
if topo:
1531-
cs, ns = topo.get_affinities(dev_indexes, deduplicate=False)
1532-
cpus_set.extend(cs)
1533-
numas_set.extend(ns)
1534-
1535-
return ",".join(set(cpus_set)), ",".join(set(numas_set))
1546+
dev_indexes = [v.strip() for v in resource_value.split(",")]
1547+
1548+
if not dev_indexes:
1549+
return "", ""
1550+
1551+
cpus = set[str]()
1552+
numas = set[str]()
1553+
for ren in runtime_env:
1554+
if af := self._visible_devices_cpus_affinities.get(ren):
1555+
for di in dev_indexes:
1556+
if di in af:
1557+
cpus.add(af[di])
1558+
if not envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY:
1559+
continue
1560+
if af := self._visible_devices_numa_affinities.get(ren):
1561+
for di in dev_indexes:
1562+
if di in af:
1563+
numas.add(af[di])
1564+
1565+
return ",".join(cpus), ",".join(numas)
15361566

15371567
def align_backend_visible_devices_env_values(
15381568
self,
@@ -1556,17 +1586,14 @@ def align_backend_visible_devices_env_values(
15561586
If no alignment is needed, return the original `resource_key_values`.
15571587
15581588
"""
1559-
if (
1560-
backend_visible_devices_env
1561-
not in envs.GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT
1562-
):
1563-
return resource_key_values
15641589
self._prepare()
1590+
15651591
alignments = self._backend_visible_devices_values_alignment.get(
15661592
backend_visible_devices_env,
15671593
)
15681594
if not alignments:
15691595
return resource_key_values
1596+
15701597
return ",".join(
15711598
[alignments.get(v, v) for v in resource_key_values.split(",")],
15721599
)
@@ -1582,6 +1609,17 @@ def name(self) -> str:
15821609
"""
15831610
return self._name
15841611

1612+
@property
1613+
def allowed_uuid_values(self) -> bool:
1614+
"""
1615+
Return whether the deployer allows using UUIDs as visible devices values.
1616+
1617+
Returns:
1618+
True if allowed, False otherwise.
1619+
1620+
"""
1621+
return True
1622+
15851623
def close(self):
15861624
if self._pool:
15871625
self._pool.shutdown(cancel_futures=True)

gpustack_runtime/deployer/cdi/__utils__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ def path_to_cdi_mount(
147147
path: str,
148148
container_path: str | None = None,
149149
options: list[str] | None = None,
150+
ignore_notfound: bool = False,
150151
) -> ConfigMount | None:
151152
"""
152153
Convert a file/directory path to a ConfigMount.
@@ -158,13 +159,15 @@ def path_to_cdi_mount(
158159
Path to the file or directory inside the container.
159160
options:
160161
Mount options.
162+
ignore_notfound:
163+
Whether to ignore if the path does not exist.
161164
162165
Returns:
163166
The ConfigMount object.
164167
None if the path does not exist.
165168
166169
"""
167-
if not Path(path).exists():
170+
if not Path(path).exists() and not ignore_notfound:
168171
return None
169172

170173
if container_path is None:

gpustack_runtime/deployer/docker.py

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,26 +1075,21 @@ def _create_containers( # noqa: C901
10751075
# Configure runtime device access environment variables.
10761076
if r_v != "all" and privileged:
10771077
for be in backend_env:
1078-
create_options["environment"][be] = (
1079-
self.align_backend_visible_devices_env_values(
1080-
be,
1081-
str(r_v),
1082-
)
1078+
bev = self.align_backend_visible_devices_env_values(
1079+
be,
1080+
str(r_v),
10831081
)
1082+
create_options["environment"][be] = bev
10841083

10851084
# Configure affinity if applicable.
1086-
if (
1087-
envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
1088-
or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
1089-
):
1090-
cpus, numas = self.get_visible_devices_affinities(
1091-
runtime_env,
1092-
r_v,
1093-
)
1094-
if cpus:
1095-
create_options["cpuset_cpus"] = cpus
1096-
if numas and envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY:
1097-
create_options["cpuset_mems"] = numas
1085+
cpus, numas = self.get_visible_devices_affinities(
1086+
runtime_env,
1087+
r_v,
1088+
)
1089+
if cpus:
1090+
create_options["cpuset_cpus"] = cpus
1091+
if numas:
1092+
create_options["cpuset_mems"] = numas
10981093

10991094
# Parameterize mounts.
11001095
self._append_container_mounts(

gpustack_runtime/deployer/k8s/deviceplugin/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ def get_device_allocation_policy(
323323

324324
if manufacturer in [
325325
ManufacturerEnum.AMD,
326-
# ManufacturerEnum.ASCEND, # Prioritize using Env policy for Ascend.
326+
ManufacturerEnum.ASCEND,
327327
ManufacturerEnum.HYGON,
328328
ManufacturerEnum.ILUVATAR,
329329
ManufacturerEnum.METAX,

0 commit comments

Comments
 (0)