1515from .. import envs
1616from ..detector import (
1717 ManufacturerEnum ,
18- Topology ,
1918 detect_devices ,
20- get_devices_topologies ,
2119 group_devices_by_manufacturer ,
2220 manufacturer_to_backend ,
2321)
22+ from ..detector .__utils__ import map_numa_node_to_cpu_affinity
2423from .__utils__ import (
2524 adjust_image_with_envs ,
2625 correct_runner_image ,
@@ -1306,29 +1305,40 @@ class Deployer(ABC):
13061305 """
13071306 Recorded visible devices values,
13081307 the key is the runtime visible devices env name,
1309- the value is the list of device indexes or uuids.
1308+ the value is the list of device index strings or uuids.
13101309 For example:
13111310 {
1312- "NVIDIA_VISIBLE_DEVICES": ["0 "],
1311+ "NVIDIA_VISIBLE_DEVICES": ["GPU-11111111-2222-3333-4444-555555555555 "],
13131312 "AMD_VISIBLE_DEVICES": ["0", "1"]
13141313 }.
13151314 """
1316- _visible_devices_topologies : dict [str , Topology ] | None = None
1315+ _visible_devices_numa_affinities : dict [str , dict [ str , str ] ] | None = None
13171316 """
1318- Recorded visible devices topologies ,
1317+ Recorded visible devices NUMA affinities ,
13191318 the key is the runtime visible devices env name,
1320- the value is the corresponding topology .
1319+ the value is the mapping from device index string to NUMA node string .
13211320 For example:
13221321 {
1323- "NVIDIA_VISIBLE_DEVICES": Topology(...),
1324- "AMD_VISIBLE_DEVICES": Topology(...)
1322+ "NVIDIA_VISIBLE_DEVICES": {"0": "0-1"},
1323+ "AMD_VISIBLE_DEVICES": {"0": "0-1", "1": "0-1"}
1324+ }.
1325+ """
1326+ _visible_devices_cpus_affinities : dict [str , dict [str , str ]] | None = None
1327+ """
1328+ Recorded visible devices CPUs affinities,
1329+ the key is the runtime visible devices env name,
1330+ the value is the mapping from device index string to CPU cores string.
1331+ For example:
1332+ {
1333+ "NVIDIA_VISIBLE_DEVICES": {"0": "0-7"},
1334+ "AMD_VISIBLE_DEVICES": {"0": "0-7", "1": "8-15"}
13251335 }.
13261336 """
13271337 _backend_visible_devices_values_alignment : dict [str , dict [str , str ]] | None = None
13281338 """
13291339 Recorded backend visible devices values alignment,
13301340 the key is the runtime visible devices env name,
1331- the value is the mapping from backend device index to aligned index.
1341+ the value is the mapping from device index string to aligned index string .
13321342 For example:
13331343 {
13341344 "CUDA_VISIBLE_DEVICES": {"0": "0"},
@@ -1363,7 +1373,9 @@ def _prepare(self):
13631373 - Prepare visible devices manufacturers mapping.
13641374 - Prepare visible devices environment variables mapping.
13651375 - Prepare visible devices values mapping.
1366- - Prepare visible devices topologies mapping.
1376+ - Prepare visible devices NUMA mapping.
1377+ - Prepare visible devices CPUs mapping.
1378+ - Prepare backend visible devices values alignment mapping.
13671379 """
13681380 if self ._visible_devices_manufacturers is not None :
13691381 return
@@ -1372,7 +1384,8 @@ def _prepare(self):
13721384 self ._visible_devices_env = {}
13731385 self ._visible_devices_cdis = {}
13741386 self ._visible_devices_values = {}
1375- self ._visible_devices_topologies = {}
1387+ self ._visible_devices_numa_affinities = {}
1388+ self ._visible_devices_cpus_affinities = {}
13761389 self ._backend_visible_devices_values_alignment = {}
13771390
13781391 group_devices = group_devices_by_manufacturer (
@@ -1398,46 +1411,48 @@ def _prepare(self):
13981411 )
13991412 if ren and ben_list :
14001413 valued_uuid = (
1401- ren
1402- in envs .GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID
1414+ self .allowed_uuid_values
1415+ and (
1416+ ren
1417+ in envs .GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID
1418+ )
14031419 and manu != ManufacturerEnum .ASCEND
14041420 )
1405- dev_uuids : list [str ] = []
1406- dev_indexes : list [str ] = []
1421+ dev_values : list [str ] = []
1422+ dev_numa_affinities : dict [str , str ] = {}
1423+ dev_cpus_affinities : dict [str , str ] = {}
14071424 dev_indexes_alignment : dict [str , str ] = {}
14081425 for dev_i , dev in enumerate (devs ):
1409- dev_uuids .append (dev .uuid )
1410- dev_indexes .append (str (dev .index ))
1411- dev_indexes_alignment [str (dev .index )] = str (dev_i )
1426+ dev_index = str (dev .index )
1427+ dev_value = dev .uuid if valued_uuid else dev_index
1428+ dev_values .append (dev_value )
1429+ dev_numa_affinities [dev_index ] = dev .appendix .get ("numa" , "" )
1430+ dev_cpus_affinities [dev_index ] = map_numa_node_to_cpu_affinity (
1431+ dev_numa_affinities [dev_index ],
1432+ )
1433+ dev_indexes_alignment [dev_index ] = str (dev_i )
14121434 # Map runtime visible devices env <-> manufacturer.
14131435 self ._visible_devices_manufacturers [ren ] = manu
14141436 # Map runtime visible devices env <-> backend visible devices env list.
14151437 self ._visible_devices_env [ren ] = ben_list
14161438 # Map runtime visible devices env <-> CDI key.
14171439 self ._visible_devices_cdis [ren ] = cdi
1418- # Map runtime visible devices env <-> device indexes or uuids.
1419- self ._visible_devices_values [ren ] = (
1420- dev_uuids if valued_uuid else dev_indexes
1421- )
1422- # Map runtime visible devices env <-> topology.
1423- if (
1424- envs .GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
1425- or envs .GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
1426- ):
1427- topos = get_devices_topologies (devices = devs )
1428- if topos :
1429- self ._visible_devices_topologies [ren ] = topos [0 ]
1440+ # Map runtime visible devices env <-> device index string or uuid.
1441+ self ._visible_devices_values [ren ] = dev_values
1442+ # Map runtime visible devices env <-> NUMA affinities.
1443+ self ._visible_devices_numa_affinities [ren ] = dev_numa_affinities
1444+ # Map runtime visible devices env <-> CPUs affinities.
1445+ self ._visible_devices_cpus_affinities [ren ] = dev_cpus_affinities
14301446 # Map backend visible devices env <-> devices alignment.
1431- if not valued_uuid :
1432- for ben in ben_list :
1433- valued_alignment = (
1434- ben
1435- in envs .GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT
1447+ for ben in ben_list :
1448+ valued_alignment = (
1449+ ben
1450+ in envs .GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT
1451+ )
1452+ if valued_alignment :
1453+ self ._backend_visible_devices_values_alignment [ben ] = (
1454+ dev_indexes_alignment
14361455 )
1437- if valued_alignment :
1438- self ._backend_visible_devices_values_alignment [ben ] = (
1439- dev_indexes_alignment
1440- )
14411456
14421457 if self ._visible_devices_env :
14431458 return
@@ -1492,6 +1507,7 @@ def get_visible_devices_materials(
14921507
14931508 """
14941509 self ._prepare ()
1510+
14951511 return (
14961512 self ._visible_devices_manufacturers ,
14971513 self ._visible_devices_env ,
@@ -1519,20 +1535,34 @@ def get_visible_devices_affinities(
15191535 - A comma-separated string of NUMA affinities.
15201536
15211537 """
1538+ if not (
1539+ envs .GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
1540+ or envs .GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
1541+ ):
1542+ return "" , ""
1543+
15221544 dev_indexes = []
15231545 if resource_value != "all" :
1524- dev_indexes = [int (v .strip ()) for v in resource_value .split ("," )]
1525-
1526- cpus_set : list [str ] = []
1527- numas_set : list [str ] = []
1528- for re_ in runtime_env :
1529- topo = self ._visible_devices_topologies .get (re_ )
1530- if topo :
1531- cs , ns = topo .get_affinities (dev_indexes , deduplicate = False )
1532- cpus_set .extend (cs )
1533- numas_set .extend (ns )
1534-
1535- return "," .join (set (cpus_set )), "," .join (set (numas_set ))
1546+ dev_indexes = [v .strip () for v in resource_value .split ("," )]
1547+
1548+ if not dev_indexes :
1549+ return "" , ""
1550+
1551+ cpus = set [str ]()
1552+ numas = set [str ]()
1553+ for ren in runtime_env :
1554+ if af := self ._visible_devices_cpus_affinities .get (ren ):
1555+ for di in dev_indexes :
1556+ if di in af :
1557+ cpus .add (af [di ])
1558+ if not envs .GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY :
1559+ continue
1560+ if af := self ._visible_devices_numa_affinities .get (ren ):
1561+ for di in dev_indexes :
1562+ if di in af :
1563+ numas .add (af [di ])
1564+
1565+ return "," .join (cpus ), "," .join (numas )
15361566
15371567 def align_backend_visible_devices_env_values (
15381568 self ,
@@ -1556,17 +1586,14 @@ def align_backend_visible_devices_env_values(
15561586 If no alignment is needed, return the original `resource_key_values`.
15571587
15581588 """
1559- if (
1560- backend_visible_devices_env
1561- not in envs .GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT
1562- ):
1563- return resource_key_values
15641589 self ._prepare ()
1590+
15651591 alignments = self ._backend_visible_devices_values_alignment .get (
15661592 backend_visible_devices_env ,
15671593 )
15681594 if not alignments :
15691595 return resource_key_values
1596+
15701597 return "," .join (
15711598 [alignments .get (v , v ) for v in resource_key_values .split ("," )],
15721599 )
@@ -1582,6 +1609,17 @@ def name(self) -> str:
15821609 """
15831610 return self ._name
15841611
1612+ @property
1613+ def allowed_uuid_values (self ) -> bool :
1614+ """
1615+ Return whether the deployer allows using UUIDs as visible devices values.
1616+
1617+ Returns:
1618+ True if allowed, False otherwise.
1619+
1620+ """
1621+ return True
1622+
15851623 def close (self ):
15861624 if self ._pool :
15871625 self ._pool .shutdown (cancel_futures = True )
0 commit comments