libvirt-hooks/qemu at main · NVIDIA/libvirt-hooks · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#!/usr/bin/env python3
#
# Libvirt hook for qemu
#
# This hook script catches all qemu VM creation and activates Shared NVSwitch
# GPU partition accordingly if there are GPUs passed through
#

import json
import logging
from logging.handlers import SysLogHandler
import os
import subprocess
import sys
import xml.etree.ElementTree as etree

LOG = logging.getLogger(__name__)
LOG.setLevel(logging.DEBUG)
handler = SysLogHandler(facility=SysLogHandler.LOG_DAEMON,address='/dev/log')
handler.setLevel(logging.DEBUG)
fmt = logging.Formatter('libvirt-hook: %(levelname)s: %(message)s')
handler.setFormatter(fmt)
LOG.addHandler(handler)

def call(args):
    LOG.debug("About to execute %s " % ' '.join(args))
    subprocess.check_call(args)


def gpusExist(xmlHostdevs):
    num_gpus = 0

    gpu_bdfs = set()
    output = subprocess.run(r'lspci | grep 3D | grep NVIDIA | cut -d" " -f1', shell=True, universal_newlines=True, stdout=subprocess.PIPE).stdout
    for line in output.split("\n"):
        if line:
            gpu_bdfs.add(line.lower())

    for pci_hostdev_addr in xmlHostdevs:
        if pci_hostdev_addr is None:
            LOG.debug("No PCI hostdev devices passed through to the VM")
            break

        # Form BDF string from <address bus slot function/> in VM xml
        pci_hostdev_addr_bus = int(pci_hostdev_addr.get('bus'),0)
        pci_hostdev_addr_device = int(pci_hostdev_addr.get('slot'),0)
        pci_hostdev_addr_function = int(pci_hostdev_addr.get('function'),0)
        pci_hostdev_bdf_from_xml = format(pci_hostdev_addr_bus, '02X') + ':' + format(pci_hostdev_addr_device, '02X') + '.' + format(pci_hostdev_addr_function, '01X')

        if pci_hostdev_bdf_from_xml.lower() in gpu_bdfs:
            num_gpus += 1

    if num_gpus > 0:
        return True

    return False

def main():
    LOG.debug("Arguments: %s", sys.argv)

    if len(sys.argv) < 3:
        LOG.debug("Nothing to do: too few arguments")
        sys.exit(0)

    guest_name = sys.argv[1]
    action = sys.argv[2]

    if (action != 'prepare' and action != 'release'):
        LOG.debug("Nothing to do: only prepare or release are hooked")
        sys.exit(0)

    FM_IP = "127.0.0.1"

    xml = sys.stdin.read()
    LOG.debug("VM libvirt XML: %s", xml)
    root = etree.fromstring(xml)
    # Skip if no GPUs detected in XML, i.e. service VM
    if not gpusExist(root.findall("./devices/hostdev[@type='pci']/source/address")):
        LOG.debug("No GPUs detected in VM - exiting hook script")
        sys.exit(0)

    # Extract current GPU partitions' state and info in json format
    try:
        output = subprocess.check_output("/usr/bin/fmpm -l --hostname " + FM_IP, shell = True)
    except subprocess.CalledProcessError as e:
        ret = sys.stderr.write('/usr/bin/fmpm -l --hostname ' + FM_IP + ' failed: ' + str(e.returncode))
        sys.exit(0)


    gpu_partition_json = output.decode("utf-8")
    gpu_partition_json_data = json.loads(gpu_partition_json)

    platform = subprocess.check_output("dmidecode -t 1 | grep Product | awk -F \": \" '{print $2}'", shell = True)
    platform = platform.decode("utf-8").strip()
    LOG.debug("DEBUG: Platform is %s", platform)

    # Build GPU Module ID to GPU BDF dictionary
    gpus_mod_to_bdf = {}
    prior_line = ""
    module_id_string = "Module ID"
    if (("DGX-2" in platform) or ("V100" in platform) or ("A100" in platform) or ("A800" in platform)):
        for i in range(int(gpu_partition_json_data["partitionInfo"][0]["numGpus"])):
            mod_id = str(gpu_partition_json_data["partitionInfo"][0]["gpuInfo"][i]["physicalId"])
            bdf = gpu_partition_json_data["partitionInfo"][0]["gpuInfo"][i]["pciBusId"]
            gpus_mod_to_bdf[mod_id] = bdf.split(":",1)[1].strip()
    else:
        output = subprocess.run(r'nvidia-smi -q | grep -i "Module ID\|GPU 00000000"', shell=True, universal_newlines=True, stdout=subprocess.PIPE).stdout
        for line in output.split("\n"):
            if module_id_string.casefold() in line.casefold():
                mod_id = line.split(":")[1].strip()
                gpus_mod_to_bdf[mod_id] = prior_line.split(":",1)[1]
            else:
                prior_line = line

    LOG.debug("GPU Module ID to GPU BDF mapping: %s", gpus_mod_to_bdf)

    # Build GPU BDF to GPU Module ID dictionary
    gpus_bdf_to_mod = {}
    prior_line = ""
    if (("DGX-2" in platform) or ("V100" in platform) or ("A100" in platform) or ("A800" in platform)):
        for i in range(int(gpu_partition_json_data["partitionInfo"][0]["numGpus"])):
            mod_id = str(gpu_partition_json_data["partitionInfo"][0]["gpuInfo"][i]["physicalId"])
            bdf = gpu_partition_json_data["partitionInfo"][0]["gpuInfo"][i]["pciBusId"]
            gpus_bdf_to_mod[bdf.split(":",1)[1].strip()] = mod_id
    else:
        output = subprocess.run(r'nvidia-smi -q | grep -i "Module ID\|GPU 00000000"', shell=True, universal_newlines=True, stdout=subprocess.PIPE).stdout
        for line in output.split("\n"):
            if module_id_string.casefold() in line.casefold():
                mod_id = line.split(":")[1].strip()
                gpus_bdf_to_mod[prior_line.split(":",1)[1]] = mod_id
            else:
                prior_line = line

    LOG.debug("GPU BDF to GPU Module ID mapping: %s", gpus_bdf_to_mod)

    num_pci_bdf_from_xml = len(root.findall("./devices/hostdev[@type='pci']/source/address"))

    # Build Nvidia GPU Module ID list from GPUs in the VM XML
    num_gpu_bdf_from_xml = 0
    gpu_mod_list_from_xml =[]
    for pci_hostdev_addr in root.findall("./devices/hostdev[@type='pci']/source/address"):
        if pci_hostdev_addr is None:
            LOG.debug("No PCI hostdev devices passed through to the VM")
            break

        # Form BDF string from <address bus slot function/> in VM xml
        pci_hostdev_addr_bus = int(pci_hostdev_addr.get('bus'),0)
        pci_hostdev_addr_device = int(pci_hostdev_addr.get('slot'),0)
        pci_hostdev_addr_function = int(pci_hostdev_addr.get('function'),0)
        pci_hostdev_bdf_from_xml = format(pci_hostdev_addr_bus, '02X') + ':' + format(pci_hostdev_addr_device, '02X') + '.' + format(pci_hostdev_addr_function, '01X')

        if pci_hostdev_bdf_from_xml in list(gpus_bdf_to_mod.keys()):
            # pci hostdev BDF from xml is a Nvidia GPU
            LOG.debug("%s is a Nvidia GPU", pci_hostdev_bdf_from_xml)
            num_gpu_bdf_from_xml +=1
            # Add this GPU BDF's Module ID to a list
            gpu_mod_list_from_xml.append(int(gpus_bdf_to_mod[pci_hostdev_bdf_from_xml]))
    LOG.debug("Number of GPUs passed through in the VM XML is %s", num_gpu_bdf_from_xml)
    LOG.debug("GPU Module IDs %s for GPUs passed through in the VM XML", gpu_mod_list_from_xml)

    # Extract current GPU partitions' state and info in json format
    output = subprocess.check_output("/usr/bin/fmpm -l --hostname " + FM_IP, shell = True)
    gpu_partition_json = output.decode("utf-8")
    gpu_partition_json_data = json.loads(gpu_partition_json)

    gpu_partitions = list(filter(lambda x:x["numGpus"] == num_gpu_bdf_from_xml, gpu_partition_json_data["partitionInfo"]))
    if not gpu_partitions:
        LOG.debug("No supported GPU partition with %s GPUs as passed in the VM XML", num_gpu_bdf_from_xml)
        sys.exit(0)

    found = 0
    gpu_partition_id = 0
    # Loop through each GPU partition containing the number of GPUs matching
    # the number of GPUs passed through in the VM XML
    for partition in gpu_partitions:
        gpu_infos = list(partition["gpuInfo"])
        # Build the GPU Module ID list for the GPUs contained in the given
        # GPU partition
        gpu_mod_list = []
        for gpu in gpu_infos:
            gpu_mod_list.append(gpu["physicalId"])

        LOG.debug("GPU Module ID list %s of GPU partition %s", gpu_mod_list, partition["partitionId"])
        if set(gpu_mod_list) == set(gpu_mod_list_from_xml):
            gpu_partition_id = partition["partitionId"]
            found = 1
            break

    if found == 0:
        LOG.debug("No supported GPU partition containing GPUs as passed throught in the VM XML")
        sys.exit(0)
    else:
        LOG.debug("GPU partition %s contains GPUs as passed throught in the VM XML", gpu_partition_id)

    # Extract current GPU partitions' state and info in json format
    output = subprocess.check_output("/usr/bin/fmpm -l --hostname " + FM_IP, shell = True)
    gpu_partition_json = output.decode("utf-8")
    gpu_partition_json_data = json.loads(gpu_partition_json)

    partition = list(filter(lambda x:x["partitionId"] == int(gpu_partition_id), gpu_partition_json_data["partitionInfo"]))

    if not partition:
        LOG.debug("Get Partition state: No partitionInfo matching partition ID %s", gpu_partition_id)
        sys.exit(0)

    partition_isActive = partition[0]["isActive"]
    LOG.debug("GPU Partition %s isActive = %s", gpu_partition_id, partition_isActive)

    gpus_bdf_list = []
    gpu_infos = list(partition[0]["gpuInfo"])
    for gpu in gpu_infos:
        gpus_bdf_list.append(gpus_mod_to_bdf[str(gpu["physicalId"])])
    # action is prepare during VM create
    if (action == 'prepare'):
        if partition_isActive != 0:
            LOG.debug("GPU Partition %s is already active during action = prepare", gpu_partition_id)
            sys.exit(0)

        # Activate the GPU Partition
        partition_action = '-a'

    # action is release during VM release
    if (action == 'release'):
        if partition_isActive == 0:
            LOG.debug("GPU Partition %s is already inactive during action = release", gpu_partition_id)
            sys.exit(0)
        # De-activate the GPU Partition
        partition_action = '-d'

    cmd = '/usr/bin/fmpm ' + partition_action + ' ' + str(gpu_partition_id) + ' --hostname ' + FM_IP
    LOG.debug("Action = %s. Executing command: %s", action, cmd)
    rc, out = subprocess.getstatusoutput(cmd)
    if rc != 0:
        LOG.debug("\"%s\" failed with rc %s", cmd, rc)
        sys.exit(rc)

if __name__ == '__main__':
    try:
        main()
    except Exception:
        LOG.exception("Unexpected exception occured")