gpu_occupancy_monitor/fuck_gpu.py at main · zzjweb/gpu_occupancy_monitor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import time
import threading
import pynvml
import torch

pynvml.nvmlInit()
gpu_count = pynvml.nvmlDeviceGetCount()

# GPU usage threshold(%); tasks will run when usage is below this value
usage_threshold = 60.0
CHECK_INTERVAL = 10

running_flags = [False] * gpu_count

def get_gpu_usage(gpu_index):
    handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
    utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
    return utilization.gpu

def dummy_task(device, gpu_index):
    size = 1024  # Adjust matrix size as needed
    a = torch.rand((size, size), device=device)
    b = torch.rand((size, size), device=device)

    while running_flags[gpu_index]:
        c = torch.mm(a, b)
        _ = c.sum()

def monitor_gpus():
    global running_flags

    try:
        while True:
            for i in range(gpu_count):
                # Pause the task first
                running_flags[i] = False
                time.sleep(1)
                usage = get_gpu_usage(i)
                device = torch.device(f"cuda:{i}")
                if usage < usage_threshold and not running_flags[i]:
                    running_flags[i] = True
                    threading.Thread(target=dummy_task, args=(device, i), daemon=True).start()

            time.sleep(CHECK_INTERVAL)

    finally:
        pynvml.nvmlShutdown()


if __name__ == "__main__":
    monitor_gpus()