-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathGPU_peak_perf_test.py
More file actions
61 lines (51 loc) · 1.84 KB
/
GPU_peak_perf_test.py
File metadata and controls
61 lines (51 loc) · 1.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import math
import torch
import time
torch.backends.cuda.enable_math_sdp(True)
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
import os, sys
if sys.platform.startswith("win32"):
import rocwmma_fattn.zluda_hijack_torch_hip_ext as zluda_hijack_torch_hip_ext
torch.utils.cpp_extension.IS_HIP_EXTENSION = True
torch.version.hip = "5.7.0"
torch.version.cuda = None
else:
import torch.utils.cpp_extension
os.environ["PYTORCH_ROCM_ARCH"] = "gfx1100" # ;gfx1101;gfx1102;gfx1103"
src_Path = os.path.split(os.path.realpath(__file__))[0]
build_path = os.path.join(src_Path, "build")
os.makedirs(build_path, exist_ok=True)
src_code = ["dummy.cpp", "GPU_peak_perf_test.cu"]
src_code = [os.path.join(src_Path, x) for x in src_code]
import torch.utils.cpp_extension
peak_perf_test = torch.utils.cpp_extension.load(
name="peak_perf_test",
sources=src_code,
extra_cuda_cflags=[
"-Ofast",
"-save-temps",
"-mcumode", # CU Mode:-mcumode WGP Mode:-mno-cumode
"-ffast-math",
],
build_directory=build_path,
)
# more round and running_blocks would cause driver TDR in windows
round = 4
matmul_ops = 16 * 16 * 16 * 2 # 2MNK ops for a matrix mul add
wmma_insts = round * 500000 # 100000 * 5 for loop in .cu
running_blocks = 96 # 96 in cu mode / 48 in wgp mode (gfx1100)
waves_per_block = 16 # 2 SIMD32 per CU / 4 SIMD32 per WGP (use > 4 for latency hiding)
waves = waves_per_block * running_blocks
total_ops = wmma_insts * matmul_ops * waves
# warm up
for i in range(3):
peak_perf_test.forward(running_blocks, waves_per_block)
torch.cuda.synchronize()
t0 = time.time()
for i in range(round):
peak_perf_test.forward(running_blocks, waves_per_block)
torch.cuda.synchronize()
t1 = time.time() - t0
print("time:", t1 / round)
print("TFlops:", total_ops / t1 * 1e-12)