-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbenchmark.cpp
More file actions
261 lines (217 loc) · 11.2 KB
/
benchmark.cpp
File metadata and controls
261 lines (217 loc) · 11.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#include <iostream>
#include <chrono>
#include <cassert>
#include <Eigen/Core>
#include "../include/tensor2d.hpp"
#include "../include/tensor3d.hpp"
#include "../include/matmul_cuda.hpp"
void benchmark_matmul(size_t M, size_t K, size_t N) {
std::cout << "Benchmarking matmul with shapes: (" << M << ", " << K << ") * (" << K << ", " << N << ")\n";
Tensor2D A = Tensor2D(M, K, 1.0f);
Tensor2D B = Tensor2D(K, N, 2.0f);
// Manual matmul
auto start = std::chrono::high_resolution_clock::now();
Tensor2D C_manual = A.mat_mul(B);
auto end = std::chrono::high_resolution_clock::now();
auto manual_us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
std::cout << "Manual mat_mul: " << manual_us << " us\n";
// Eigen matmul
start = std::chrono::high_resolution_clock::now();
Tensor2D C_eigen = A.mat_mul_eigen(B);
end = std::chrono::high_resolution_clock::now();
auto eigen_us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
std::cout << "Eigen mat_mul : " << eigen_us << " us\n";
std::cout << "Speedup : " << static_cast<float>(manual_us) / eigen_us << "x\n\n";
}
void benchmark_batched_matmul(size_t batch, size_t M, size_t K, size_t N) {
std::cout << "Benchmarking batched matmul with shapes: ("
<< batch << ", " << M << ", " << K << ") * ("
<< batch << ", " << K << ", " << N << ")\n";
Tensor3D A(batch, M, K, 1.0f);
Tensor3D B(batch, K, N, 2.0f);
// Manual batched matmul
auto start = std::chrono::high_resolution_clock::now();
Tensor3D C_manual = A.mat_mul(B);
auto end = std::chrono::high_resolution_clock::now();
auto manual_us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
std::cout << "Manual batched_mat_mul: " << manual_us << " us\n";
// Eigen batched matmul
start = std::chrono::high_resolution_clock::now();
Tensor3D C_eigen = A.mat_mul_eigen(B);
end = std::chrono::high_resolution_clock::now();
auto eigen_us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
std::cout << "Eigen batched_mat_mul : " << eigen_us << " us\n";
// Result
std::cout << "Speedup : " << static_cast<float>(manual_us) / eigen_us << "x\n\n";
}
void benchmark_batched_matmul_parallel(size_t batch, size_t M, size_t K, size_t N) {
std::cout << "Benchmarking batched matmul parallel with shapes: ("
<< batch << ", " << M << ", " << K << ") * ("
<< batch << ", " << K << ", " << N << ")\n";
Tensor3D A(batch, M, K, 1.0f);
Tensor3D B(batch, K, N, 2.0f);
// Eigen batched matmul
auto start = std::chrono::high_resolution_clock::now();
Tensor3D C_manual = A.mat_mul_eigen(B);
auto end = std::chrono::high_resolution_clock::now();
auto eigen_us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
std::cout << "Eigen batched_mat_mul : " << eigen_us << " us\n";
// Eigen batched matmul in parallel
start = std::chrono::high_resolution_clock::now();
Tensor3D C_eigen = A.mat_mul_eigen_parallel(B);
end = std::chrono::high_resolution_clock::now();
auto eigen_us_parallel = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
std::cout << "Eigen batched_mat_mul in parallel : " << eigen_us_parallel << " us\n";
// Result
std::cout << "Speedup : " << static_cast<float>(eigen_us) / eigen_us_parallel << "x\n\n";
}
#ifdef USE_CUDA
void benchmark_matmul_cuda_vs_cpu(int N) {
std::cout << "Benchmarking matmul cpu vs. gpu with shape: " << N << " x " << N << std::endl;
// Create CPU tensors
Tensor2D A_cpu = Tensor2D::from_random(N, N, Device::CPU);
Tensor2D B_cpu = Tensor2D::from_random(N, N, Device::CPU);
// CPU baseline
auto cpu_start = std::chrono::high_resolution_clock::now();
Tensor2D C_cpu = A_cpu.mat_mul_eigen(B_cpu);
auto cpu_end = std::chrono::high_resolution_clock::now();
double cpu_ms = std::chrono::duration<double, std::milli>(cpu_end - cpu_start).count();
// Manually copy to GPU tensors
Tensor2D A_gpu(N, N, 0.0f, Device::GPU);
Tensor2D B_gpu(N, N, 0.0f, Device::GPU);
cudaMemcpy(A_gpu.data(), A_cpu.data(), N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(B_gpu.data(), B_cpu.data(), N * N * sizeof(float), cudaMemcpyHostToDevice);
// GPU matmul
auto gpu_start = std::chrono::high_resolution_clock::now();
Tensor2D C_gpu = mat_mul_cuda(A_gpu, B_gpu);
auto gpu_end = std::chrono::high_resolution_clock::now();
double gpu_ms = std::chrono::duration<double, std::milli>(gpu_end - gpu_start).count();
// Result
double speedup = cpu_ms / gpu_ms;
std::cout << "CPU time: " << cpu_ms << " ms\n";
std::cout << "GPU time: " << gpu_ms << " ms\n";
std::cout << "Speedup: " << speedup << "x\n\n";
}
void benchmark_device_transfer(size_t N) {
std::cout << "Benchmarking device transfer with shape: " << N << " x " << N << std::endl;
// Create original CPU tensor with random data
Tensor2D original = Tensor2D::from_random(N, N, Device::CPU);
// Time CPU → GPU transfer
auto cpu_to_gpu_start = std::chrono::high_resolution_clock::now();
Tensor2D gpu_tensor = original.to(Device::GPU);
auto cpu_to_gpu_end = std::chrono::high_resolution_clock::now();
auto cpu_to_gpu_us = std::chrono::duration_cast<std::chrono::microseconds>(cpu_to_gpu_end - cpu_to_gpu_start).count();
// Time GPU → CPU transfer
auto gpu_to_cpu_start = std::chrono::high_resolution_clock::now();
Tensor2D cpu_tensor = gpu_tensor.to(Device::CPU);
auto gpu_to_cpu_end = std::chrono::high_resolution_clock::now();
auto gpu_to_cpu_us = std::chrono::duration_cast<std::chrono::microseconds>(gpu_to_cpu_end - gpu_to_cpu_start).count();
// Verify shape and values match original
assert(cpu_tensor.shape() == original.shape());
for (size_t i = 0; i < N; ++i) {
for (size_t j = 0; j < N; ++j) {
assert(cpu_tensor(i, j) == original(i, j));
}
}
// Result
std::cout << "CPU → GPU transfer: " << cpu_to_gpu_us << " us\n";
std::cout << "GPU → CPU transfer: " << gpu_to_cpu_us << " us\n";
std::cout << "Total roundtrip: " << (cpu_to_gpu_us + gpu_to_cpu_us) << " us\n\n";
}
void benchmark_bmm_vs_bmm_cuda(size_t B, size_t M, size_t K, size_t N) {
std::cout << "Benchmarking batched matmul CPU vs GPU with shapes: ("
<< B << ", " << M << ", " << K << ") * ("
<< B << ", " << K << ", " << N << ")\n";
// Create CPU tensors once and reuse
Tensor3D A_cpu = Tensor3D::from_random(B, M, K);
Tensor3D B_cpu = Tensor3D::from_random(B, K, N);
// CPU baseline (mat_mul_eigen_parallel)
auto cpu_start = std::chrono::high_resolution_clock::now();
Tensor3D C_cpu = A_cpu.mat_mul_eigen_parallel(B_cpu);
auto cpu_end = std::chrono::high_resolution_clock::now();
double cpu_ms = std::chrono::duration<double, std::milli>(cpu_end - cpu_start).count();
// Create GPU tensors
Tensor3D A_gpu = A_cpu.to(Device::GPU);
Tensor3D B_gpu = B_cpu.to(Device::GPU);
// GPU batched matmul
auto gpu_start = std::chrono::high_resolution_clock::now();
Tensor3D C_gpu = bmm_cuda(A_gpu, B_gpu);
auto gpu_end = std::chrono::high_resolution_clock::now();
double gpu_ms = std::chrono::duration<double, std::milli>(gpu_end - gpu_start).count();
// Result
double speedup = cpu_ms / gpu_ms;
std::cout << "CPU time (mat_mul_eigen_parallel): " << cpu_ms << " ms\n";
std::cout << "GPU time (bmm_cuda): " << gpu_ms << " ms\n";
std::cout << "Speedup: " << speedup << "x\n\n";
}
void benchmark_bmm_add_fused_vs_unfused(size_t B, size_t M, size_t K, size_t N) {
std::cout << "Benchmarking fused vs unfused bmm_add_cuda with shapes:\n";
std::cout << " Input: (" << B << ", " << M << ", " << K << ")\n";
std::cout << " Weight: (" << B << ", " << K << ", " << N << ")\n";
std::cout << " Bias: (" << B << ", " << M << ", " << N << ")\n";
// Init CPU tensors
Tensor3D input_cpu = Tensor3D::from_random(B, M, K);
Tensor3D weight_cpu = Tensor3D::from_random(B, K, N);
Tensor3D bias_cpu = Tensor3D::from_random(B, M, N);
// Transfer to GPU
Tensor3D input_gpu = input_cpu.to(Device::GPU);
Tensor3D weight_gpu = weight_cpu.to(Device::GPU);
Tensor3D bias_gpu = bias_cpu.to(Device::GPU);
// Unfused (bmm + add)
auto start_unfused = std::chrono::high_resolution_clock::now();
Tensor3D temp = bmm_cuda(input_gpu, weight_gpu);
Tensor3D unfused_result = add_cuda(temp, bias_gpu);
auto end_unfused = std::chrono::high_resolution_clock::now();
double time_unfused = std::chrono::duration<double, std::milli>(end_unfused - start_unfused).count();
// Fused
auto start_fused = std::chrono::high_resolution_clock::now();
Tensor3D fused_result = bmm_add_cuda(input_gpu, weight_gpu, bias_gpu);
auto end_fused = std::chrono::high_resolution_clock::now();
double time_fused = std::chrono::duration<double, std::milli>(end_fused - start_fused).count();
// Print results
std::cout << "Unfused (bmm + add): " << time_unfused << " ms\n";
std::cout << "Fused (bmm_add_cuda): " << time_fused << " ms\n";
std::cout << "Speedup: " << time_unfused / time_fused << "x\n\n";
}
#endif
int main() {
// benchmark_matmul(16, 16, 16);
// benchmark_matmul(128, 128, 128);
// benchmark_matmul(256, 512, 128);
// benchmark_matmul(512, 512, 512);
// benchmark_matmul(1024, 1024, 1024);
// std::cout << std::endl;
// benchmark_batched_matmul(8, 16, 16, 16);
// benchmark_batched_matmul(16, 64, 64, 64);
// benchmark_batched_matmul(32, 128, 128, 128);
// benchmark_batched_matmul(8, 256, 512, 128);
// benchmark_batched_matmul(4, 512, 512, 512);
// benchmark_batched_matmul(2, 1024, 1024, 1024);
// benchmark_batched_matmul_parallel(8, 16, 16, 16);
// benchmark_batched_matmul_parallel(16, 64, 64, 64);
// benchmark_batched_matmul_parallel(32, 128, 128, 128);
// benchmark_batched_matmul_parallel(8, 256, 512, 128);
// benchmark_batched_matmul_parallel(4, 512, 512, 512);
// benchmark_batched_matmul_parallel(2, 1024, 1024, 1024);
// std::cout << std::endl;
#ifdef USE_CUDA
// benchmark_matmul_cuda_vs_cpu(512);
// benchmark_matmul_cuda_vs_cpu(1024);
// benchmark_device_transfer(512);
// benchmark_device_transfer(1024);
// benchmark_bmm_vs_bmm_cuda(8, 16, 16, 16);
// benchmark_bmm_vs_bmm_cuda(16, 64, 64, 64);
// benchmark_bmm_vs_bmm_cuda(32, 128, 128, 128);
// benchmark_bmm_vs_bmm_cuda(8, 256, 512, 128);
// benchmark_bmm_vs_bmm_cuda(4, 512, 512, 512);
// benchmark_bmm_vs_bmm_cuda(2, 1024, 1024, 1024);
// std::cout << std::endl;
benchmark_bmm_add_fused_vs_unfused(8, 16, 16, 16);
benchmark_bmm_add_fused_vs_unfused(16, 64, 64, 64);
benchmark_bmm_add_fused_vs_unfused(32, 128, 128, 128);
benchmark_bmm_add_fused_vs_unfused(8, 256, 512, 128);
benchmark_bmm_add_fused_vs_unfused(4, 512, 512, 512);
benchmark_bmm_add_fused_vs_unfused(2, 1024, 1024, 1024);
std::cout << std::endl;
#endif
}