-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsumArraysOnGPUwithTime.cu
More file actions
101 lines (82 loc) · 2.77 KB
/
sumArraysOnGPUwithTime.cu
File metadata and controls
101 lines (82 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#include <cuda_runtime.h>
#include <cstdio>
#include <ctime>
#include <iostream>
//#include <format>
#include "helper.h"
#include <chrono>
#include <sys/time.h>
#include <cuda_profiler_api.h>
void initialData(float *ip, int size){
time_t t;
srand((unsigned int) time(&t));
for(int i=0; i < size; ++i){
ip[i] = (float )(rand() & 0xFFFF) / 10.0f; // 8bit 一个字节0xFF
}
}
void sumArrayOnHost(float *A, float *B, float *C, const int N){
for(int idx=0;idx < N;++idx) C[idx] = A[idx] + B[idx];
}
__global__ void sumArrayOnGPU(float *A, float *B, float *C, int N){
auto i = blockIdx.x * blockDim.x + threadIdx.x;
if(i < N) C[i] = A[i] + B[i];
}
int main(int argc, char **argv){
printf("%s Starting ... \n", argv[0]);
cudaProfilerStart();
// setup device
int dev = 0;
cudaDeviceProp deviceProp{};
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
std::cout << "Using Device " << dev << ": " << deviceProp.name << "\n";
CHECK(cudaSetDevice(dev));
// set up size of vectors
int nElem = 1<<24;
printf("Vector size %d\n", nElem);
// malloc host memory
size_t nBytes = nElem * sizeof(float);
auto h_A = (float *) malloc(nBytes);
auto h_B = (float *) malloc(nBytes);
auto host_C = (float *) malloc(nBytes);
auto gpu_C = (float *) malloc(nBytes);
// initial data at host side
initialData(h_A, nElem);
initialData(h_B, nElem);
memset(host_C, 0, nBytes);
memset(gpu_C, 0, nBytes);
// malloc GPU memory
float *d_A, *d_B, *d_C;
cudaMalloc((float **) &d_A, nBytes);
cudaMalloc((float **) &d_B, nBytes);
cudaMalloc((float **) &d_C, nBytes);
// transfer data
cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
// Configuration
dim3 nThread {32};
dim3 nBlock {(nElem + nThread.x - 1) / nThread.x};
// calculate on Device
auto start = std::chrono::high_resolution_clock::now();
sumArrayOnGPU<<<nBlock, nThread>>>(d_A, d_B, d_C, nElem);
cudaProfilerStop();
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
auto duration = end - start;
std::cout << "Execution configuration <<<" << nBlock.x << ", " << nThread.x << ">>>.\n";
std::cout << "Execution time: " << duration.count() << std::endl;
// std::cout << std::format("Execution configuration <<<{},{}>>>\n", nBlock.x, nThread.x) << std::endl;
cudaMemcpy(gpu_C, d_C, nBytes, cudaMemcpyDeviceToHost);
// calculate on Host
sumArrayOnHost(h_A, h_B, host_C, nElem);
// check Result
checkResult(host_C, gpu_C, nElem);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(host_C);
free(gpu_C);
cudaDeviceReset();
return 0;
}