CudaCProgramming/sumArraysOnGPUwithTime.cu at main · DoubleRedX/CudaCProgramming · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#include <cuda_runtime.h>
#include <cstdio>
#include <ctime>
#include <iostream>
//#include <format>
#include "helper.h"
#include <chrono>
#include <sys/time.h>
#include <cuda_profiler_api.h>
void initialData(float *ip, int size){
    time_t t;
    srand((unsigned int) time(&t));

    for(int i=0; i < size; ++i){
        ip[i] = (float )(rand() & 0xFFFF) / 10.0f;  // 8bit 一个字节0xFF
    }
}

void sumArrayOnHost(float *A, float *B, float *C, const int N){
    for(int idx=0;idx < N;++idx) C[idx] = A[idx] + B[idx];
}

__global__ void sumArrayOnGPU(float *A, float *B, float *C, int N){
    auto i = blockIdx.x * blockDim.x + threadIdx.x;
    if(i < N) C[i] = A[i] + B[i];
}

int main(int argc, char **argv){
    printf("%s Starting ... \n", argv[0]);
    cudaProfilerStart();

    // setup device
    int dev = 0;
    cudaDeviceProp deviceProp{};
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    std::cout << "Using Device " << dev << ": " << deviceProp.name << "\n";
    CHECK(cudaSetDevice(dev));

    // set up size of vectors
    int nElem = 1<<24;
    printf("Vector size %d\n", nElem);

    // malloc host memory
    size_t nBytes = nElem * sizeof(float);
    auto h_A = (float *) malloc(nBytes);
    auto h_B = (float *) malloc(nBytes);
    auto host_C = (float *) malloc(nBytes);
    auto gpu_C = (float *) malloc(nBytes);

    // initial data at host side
    initialData(h_A, nElem);
    initialData(h_B, nElem);

    memset(host_C, 0, nBytes);
    memset(gpu_C, 0, nBytes);

    // malloc GPU memory
    float *d_A, *d_B, *d_C;
    cudaMalloc((float **) &d_A, nBytes);
    cudaMalloc((float **) &d_B, nBytes);
    cudaMalloc((float **) &d_C, nBytes);

    // transfer data
    cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);

    // Configuration
    dim3 nThread {32};
    dim3 nBlock {(nElem + nThread.x - 1) / nThread.x};

    // calculate on Device
    auto start = std::chrono::high_resolution_clock::now();
    sumArrayOnGPU<<<nBlock, nThread>>>(d_A, d_B, d_C, nElem);
    cudaProfilerStop();
    cudaDeviceSynchronize();
    auto end = std::chrono::high_resolution_clock::now();
    auto duration = end - start;
    std::cout << "Execution configuration <<<" << nBlock.x << ", " << nThread.x <<  ">>>.\n";
    std::cout << "Execution time: " << duration.count() << std::endl;
//    std::cout << std::format("Execution configuration <<<{},{}>>>\n", nBlock.x, nThread.x) << std::endl;
    cudaMemcpy(gpu_C, d_C, nBytes, cudaMemcpyDeviceToHost);

    // calculate on Host
    sumArrayOnHost(h_A, h_B, host_C, nElem);

    // check Result
    checkResult(host_C, gpu_C, nElem);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(host_C);
    free(gpu_C);

    cudaDeviceReset();

    return 0;
}