-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.cu
More file actions
122 lines (88 loc) · 3.22 KB
/
main.cu
File metadata and controls
122 lines (88 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdint.h>
#define STB_IMAGE_WRITE_IMPLEMENTATION
#define STB_IMAGE_IMPLEMENTATION
#include "lib/stb/stb_image.h"
#include "lib/stb/stb_image_write.h"
#define N_CHANNELS 3
#define MAX_BLOCK_PER_GRID 65534
#include "include/functions.hpp"
int main()
{
char path_to_image [] = "../images/airplane2.jpg";
char path_to_save [] = "../output/output2.jpg";
int count;
cudaDeviceProp prop;
cudaGetDeviceCount( &count );
for (int i = 0; i < count; i++)
{
cudaGetDeviceProperties( &prop, i );
}
cudaEvent_t start, stop;
float gpuTime = 0.0;
int n_iter = 10;
uint8_t* src;
uint8_t* dst;
uint8_t* dev_src;
uint8_t* dev_dst;
uint8_t* dev_kernel;
uint8_t* dev_transfer_out;
int width, height, ch;
uint8_t kernel [81] = {1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1};
src = stbi_load( path_to_image, &width, &height, &ch, 3 );
int N = width*height*N_CHANNELS;
dst = (uint8_t*)malloc(N * sizeof( uint8_t ));
cudaEventCreate( &start );
cudaEventCreate( &stop );
cudaEventRecord( start, 0 );
// printf("\nCUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
for (int i = 0; i < n_iter; i++)
{
cudaMalloc((void**)&dev_src, N * sizeof( uint8_t ));
cudaMalloc((void**)&dev_dst, N * sizeof( uint8_t ));
cudaMalloc((void**)&dev_transfer_out, N * sizeof( uint8_t ));
cudaMalloc((void**)&dev_kernel, sizeof( kernel ));
cudaMemcpy( dev_src, src, N * sizeof(uint8_t), cudaMemcpyHostToDevice );
cudaMemcpy( dev_kernel, kernel, sizeof(kernel), cudaMemcpyHostToDevice );
int threadsPerBlock = prop.maxThreadsPerBlock;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
if (blocksPerGrid > MAX_BLOCK_PER_GRID)
{
blocksPerGrid = MAX_BLOCK_PER_GRID;
}
transfer<<< blocksPerGrid, threadsPerBlock >>>( dev_src, dev_transfer_out,
height, width, 2000, 2000 );
Convolution2D<<< blocksPerGrid, threadsPerBlock >>>( dev_transfer_out, dev_dst, height, width,
dev_kernel, 9 );
cudaMemcpy( dst, dev_dst, N * sizeof(uint8_t), cudaMemcpyDeviceToHost );
cudaFree( dev_src );
cudaFree( dev_dst );
cudaFree( dev_transfer_out );
cudaFree( dev_kernel );
}
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &gpuTime, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
printf( "\nTime for %d X %d X 3 image = %f milliseconds\n", width, height, gpuTime / n_iter );
int err = stbi_write_jpg( path_to_save, width, height, ch, dst, width*ch );
if (err == 0)
{
printf("\nWrite/Read error\n");
}
stbi_image_free( src );
free( dst );
return 0;
}