-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathatomic_operations.cu
More file actions
70 lines (56 loc) · 1.99 KB
/
atomic_operations.cu
File metadata and controls
70 lines (56 loc) · 1.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#include <stdio.h>
#include "gputimer.h"
#define NUM_THREADS 1000000
#define ARRAY_SIZE 100
#define BLOCK_WIDTH 1000
void print_array(int *array, int size)
{
printf("{ ");
for (int i = 0; i < size; i++) { printf("%d ", array[i]); }
printf("}\n");
}
__global__ void increment_naive(int *g)
{
// which thread is this?
int i = blockIdx.x * blockDim.x + threadIdx.x;
// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
i = i % ARRAY_SIZE;
g[i] = g[i] + 1;
}
__global__ void increment_atomic(int *g)
{
// which thread is this?
int i = blockIdx.x * blockDim.x + threadIdx.x;
// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
i = i % ARRAY_SIZE;
atomicAdd(& g[i], 1);
}
int main(int argc,char **argv)
{
GpuTimer timer;
printf("%d total threads in %d blocks writing into %d array elements\n",
NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);
// declare and allocate host memory
int h_array[ARRAY_SIZE];
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
// declare, allocate, and zero out GPU memory
int * d_array;
cudaMalloc((void **) &d_array, ARRAY_BYTES);
cudaMemset((void *) d_array, 0, ARRAY_BYTES);
// launch the kernel - comment out one of these
timer.Start();
// Instructions: This program is needed for the next quiz
// uncomment increment_naive to measure speed and accuracy
// of non-atomic increments or uncomment increment_atomic to
// measure speed and accuracy of atomic icrements
// increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
increment_atomic<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
timer.Stop();
// copy back the array of sums from GPU and print
cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost);
print_array(h_array, ARRAY_SIZE);
printf("Time elapsed = %g ms\n", timer.Elapsed());
// free GPU memory allocation and exit
cudaFree(d_array);
return 0;
}