Parallelizing-FFT/fft-opt1.cu at master · gnarayang/Parallelizing-FFT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
// One of the possible optimizations to the implementation in fft.cu
//
// The algorithm is as follows:
//
// for (s = 0 to log2(N)) ........ loop1
//      m = 2^i;
//      statements
//      for (j=0;j<N;j+=m) ....... loop2
//          for (k=0 to m/2) ..... loop3
//              statements
//
// The second and third loop are complementary to each other in the sense that
// while loop2 runs for n/m values of j, loop3 runs for m/2 values of k. With
// larger values of m, loop3 does more work, while for smaller values of m, loop2
// does more work, so in this optimization, we plan to separate out the two cases
// and achieve parallelization of both outer and inner loops
//
// The point of separation needs to be experimentally arrived at, i.e, we need to
// test out different cases and choose the best of the lot. As a first commit in
// the optimization, we have adopted a naive approach and check the value of m/2.
// If the value of m/2 is less than the maximum number of threads that can be
// spawned by a block, we parallelize loop2, and loop3 otherwise.
//
//

#include <stdio.h>
#include <cmath>
#include <cuda.h>

typedef float2 Complex;

#define THREADS 32
#define MAX_NO_OF_THREADS_PER_BLOCK 1024

const long long ARRAY_SIZE = 65536;
const long long ARRAY_BYTES = ARRAY_SIZE * sizeof(Complex);

// Bit reversal re-ordering, first step of FFT
__global__ void bit_reverse_reorder(Complex *d_rev, Complex *d_a, int s) {
	  int id = blockIdx.x * blockDim.x + threadIdx.x;
    int rev = __brev(id) >> (32-s);

    if(id < ARRAY_SIZE)
        d_rev[rev] = d_a[id];
}

// Work of the innermost loop, common to both parallelization
__device__ void inplace_fft(Complex *a, int j, int k, int m){

    if (j+k+m/2 < ARRAY_SIZE){

        Complex w, t, u;

        // w^k (w is root of unity)
        w.x = __cosf((2*M_PI*k)/m);
        w.y = -__sinf((2*M_PI*k)/m);

        // u = a[j+k]
        u.x = a[j+k].x;
        u.y = a[j+k].y;

        // t = w*a[j+k+m/2];
        t.x = w.x*a[j+k+m/2].x - w.y*a[j+k+m/2].y;
        t.y = w.x*a[j+k+m/2].y + w.y*a[j+k+m/2].x;

        // a[j+k] = u+t;
        a[j+k].x = u.x + t.x;
        a[j+k].y = u.y + t.y;

        // a[j+k+m/2] = u-t;
        a[j+k+m/2].x = u.x - t.x;
        a[j+k+m/2].y = u.y - t.y;

    }
}

// Parallelization of loop2
__global__ void fft_outer(Complex *a, int m){
    int j = (blockIdx.x * blockDim.x + threadIdx.x)*m;
    if (j < ARRAY_SIZE){
        for (int k=0;k<m/2;k++){
            inplace_fft(a,j,k,m);
        }
    }
}

// Parallelization of loop3
__global__ void fft_inner(Complex *a, int j, int m){
    int k = (blockIdx.x * blockDim.x + threadIdx.x);
    if (k < m/2)
        inplace_fft(a,j,k,m);
}

int main() {

    // Host arrays for input and output
    Complex h_a[ARRAY_SIZE];
    Complex h_rev[ARRAY_SIZE];

    // Input signal, complex part remains zero.
    // Signal is of the form sin(2*M_PI*f*x/N) or cos(2*M_PI*f*x/N)
    // N is sample size and is always a power of 2
    for(int i = 0; i < ARRAY_SIZE; i++) {
    	h_a[i].x = cos((6.0*M_PI*i)/ARRAY_SIZE);
        h_a[i].y = 0.0;
    }

    // No of bits required to represent N
    int s = (int)ceil(log2(ARRAY_SIZE));

    // Device arrays
    Complex *d_a, *d_rev;

    // Memory Allocation
    cudaMalloc((void**) &d_a, ARRAY_BYTES);
    cudaMalloc((void**) &d_rev, ARRAY_BYTES);

    // Copy host array to device
    cudaMemcpy(d_a, h_a, ARRAY_BYTES, cudaMemcpyHostToDevice);

    // First step in FFT, bit-reverse reordering
    // Swap an element at index 'i', with the element
    // which is the bit-string reversal of 'i'
    bit_reverse_reorder<<<(ARRAY_SIZE+THREADS-1)/THREADS, THREADS>>>(d_rev, d_a, s);

    cudaDeviceSynchronize();

    // FFT driver code
    for (int i=1;i<=s;i++){

        // m = 2^i
        int m = 1 << i;

        if (m/2 < MAX_NO_OF_THREADS_PER_BLOCK){

            fft_outer<<<((ARRAY_SIZE/m)+THREADS-1)/THREADS,THREADS>>>(d_rev,m);

        } else {

            for (int j=0;j<ARRAY_SIZE;j+=m){

                fft_inner<<<((m/2)+THREADS-1)/THREADS,THREADS>>>(d_rev,j,m);

            }
        }
    }

    // Copy result array from device to host
    cudaMemcpy(h_rev,d_rev,ARRAY_BYTES,cudaMemcpyDeviceToHost);

    // Free allocated device memory
    cudaFree(d_a);
    cudaFree(d_rev);

    return 0;
}