diff --git a/README.md b/README.md index b71c458..1de268a 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,127 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Nischal K N +* Tested on: Windows 10, i7-2670QM @ 2.20GHz 8GB, GTX 540 2GB (Personal) -### (TODO: Your README) +### SUMMARY +This project implements scan, stream compaction and sorting algorithms implemented on GPU and its performance compared against a CPU implementation. Two kinds of exclusive scans are implemented, viz., naive and efficient. Stream compaction is implemented using the efficient GPU scan implementation. Additionally Radix sort is also implemented using efficient stream compaction. The performance of these algorithms are measured against CPU and thrust implementations and are documented in Analysis section. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### BUILD +* To setup cuda development environment use [this guide](https://github.com/nischalkn/Project0-CUDA-Getting-Started/blob/master/INSTRUCTION.md#part-1-setting-up-your-development-environment) +* Once the environment is set up, to Build and run the project follow [this](https://github.com/nischalkn/Project0-CUDA-Getting-Started/blob/master/INSTRUCTION.md#part-3-build--run) +### PARAMETERS +* `SIZE` - Size of array to be scanned, compacted and sorted +* `PROFILE` - `1` or `0`, Print execution times + +### PERFORMANCE ANALYSIS +The time taken to perform exclusive scans on arrays of different sizes ranging from `2^8` or `2^16` were recorded with a fixed block size of `128` (program crashed for array sizes beyond `2^16`). It was seen that the time taken by the CPU was very small to be recorded accurately for small array sizes but would increase exponentially as the array size increases. It is also seen that the naive scan outperforms work-efficient scan at larger array sizes because of the overhead of upsweep and downsweep steps which increase logarithmically with increase in array size. Most of the threads do not do any work in the extreme stages of the upsweep and downsweep. Many threads are launched but do not perform any work. This overhead causes a dramatic increase in execution time. It was also observed that the thrust implementation consistently was the slowest. The following are the time taken for different kind of scans in milliseconds. + +|Scan | 2^8 | 2^10 | 2^12 | 2^14 | 2^16 | +|:--------------------------------|-----|-------|-------|-------|-------| +|cpu scan, power-of-two | 0 | 0 | 0 | 0 | 0 | +|cpu scan, non-power-of-two | 0 |0 | 0 |0 | 0.5004| +|naive scan, power-of-two | 0.5357 |0.7699| 0.8006| 1.2946| 2.4013| +|naive scan, non-power-of-two | 0.6124| 0.6252| 0.6991| 1.0961| 2.4055| +|work-efficient scan, power-of-two| 0.0750| 0.1054| 0.2469| 0.8709| 3.5367| +|work-efficient scan, non-power-of-two| 0.0745| 0.1050| 0.2465| 0.8685| 3.5334| +|thrust scan, power-of-two | 6.7073| 6.8454| 7.4349| 10.7565| 28.5635| +|thrust scan, non-power-of-two | 0.7849| 0.8866| 1.6581| 4.8936| 16.7807| + +![](images/scan_array.png) + +An other experiment was conducted with various block sizes for a constant array length of `2^16`. It is seen that the best performance is obtained with a block size of `128`. +![](images/scan_block.png) +CPU compactions for array sizes less than `2^14` take negligible amount of time. The work-efficient compactions is the slowest because of the same reason as mentioned above. A number of threads are launched which do not do any work. + +|Compact | 2^8 | 2^10 | 2^12 | 2^14 | 2^16 | +|:----------------------------------------|-----|-------|-------|-------|-----| +|cpu compact without scan, power of two |0 |0 |0 |0.5029 |0.500066667| +|cpu compact without scan, non-power of two |0 |0 |0 |0.5019 |0.50035| +|cpu compact with scan |0 |0 |0 |0.4998 |0.999733333| +|work-efficient compact, power-of-two |0.089568 |0.123530667 |0.280234667| 0.946997333 |3.793076667| +|work-efficient compact, non-power-of-two |0.086741333 |0.118101333| 0.276192 |0.947157333 |3.79866| + + +![](images/compact_array.png) +Again running the compactions on different block sizes result in a best block size of `128`. +![](images/compact_block.png) +A radix sort was implemented using the work-efficient scan and its performance was compared with the `thrust::sort`. It was seen that for small array sizes, the radix sort outperforms thrust sort. But however as the array size increases, the work-efficient scan becomes slow as explained above and reduces the effciency of sorting algorithm. +![](images/sort.png) + +### FUTURE IMPROVEMENTS +* Work-efficient scan can be optimized to reduce the number of threads launched at each stage of upsweep and downsweep process. +* Early termination of threads would also improve the scan performance. + +### OUTPUT +``` + +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 35 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed +==== naive scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== naive scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== work-efficient scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed +==== thrust scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== thrust scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 1 ] + passed +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== work-efficient compact, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 1 ] + passed + +***************************** +***** RADIX SORT TESTS ****** +***************************** +==== thrust sort small array ==== + [ 0 1 2 3 4 5 6 7 8 9 ] +==== Radix Sort small array ==== + [ 0 1 2 3 4 5 6 7 8 9 ] + passed +==== thrust sort large array ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 199 199 ] +==== Radix Sort large array ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 199 199 ] + passed +``` + +### MODIFICATIONS +* radixSort was added to StreamCompaction module containing the sort function. +* sort function was also added to thrust.cu module to perform `thrust::sort`. +* 4 tests are added to the main function for sorting. 2 test to sort an array using `thrust::sort` and 2 tests to sort the array using `StreamCompaction::radixSort`. +* CMakeLists.txt of StreamCompaction was edited to include `radixSort.h` and `radixSort.cu`. diff --git a/images/compact_array.png b/images/compact_array.png new file mode 100644 index 0000000..f5a6a4a Binary files /dev/null and b/images/compact_array.png differ diff --git a/images/compact_block.png b/images/compact_block.png new file mode 100644 index 0000000..f5d8499 Binary files /dev/null and b/images/compact_block.png differ diff --git a/images/scan_array.png b/images/scan_array.png new file mode 100644 index 0000000..d75db25 Binary files /dev/null and b/images/scan_array.png differ diff --git a/images/scan_block.png b/images/scan_block.png new file mode 100644 index 0000000..c2957fa Binary files /dev/null and b/images/scan_block.png differ diff --git a/images/sort.png b/images/sort.png new file mode 100644 index 0000000..b4f819e Binary files /dev/null and b/images/sort.png differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..45c0dee 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,10 +11,11 @@ #include #include #include +#include #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 1 << 16; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; @@ -43,37 +44,37 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -112,12 +113,41 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + + printf("\n"); + printf("*****************************\n"); + printf("***** RADIX SORT TESTS ******\n"); + printf("*****************************\n"); + + int t1[10] = { 4, 1, 6, 3, 8, 9, 2, 0, 5, 7 }; + int t2[10]; + int t3[10]; + printDesc("thrust sort small array"); + StreamCompaction::Thrust::sort(10, t2, t1); + printArray(10, t2, true); + + printDesc("Radix Sort small array"); + StreamCompaction::radixSort::sort(10, t3, t1); + printArray(10, t3, true); + printCmpResult(10, t2, t3); + + genArray(SIZE, a, 200); + zeroArray(SIZE, c); + printDesc("thrust sort large array"); + StreamCompaction::Thrust::sort(SIZE, c, a); + printArray(SIZE, c, true); + + zeroArray(SIZE, b); + printDesc("Radix Sort large array"); + StreamCompaction::radixSort::sort(SIZE, b, a); + printArray(SIZE, b, true); + printCmpResult(10, c, b); } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..3dfb146 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -9,6 +9,8 @@ set(SOURCE_FILES "efficient.cu" "thrust.h" "thrust.cu" + "radixSort.h" + "radixSort.cu" ) cuda_add_library(stream_compaction diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..4e0d3e6 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,10 @@ namespace Common { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + return; + bools[index] = idata[index] ? 1 : 0; } /** @@ -32,7 +35,11 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + return; + if (bools[index] == 1) + odata[indices[index]] = idata[index]; } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..afb4101 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -3,6 +3,11 @@ #include #include #include +#include +#include +#include + +#define PROFILE 0 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..f4c0dac 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,5 +1,6 @@ #include #include "cpu.h" +#include "common.h" namespace StreamCompaction { namespace CPU { @@ -8,8 +9,21 @@ namespace CPU { * CPU scan (prefix sum). */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + odata[0] = 0; + + #if PROFILE + auto begin = std::chrono::high_resolution_clock::now(); + #endif + + for (int i = 1; i < n; i++) + { + odata[i] = odata[i - 1] + idata[i - 1]; + } + + #if PROFILE + auto end = std::chrono::high_resolution_clock::now(); + std::cout << "Time Elapsed for cpu scan(size " << n << "): " << std::chrono::duration_cast(end - begin).count() << "ns" << std::endl; + #endif } /** @@ -18,8 +32,27 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; + long j = 0; + + #if PROFILE + auto begin = std::chrono::high_resolution_clock::now(); + #endif + + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[j] = idata[i]; + j++; + } + } + + #if PROFILE + auto end = std::chrono::high_resolution_clock::now(); + std::cout << "Time Elapsed for compact without scan(size " << n << "): " << std::chrono::duration_cast(end - begin).count() << "ns" << std::endl; + #endif + + return j; } /** @@ -28,8 +61,35 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; + int *temporal; + int *pscan; + long j = 0; + temporal = (int*)malloc(sizeof(int)*n); + pscan = (int*)malloc(sizeof(int)*n); + + #if PROFILE + auto begin = std::chrono::high_resolution_clock::now(); + #endif + for (int i = 0; i < n; i++) + { + temporal[i] = idata[i] ? 1 : 0; + } + scan(n, pscan, temporal); + for (int i = 0; i < n; i++) + { + if (temporal[i] == 1) + { + odata[pscan[i]] = idata[i]; + j++; + } + } + + #if PROFILE + auto end = std::chrono::high_resolution_clock::now(); + std::cout << "Time Elapsed for compact with scan(size " << n << "): " << std::chrono::duration_cast(end - begin).count() << "ns" << std::endl; + #endif + + return j; } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..9595e72 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,32 +3,166 @@ #include "common.h" #include "efficient.h" +#define blockSize 128 + namespace StreamCompaction { namespace Efficient { -// TODO: __global__ + __global__ void upSweep(int n, int *idata, int d) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) + return; + if (k % (d * 2) == (d * 2) - 1) { + idata[k] += idata[k - d]; + } -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + } -/** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ -int compact(int n, int *odata, const int *idata) { - // TODO - return -1; -} + __global__ void downSweep(int n, int *idata, int d) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) + return; + int temp; + if (k % (d * 2) == (d * 2) - 1) { + //printf("kernel: %d", k); + temp = idata[k - d]; + idata[k - d] = idata[k]; // Set left child to this node’s value + idata[k] += temp; + } + + } + + __global__ void makeElementZero(int *data, int index) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index == k) { + data[k] = 0; + } + } + + __global__ void copyElements(int n, int *src, int *dest) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + return; + dest[index] = src[index]; + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + int *dev_idata; + + int paddedArraySize = 1 << ilog2ceil(n); + + dim3 fullBlocksPerGrid((paddedArraySize + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_idata, paddedArraySize * sizeof(int)); + checkCUDAError("Cannot allocate memory for idata"); + + cudaMemcpy(dev_idata, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + #if PROFILE + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + #endif + + for (int d = 0; d < ilog2ceil(paddedArraySize); d++) { + upSweep << > >(paddedArraySize, dev_idata, 1<> >(dev_idata, paddedArraySize - 1); + + for (int d = ilog2ceil(paddedArraySize) - 1; d >= 0; d--) { + downSweep << > >(paddedArraySize, dev_idata, 1<> >(n, dev_boolean, dev_idata); + + copyElements << > >(n, dev_boolean, dev_indices); + + for (int d = 0; d < ilog2ceil(paddedArraySize); d++) { + upSweep << > >(paddedArraySize, dev_indices, 1 << d); + } + + makeElementZero << > >(dev_indices, paddedArraySize - 1); + + for (int d = ilog2ceil(paddedArraySize) - 1; d >= 0; d--) { + downSweep << > >(paddedArraySize, dev_indices, 1 << d); + } + + StreamCompaction::Common::kernScatter << > >(n, dev_odata, dev_idata, dev_boolean, dev_indices); + + #if PROFILE + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + std::cout << "Time Elapsed for Efficient compact (size " << n << "): " << milliseconds << std::endl; + #endif + + cudaMemcpy(odata, dev_odata, n*sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&count, dev_indices + paddedArraySize - 1, sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_boolean); + cudaFree(dev_indices); + return count; + } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..7af9804 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,18 +3,83 @@ #include "common.h" #include "naive.h" +#define blockSize 128 + +#if PROFILE +cudaEvent_t start, stop; +#endif + namespace StreamCompaction { namespace Naive { -// TODO: __global__ + __global__ void naiveScan(int n, int *odata, int *idata, int val) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + if (index >= val) { + odata[index] = idata[index - val] + idata[index]; + } + else { + odata[index] = idata[index]; + } + } -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + int *dev_odata; + int *dev_idata; + int flag = 1; + int val; + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + #if PROFILE + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + #endif + + for (int d = 1; d <= ilog2ceil(n); d++) { //ilog2ceil(n) + val = pow(2, d - 1); + if (flag == 1) { + naiveScan << > >(n, dev_odata, dev_idata, val); + flag = 0; + } + else { + naiveScan << > >(n, dev_idata, dev_odata, val); + flag = 1; + } + cudaThreadSynchronize(); + } + + odata[0] = 0; + + #if PROFILE + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + std::cout << "Time Elapsed for Naive scan (size " << n << "): " << milliseconds << std::endl; + #endif + + if (flag == 0) { + cudaMemcpy(odata+1, dev_odata, (n-1)*sizeof(int), cudaMemcpyDeviceToHost); + } + else { + cudaMemcpy(odata+1, dev_idata, (n-1)*sizeof(int), cudaMemcpyDeviceToHost); + } + + cudaFree(dev_idata); + cudaFree(dev_odata); + } } } diff --git a/stream_compaction/radixSort.cu b/stream_compaction/radixSort.cu new file mode 100644 index 0000000..1d21aa2 --- /dev/null +++ b/stream_compaction/radixSort.cu @@ -0,0 +1,121 @@ +#include +#include +#include "common.h" +#include "efficient.h" + +#define blockSize 128 + +namespace StreamCompaction { + namespace radixSort { + + __global__ void makeBarray(int n, int *odata, int *idata, int d) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + return; + int val = idata[index] & d; + odata[index] = val ? 1 : 0; + } + + __global__ void makeEarray(int n, int *odata, int *idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + return; + odata[index] = idata[index] ? 0 : 1; + } + + __global__ void makeTarray(int n, int *odata, int *idata,int totalFalse) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + return; + odata[index] = index - idata[index] + totalFalse; + } + + __global__ void makeDarray(int n, int *odata, int *b, int *t, int *f) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + return; + odata[index] = b[index] ? t[index] : f[index]; + } + + __global__ void reorder(int n, int *odata, int *idata, int *d) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + return; + odata[d[index]] = idata[index]; + } + + void sort(int n, int *odata, const int *idata) { + int *dev_idata; + int *dev_odata; + int *dev_b; + int *dev_f; + int *dev_t; + int *dev_d; + + int last_e; + int last_f; + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("Cannot allocate memory for idata"); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("Cannot allocate memory for odata"); + cudaMalloc((void**)&dev_b, n * sizeof(int)); + checkCUDAError("Cannot allocate memory for odata"); + cudaMalloc((void**)&dev_f, n * sizeof(int)); + checkCUDAError("Cannot allocate memory for odata"); + cudaMalloc((void**)&dev_t, n * sizeof(int)); + checkCUDAError("Cannot allocate memory for odata"); + cudaMalloc((void**)&dev_d, n * sizeof(int)); + checkCUDAError("Cannot allocate memory for odata"); + + cudaMemcpy(dev_idata, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + #if PROFILE + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + #endif + + for (int i = 0; i < ilog2ceil(n); i++){ + makeBarray << > >(n, dev_b, dev_idata, 1 << i); + makeEarray << > >(n, dev_f, dev_b); + StreamCompaction::Efficient::scan(n, dev_f, dev_f); + + cudaMemcpy(&last_e, dev_b + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&last_f, dev_f + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + + int totalFalses = (last_e ? 0 : 1) + last_f; + + makeTarray << > >(n, dev_t, dev_f, totalFalses); + makeDarray << > >(n, dev_d, dev_b, dev_t, dev_f); + + reorder << > >(n, dev_odata, dev_idata, dev_d); + + int *temp = dev_odata; + dev_odata = dev_idata; + dev_idata = temp; + } + + #if PROFILE + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + std::cout << "Time Elapsed for radix sort (size " << n << "): " << milliseconds << std::endl; + #endif + + cudaMemcpy(odata, dev_idata, n*sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_b); + cudaFree(dev_f); + cudaFree(dev_t); + cudaFree(dev_d); + } + + } +} diff --git a/stream_compaction/radixSort.h b/stream_compaction/radixSort.h new file mode 100644 index 0000000..5184533 --- /dev/null +++ b/stream_compaction/radixSort.h @@ -0,0 +1,7 @@ +#pragma once + +namespace StreamCompaction { + namespace radixSort { + void sort(int n, int *odata, const int *idata); + } +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..3922eaf 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -3,20 +3,67 @@ #include #include #include +#include #include "common.h" #include "thrust.h" namespace StreamCompaction { namespace Thrust { -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ -void scan(int n, int *odata, const int *idata) { - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); -} + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + // TODO use `thrust::exclusive_scan` + // example: for device_vectors dv_in and dv_out: + // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::device_vector dv_in(n); + thrust::device_vector dv_out(n); + thrust::copy(idata, idata + n - 1, dv_in.begin()); + + #if PROFILE + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + #endif + + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + #if PROFILE + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + std::cout << "Time Elapsed for trust scan (size " << n << "): " << milliseconds << std::endl; + #endif + + thrust::copy(dv_out.begin(), dv_out.end(), odata); + } + + void sort(int n, int *odata, const int *idata) { + thrust::device_vector dv_out(n); + thrust::copy(idata, idata + n, dv_out.begin()); + + #if PROFILE + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + #endif + + thrust::sort(dv_out.begin(), dv_out.end()); + + #if PROFILE + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + std::cout << "Time Elapsed for trust sort (size " << n << "): " << milliseconds << std::endl; + #endif + + thrust::copy(dv_out.begin(), dv_out.end(), odata); + } } } diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h index 06707f3..6187545 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.h @@ -3,5 +3,6 @@ namespace StreamCompaction { namespace Thrust { void scan(int n, int *odata, const int *idata); + void sort(int n, int *odata, const int *idata); } }