diff --git a/README.md b/README.md index b71c458..d738c01 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,263 @@ -CUDA Stream Compaction -====================== +####University of Pennsylvania +####CIS 565: GPU Programming and Architecture -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +##Project 2 - CUDA Stream Compaction -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Xueyin Wan +* Tested on: Windows 10 x64, i7-6700K @ 4.00GHz 16GB, GTX 970 4096MB (Personal Desktop) +* Compiled with Visual Studio 2013 and CUDA 7.5 -### (TODO: Your README) +**SCREENSHOT** +------------- +**BlockSize : 128** +**SIZE : 1 << 24** -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +![alt text](https://github.com/xueyinw/Project2-Stream-Compaction/blob/master/result_showcase/XueyinResultOriginal_pow(2%2C24).gif "Performance One") +### +**FEATURES I IMPLEMENT** +------------- +``` +Part 1: CPU Scan & Stream Compaction +Part 2: Naive GPU Scan Algorithm +Part 3: Work-Efficient GPU Scan & Stream Compaction +Part 4: Thrust Exclusive Scan using Thrust library +Part 5: Radix Sort (Extra Credit) +Part 6: Using std::chrono and CUDA events for comparing the speed of different algorithms +``` +### + +**Dive Into Block Size** +------------- +In order to find the relationship between block size and performance, I modified block size to see different algorithm run time for getting the optimized block size. +Below is my chart based on my code: + +**Case 1:** +#####Power of Two number, `SIZE` = 1 << 24 = 16777216. All the time recorded in `ms`. + +Block Size | Naïve Scan | Efficient Scan | Thrust Scan|CPU Scan +---|---|---|---|--- +16 | 52.045727 | 91.401695 |2.128768|24.0632 +32 | 30.109312 | 53.902912 |2.09424|24.0563 +64 | 25.546721 | 29.845119 |2.081152|24.0908 +128 | 25.994272 | 27.808865 |2.255712|24.0321 +256 | 25.615328 | 27.646433 |2.404192|24.064 +512 | 25.576256 | 29.840576 |2.256288|24.5889 +1024 | 25.609535 | 33.565887 |2.211232|24.0653 + + + +**Case 2:** +#####Non Power of Two number, `SIZE(NPOT)`= 1 << 24 - 3 = 16777213. All the time recorded in `ms`. + +Block Size | Naïve Scan | Efficient Scan | Thrust Scan|CPU Scan| +---|---|---|---|--- +16 | 45.901855 | 89.639648 |2.094752|42.9234 +32 | 30.138912 | 51.030048 |2.29776|43.1142 +64 | 25.93968 | 27.795744 |2.011712|42.6413 +128 | 25.812672 | 24.770847 |2.052608|42.6398 +256 | 25.627424 | 27.607807 |2.223552|41.6099 +512 | 25.609535 | 29.848961 |2.146816|42.1115 +1024 | 26.082048 | 33.715874 |2.04576|42.6002 + + +Now let me draw a graph to explicitly show my result :) +#### +`Notice: ` +This graph is based on Case 1 result, `Array Size` is Power of Two number, `SIZE` = 1 << 24 = 16777216 +### +![alt text](https://github.com/xueyinw/Project2-Stream-Compaction/blob/master/result_showcase/ReadMeAboutBlockSizeChoose1.PNG "Chart1") +### +![alt text](https://github.com/xueyinw/Project2-Stream-Compaction/blob/master/result_showcase/ReadMeAboutBlockSizeChoose2.PNG "Chart2") + +From case 1 and case 2, we could see that when block size is less than 128, the algorithm performance is definitely worse than block size = 128. And after we set block size to 128, we could see that radix sort performance reaches to its highest level. As block size continues to grow, we could notice that Naive Scan, Efficient Scan and Radix Sort are all becoming slower. +So I choose my block size to be `128` in my code. + +**Dive Into Array Size** +------------- +I set block size = `128` in my code, and start to use array size as a parameter to change, in order to compare the performance between different GPU algorithms and CPU algorithm. +Below is my chart based on my code: + +#####`Blocksize` = 128. All the time recorded in `ms`. Max Value for scan in the array is 50 (for this chart) +Array Size | Naïve Scan | Efficient Scan | Thrust Scan|CPU Scan +---|---|---|---|--- +2^8 | 0.031904 | 0.11024 |0.021248|0 +2^12 | 0.047008 | 0.141728 |0.027616|0 +2^16 | 0.13168 | 0.347968 |0.245728|0.5013 +2^20 | 1.297824 | 1.681472 |0.468608|1.5041 +2^24 | 25.53968 | 27.6632 |2.403232|25.0931 + +#####`Blocksize` = 128. All the time recorded in `ms`. Max Value for sort in the array is 2^15 (for this chart) +Array Size | Radix Sort | Std::sort +---|---|--- +2^8 | 1.105344 | 0 +2^12 | 2.223136 | 0 +2^16 | 7.358048 | 4.0105 +2^20 | 42.627296 | 58.1868 +2^24 | 749.649841 | 894.4247 + +Graph for summary: +#### +![alt text](https://github.com/xueyinw/Project2-Stream-Compaction/blob/master/result_showcase/ReadMeAboutArraySizeChoose0.PNG "Chart1") +#### +![alt text](https://github.com/xueyinw/Project2-Stream-Compaction/blob/master/result_showcase/ReadMeAboutArraySizeChoose1.PNG "Chart2") + +From the test result, we could see that for small Array Size, GPU implementation is slower than CPU's. But when the array size grows, they two become close. +Thrust Scan is very fast for large array size. + +#### +For Radix sort, I compare it with std::sort. We could see that when arraysize is small, std::sort is faster. +However, as array size grows, my radix sort on GPU is much faster than std::sort! + +###Answer to Questions + +#### 1. Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU. +Done! See above `Dive Into Block Size` part. + + +#### 2. Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis). +Done! See above `Dive Into Array Size` part. +I use CUDA events for timing GPU code. +I use std::chrono for timing CPU code. + +#### 3.To guess at what might be happening inside the Thrust implementation (e.g. allocation, memory copy) +Answer: +I guess the inner mechnism of THRUST requires some initialization operations. After this step, it reaches to better performance. + +#### 4.Can you find the performance bottlenecks? +Answer: +Yes. First I want to mention, When we are doing iterations in scan function, and inside each loop is kernal function like Upsweep, Downsweep. We could see that as the iteration goes on, one phenomenon appears: +There are several threads idling. Since they need to wait those threads which are working to finish there mission, they have to be idling, which causes extra resource allocate. +Paste part of my code here to address this problem: +```java +void scanInDevice(int n, int *devData) { + int blockNum = (n + blockSize - 1) / blockSize; + for (int d = 0; d < ilog2ceil(n) - 1; d++) { //Here we have iterations + upSweep << > >(n, d, devData); // Here we have kernal function + checkCUDAError("upSweep not correct..."); + } + //set last element to zero, refer to slides! + int counter = 0; + cudaMemcpy(&devData[n - 1], &counter, sizeof(int), cudaMemcpyHostToDevice); + + for (int d = ilog2ceil(n) - 1; d >= 0; d--) { + downSweep << > >(n, d, devData); + checkCUDAError("downSweep not correct..."); + } + } +``` +```java +__global__ void upSweep(int N, int d, int *idata) { + int n = (blockDim.x * blockIdx.x) + threadIdx.x; + if (n >= N) { + return; + } + int delta = 1 << d; + int doubleDelta = 1 << (d + 1); + if (n % doubleDelta == 0) { // not each thread is working, right? + //But those "should not be working" threads are still evoked. + idata[n + doubleDelta - 1] += idata[n + delta - 1]; + } + } +``` + +Plan to optimize this (yet several interviews this week I have to say: "lol" D:) +Try to optimize mycode in path tracer project ! + +Also a huge problem: Memory I/O! +We need to malloc memory in device, copy the host content into device then get a result, then transfer back to host memory...... +When we're doing first assignment, we know that for index-continuous threads to access physical-not-continuous memory, it needs extra unnecessary operations and becomes slow. +And in this project, we have a lot of memory I/O operation... So here we found another issue!!! +![alt text](https://github.com/xueyinw/Project2-Stream-Compaction/blob/master/result_showcase/Profiling.PNG "Chart1") +### +From the picture above, we can see that CUDA memory operations occupied especially large part of the entire execution. +So my guess is right. :) + +#### 5. Sample output +More my test result are in the `result_showcase` folder. +Here I show one of them: +####Array Size = 1 << 24. Block Size is 128. +``` +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 42 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411089014 411089056 ] +CPU scan power-of-two number time is 24.032100 ms +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411088950 411088974 ] + passed +CPU scan non-power-of-two number time is 42.639800 ms +==== naive scan, power-of-two ==== +GPU Naive Scan time is 25.994272 ms + passed +==== naive scan, non-power-of-two ==== +GPU Naive Scan time is 25.812672 ms + passed +==== work-efficient scan, power-of-two ==== +GPU Efficient Scan time is 27.808865 ms + passed +==== work-efficient scan, non-power-of-two ==== +GPU Efficient Scan time is 24.770847 ms + passed +==== thrust scan, power-of-two ==== +GPU Thrust Scan time is 2.255712 ms + passed +==== thrust scan, non-power-of-two ==== +GPU Thrust Scan time is 2.052608 ms + passed + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +*************** POWER-OF-TWO **************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 741.838257 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +************* NON-POWER-OF-TWO ************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 743.634277 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan power-of-two number time is 37.117100 ms +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan non-power-of-two number time is 37.113600 ms +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact with scan time is 129.808400 ms +==== work-efficient compact, power-of-two ==== +GPU Efficient Compact time is 27.158209 ms + passed +==== work-efficient compact, non-power-of-two ==== +GPU Efficient Compact time is 27.228865 ms + passed +``` diff --git a/result_showcase/ArraySize = 2^12.txt b/result_showcase/ArraySize = 2^12.txt new file mode 100644 index 0000000..18fbea8 --- /dev/null +++ b/result_showcase/ArraySize = 2^12.txt @@ -0,0 +1,86 @@ +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 24 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 99378 99402 ] +CPU scan power-of-two number time is 0.000000 ms +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 99347 99371 ] + passed +CPU scan non-power-of-two number time is 0.000000 ms +==== naive scan, power-of-two ==== +GPU Naive Scan time is 0.047008 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 99378 99402 ] + passed +==== naive scan, non-power-of-two ==== +GPU Naive Scan time is 0.045472 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== +GPU Efficient Scan time is 0.141728 ms + passed +==== work-efficient scan, non-power-of-two ==== +GPU Efficient Scan time is 0.167264 ms + passed +==== thrust scan, power-of-two ==== +GPU Thrust Scan time is 0.027616 ms + passed +==== thrust scan, non-power-of-two ==== +GPU Thrust Scan time is 0.019008 ms + passed + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +*************** POWER-OF-TWO **************** +********************************************* + [ 38 3623 758 2437 663 3605 173 3613 2258 1940 1757 3524 1142 ... 3336 120 ] +==== std sort for comparasion ==== +std sort for power-of-two number time is 0.000000 ms + [ 0 1 1 1 3 3 4 5 6 6 7 8 8 ... 4093 4094 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 2.223136 ms + passed + [ 0 1 1 1 3 3 4 5 6 6 7 8 8 ... 4093 4094 ] + + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +************* NON-POWER-OF-TWO ************** +********************************************* + [ 38 3623 758 2437 663 3605 173 3613 2258 1940 1757 3524 1142 ... 3336 120 ] +==== std sort for comparasion ==== +std sort for non-power-of-two number time is 0.000000 ms + [ 0 1 1 1 3 3 4 5 6 6 7 8 8 ... 4093 4094 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 2.448928 ms + passed + [ 0 1 1 1 3 3 4 5 6 6 7 8 8 ... 4093 4094 ] + + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +CPU compact without scan power-of-two number time is 0.000000 ms +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +CPU compact without scan non-power-of-two number time is 0.000000 ms +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +CPU compact with scan time is 0.000000 ms +==== work-efficient compact, power-of-two ==== +GPU Efficient Compact time is 0.284064 ms + passed +==== work-efficient compact, non-power-of-two ==== +GPU Efficient Compact time is 0.266176 ms + passed diff --git a/result_showcase/ArraySize = 2^16.txt b/result_showcase/ArraySize = 2^16.txt new file mode 100644 index 0000000..cee1c5d --- /dev/null +++ b/result_showcase/ArraySize = 2^16.txt @@ -0,0 +1,86 @@ +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 35 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] +CPU scan power-of-two number time is 0.501300 ms +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed +CPU scan non-power-of-two number time is 0.502800 ms +==== naive scan, power-of-two ==== +GPU Naive Scan time is 0.131680 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== naive scan, non-power-of-two ==== +GPU Naive Scan time is 0.129920 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== +GPU Efficient Scan time is 0.347968 ms + passed +==== work-efficient scan, non-power-of-two ==== +GPU Efficient Scan time is 0.344672 ms + passed +==== thrust scan, power-of-two ==== +GPU Thrust Scan time is 0.245728 ms + passed +==== thrust scan, non-power-of-two ==== +GPU Thrust Scan time is 0.214912 ms + passed + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +*************** POWER-OF-TWO **************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 15085 26505 ] +==== std sort for comparasion ==== +std sort for power-of-two number time is 4.010500 ms + [ 1 1 1 1 2 2 2 3 3 4 4 4 4 ... 32766 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 7.358048 ms + passed + [ 1 1 1 1 2 2 2 3 3 4 4 4 4 ... 32766 32767 ] + + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +************* NON-POWER-OF-TWO ************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 15085 26505 ] +==== std sort for comparasion ==== +std sort for non-power-of-two number time is 3.509900 ms + [ 1 1 1 1 2 2 2 3 3 4 4 4 4 ... 32766 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 7.425376 ms + passed + [ 1 1 1 1 2 2 2 3 3 4 4 4 4 ... 32766 32767 ] + + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +CPU compact without scan power-of-two number time is 0.000000 ms +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 1 ] + passed +CPU compact without scan non-power-of-two number time is 0.501300 ms +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +CPU compact with scan time is 0.000000 ms +==== work-efficient compact, power-of-two ==== +GPU Efficient Compact time is 0.499584 ms + passed +==== work-efficient compact, non-power-of-two ==== +GPU Efficient Compact time is 0.499040 ms + passed diff --git a/result_showcase/ArraySize = 2^20.txt b/result_showcase/ArraySize = 2^20.txt new file mode 100644 index 0000000..f7185e7 --- /dev/null +++ b/result_showcase/ArraySize = 2^20.txt @@ -0,0 +1,86 @@ +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 6 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 25680538 25680544 ] +CPU scan power-of-two number time is 1.504100 ms +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 25680508 25680512 ] + passed +CPU scan non-power-of-two number time is 1.503500 ms +==== naive scan, power-of-two ==== +GPU Naive Scan time is 1.297824 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 25680538 25680544 ] + passed +==== naive scan, non-power-of-two ==== +GPU Naive Scan time is 1.293184 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== +GPU Efficient Scan time is 1.681472 ms + passed +==== work-efficient scan, non-power-of-two ==== +GPU Efficient Scan time is 1.686784 ms + passed +==== thrust scan, power-of-two ==== +GPU Thrust Scan time is 0.468608 ms + passed +==== thrust scan, non-power-of-two ==== +GPU Thrust Scan time is 0.336704 ms + passed + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +*************** POWER-OF-TWO **************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 14656 30864 ] +==== std sort for comparasion ==== +std sort for power-of-two number time is 58.186800 ms + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 42.627296 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +************* NON-POWER-OF-TWO ************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 14656 30864 ] +==== std sort for comparasion ==== +std sort for non-power-of-two number time is 57.672800 ms + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 42.236992 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan power-of-two number time is 2.507600 ms +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan non-power-of-two number time is 2.506600 ms +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact with scan time is 7.017800 ms +==== work-efficient compact, power-of-two ==== +GPU Efficient Compact time is 1.964224 ms + passed +==== work-efficient compact, non-power-of-two ==== +GPU Efficient Compact time is 2.022688 ms + passed diff --git a/result_showcase/ArraySize = 2^24.txt b/result_showcase/ArraySize = 2^24.txt new file mode 100644 index 0000000..57390b1 --- /dev/null +++ b/result_showcase/ArraySize = 2^24.txt @@ -0,0 +1,86 @@ +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 42 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411089014 411089056 ] +CPU scan power-of-two number time is 25.093100 ms +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411088950 411088974 ] + passed +CPU scan non-power-of-two number time is 43.115700 ms +==== naive scan, power-of-two ==== +GPU Naive Scan time is 25.539680 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411089014 411089056 ] + passed +==== naive scan, non-power-of-two ==== +GPU Naive Scan time is 25.601631 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== +GPU Efficient Scan time is 27.663200 ms + passed +==== work-efficient scan, non-power-of-two ==== +GPU Efficient Scan time is 25.619167 ms + passed +==== thrust scan, power-of-two ==== +GPU Thrust Scan time is 2.403232 ms + passed +==== thrust scan, non-power-of-two ==== +GPU Thrust Scan time is 2.132640 ms + passed + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +*************** POWER-OF-TWO **************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== +std sort for power-of-two number time is 894.424700 ms + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 749.649841 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +************* NON-POWER-OF-TWO ************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== +std sort for non-power-of-two number time is 909.447400 ms + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 747.378296 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan power-of-two number time is 37.114100 ms +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan non-power-of-two number time is 37.096700 ms +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact with scan time is 166.425500 ms +==== work-efficient compact, power-of-two ==== +GPU Efficient Compact time is 27.002945 ms + passed +==== work-efficient compact, non-power-of-two ==== +GPU Efficient Compact time is 27.037985 ms + passed diff --git a/result_showcase/ArraySize = 2^8.txt b/result_showcase/ArraySize = 2^8.txt new file mode 100644 index 0000000..ad1ee6d --- /dev/null +++ b/result_showcase/ArraySize = 2^8.txt @@ -0,0 +1,86 @@ +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 26 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] +CPU scan power-of-two number time is 0.000000 ms +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6146 6190 ] + passed +CPU scan non-power-of-two number time is 0.000000 ms +==== naive scan, power-of-two ==== +GPU Naive Scan time is 0.031904 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] + passed +==== naive scan, non-power-of-two ==== +GPU Naive Scan time is 0.030528 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== +GPU Efficient Scan time is 0.110240 ms + passed +==== work-efficient scan, non-power-of-two ==== +GPU Efficient Scan time is 0.101760 ms + passed +==== thrust scan, power-of-two ==== +GPU Thrust Scan time is 0.021248 ms + passed +==== thrust scan, non-power-of-two ==== +GPU Thrust Scan time is 0.012640 ms + passed + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +*************** POWER-OF-TWO **************** +********************************************* + [ 38 39 246 133 151 21 173 29 210 148 221 196 118 ... 156 177 ] +==== std sort for comparasion ==== +std sort for power-of-two number time is 0.000000 ms + [ 0 0 1 1 3 3 3 6 7 7 7 8 9 ... 253 255 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 1.105344 ms + passed + [ 0 0 1 1 3 3 3 6 7 7 7 8 9 ... 253 255 ] + + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +************* NON-POWER-OF-TWO ************** +********************************************* + [ 38 39 246 133 151 21 173 29 210 148 221 196 118 ... 156 177 ] +==== std sort for comparasion ==== +std sort for non-power-of-two number time is 0.000000 ms + [ 0 0 1 1 3 3 3 6 7 7 7 8 9 ... 253 255 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 1.097664 ms + passed + [ 0 0 1 1 3 3 3 6 7 7 7 8 9 ... 253 255 ] + + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +CPU compact without scan power-of-two number time is 0.000000 ms +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 2 ] + passed +CPU compact without scan non-power-of-two number time is 0.000000 ms +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +CPU compact with scan time is 0.000000 ms +==== work-efficient compact, power-of-two ==== +GPU Efficient Compact time is 0.221120 ms + passed +==== work-efficient compact, non-power-of-two ==== +GPU Efficient Compact time is 0.216736 ms + passed diff --git a/result_showcase/BlockSize1024.txt b/result_showcase/BlockSize1024.txt new file mode 100644 index 0000000..299109a --- /dev/null +++ b/result_showcase/BlockSize1024.txt @@ -0,0 +1,82 @@ +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 42 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411089014 411089056 ] +CPU scan power-of-two number time is 24.065300 ms +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411088950 411088974 ] + passed +CPU scan non-power-of-two number time is 42.600200 ms +==== naive scan, power-of-two ==== +GPU Naive Scan time is 25.836449 ms + passed +==== naive scan, non-power-of-two ==== +GPU Naive Scan time is 26.082048 ms + passed +==== work-efficient scan, power-of-two ==== +GPU Efficient Scan time is 33.565887 ms + passed +==== work-efficient scan, non-power-of-two ==== +GPU Efficient Scan time is 33.715874 ms + passed +==== thrust scan, power-of-two ==== +GPU Thrust Scan time is 2.211232 ms + passed +==== thrust scan, non-power-of-two ==== +GPU Thrust Scan time is 2.045760 ms + passed + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +*************** POWER-OF-TWO **************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 860.626038 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +************* NON-POWER-OF-TWO ************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 851.824646 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan power-of-two number time is 51.122000 ms +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan non-power-of-two number time is 37.596400 ms +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact with scan time is 147.394200 ms +==== work-efficient compact, power-of-two ==== +GPU Efficient Compact time is 31.792288 ms + passed +==== work-efficient compact, non-power-of-two ==== +GPU Efficient Compact time is 32.711681 ms + passed diff --git a/result_showcase/BlockSize128.txt b/result_showcase/BlockSize128.txt new file mode 100644 index 0000000..421c442 --- /dev/null +++ b/result_showcase/BlockSize128.txt @@ -0,0 +1,82 @@ +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 42 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411089014 411089056 ] +CPU scan power-of-two number time is 24.032100 ms +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411088950 411088974 ] + passed +CPU scan non-power-of-two number time is 42.639800 ms +==== naive scan, power-of-two ==== +GPU Naive Scan time is 25.994272 ms + passed +==== naive scan, non-power-of-two ==== +GPU Naive Scan time is 25.812672 ms + passed +==== work-efficient scan, power-of-two ==== +GPU Efficient Scan time is 27.808865 ms + passed +==== work-efficient scan, non-power-of-two ==== +GPU Efficient Scan time is 24.770847 ms + passed +==== thrust scan, power-of-two ==== +GPU Thrust Scan time is 2.255712 ms + passed +==== thrust scan, non-power-of-two ==== +GPU Thrust Scan time is 2.052608 ms + passed + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +*************** POWER-OF-TWO **************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 741.838257 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +************* NON-POWER-OF-TWO ************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 743.634277 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan power-of-two number time is 37.117100 ms +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan non-power-of-two number time is 37.113600 ms +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact with scan time is 129.808400 ms +==== work-efficient compact, power-of-two ==== +GPU Efficient Compact time is 27.158209 ms + passed +==== work-efficient compact, non-power-of-two ==== +GPU Efficient Compact time is 27.228865 ms + passed diff --git a/result_showcase/BlockSize16.txt b/result_showcase/BlockSize16.txt new file mode 100644 index 0000000..bfc4ca0 --- /dev/null +++ b/result_showcase/BlockSize16.txt @@ -0,0 +1,82 @@ +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 42 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411089014 411089056 ] +CPU scan power-of-two number time is 24.063200 ms +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411088950 411088974 ] + passed +CPU scan non-power-of-two number time is 42.923400 ms +==== naive scan, power-of-two ==== +GPU Naive Scan time is 52.045727 ms + passed +==== naive scan, non-power-of-two ==== +GPU Naive Scan time is 45.901855 ms + passed +==== work-efficient scan, power-of-two ==== +GPU Efficient Scan time is 91.401695 ms + passed +==== work-efficient scan, non-power-of-two ==== +GPU Efficient Scan time is 89.639648 ms + passed +==== thrust scan, power-of-two ==== +GPU Thrust Scan time is 2.128768 ms + passed +==== thrust scan, non-power-of-two ==== +GPU Thrust Scan time is 2.094752 ms + passed + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +*************** POWER-OF-TWO **************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 2466.394043 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +************* NON-POWER-OF-TWO ************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 2448.440186 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan power-of-two number time is 37.076700 ms +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan non-power-of-two number time is 49.660900 ms +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact with scan time is 213.567100 ms +==== work-efficient compact, power-of-two ==== +GPU Efficient Compact time is 111.799934 ms + passed +==== work-efficient compact, non-power-of-two ==== +GPU Efficient Compact time is 100.933853 ms + passed diff --git a/result_showcase/BlockSize256.txt b/result_showcase/BlockSize256.txt new file mode 100644 index 0000000..ef2effc --- /dev/null +++ b/result_showcase/BlockSize256.txt @@ -0,0 +1,82 @@ +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 42 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411089014 411089056 ] +CPU scan power-of-two number time is 24.064000 ms +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411088950 411088974 ] + passed +CPU scan non-power-of-two number time is 41.609900 ms +==== naive scan, power-of-two ==== +GPU Naive Scan time is 25.615328 ms + passed +==== naive scan, non-power-of-two ==== +GPU Naive Scan time is 25.627424 ms + passed +==== work-efficient scan, power-of-two ==== +GPU Efficient Scan time is 27.646433 ms + passed +==== work-efficient scan, non-power-of-two ==== +GPU Efficient Scan time is 27.607807 ms + passed +==== thrust scan, power-of-two ==== +GPU Thrust Scan time is 2.404192 ms + passed +==== thrust scan, non-power-of-two ==== +GPU Thrust Scan time is 2.223552 ms + passed + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +*************** POWER-OF-TWO **************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 769.176636 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +************* NON-POWER-OF-TWO ************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 771.496643 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan power-of-two number time is 38.061700 ms +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan non-power-of-two number time is 36.591800 ms +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact with scan time is 158.915200 ms +==== work-efficient compact, power-of-two ==== +GPU Efficient Compact time is 30.891071 ms + passed +==== work-efficient compact, non-power-of-two ==== +GPU Efficient Compact time is 30.769600 ms + passed diff --git a/result_showcase/BlockSize32.txt b/result_showcase/BlockSize32.txt new file mode 100644 index 0000000..b402d47 --- /dev/null +++ b/result_showcase/BlockSize32.txt @@ -0,0 +1,82 @@ +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 42 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411089014 411089056 ] +CPU scan power-of-two number time is 24.056300 ms +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411088950 411088974 ] + passed +CPU scan non-power-of-two number time is 43.114200 ms +==== naive scan, power-of-two ==== +GPU Naive Scan time is 30.109312 ms + passed +==== naive scan, non-power-of-two ==== +GPU Naive Scan time is 30.138912 ms + passed +==== work-efficient scan, power-of-two ==== +GPU Efficient Scan time is 53.902912 ms + passed +==== work-efficient scan, non-power-of-two ==== +GPU Efficient Scan time is 51.030048 ms + passed +==== thrust scan, power-of-two ==== +GPU Thrust Scan time is 2.094240 ms + passed +==== thrust scan, non-power-of-two ==== +GPU Thrust Scan time is 2.297760 ms + passed + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +*************** POWER-OF-TWO **************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 1302.962769 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +************* NON-POWER-OF-TWO ************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 1298.191284 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan power-of-two number time is 37.097700 ms +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan non-power-of-two number time is 37.119400 ms +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact with scan time is 124.856800 ms +==== work-efficient compact, power-of-two ==== +GPU Efficient Compact time is 49.938145 ms + passed +==== work-efficient compact, non-power-of-two ==== +GPU Efficient Compact time is 50.175392 ms + passed diff --git a/result_showcase/BlockSize512.txt b/result_showcase/BlockSize512.txt new file mode 100644 index 0000000..d4b2095 --- /dev/null +++ b/result_showcase/BlockSize512.txt @@ -0,0 +1,82 @@ +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 42 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411089014 411089056 ] +CPU scan power-of-two number time is 24.588900 ms +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411088950 411088974 ] + passed +CPU scan non-power-of-two number time is 42.111500 ms +==== naive scan, power-of-two ==== +GPU Naive Scan time is 25.576256 ms + passed +==== naive scan, non-power-of-two ==== +GPU Naive Scan time is 25.609535 ms + passed +==== work-efficient scan, power-of-two ==== +GPU Efficient Scan time is 29.840576 ms + passed +==== work-efficient scan, non-power-of-two ==== +GPU Efficient Scan time is 29.848961 ms + passed +==== thrust scan, power-of-two ==== +GPU Thrust Scan time is 2.256288 ms + passed +==== thrust scan, non-power-of-two ==== +GPU Thrust Scan time is 2.146816 ms + passed + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +*************** POWER-OF-TWO **************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 791.230530 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +************* NON-POWER-OF-TWO ************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 803.836487 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan power-of-two number time is 37.618200 ms +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan non-power-of-two number time is 37.092600 ms +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact with scan time is 154.884600 ms +==== work-efficient compact, power-of-two ==== +GPU Efficient Compact time is 32.293793 ms + passed +==== work-efficient compact, non-power-of-two ==== +GPU Efficient Compact time is 32.385727 ms + passed diff --git a/result_showcase/BlockSize64.txt b/result_showcase/BlockSize64.txt new file mode 100644 index 0000000..b06a4be --- /dev/null +++ b/result_showcase/BlockSize64.txt @@ -0,0 +1,82 @@ +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 42 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411089014 411089056 ] +CPU scan power-of-two number time is 24.090800 ms +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 411088950 411088974 ] + passed +CPU scan non-power-of-two number time is 42.641300 ms +==== naive scan, power-of-two ==== +GPU Naive Scan time is 25.546721 ms + passed +==== naive scan, non-power-of-two ==== +GPU Naive Scan time is 25.939680 ms + passed +==== work-efficient scan, power-of-two ==== +GPU Efficient Scan time is 29.845119 ms + passed +==== work-efficient scan, non-power-of-two ==== +GPU Efficient Scan time is 27.795744 ms + passed +==== thrust scan, power-of-two ==== +GPU Thrust Scan time is 2.081152 ms + passed +==== thrust scan, non-power-of-two ==== +GPU Thrust Scan time is 2.011712 ms + passed + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +*************** POWER-OF-TWO **************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 786.153015 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +********************************************* +*************** EXTRA CREDIT **************** +************* RADIX SORT TESTS ************** +************* NON-POWER-OF-TWO ************** +********************************************* + [ 38 7719 21238 2437 8855 11797 8365 32285 10450 30612 5853 28100 1142 ... 7792 2304 ] +==== std sort for comparasion ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + +==== Extra : RadixSort ==== +GPU Radix Sort time is 788.341675 ms + passed + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan power-of-two number time is 37.617900 ms +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact without scan non-power-of-two number time is 37.122000 ms +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +CPU compact with scan time is 126.336900 ms +==== work-efficient compact, power-of-two ==== +GPU Efficient Compact time is 28.904863 ms + passed +==== work-efficient compact, non-power-of-two ==== +GPU Efficient Compact time is 28.872959 ms + passed diff --git a/result_showcase/FD5D5800 b/result_showcase/FD5D5800 new file mode 100644 index 0000000..b0c2771 Binary files /dev/null and b/result_showcase/FD5D5800 differ diff --git a/result_showcase/Profiling.PNG b/result_showcase/Profiling.PNG new file mode 100644 index 0000000..5f81ab0 Binary files /dev/null and b/result_showcase/Profiling.PNG differ diff --git a/result_showcase/ReadMeAboutArraySizeChoose0.PNG b/result_showcase/ReadMeAboutArraySizeChoose0.PNG new file mode 100644 index 0000000..869e448 Binary files /dev/null and b/result_showcase/ReadMeAboutArraySizeChoose0.PNG differ diff --git a/result_showcase/ReadMeAboutArraySizeChoose1.PNG b/result_showcase/ReadMeAboutArraySizeChoose1.PNG new file mode 100644 index 0000000..2357034 Binary files /dev/null and b/result_showcase/ReadMeAboutArraySizeChoose1.PNG differ diff --git a/result_showcase/ReadMeAboutBlockSizeChoose1.PNG b/result_showcase/ReadMeAboutBlockSizeChoose1.PNG new file mode 100644 index 0000000..9e4d059 Binary files /dev/null and b/result_showcase/ReadMeAboutBlockSizeChoose1.PNG differ diff --git a/result_showcase/ReadMeAboutBlockSizeChoose2.PNG b/result_showcase/ReadMeAboutBlockSizeChoose2.PNG new file mode 100644 index 0000000..375052c Binary files /dev/null and b/result_showcase/ReadMeAboutBlockSizeChoose2.PNG differ diff --git a/result_showcase/Result Comparasion.xlsx b/result_showcase/Result Comparasion.xlsx new file mode 100644 index 0000000..e3fa0df Binary files /dev/null and b/result_showcase/Result Comparasion.xlsx differ diff --git a/result_showcase/XueyinResultOriginal_pow(2,24).gif b/result_showcase/XueyinResultOriginal_pow(2,24).gif new file mode 100644 index 0000000..03f5872 Binary files /dev/null and b/result_showcase/XueyinResultOriginal_pow(2,24).gif differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..1825752 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -7,17 +7,22 @@ */ #include +#include +#include +#include #include #include #include #include +#include +#include #include "testing_helpers.hpp" -int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; - const int NPOT = SIZE - 3; - int a[SIZE], b[SIZE], c[SIZE]; +const int SIZE = 1 << 24; +const int NPOT = SIZE - 3; +int a[SIZE], b[SIZE], c[SIZE]; +int main(int argc, char* argv[]) { // Scan tests printf("\n"); @@ -31,51 +36,128 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); + auto startTime = std::chrono::high_resolution_clock::now(); StreamCompaction::CPU::scan(SIZE, b, a); + auto endTime = std::chrono::high_resolution_clock::now(); printArray(SIZE, b, true); + std::chrono::duration eclipsed = endTime - startTime; + double delta = eclipsed.count(); + printf("CPU scan power-of-two number time is %f ms\n", delta); zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); + startTime = std::chrono::high_resolution_clock::now(); StreamCompaction::CPU::scan(NPOT, c, a); - printArray(NPOT, b, true); + endTime = std::chrono::high_resolution_clock::now(); + eclipsed = endTime - startTime; + delta = eclipsed.count(); + printArray(NPOT, b, true); printCmpResult(NPOT, b, c); + printf("CPU scan non-power-of-two number time is %f ms\n", delta); zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); + // printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); + // printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); + // printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); + // printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + printf("\n"); + printf("*********************************************\n"); + printf("*************** EXTRA CREDIT ****************\n"); + printf("************* RADIX SORT TESTS **************\n"); + printf("*************** POWER-OF-TWO ****************\n"); + printf("*********************************************\n"); + + genArray(SIZE, a, SIZE); + printArray(SIZE, a, true); + memcpy(b, a, SIZE*sizeof(int)); + printDesc("std sort for comparasion"); + startTime = std::chrono::high_resolution_clock::now(); + std::sort(a, a + SIZE); + endTime = std::chrono::high_resolution_clock::now(); + eclipsed = endTime - startTime; + delta = eclipsed.count(); + printf("std sort for power-of-two number time is %f ms\n", delta); + + printArray(SIZE, a, true); + printf("\n"); + + printDesc("Extra : RadixSort"); + StreamCompaction::Radix::RadixSort(SIZE, b, SIZE); + printCmpResult(SIZE, b, a); + printArray(SIZE, b, true); + printf("\n"); + + printf("\n"); + printf("*********************************************\n"); + printf("*************** EXTRA CREDIT ****************\n"); + printf("************* RADIX SORT TESTS **************\n"); + printf("************* NON-POWER-OF-TWO **************\n"); + printf("*********************************************\n"); + + //zeroArray(SIZE, c); + //printDesc("work-efficient scan, power-of-two"); + //StreamCompaction::Efficient::scan(SIZE, c, a); + //// printArray(SIZE, c, true); + //printCmpResult(SIZE, b, c); + + //zeroArray(SIZE, c); + //printDesc("work-efficient scan, non-power-of-two"); + //StreamCompaction::Efficient::scan(NPOT, c, a); + //// printArray(NPOT, c, true); + //printCmpResult(NPOT, b, c); + + genArray(SIZE, a, SIZE); + printArray(SIZE, a, true); + memcpy(b, a, NPOT * sizeof(int)); + + printDesc("std sort for comparasion"); + startTime = std::chrono::high_resolution_clock::now(); + std::sort(a, a + NPOT); + endTime = std::chrono::high_resolution_clock::now(); + eclipsed = endTime - startTime; + delta = eclipsed.count(); + printf("std sort for non-power-of-two number time is %f ms\n", delta); + + printArray(NPOT, a, true); + printf("\n"); + + printDesc("Extra : RadixSort"); + StreamCompaction::Radix::RadixSort(NPOT, b, SIZE); + printCmpResult(NPOT, b, a); + printArray(NPOT, b, true); + printf("\n"); + printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); @@ -88,26 +170,40 @@ int main(int argc, char* argv[]) { printArray(SIZE, a, true); int count, expectedCount, expectedNPOT; - zeroArray(SIZE, b); printDesc("cpu compact without scan, power-of-two"); + startTime = std::chrono::high_resolution_clock::now(); count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - expectedCount = count; + endTime = std::chrono::high_resolution_clock::now(); + eclipsed = endTime - startTime; + delta = eclipsed.count(); + expectedCount = count; printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); + printf("CPU compact without scan power-of-two number time is %f ms\n", delta); zeroArray(SIZE, c); printDesc("cpu compact without scan, non-power-of-two"); + startTime = std::chrono::high_resolution_clock::now(); count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + endTime = std::chrono::high_resolution_clock::now(); + eclipsed = endTime - startTime; + delta = eclipsed.count(); expectedNPOT = count; printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + printf("CPU compact without scan non-power-of-two number time is %f ms\n", delta); zeroArray(SIZE, c); printDesc("cpu compact with scan"); + startTime = std::chrono::high_resolution_clock::now(); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + endTime = std::chrono::high_resolution_clock::now(); + eclipsed = endTime - startTime; + delta = eclipsed.count(); printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + printf("CPU compact with scan time is %f ms\n", delta); zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..bcc484e 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -9,6 +9,8 @@ set(SOURCE_FILES "efficient.cu" "thrust.h" "thrust.cu" + "radix.h" + "radix.cu" ) cuda_add_library(stream_compaction diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..56a061c 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,10 @@ namespace Common { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index < n) { + bools[index] = (idata[index] == 0) ? 0 : 1; + } } /** @@ -33,6 +37,19 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) { + return; + } + //special deal with the last element + if (index == n - 1) { + odata[indices[index]] = idata[index]; + } + + else if (indices[index] != indices[index + 1]) { + odata[indices[index]] = idata[index]; + } + //over } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..9562149 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -3,9 +3,13 @@ #include #include #include +#include +#include +#include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define blockSize 128 /** * Check for CUDA errors; print and exit if there was a problem. @@ -31,5 +35,6 @@ namespace Common { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices); + } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..8f839f9 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -9,7 +9,15 @@ namespace CPU { */ void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + odata[0] = 0; + + + // exclusive prefix sum + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + + // printf("TODO\n"); } /** @@ -19,7 +27,14 @@ void scan(int n, int *odata, const int *idata) { */ int compactWithoutScan(int n, int *odata, const int *idata) { // TODO - return -1; + // return -1; + int counter = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[counter++] = idata[i]; + } + } + return counter; } /** @@ -27,9 +42,39 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * * @returns the number of elements remaining after compaction. */ + +/** +* so helper function scatter here +**/ + +int scatter(int n, int *odata, const int *idata, const int *idataChanged, const int *exclusivePreSum){ + int counter = 0; + for (int i = 0; i < n; i++) { + if (idataChanged[i] == 1) { + odata[exclusivePreSum[i]] = idata[i]; + counter++; + } + } + return counter; +} + int compactWithScan(int n, int *odata, const int *idata) { // TODO - return -1; + // return -1; + int* idataChanged = new int[n]; + int* exclusivePreSum = new int[n]; + + for (int i = 0; i < n; i++) { + idataChanged[i] = (idata[i] == 0) ? 0 : 1; + } + + //odataChanged is the exclusive prefix sum + scan(n, exclusivePreSum, idataChanged); + int counter = scatter(n, odata, idata, idataChanged, exclusivePreSum); + delete[] idataChanged; + delete[] exclusivePreSum; + return counter; + } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 6348bf3..7b2ef2e 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -6,6 +6,8 @@ namespace CPU { int compactWithoutScan(int n, int *odata, const int *idata); + int scatter(int n, int *odata, const int *idata, const int *idataChanged, const int *exclusivePreSum); + int compactWithScan(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..fe3a03d 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -7,13 +7,101 @@ namespace StreamCompaction { namespace Efficient { // TODO: __global__ + __global__ void upSweep(int N, int d, int *idata) { + int n = (blockDim.x * blockIdx.x) + threadIdx.x; + if (n >= N) { + return; + } + int delta = 1 << d; + int doubleDelta = 1 << (d + 1); + if (n % doubleDelta == 0) { + idata[n + doubleDelta - 1] += idata[n + delta - 1]; + } + } + + __global__ void downSweep(int N, int d, int *idata) { + int n = (blockDim.x * blockIdx.x) + threadIdx.x; + if (n >= N) { + return; + } + int delta = 1 << d; + int doubleDelta = 1 << (d + 1); + if (n % doubleDelta == 0) { + int temp = idata[n + delta - 1]; + idata[n + delta - 1] = idata[n + doubleDelta - 1]; + idata[n + doubleDelta - 1] += temp; + } + } + + void scanInDevice(int n, int *devData) { + int blockNum = (n + blockSize - 1) / blockSize; + for (int d = 0; d < ilog2ceil(n) - 1; d++) { + upSweep << > >(n, d, devData); + checkCUDAError("upSweep not correct..."); + } + //set last element to zero, refer to slides! + int counter = 0; + cudaMemcpy(&devData[n - 1], &counter, sizeof(int), cudaMemcpyHostToDevice); + + for (int d = ilog2ceil(n) - 1; d >= 0; d--) { + downSweep << > >(n, d, devData); + checkCUDAError("downSweep not correct..."); + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + // printf("TODO\n"); + int *temp; + int realN; + + // google for bit operation + // if n is not 2^a number + if (n & (n - 1) != 0) { + // enlarge to be a 2^a number + realN = 1 << (ilog2ceil(n)); + temp = (int*)malloc(realN * sizeof(int)); + memcpy(temp, idata, realN * sizeof(int)); + // update the new added elements to zero + for (int j = n; j < realN; j++) { + temp[j] = 0; + } + + } else { // is 2^a + //do nothing, realN is n + realN = n; + temp = (int*)malloc(realN * sizeof(int)); + memcpy(temp, idata, realN * sizeof(int)); + } + + int arraySize = realN * sizeof(int); + int *devIdata; + + cudaMalloc((void**)&devIdata, arraySize); + checkCUDAError("cudaMalloc devIdata failed"); + cudaMemcpy(devIdata, temp, arraySize, cudaMemcpyHostToDevice); + + //Add performance analysis + cudaEvent_t start, end; + cudaEventCreate(&start); + cudaEventCreate(&end); + cudaEventRecord(start); + + //call scanInDevice Function + scanInDevice(realN, devIdata); + + //Add performance analysis + cudaEventRecord(end); + cudaEventSynchronize(end); + float deltaTime; + cudaEventElapsedTime(&deltaTime, start, end); + printf("GPU Efficient Scan time is %f ms\n", deltaTime); + cudaMemcpy(odata, devIdata, arraySize, cudaMemcpyDeviceToHost); + cudaFree(devIdata); + } /** @@ -27,7 +115,80 @@ void scan(int n, int *odata, const int *idata) { */ int compact(int n, int *odata, const int *idata) { // TODO - return -1; + // return -1; + int *temp; + int realN; + + if (n & (n - 1) != 0) { // if size is not a power of 2 + // enlarge to be a 2^a number + realN = 1 << (ilog2ceil(n)); + temp = (int*)malloc(realN * sizeof(int)); + memcpy(temp, idata, realN * sizeof(int)); + + // update the new added elements to zero + for (int j = n; j < realN; j++) { + temp[j] = 0; + } + + } else { // is 2^a + //do nothing, realN is n + realN = n; + temp = (int*)malloc(realN * sizeof(int)); + memcpy(temp, idata, realN * sizeof(int)); + } + + int arraySize = realN * sizeof(int); + int blockNum = (realN + blockSize - 1) / blockSize; + + int *devIdata; + int *devOdata; + int *devIndex; + + cudaMalloc((void**)&devIdata, arraySize); + checkCUDAError("cudaMalloc devIdata failed"); + cudaMalloc((void**)&devOdata, arraySize); + checkCUDAError("cudaMalloc devOdata failed"); + cudaMalloc((void**)&devIndex, arraySize); + checkCUDAError("cudaMalloc devIndex failed"); + + cudaMemcpy(devIdata, temp, arraySize, cudaMemcpyHostToDevice); + + //Add performance analysis + cudaEvent_t start, end; + cudaEventCreate(&start); + cudaEventCreate(&end); + cudaEventRecord(start); + + StreamCompaction::Common::kernMapToBoolean << > >(realN, devIndex, devIdata); + int lastElem; + cudaMemcpy(&lastElem, devIndex + realN - 1, sizeof(int), cudaMemcpyDeviceToHost); + + scanInDevice(realN, devIndex); + int size; + cudaMemcpy(&size, devIndex + realN - 1, sizeof(int), cudaMemcpyDeviceToHost); + + StreamCompaction::Common::kernScatter << > >(realN, devOdata, devIdata, devIndex, devIndex); + + //Add performance analysis + cudaEventRecord(end); + cudaEventSynchronize(end); + float deltaTime; + cudaEventElapsedTime(&deltaTime, start, end); + printf("GPU Efficient Compact time is %f ms\n", deltaTime); + + cudaMemcpy(odata, devOdata, arraySize, cudaMemcpyDeviceToHost); + + //exclusive scan + if (lastElem == 1) { + size++; + } + + cudaFree(devIdata); + cudaFree(devOdata); + cudaFree(devIndex); + + return size; + } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 395ba10..c798b2f 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,6 +2,8 @@ namespace StreamCompaction { namespace Efficient { + void scanInDevice(int n, int *devData); + void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..4d573b5 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,18 +3,78 @@ #include "common.h" #include "naive.h" +int* devIdata; +int* devOdata; + namespace StreamCompaction { -namespace Naive { + namespace Naive { -// TODO: __global__ + // TODO: __global__ + __global__ void kernelNaive(int n, int delta, const int *idata, int *odata) { + int index = (blockIdx.x *blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + if (index - delta < 0) { + odata[index] = idata[index]; + } else { + odata[index] = idata[index - delta] + idata[index]; + } + } -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + // TODO + // printf("TODO\n"); + cudaMalloc((void**)&devIdata, n * sizeof(int)); + checkCUDAError("cudaMalloc devIdata failed"); -} + cudaMalloc((void**)&devOdata, n * sizeof(int)); + checkCUDAError("cudaMalloc devOdata failed"); + + cudaMemcpy(devIdata, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + //performance check... remember... + + int blockNum = (n + blockSize - 1) / blockSize; + + + //Add performance analysis + cudaEvent_t start, end; + cudaEventCreate(&start); + cudaEventCreate(&end); + cudaEventRecord(start); + + //Naive Parallel Scan + int level = ilog2ceil(n); + int delta; + for (int d = 1; d <= level; d++) { + // pow (2,d-1) + // refer to slides + delta = (1 << (d - 1)); + kernelNaive << < blockNum, blockSize >> >(n, delta, devIdata, devOdata); + std::swap(devIdata, devOdata); + } + // Think twice............. + std::swap(devIdata, devOdata); + //Add performance analysis + cudaEventRecord(end); + cudaEventSynchronize(end); + float deltaTime; + cudaEventElapsedTime(&deltaTime, start, end); + printf("GPU Naive Scan time is %f ms\n", deltaTime); + + // exclusive scan, set odata[0] = 0 seperately + cudaMemcpy(odata + 1, devOdata, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + odata[0] = 0; + + cudaFree(devIdata); + cudaFree(devOdata); + + checkCUDAError("naice scan error..."); + } + + } } diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 0000000..c3f4e5d --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,136 @@ +#include +#include +#include "common.h" +#include "radix.h" +#include "efficient.h" + +namespace StreamCompaction { +namespace Radix { + + //b array + __global__ void kernTestTrueFalseOnRightKthBit(int n, int k, int* odata, const int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (((1 << k) & idata[index]) == (1<= n) { + return; + } + odata[index] = 1 - idata[index]; + } + + //t array + __global__ void kernComputeTArray(int n, const int *bArray, const int *idata, int *odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) { + return; + } + odata[index] = index - idata[index] + idata[n - 1] + !bArray[n - 1]; + } + + //d array + __global__ void kernComputeDArray(int n, int * dArray, const int *bArray, const int *fArray, const int *tArray) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) { + return; + } + dArray[index] = (bArray[index] ? tArray[index] : fArray[index]); + } + + //Reshuffle Index + __global__ void kernReshuffle(int n, int* idata, int *odata, const int *dArray) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) { + return; + } + odata[dArray[index]] = idata[index]; + } + + void RadixSort(int n, int* idata, int maxNum) { + int *devIdata; int *devOdata; + int *bArray; int *eArray; int *fArray; int *tArray; int *dArray; + int realN = 0; + + //Where bugs come from...... + if (n & (n - 1) != 0){ + realN = 1 << (ilog2ceil(n)); + } else { + realN = n; + } + + cudaMalloc((void**)&devIdata, sizeof(int) * n); + checkCUDAError("cudaMalloc radix devIdata array failed"); + + cudaMalloc((void**)&devOdata, sizeof(int) * n); + checkCUDAError("cudaMalloc radix devOdata array failed"); + + cudaMalloc((void**)&bArray, sizeof(int) * n); + checkCUDAError("cudaMalloc radix bArray failed"); + + cudaMalloc((void**)&eArray, sizeof(int) * realN); + cudaMemset(eArray, sizeof(int) * realN, 0); + checkCUDAError("cudaMalloc radix eArray failed"); + + //Remember do realN here...... + //cudaMalloc((void**)&fArray, sizeof(int) * c); + //checkCUDAError("cudaMalloc radix fArray failed"); + + cudaMalloc((void**)&tArray, sizeof(int) * n); + checkCUDAError("cudaMalloc radix tArray failed"); + + cudaMalloc((void**)&dArray, sizeof(int) * n); + checkCUDAError("cudaMalloc radix dArray failed"); + + int blockNum = (n + blockSize - 1) / blockSize; + int digitNum = ilog2ceil(maxNum); + + cudaMemcpy(devIdata, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + //Add performance analysis + cudaEvent_t start, end; + cudaEventCreate(&start); + cudaEventCreate(&end); + cudaEventRecord(start); + + for (int i = 0; i < digitNum; i++) { + kernTestTrueFalseOnRightKthBit << < blockNum, blockSize >> >(n, i, bArray, devIdata); + kernNotOperatorOnArray << < blockNum, blockSize >> >(n, eArray, bArray); + + fArray = eArray; + //cudaMemcpy(fArray, eArray, n*sizeof(int), cudaMemcpyDeviceToDevice); + StreamCompaction::Efficient::scanInDevice(realN, fArray); + + //Slides.... + kernComputeTArray << < blockNum, blockSize >> >(n, bArray, fArray, tArray); + kernComputeDArray << < blockNum, blockSize >> >(n, dArray, bArray, fArray, tArray); + kernReshuffle << > >(n, devIdata, devOdata, dArray); + std::swap(devOdata, devIdata); + } + + //Add performance analysis + cudaEventRecord(end); + cudaEventSynchronize(end); + float deltaTime; + cudaEventElapsedTime(&deltaTime, start, end); + printf("GPU Radix Sort time is %f ms\n", deltaTime); + + cudaMemcpy(idata, devIdata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(devIdata); cudaFree(devOdata); cudaFree(bArray); cudaFree(eArray); + /*cudaFree(fArray);*/ cudaFree(tArray); cudaFree(dArray); + + } + + +} +} diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 0000000..2536cc5 --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,7 @@ +#pragma once + +namespace StreamCompaction { +namespace Radix { + void RadixSort(int n, int *idata, int maxNum); +} +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..1377e6b 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -14,8 +14,29 @@ namespace Thrust { */ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::device_vector devIdata(idata, idata + n); + thrust::device_vector devOdata(odata, odata + n); + + // example: for device_vectors dv_in and dv_out: + // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + //Add performance analysis + cudaEvent_t start, end; + cudaEventCreate(&start); + cudaEventCreate(&end); + cudaEventRecord(start); + + thrust::exclusive_scan(devIdata.begin(), devIdata.end(), devOdata.begin()); + + //Add performance analysis + cudaEventRecord(end); + cudaEventSynchronize(end); + float deltaTime; + cudaEventElapsedTime(&deltaTime, start, end); + printf("GPU Thrust Scan time is %f ms\n", deltaTime); + + thrust::copy(devOdata.begin(), devOdata.end(), odata); } }