diff --git a/Book1.xlsx b/Book1.xlsx new file mode 100644 index 0000000..11315bf Binary files /dev/null and b/Book1.xlsx differ diff --git a/README.md b/README.md index b71c458..89a2b87 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,142 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Xiang Deng +* Tested on: Windows 10-Home, i7-6700U @ 2.6GHz 16GB, GTX 1060 6GB (Personal Computer) -### (TODO: Your README) + +* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis). +(The following are the results for timing vs. varing arraysize with blocksize=128) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +![](images/1.PNG) +![](images/7.PNG) + +Based on the figure and data above, regarding scanning, I found the breakpoint for GPU outperforms the GPU is around the arraysize of 2^16, after which the GPU sigificantly speed up than the CPU. +The CPU shows its adavantage for small arraysize. + +![](images/2.PNG) + +![](images/3.PNG) + +![](images/8.PNG) + +Based on the figure and data above, regarding compacting, I found the breakpoint for GPU outperforms the GPU is between the arraysize of 2^16 and 2^20, after which the GPU sigificantly speed up than the CPU. +The CPU still shows its adavantage for small arraysize. + +# Regarding "bottlenecks" : + +Firstly we didn't observe significant improvement from Naive GPU scanning to Efficient Scanning, one possible reasoning that might shed light on this is the +increasing number of sleeping threads as the levels of the balance tree grows higher. + +The memory transfer between CPU and GPU also slows down the performance, one possible way to alleviate this problem might be increasing the data size which reduces the +percentage of such overhead in the meanwhile. + +The switching between the upsweeping and downsweeping trigerrs new kernels to be established, which could be more efficient if the kernels could be reused. + +![](images/4.PNG) + + +* Optimization of blocksize: +Experiments was conducted on various blocksizes from 32 to 1024 with exponential growth. Typically we observed the optimizal value of block size (256) which best +balance the optimal value of scan time as well as compact time for GPU. Since earlier we observed the array size of 2^16 is around the "turning poit", we +used this parameter for the tuning of the blocksize. + +![](images/5.PNG) + +![](images/6.PNG) + +* Extra credits +* 1) I typically found arraysize of 2^16 or greater already makes the GPU outforms the GPU. +* 2) The radix sort was implemented and tested. The testing function (at the end of the main.cpp) generates array size of power of two and not power of two. In both cases, +we compare the sorting result with C++ built in sorting function to verify the correctness. It's correctness has been verified. + +# Test output + +``` +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 6 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 25680538 25680544 ] +time lapsed 2.110000 ms +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 25680508 25680512 ] + passed +time lapsed 2.130000 ms +==== naive scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 25680538 25680544 ] + passed +time lapsed 1.296384 ms +==== naive scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 0 0 ] + passed +time lapsed 1.291712 ms +==== work-efficient scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 25680538 25680544 ] + passed +time lapsed 1.671008 ms +==== work-efficient scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 25680508 25680512 ] + passed +time lapsed 1.668096 ms +==== thrust scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 25680538 25680544 ] + passed +time lapsed 0.630000 ms +==== thrust scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 25680508 25680512 ] + passed +time lapsed 0.620000 ms + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +time lapsed 3.060000 ms +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +time lapsed 3.110000 ms +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +time lapsed 9.810000 ms +==== work-efficient compact, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +time lapsed 2.623488 ms +==== work-efficient compact, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +time lapsed 2.942464 ms + +***************************** +** RADIX SORT TESTS ** +***************************** +==== Array to be sorted power of 2 ==== + [ 38 119 38 37 55 197 165 85 50 12 53 100 142 ... 56 0 ] +==== RADIX SORT POT ==== +size of int is 32 bits + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 199 199 ] +==== C++ SORT POT ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 199 199 ] + passed +==== Array to be sorted not power of 2 ==== + [ 38 1719 1238 437 855 1797 365 285 450 612 1853 100 1142 ... 656 0 ] +==== RADIX SORT NPOT ==== +size of int is 32 bits + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 1999 1999 ] +==== C++ SORT NPOT ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 1999 1999 ] + passed +``` + +## Note +### Modified files +CMakeList.txt : add radixSort.h and radixSort.cu, changed -arch=sm_20 to sm_61 +Two files added: radixSort.h and radixSort.cu \ No newline at end of file diff --git a/images/1.PNG b/images/1.PNG new file mode 100644 index 0000000..49c88d3 Binary files /dev/null and b/images/1.PNG differ diff --git a/images/2.PNG b/images/2.PNG new file mode 100644 index 0000000..a8f5d6b Binary files /dev/null and b/images/2.PNG differ diff --git a/images/3.PNG b/images/3.PNG new file mode 100644 index 0000000..6af9f61 Binary files /dev/null and b/images/3.PNG differ diff --git a/images/4.PNG b/images/4.PNG new file mode 100644 index 0000000..602d89a Binary files /dev/null and b/images/4.PNG differ diff --git a/images/5.PNG b/images/5.PNG new file mode 100644 index 0000000..15f4aa1 Binary files /dev/null and b/images/5.PNG differ diff --git a/images/6.PNG b/images/6.PNG new file mode 100644 index 0000000..0484d0e Binary files /dev/null and b/images/6.PNG differ diff --git a/images/7.PNG b/images/7.PNG new file mode 100644 index 0000000..5f1ff0f Binary files /dev/null and b/images/7.PNG differ diff --git a/images/8.PNG b/images/8.PNG new file mode 100644 index 0000000..9d04fa9 Binary files /dev/null and b/images/8.PNG differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..dcbbf87 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,8 +1,8 @@ /** * @file main.cpp * @brief Stream compaction test program - * @authors Kai Ninomiya - * @date 2015 + * @authors Kai Ninomiya, Xiang Deng + * @date 2015, 2016 * @copyright University of Pennsylvania */ @@ -11,13 +11,18 @@ #include #include #include +#include #include "testing_helpers.hpp" - +#include +#include +#include int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 1 << 20; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; - + float milscs; + int nitercpu = 100; + // Scan tests printf("\n"); @@ -31,50 +36,82 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); + auto begin = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < nitercpu; i++){ + StreamCompaction::CPU::scan(SIZE, b, a); + } + auto end = std::chrono::high_resolution_clock::now(); + milscs = (float)std::chrono::duration_cast(end - begin).count() / (float)nitercpu; printArray(SIZE, b, true); + printf("time lapsed %f ms\n", milscs); + zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); + begin = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < nitercpu; i++){ + StreamCompaction::CPU::scan(NPOT, c, a); + } + end = std::chrono::high_resolution_clock::now(); + milscs = (float)std::chrono::duration_cast(end - begin).count() / nitercpu; printArray(NPOT, b, true); printCmpResult(NPOT, b, c); + printf("time lapsed %f ms\n", milscs); + zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + milscs = StreamCompaction::Naive::scan(SIZE, c, a); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); + printf("time lapsed %f ms\n", milscs); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); + milscs = StreamCompaction::Naive::scan(NPOT, c, a); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); - + printf("time lapsed %f ms\n", milscs); + + zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); + milscs = StreamCompaction::Efficient::scan(SIZE, c, a); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + printf("time lapsed %f ms\n", milscs); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); + milscs = StreamCompaction::Efficient::scan(NPOT, c, a); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + printf("time lapsed %f ms\n", milscs); + zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); + begin = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < nitercpu; i++){ + StreamCompaction::Thrust::scan(SIZE, c, a); + } + end = std::chrono::high_resolution_clock::now(); + milscs = (float)std::chrono::duration_cast(end - begin).count() / nitercpu; + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); + printf("time lapsed %f ms\n", milscs); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); + begin = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < nitercpu; i++){ + StreamCompaction::Thrust::scan(NPOT, c, a); + } + end = std::chrono::high_resolution_clock::now(); + milscs = (float)std::chrono::duration_cast(end - begin).count() / nitercpu; + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + printf("time lapsed %f ms\n", milscs); printf("\n"); printf("*****************************\n"); @@ -91,33 +128,92 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, b); printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + begin = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < nitercpu; i++){ + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + } + end = std::chrono::high_resolution_clock::now(); + milscs = (float)std::chrono::duration_cast(end - begin).count() / nitercpu; expectedCount = count; printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); + printf("time lapsed %f ms\n", milscs); zeroArray(SIZE, c); printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + begin = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < nitercpu; i++){ + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + } + end = std::chrono::high_resolution_clock::now(); expectedNPOT = count; printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + milscs = (float)std::chrono::duration_cast(end - begin).count() / nitercpu; + printf("time lapsed %f ms\n", milscs); zeroArray(SIZE, c); printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + begin = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < nitercpu; i++){ + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + } + end = std::chrono::high_resolution_clock::now(); printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + milscs = (float)std::chrono::duration_cast(end - begin).count() / nitercpu; + printf("time lapsed %f ms\n", milscs); zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - //printArray(count, c, true); + count = StreamCompaction::Efficient::compact(SIZE, c, a, milscs); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + printf("time lapsed %f ms\n", milscs); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - //printArray(count, c, true); + count = StreamCompaction::Efficient::compact(NPOT, c, a, milscs); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + printf("time lapsed %f ms\n", milscs); + + printf("\n"); + printf("*****************************\n"); + printf("** RADIX SORT TESTS **\n"); + printf("*****************************\n"); + genArray(SIZE - 1, a, 200); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printDesc("Array to be sorted power of 2"); + printArray(SIZE, a, true); + zeroArray(SIZE, b); + printDesc("RADIX SORT POT"); + int msb=31; + printf("size of int is %d bits\n", sizeof(int)*CHAR_BIT); + StreamCompaction::RadixSort::sort(SIZE, b, a, msb); + printArray(SIZE, b, true); + + printDesc("C++ SORT POT"); + memcpy(c, a, SIZE*sizeof(int)); + std::sort(c,c+SIZE); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + genArray(SIZE - 1, a, 2000); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printDesc("Array to be sorted not power of 2"); + printArray(SIZE, a, true); + zeroArray(SIZE, b); + zeroArray(SIZE, c); + printDesc("RADIX SORT NPOT"); + printf("size of int is %d bits\n", sizeof(int)*CHAR_BIT); + StreamCompaction::RadixSort::sort(NPOT, b, a, msb); + printArray(NPOT, b, true); + + printDesc("C++ SORT NPOT"); + memcpy(c, a, SIZE*sizeof(int)); + std::sort(c,c+NPOT); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..44a1ee9 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -9,9 +9,11 @@ set(SOURCE_FILES "efficient.cu" "thrust.h" "thrust.cu" + "radixSort.h" + "radixSort.cu" ) cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_61 ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..99aef01 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,10 @@ namespace Common { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < n){ + bools[index] = idata[index] != 0; + } } /** @@ -32,7 +36,11 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) return; + if (bools[index]){ + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..5ebcdb9 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -3,7 +3,7 @@ #include #include #include - +#define blockSize 256 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..7c326a2 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -9,7 +9,15 @@ namespace CPU { */ void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + //printf("TODO\n"); + if (n<=0) return; + for (int i=0; i +#include #include #include "common.h" #include "efficient.h" @@ -7,13 +7,95 @@ namespace StreamCompaction { namespace Efficient { // TODO: __global__ +__global__ void upSweep(int offset, int n, int *idata){ + int index = threadIdx.x + blockIdx.x*blockDim.x; + if (index >=n) return; + int tmp=(offset << 1); + if (index % tmp==0){ + if (index + tmp <=n){ + idata[index+tmp-1] += idata[index+offset-1] ; + } + } +} + +__global__ void downSweep(int offset, int n, int *idata){ + int index = threadIdx.x + blockIdx.x*blockDim.x; + if (index >=n) return; + int tmp=(offset << 1); + if (index % tmp==0){ + + if (index + tmp <= n){ + int t = idata[index + offset -1]; + idata[index+offset-1] = idata[index+ tmp -1]; + idata[index+ tmp -1] += t ; + } + + } +} + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { +float scan(int n, int *odata, const int *idata) { + cudaEvent_t t0, t2; + cudaEventCreate(&t0); + cudaEventCreate(&t2); + + float milliscs = 0.0f; + float tmpt; // TODO - printf("TODO\n"); + //printf("TODO\n"); + int levels_max = ilog2ceil(n); + int n_max= 1 << levels_max; + + dim3 numblocks(std::ceil((double) n_max / blockSize)); + int* idata_buff; + //allocate more space than needed + cudaMalloc((void**)&idata_buff, n_max*sizeof(int)); + checkCUDAError("cudaMalloc-idata_buff- failed!"); + //reset all to zeros + cudaMemset(idata_buff, 0, n_max*sizeof(int)); + checkCUDAError("cudaMemset-idata_buff- failed!"); + + /// CPU -->GPU + cudaMemcpy(idata_buff,idata,n*sizeof(int),cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy-idata_buff-failed"); + + cudaEventRecord(t0); + + //upsweep + for (int level=0; level <= levels_max-1; level++){ + upSweep<<>>(1<=0 ; level--){ + downSweep<<>>(1< CPU + cudaMemcpy(odata, idata_buff, n*sizeof(int),cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy-odata-failed"); + cudaFree(idata_buff); + return milliscs; } /** @@ -25,9 +107,64 @@ void scan(int n, int *odata, const int *idata) { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ -int compact(int n, int *odata, const int *idata) { - // TODO - return -1; +int compact(int n, int *odata, const int *idata, float &milliscs) { + cudaEvent_t t0, t2; + cudaEventCreate(&t0); + cudaEventCreate(&t2); + + milliscs = 0.0f; + float tmpt; + + + int n_remaing=0; + int * idata_buff; + int * odata_buff; + int * bool_buff; + int * indices_buff; + + dim3 numblocks(std::ceil((double) n/blockSize)); + // + cudaMalloc((void**)&idata_buff,n * sizeof(int)); + checkCUDAError("cudaMalloc-idata_buff-failed"); + cudaMalloc((void**)&odata_buff,n * sizeof(int)); + checkCUDAError("cudaMalloc-odata_buff-failed"); + cudaMalloc((void**)&bool_buff,n * sizeof(int)); + checkCUDAError("cudaMalloc-odata_buff-failed"); + cudaMalloc((void**)&indices_buff,n * sizeof(int)); + checkCUDAError("cudaMalloc-odata_buff-failed"); + + cudaMemcpy(idata_buff, idata, n* sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy-idata_buff-failed"); + cudaMemcpy(odata_buff, odata, n* sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy-odata_buff-failed"); + + cudaEventRecord(t0); + //produce the indices + StreamCompaction::Common::kernMapToBoolean<<>> ( n, bool_buff, idata_buff); + + scan (n, indices_buff, bool_buff); + + StreamCompaction::Common::kernScatter<<>>( n, odata_buff, idata_buff, bool_buff, indices_buff); + + cudaEventRecord(t2); + cudaEventSynchronize(t2); + cudaEventElapsedTime(&tmpt, t0, t2); + milliscs += tmpt; + + //GPU-->CPU + cudaMemcpy(odata,odata_buff,n*sizeof(int),cudaMemcpyDeviceToHost); + + //for (int i =0; i< n; i++){ + // n_remaing+=bool_buff[i]; + //} + cudaMemcpy(&n_remaing,indices_buff+n-1,sizeof(int),cudaMemcpyDeviceToHost); + int extra; + cudaMemcpy(&extra, bool_buff + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(idata_buff); + cudaFree(odata_buff); + cudaFree(bool_buff); + cudaFree(indices_buff); + return n_remaing + extra; } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 395ba10..bd669eb 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,8 +2,9 @@ namespace StreamCompaction { namespace Efficient { - void scan(int n, int *odata, const int *idata); + float scan(int n, int *odata, const int *idata); - int compact(int n, int *odata, const int *idata); + int compact(int n, int *odata, const int *idata, float & milliscs); + } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..89b356f 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,20 +1,97 @@ #include #include +#include + #include "common.h" #include "naive.h" namespace StreamCompaction { -namespace Naive { + namespace Naive { + + + //__global__ + __global__ void scan(int offset, int n, int *odata, const int *idata) { + int index = threadIdx.x + blockIdx.x*blockDim.x; + if (index >= n) return; + + if (index >= offset){ + odata[index] = idata[index] + idata[index - offset]; + } + else{ + odata[index] = idata[index]; + } + } + __global__ void excludesiveShift(int n, int *odata, int *idata){ + int index = threadIdx.x + blockIdx.x* blockDim.x; + if (index >= n) return; + if (index >= 1){ + odata[index] = idata[index - 1]; + } + else { + odata[index] = 0; + } + } + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + float scan(int n, int *odata, const int *idata) { + cudaEvent_t t0, t2; + cudaEventCreate(&t0); + cudaEventCreate(&t2); + + float milliscs = 0.0f; + float tmpt; + //dim3 numblocks(std::ceil((double) n / blockSize)); + dim3 numblocks((n + blockSize - 1) / blockSize); + int* idata_buff; + int* odata_buff; + + cudaMalloc((void**)&idata_buff, n*sizeof(int)); + checkCUDAError("cudaMalloc-idata_buff- failed!"); + cudaMalloc((void**)&odata_buff, n*sizeof(int)); + checkCUDAError("cudaMalloc-odata_buff-failed!"); + + /// CPU -->GPU + cudaMemcpy(idata_buff, idata, sizeof(int)*n, cudaMemcpyHostToDevice); + cudaMemcpy(odata_buff, idata, sizeof(int)*n, cudaMemcpyHostToDevice); + + + for (int level = 1; level <= ilog2ceil(n); level++) { + int offset; + if (level == 1){ + offset = 1; + } + else { + offset = 2 << (level - 2); + } + + cudaEventRecord(t0); + // for the given level, all threads read from idata_buff + scan << > >(offset, n, odata_buff, idata_buff); + cudaEventRecord(t2); + cudaEventSynchronize(t2); + cudaEventElapsedTime(&tmpt, t0, t2); + milliscs += tmpt; + + //std::swap(idata_buff, odata_buff); + // odata_buff --> idata_buff for next iteration + cudaMemcpy(idata_buff, odata_buff, sizeof(int)*n, cudaMemcpyDeviceToDevice); + } + + cudaEventRecord(t0); + excludesiveShift << > >(n, odata_buff, idata_buff); + cudaEventRecord(t2); + cudaEventSynchronize(t2); + cudaEventElapsedTime(&tmpt, t0, t2); + milliscs += tmpt; -// TODO: __global__ + //GPU --> CPU + cudaMemcpy(odata, odata_buff, sizeof(int)*n, cudaMemcpyDeviceToHost); + cudaFree(idata_buff); + cudaFree(odata_buff); -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + return milliscs; + } -} -} + } +} \ No newline at end of file diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 21152d6..7090b46 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -2,6 +2,6 @@ namespace StreamCompaction { namespace Naive { - void scan(int n, int *odata, const int *idata); + float scan(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/radixSort.cu b/stream_compaction/radixSort.cu new file mode 100644 index 0000000..ecaf03e --- /dev/null +++ b/stream_compaction/radixSort.cu @@ -0,0 +1,130 @@ +#include +#include +#include "common.h" +#include "radixSort.h" +#include "efficient.h" + +namespace StreamCompaction { + namespace RadixSort { + + + //int getNbit(int input, int nth){ + // return (input >> nth) & 1; + //} + + // assume the input and output are bits + __global__ void computeE(int n, int * edata, const int * bdata){ + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < n) { + //edata[index] = ~(bdata[index]); + edata[index] = 1 - (bdata[index]); + //if (index ==0){ + // odata[index] = ~(0|idata[index]); + //} + //else { + // odata[index] = ~(idata[index-1]|idata[index]); + //} + } + } + __global__ void computeT(int n, int * tdata, const int * fdata, const int totalFalses){ + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < n){ + tdata[index] = index - fdata[index] + totalFalses; + } + } + + __global__ void computeB(int n, int *bdata, const int *idata, int ith){ + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < n){ + //bdata[index] = (idata[index]>> ith) & 1; + bdata[index] = (idata[index] & (1<>ith; + } + } + + __global__ void computeD(int n, int *ddata, const int * bdata, const int *tdata, const int * fdata){ + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < n){ + //printf(" %d \n",bdata[index]); + ddata[index] = bdata[index] ? tdata[index] : fdata[index]; + } + } + + __global__ void scatter(int n, int *odata, const int *idata, const int * ddata){ + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < n){ + //odata[index]= idata[ddata[index]]; + odata[ddata[index]]= idata[index]; + } + } + /** + * radix sort + */ + void sort(int n, int *odata, const int *idata, int msb) { + dim3 numblocks(std::ceil((double) n / blockSize)); + + int * idata_buff; + int * idata_buff2; + int * bdata_buff; + int * edata_buff; + int * fdata_buff; + int * tdata_buff; + int * ddata_buff; + + + cudaMalloc((void**)&idata_buff,n*sizeof(int)); + checkCUDAError("cudaMalloc-idata_buff- failed!"); + cudaMalloc((void**)&idata_buff2,n*sizeof(int)); + checkCUDAError("cudaMalloc-idata_buff2- failed!"); + cudaMalloc((void**)&bdata_buff,n*sizeof(int)); + checkCUDAError("cudaMalloc-bdata_buff- failed!"); + cudaMalloc((void**)&edata_buff,n*sizeof(int)); + checkCUDAError("cudaMalloc-edata_buff- failed!"); + cudaMalloc((void**)&fdata_buff,n*sizeof(int)); + checkCUDAError("cudaMalloc-fdata_buff- failed!"); + cudaMalloc((void**)&tdata_buff,n*sizeof(int)); + checkCUDAError("cudaMalloc-tdata_buff- failed!"); + cudaMalloc((void**)&ddata_buff,n*sizeof(int)); + checkCUDAError("cudaMalloc-ddata_buff- failed!"); + + /// CPU -->GPU + cudaMemcpy(idata_buff,idata,n*sizeof(int),cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy-idata_buff-failed"); + + for (int i=0; i<= msb; i++){ + int totalFalses; + int totalFalses1 = 0; + int totalFalses2 = 0; + //find b array for each bit + computeB<<>>(n, bdata_buff, idata_buff, i); + computeE<<>>(n, edata_buff, bdata_buff); + StreamCompaction::Efficient::scan(n, fdata_buff, edata_buff); + + cudaMemcpy(&totalFalses1,edata_buff+n-1,sizeof(int),cudaMemcpyDeviceToHost); + cudaMemcpy(&totalFalses2,fdata_buff+n-1,sizeof(int),cudaMemcpyDeviceToHost); + totalFalses = totalFalses1 + totalFalses2; + + computeT<<>>(n, tdata_buff, fdata_buff, totalFalses); + computeD<<>>(n, ddata_buff, bdata_buff, tdata_buff, fdata_buff); + + + //scatter darray for this bit + scatter<<>>(n, idata_buff2, idata_buff, ddata_buff); + cudaMemcpy(idata_buff,idata_buff2,n*sizeof(int),cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy-idata_buff-failed"); + } + + //GPU --> CPU + cudaMemcpy(odata,idata_buff,n*sizeof(int),cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy-odata-failed"); + //free + cudaFree(idata_buff); + cudaFree(idata_buff2); + cudaFree(bdata_buff); + cudaFree(tdata_buff); + cudaFree(fdata_buff); + cudaFree(edata_buff); + cudaFree(ddata_buff); + } + + } +} diff --git a/stream_compaction/radixSort.h b/stream_compaction/radixSort.h new file mode 100644 index 0000000..fd69bbe --- /dev/null +++ b/stream_compaction/radixSort.h @@ -0,0 +1,8 @@ +#pragma once + +namespace StreamCompaction { +namespace RadixSort { + void sort(int n, int *odata, const int *idata, int msb); + +} +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..1141bdb 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -5,6 +5,8 @@ #include #include "common.h" #include "thrust.h" +#include +//#include namespace StreamCompaction { namespace Thrust { @@ -16,6 +18,15 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + //auto begin = std::chrono::high_resolution_clock::now(); + + thrust::exclusive_scan(idata , idata +n , odata); + + //auto end = std::chrono::high_resolution_clock::now(); + //float ns = std::chrono::duration_cast(end - begin).count(); + // float ns=0; + //return ns; } }