diff --git a/CMakeLists.txt b/CMakeLists.txt index 16abe08..a51d998 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,7 @@ set(CMAKE_CXX_STANDARD 11) list(APPEND CUDA_NVCC_FLAGS_DEBUG -G -g) list(APPEND CUDA_NVCC_FLAGS_RELWITHDEBUGINFO -lineinfo) +list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_60,code=sm_60) # Crucial magic for CUDA linking find_package(Threads REQUIRED) diff --git a/README.md b/README.md index b71c458..3b17394 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,159 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +Sarah Forcier -### (TODO: Your README) +Tested on GeForce GTX 1070 -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### Overview +This project provides an introduction to parallel algorithms on integer arrays +* Scan : Computes a exclusive sum of all integers that came before each element in the array +* Stream Compaction : Removes 0's from an array of integers +* Radix Sort : Sorts an array of integers from least to most significant bits +### Parallel Algorithms +#### Naive Scan + +The diagram below illustrates the naive parallel scan algorithm. It starts by adding adjacent values, and then adding those values, and so on. This requires log2n iterations, and also requires additional memory since the algorithm cannot work in place. + +![](img/naive.jpg) + +#### Work Efficient Scan + +In contrast to the naive scan, the work efficient algorithm works in place. The idea behind this algorithm is to create a balanced binary tree of the input data (up-sweep) computing the partial sums at internal nodes, followed by a root-to-leaves sweep of this tree that builds the sum in place. An actual tree data structure is not required, as the structure can be flattened into an array. + +![](img/upsweep.jpg) + +In the down-sweep stage, the root of the tree is replaced with *0* and at each step, the node of the current level is passed to its left child and the sum of its value and the left child is passed to the right, as seen below. + +![](img/downsweep.jpg) + +Each pass in up-sweep requires almost *n* iterations, so *n* threads must be allocated even if some threads will not be used, so can retire early. However, for down-sweep an additional optimization can be used to only allocated the needed number of threads per level. + +Because the up-sweep and down-sweep algorithms rely on the size of the array being a power-of-two, non-power-of-two arrays must be right extended to the next largest power-of-two in order to maintain correct indexing. + +#### Stream Compaction + +The stream compaction algorithm start by creating a temporary boolean array where each element is *1* if the value at that index in the input array is not zero and *0* otherwise. Next this boolean array is scanned using the implementation above. The result of the scan is the index into the final array, so if the value of an index in the boolean array is *1*, then the value at that same index in the input array can be placed in the final array at the new index specified by the scanned array. + +#### Radix Sort + +The parallel implementation for radix sort begins similarly to the above stream compaction algorithm. First for bit *k*, a temporary boolean array *b* is created where is element is *0* if the *k*-th bit in the input array is *1* and visa versa. A scan is run on this boolean array. Next the total number of falses (or the number of values whose *k*-th bit is *0*) is calculated by summing the last element in the boolean array and the scanned result of this boolean array. This value is used to create another array *t* where each value at index *i* is equal to *i - s[i] + totalFalses* where *s* is the scanned array. The last array to be computed contains the indices of each input element in the array sorted by *k*-th bit. The values in this array are equal to *b[i] ? f[i] : t[i]*. While this algorithm requires multiple passes and new arrays, it is not necessary to allocate space for each step. Some steps can be done in place, and later steps can overwrite previous ones once they are no longer needed. + +*Images taken from GPU Gems 3, Chapter 39* + +### Test Program Output + +#### Scan + +``` + [ 0 31 19 29 27 6 28 18 48 16 43 0 5 49 24 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0ms (std::chrono Measured) + [ 0 0 31 50 79 106 112 140 158 206 222 265 265 270 319 343 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0ms (std::chrono Measured) + [ 0 0 31 50 79 106 112 140 158 206 222 265 265 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.011904ms (CUDA Measured) + [ 0 0 31 50 79 106 112 140 158 206 222 265 265 270 319 343 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.0112ms (CUDA Measured) + [ 0 0 31 50 79 106 112 140 158 206 222 265 265 ] + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.018048ms (CUDA Measured) + [ 0 0 31 50 79 106 112 140 158 206 222 265 265 270 319 343 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.017888ms (CUDA Measured) + [ 0 0 31 50 79 106 112 140 158 206 222 265 265 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 10.8968ms (CUDA Measured) + [ 0 0 31 50 79 106 112 140 158 206 222 265 265 270 319 343 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.013568ms (CUDA Measured) + [ 0 0 31 50 79 106 112 140 158 206 222 265 265 ] + passed +``` + +#### Stream Compaction + +``` + [ 0 3 3 3 1 0 0 0 2 2 3 2 1 3 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.000277ms (std::chrono Measured) + [ 3 3 3 1 2 2 3 2 1 3 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.000277ms (std::chrono Measured) + [ 3 3 3 1 2 2 3 2 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.000278ms (std::chrono Measured) + [ 3 3 3 1 2 2 3 2 1 3 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.217408ms (CUDA Measured) + [ 3 3 3 1 2 2 3 2 1 3 2 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.201632ms (CUDA Measured) + [ 3 3 3 1 2 2 3 2 1 ] + passed +``` + +#### Radix Sort + +``` + [ 8 15 7 7 13 12 12 8 14 14 3 2 1 3 10 0 ] +==== cpu sort, power-of-two ==== + elapsed time: 0.00111ms (std::chrono Measured) + [ 0 1 2 3 3 7 7 8 8 10 12 12 13 14 14 15 ] +==== cpu sort, non-power-of-two ==== + elapsed time: 0.000555ms (std::chrono Measured) + [ 1 2 3 7 7 8 8 12 12 13 14 14 15 ] +==== radix sort, power-of-two ==== + elapsed time: 0.784896ms (CUDA Measured) + [ 0 1 2 3 3 7 7 8 8 10 12 12 13 14 14 15 ] + passed +==== radix sort, non-power-of-two ==== + elapsed time: 0.656448ms (CUDA Measured) + [ 1 2 3 7 7 8 8 12 12 13 14 14 15 ] + passed + ``` + +### Performance Analysis + +| Scan for SIZE < 20K | Scan for SIZE > 4M | +| ------------- | ----------- | +| ![](img/scan20K.png) | ![](img/scan4m.png) | + +The CPU method on non-power-of-two arrays is faster than that for power-of-two arrays because the CPU performs serially, and the non-power-of-two arrays have 3 fewer elements, so it finished faster because it has 3 less iterations in for-loops. The difference in performance between these differently sized arrays on the GPU side is the opposite because warps are allocated for 32 threads, so non-power-of-two sized arrays cannot fill an entire warp. Thus not all threads are utilized and the warp has lower occupancy. For an array with 3 less elements than a power-of-two (as was tested here), the different is marginal. But the difference is much more apparent for arrays with 1 more element than a power-of-two. + +| Compact for SIZE < 20K | Compact for SIZE > 4M | +| ------------- | ----------- | +| ![](img/compact20k.png) | ![](img/compact4m.png) | + +The difference in performance between the CPU method, the CPU method with scan, and the GPU efficient method illustrates why different algorithms are required for parallel and serial executes. The CPU method with scan is a CPU version of the GPU efficient method. Without multiple threads running at once, the CPU version must rely on multiple for-loops, more than its compaction implementation without scan. Therefore it is the slowest of the methods. + +| Radix Sort for SIZE < 20K | Radix Sort for SIZE > 4M | +| ------------- | ----------- | +| ![](img/radix20k.png) | ![](img/radix4m.png) | + +Much like scan and compaction, GPU radix sort performs faster for large array sizes even though parallel radix sort requires more sort passes becauce the GPU works on bits in order to benefit from scan whereas the CPU version works on digits in decimal. The CPU algorithm calculates the maximum value in the array to determine how many sort passes are required. The maximum number of passes the GPU could do on an integer array is 32, but since the CPU version only loops for the maximum number of digits. So for performance comparison sake, the GPU method takes the number of bits in the input array that contain 1s, and the calculation of the CPU maximum is not included in the time. + +### Q&A + +#### Compare GPU Scan implementations to the serial CPU version? +The GPU version performs better for scan, stream compaction, and radix sort for large arrays (*SIZE < 2^[18]*). Since the intersection between the CPU and GPU methods occurs at the same sized array, it suggests that the slow GPU performance for small arrays is based on a overhead required for creating and distributing threads. One that is only surpassed with large arrays. + +#### Where are the performance bottlenecks? +For the work efficient implementation, the upsweep and downsweep kernel functions require a similar amount of time, because they have the same ratio of compute to memory instructions. Most time is spent in memory reads and writes, but since there are not many of either type of instruction, it is not possible to space out the memory instructions with computations. + +#### How does the GPU Scan implementation compare to thrust? +Thrust exclusive scan on power-of-two sized arrays performs poorly but consistently for most array size except the largest. The consistency indicates that the thrust implementation might have a large overhead. While other implementations' runtimes increase exponentially with array size, thrust with power-of-two arrays remains around 10ms for all array sizes. However, the thrust method on non-power-of-two arrays behaves more closely to the other gpu scan implementations with an exponential increase but with a much better performance. An explanation for the poor performance with power-of-two arrays might be that thrust implementation might allocate and compute on more memory with these sizes but not for non-power-of-two arrays - the opposite of what the work efficient method does. According to the Nsight performance analysis, thrust also has low warp occupancy (25%) and high number of required registers per thread (78), which could also explain the poor perform for these sized arrays. However, thrust makes good use of shared memory. diff --git a/img/compact20k.png b/img/compact20k.png new file mode 100644 index 0000000..987cfe6 Binary files /dev/null and b/img/compact20k.png differ diff --git a/img/compact4m.png b/img/compact4m.png new file mode 100644 index 0000000..10e04cf Binary files /dev/null and b/img/compact4m.png differ diff --git a/img/downsweep.jpg b/img/downsweep.jpg new file mode 100644 index 0000000..88be7ff Binary files /dev/null and b/img/downsweep.jpg differ diff --git a/img/naive.jpg b/img/naive.jpg new file mode 100644 index 0000000..bc9f9da Binary files /dev/null and b/img/naive.jpg differ diff --git a/img/radix20k.png b/img/radix20k.png new file mode 100644 index 0000000..26b65cd Binary files /dev/null and b/img/radix20k.png differ diff --git a/img/radix4m.png b/img/radix4m.png new file mode 100644 index 0000000..9bee8c2 Binary files /dev/null and b/img/radix4m.png differ diff --git a/img/scan20K.png b/img/scan20K.png new file mode 100644 index 0000000..dc70d23 Binary files /dev/null and b/img/scan20K.png differ diff --git a/img/scan4m.png b/img/scan4m.png new file mode 100644 index 0000000..fc4477f Binary files /dev/null and b/img/scan4m.png differ diff --git a/img/upsweep.jpg b/img/upsweep.jpg new file mode 100644 index 0000000..22c4da0 Binary files /dev/null and b/img/upsweep.jpg differ diff --git a/performance.xlsx b/performance.xlsx new file mode 100644 index 0000000..187a647 Binary files /dev/null and b/performance.xlsx differ diff --git a/src/main.cpp b/src/main.cpp index 7305641..94fdd03 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -10,23 +10,32 @@ #include #include #include +#include #include +#include #include "testing_helpers.hpp" - -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 20; // must be <= 20 const int NPOT = SIZE - 3; // Non-Power-Of-Two -int a[SIZE], b[SIZE], c[SIZE]; +//int a[SIZE], b[SIZE], c[SIZE], d[SIZE]; int main(int argc, char* argv[]) { // Scan tests + int* a = reinterpret_cast(malloc(SIZE * sizeof(int))); + int* b = reinterpret_cast(malloc(SIZE * sizeof(int))); + int* c = reinterpret_cast(malloc(SIZE * sizeof(int))); + int* d = reinterpret_cast(malloc(SIZE * sizeof(int))); + printf("\n"); printf("****************\n"); printf("** SCAN TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; + //genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + + for (int i = 0; i < SIZE; ++i) a[i] = i; + //a[SIZE - 1] = 0; + printArray(SIZE, a, true); // initialize b using StreamCompaction::CPU::scan you implement @@ -49,42 +58,63 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - zeroArray(SIZE, c); + zeroArray(SIZE, c); + printDesc("shared scan, power-of-two"); + StreamCompaction::Efficient_Shared::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient_Shared::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("shared scan, non-power-of-two"); + StreamCompaction::Efficient_Shared::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient_Shared::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + //printDesc("thrust scan, power-of-two"); + StreamCompaction::Thrust::scan(SIZE, c, a); + //printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + //printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -98,46 +128,97 @@ int main(int argc, char* argv[]) { a[SIZE - 1] = 0; printArray(SIZE, a, true); - int count, expectedCount, expectedNPOT; + int countSIZE, countNPOT, expectedSIZE, expectedNPOT; // initialize b using StreamCompaction::CPU::compactWithoutScan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. zeroArray(SIZE, b); printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + countSIZE = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); + expectedSIZE = countSIZE; + printArray(expectedSIZE, b, true); + printCmpLenResult(countSIZE, expectedSIZE, b, b); zeroArray(SIZE, c); printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + countNPOT = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + expectedNPOT = countNPOT; + printArray(countNPOT, c, true); + printCmpLenResult(countNPOT, expectedNPOT, b, c); zeroArray(SIZE, c); printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + countSIZE = StreamCompaction::CPU::compactWithScan(SIZE, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); + printArray(countSIZE, c, true); + printCmpLenResult(countSIZE, expectedSIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); + countSIZE = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); + printArray(expectedSIZE, c, true); + printCmpLenResult(countSIZE, expectedSIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); + countNPOT = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + printArray(expectedNPOT, c, true); + printCmpLenResult(countNPOT, expectedNPOT, b, c); + + zeroArray(SIZE, c); + printDesc("shared compact, power-of-two"); + countSIZE = StreamCompaction::Efficient_Shared::compact(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient_Shared::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(expectedSIZE, c, true); + printCmpLenResult(countSIZE, expectedSIZE, b, c); + + zeroArray(SIZE, c); + printDesc("shared compact, non-power-of-two"); + countNPOT = StreamCompaction::Efficient_Shared::compact(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient_Shared::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(expectedNPOT, c, true); + printCmpLenResult(countNPOT, expectedNPOT, b, c); + + // printf("\n"); + // printf("*****************************\n"); + // printf("** RADIX SORT TESTS **\n"); + // printf("*****************************\n"); + + // // Radix Tests + + //int k = 4; + //genArray(SIZE - 1, a, 1 << k); + // printArray(SIZE, a, true); + + // zeroArray(SIZE, b); + // printDesc("cpu sort, power-of-two"); + // StreamCompaction::CPU::sort(SIZE, b, a); + // printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + // printArray(SIZE, b, true); + + // zeroArray(SIZE, c); + // printDesc("cpu sort, non-power-of-two"); + // StreamCompaction::CPU::sort(NPOT, c, a); + // printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + // printArray(NPOT, c, true); + + // zeroArray(SIZE, d); + // printDesc("radix sort, power-of-two"); + // StreamCompaction::Radix::sort(SIZE, k + 1, d, a); + // printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // printArray(SIZE, d, true); + //printCmpResult(SIZE, b, d); + + // zeroArray(SIZE, d); + // printDesc("radix sort, non-power-of-two"); + // StreamCompaction::Radix::sort(NPOT, k + 1, d, a); + // printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // printArray(NPOT, d, true); + // printCmpResult(NPOT, c, d); system("pause"); // stop Win32 console from closing on exit } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..1d1f41d 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -7,11 +7,15 @@ set(SOURCE_FILES "naive.cu" "efficient.h" "efficient.cu" + "efficient_shared.h" + "efficient_shared.cu" "thrust.h" "thrust.cu" + "radix.h" + "radix.cu" ) cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 - ) + OPTIONS -arch=sm_60 + ) \ No newline at end of file diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 8fc0211..f85eec5 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,16 +24,22 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n || idata[index] == 0) return; + + bools[index] = 1; } /** * Performs scatter on an array. That is, for each element in idata, * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ - __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { + __global__ void kernScatter(int n, int *odata, const int *idata, const int *indices) { // TODO - } + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n || idata[index] == 0) return; + odata[indices[index]] = idata[index]; + } } -} +} \ No newline at end of file diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 55f1b38..158245d 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -1,17 +1,22 @@ #pragma once -#include -#include - -#include -#include -#include -#include +#include +#include + +#include +#include +#include +#include #include -#include +#include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define blockSize 1024 +#define bankSize 16 +#define log_bankSize 4 +//#define conflictFreeOffset(n) ((n) >> bankSize + (n) >> (2 * log_bankSize)) +#define conflictFreeOffset(n) 0 /** * Check for CUDA errors; print and exit if there was a problem. @@ -20,6 +25,7 @@ void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1); inline int ilog2(int x) { int lg = 0; + x = std::max(0, x); while (x >>= 1) { ++lg; } @@ -32,101 +38,101 @@ inline int ilog2ceil(int x) { namespace StreamCompaction { namespace Common { - __global__ void kernMapToBoolean(int n, int *bools, const int *idata); + __global__ void kernMapToBoolean(int n, int *bools, const int *idata); __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices); - - /** - * This class is used for timing the performance - * Uncopyable and unmovable - * - * Adapted from WindyDarian(https://github.com/WindyDarian) - */ - class PerformanceTimer - { - public: - PerformanceTimer() - { - cudaEventCreate(&event_start); - cudaEventCreate(&event_end); - } - - ~PerformanceTimer() - { - cudaEventDestroy(event_start); - cudaEventDestroy(event_end); - } - - void startCpuTimer() - { - if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } - cpu_timer_started = true; - - time_start_cpu = std::chrono::high_resolution_clock::now(); - } - - void endCpuTimer() - { - time_end_cpu = std::chrono::high_resolution_clock::now(); - - if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } - - std::chrono::duration duro = time_end_cpu - time_start_cpu; - prev_elapsed_time_cpu_milliseconds = - static_cast(duro.count()); - - cpu_timer_started = false; - } - - void startGpuTimer() - { - if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); } - gpu_timer_started = true; - - cudaEventRecord(event_start); - } - - void endGpuTimer() - { - cudaEventRecord(event_end); - cudaEventSynchronize(event_end); - - if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } - - cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); - gpu_timer_started = false; - } - - float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015 - { - return prev_elapsed_time_cpu_milliseconds; - } - - float getGpuElapsedTimeForPreviousOperation() //noexcept - { - return prev_elapsed_time_gpu_milliseconds; - } - - // remove copy and move functions - PerformanceTimer(const PerformanceTimer&) = delete; - PerformanceTimer(PerformanceTimer&&) = delete; - PerformanceTimer& operator=(const PerformanceTimer&) = delete; - PerformanceTimer& operator=(PerformanceTimer&&) = delete; - - private: - cudaEvent_t event_start = nullptr; - cudaEvent_t event_end = nullptr; - - using time_point_t = std::chrono::high_resolution_clock::time_point; - time_point_t time_start_cpu; - time_point_t time_end_cpu; - - bool cpu_timer_started = false; - bool gpu_timer_started = false; - - float prev_elapsed_time_cpu_milliseconds = 0.f; - float prev_elapsed_time_gpu_milliseconds = 0.f; + const int *idata, const int *indices); + + /** + * This class is used for timing the performance + * Uncopyable and unmovable + * + * Adapted from WindyDarian(https://github.com/WindyDarian) + */ + class PerformanceTimer + { + public: + PerformanceTimer() + { + cudaEventCreate(&event_start); + cudaEventCreate(&event_end); + } + + ~PerformanceTimer() + { + cudaEventDestroy(event_start); + cudaEventDestroy(event_end); + } + + void startCpuTimer() + { + if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } + cpu_timer_started = true; + + time_start_cpu = std::chrono::high_resolution_clock::now(); + } + + void endCpuTimer() + { + time_end_cpu = std::chrono::high_resolution_clock::now(); + + if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } + + std::chrono::duration duro = time_end_cpu - time_start_cpu; + prev_elapsed_time_cpu_milliseconds = + static_cast(duro.count()); + + cpu_timer_started = false; + } + + void startGpuTimer() + { + if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); } + gpu_timer_started = true; + + cudaEventRecord(event_start); + } + + void endGpuTimer() + { + cudaEventRecord(event_end); + cudaEventSynchronize(event_end); + + if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } + + cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); + gpu_timer_started = false; + } + + float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015 + { + return prev_elapsed_time_cpu_milliseconds; + } + + float getGpuElapsedTimeForPreviousOperation() //noexcept + { + return prev_elapsed_time_gpu_milliseconds; + } + + // remove copy and move functions + PerformanceTimer(const PerformanceTimer&) = delete; + PerformanceTimer(PerformanceTimer&&) = delete; + PerformanceTimer& operator=(const PerformanceTimer&) = delete; + PerformanceTimer& operator=(PerformanceTimer&&) = delete; + + private: + cudaEvent_t event_start = nullptr; + cudaEvent_t event_end = nullptr; + + using time_point_t = std::chrono::high_resolution_clock::time_point; + time_point_t time_start_cpu; + time_point_t time_end_cpu; + + bool cpu_timer_started = false; + bool gpu_timer_started = false; + + float prev_elapsed_time_cpu_milliseconds = 0.f; + float prev_elapsed_time_gpu_milliseconds = 0.f; }; } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..e513108 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,25 +1,36 @@ #include #include "cpu.h" -#include "common.h" +#include "common.h" namespace StreamCompaction { namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } + void scan_implementation(int n, int *odata, const int *idata) + { + // TODO + int sum = 0; + for (int i = 0; i < n; ++i) { + odata[i] = sum; + sum += idata[i]; + } + } + /** * CPU scan (prefix sum). * For performance analysis, this is supposed to be a simple for loop. * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata) + { timer().startCpuTimer(); - // TODO + scan_implementation(n, odata, idata); timer().endCpuTimer(); } @@ -30,9 +41,19 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); + // TODO + int num = 0; + for (int i = 0; i < n; ++i) { + int val = idata[i]; + if (val != 0) { + odata[num] = val; + num++; + } + } + timer().endCpuTimer(); - return -1; + return num; } /** @@ -41,10 +62,96 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); + // TODO - timer().endCpuTimer(); - return -1; + + int* temp = new int[n + 1]; + temp[n] = 0; + int num = 0; + timer().startCpuTimer(); + // compute temporary array + for (int i = 0; i < n; ++i) { + temp[i] = (idata[i] == 0) ? 0 : 1; + } + // run exclusive scan on temporary array + int sum = 0; + for (int i = 0; i <= n; ++i) { + int val = temp[i]; + temp[i] = sum; + sum += val; + } + + // scatter + for (int i = 1; i <= n; ++i) { + if (temp[i] != temp[i-1]) { + odata[num] = idata[i - 1]; + num++; + } + } + timer().endCpuTimer(); + delete[] temp; + return num; + } + + int findMax(int n, const int *idata) + { + int max = idata[0]; + for (int i = 1; i < n; ++i) { + int val = idata[i]; + if (val > max) { + max = val; + } + } + + return max; + } + + void countSort(int n, int d, int *odata, int *idata) + { + int count[10] = {0}; + + for (int i = 0; i < n; ++i) { + count[(idata[i]/d)%10] ++; + } + + for (int i = 1; i < 10; ++i) { + count[i] += count[i - 1]; + } + + for (int i = n - 1; i >= 0; --i) { + odata[ count[(idata[i]/d)%10] - 1] = idata[i]; + count[(idata[i]/d)%10] --; + } + } + + /** + * CPU sort + */ + void sort(int n, int *odata, const int *idata) + { + // TODO (from GeeksforGeeks Radix Sort) + int* temp = new int[n]; + + for (int i = 0; i < n; ++i) { + odata[i] = idata[i]; + } + + int max = findMax(n, idata); + + timer().startCpuTimer(); + bool eo = true; + for (int d = 1; max / d > 0; d *= 10) { + countSort(n, d, temp, odata); + std::swap(temp, odata); + eo = !eo; + } + timer().endCpuTimer(); + + if (!eo) { + std::swap(temp, odata); + std::memcpy(odata, temp, n * sizeof(int)); + } + delete[] temp; } } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 236ce11..9ba529e 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -11,5 +11,7 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata); int compactWithScan(int n, int *odata, const int *idata); + + void sort(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..80d8e47 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -5,20 +5,91 @@ namespace StreamCompaction { namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } + __global__ void kernZeroed(int totalN, int n, int *odata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= totalN) return; + + odata[index] = (index < n) ? odata[index] : 0; + } + + __global__ void kernUpSweep(int n, int d, int *odata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + int offset = 1 << (d + 1); + if (index >= n >> (d + 1)) return; + int i = (index + 1) * offset - 1; + if (i >= n) return; + + int val = 1 << d; + odata[i] += odata[i - val]; + if (i == n - 1) odata[i] = 0; + } + + __global__ void kernDownSweep(int n, int d, int *odata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + int offset = 1 << (d + 1); + if (index >= n >> (d + 1)) return; + int i = (index + 1) * offset - 1; + if (i >= n) return; + + int val = 1 << d; + + int temp = odata[i]; + odata[i] += odata[i - val]; + odata[i - val] = temp; + } + + void scan_implementation(int pow2n, int *dev_out) + { + for (int d = 0; d < ilog2ceil(pow2n); ++d) { + dim3 blocksPerGrid((pow2n / (1 << (d + 1)) + blockSize - 1) / blockSize); + kernUpSweep << > > (pow2n, d, dev_out); + checkCUDAError("kernUpSweep failed!"); + } + + for (int d = ilog2ceil(pow2n) - 1; d >= 0; --d) { + dim3 blocksPerGrid((pow2n / (1 << (d + 1)) + blockSize - 1) / blockSize); + kernDownSweep << > > (pow2n, d, dev_out); + checkCUDAError("kernDownSweep failed!"); + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + void scan(int n, int *odata, const int *idata) + { + int *dev_out; + int pow2n = 1 << ilog2ceil(n); + + cudaMalloc((void**)&dev_out, pow2n * sizeof(int)); + checkCUDAError("cudaMalloc dev_out failed!"); + + cudaMemcpy(dev_out, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_out failed!"); + + dim3 blocksPerGrid((pow2n + blockSize - 1) / blockSize); + kernZeroed << > > (pow2n, n, dev_out); + checkCUDAError("kernZeroed failed!"); + + timer().startGpuTimer(); + scan_implementation(pow2n, dev_out); + timer().endGpuTimer(); + checkCUDAError("scan_implementation failed!"); + + cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpyDeviceToHost failed!"); + + cudaFree(dev_out); } /** @@ -30,11 +101,62 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; + int compact(int n, int *odata, const int *idata) + { + // TODO + int *dbools; + int *dev_in; + int *dev_out; + + int pow2n = 1 << ilog2ceil(n); + int block2n = 1 << ilog2ceil(pow2n / blockSize); + + cudaMalloc((void**)&dbools, pow2n * sizeof(int)); + checkCUDAError("cudaMalloc dbools failed!"); + + cudaMemset(dbools, 0, pow2n * sizeof(int)); + checkCUDAError("cudaMemset dbools failed!"); + + cudaMalloc((void**)&dev_in, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_in failed!"); + + cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_in failed!"); + + cudaMalloc((void**)&dev_out, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_out failed!"); + + timer().startGpuTimer(); + + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + StreamCompaction::Common::kernMapToBoolean << > > (n, dbools, dev_in); + checkCUDAError("kernMapToBoolean failed!"); + + int num; + cudaMemcpyAsync(&num, dbools + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpyDeviceToHost failed!"); + + int ret = num; + + scan_implementation(pow2n, dbools); // requires power of 2 + + cudaMemcpyAsync(&num, dbools + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpyDeviceToHost failed!"); + ret += num; + + StreamCompaction::Common::kernScatter << > > (n, dev_out, dev_in, dbools); + checkCUDAError("kernScatter failed!"); + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_out, ret * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpyDeviceToHost failed!"); + + cudaFree(dbools); + cudaFree(dev_in); + cudaFree(dev_out); + + return ret; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..6653205 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,6 +6,8 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); + void scan_implementation(int n, int *dev_out); + void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); diff --git a/stream_compaction/efficient_shared.cu b/stream_compaction/efficient_shared.cu new file mode 100644 index 0000000..b773f7b --- /dev/null +++ b/stream_compaction/efficient_shared.cu @@ -0,0 +1,206 @@ +#include +#include +#include "common.h" +#include "efficient.h" + +namespace StreamCompaction { + namespace Efficient_Shared { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __global__ void kernScanShared(int n, int *odata, int *sumOfSums) + { + extern __shared__ int temp[]; + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + int offset = 1; + + // load shared memory + int idx = threadIdx.x; + //int bankOffset = conflictFreeOffset(idx); + int bankOffset = 0; + temp[idx + bankOffset] = odata[index]; + //temp[2 * index + 1] = odata[2 * index + 1]; + + // upsweep + for (int d = blockSize >> 1; d > 0; d >>= 1) { + __syncthreads(); + if (idx < d) { + int ai = offset * (2 * idx + 1) - 1; + int bi = offset * (2 * idx + 2) - 1; + ai += conflictFreeOffset(ai); + bi += conflictFreeOffset(bi); + + temp[bi] += temp[ai]; + } + offset *= 2; + } + + if (idx == 0) { + sumOfSums[blockIdx.x] = temp[blockDim.x - 1 + conflictFreeOffset(blockDim.x - 1)]; + temp[blockDim.x - 1 + conflictFreeOffset(blockDim.x - 1)] = 0; + } + + // downsweep + for (int d = 1; d < blockSize; d *= 2) { + offset >>= 1; + __syncthreads(); + if (idx < d) { + int ai = offset * (2 * idx + 1) - 1; + int bi = offset * (2 * idx + 2) - 1; + ai += conflictFreeOffset(ai); + bi += conflictFreeOffset(bi); + + int t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + + __syncthreads(); + + odata[index] = temp[idx + bankOffset]; + //odata[2 * index + 1] = temp[2 * index + 1]; + } + + __global__ void kernAddSums(int n, int *odata, int *sumOfSums) + { + __shared__ int sum; + if (threadIdx.x == 0) sum = sumOfSums[blockIdx.x]; + __syncthreads(); + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + odata[index] += sum; + } + + /** + * invariant: n must be power-of-2 + */ + void scan_implementation(int pow2n, int block2n, int *dev_out, int *sumOfSums) + { + int numBlocks1 = (pow2n + blockSize - 1) / blockSize; + int numBlocks2 = (block2n + blockSize - 1) / blockSize; + + kernScanShared << > > (pow2n, dev_out, sumOfSums); + checkCUDAError("kernScanShared 1 failed!"); + + kernScanShared << > > (block2n, sumOfSums, sumOfSums); + checkCUDAError("kernScanShared 2 failed!"); + + kernAddSums << > > (pow2n, dev_out, sumOfSums); + checkCUDAError("kernAddSums failed!"); + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) + { + int *dev_out; + int *sumOfSums; + int pow2n = 1 << ilog2ceil(n); + int block2n = 1 << ilog2ceil(pow2n / blockSize); + + cudaMalloc((void**)&dev_out, pow2n * sizeof(int)); + checkCUDAError("cudaMalloc dev_out failed!"); + + cudaMalloc((void**)&sumOfSums, block2n * sizeof(int)); + checkCUDAError("cudaMalloc sumOfSums failed!"); + + cudaMemset(dev_out, 0, pow2n * sizeof(int)); + checkCUDAError("cudaMemset dev_out failed!"); + + cudaMemcpy(dev_out, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_out failed!"); + + timer().startGpuTimer(); + scan_implementation(pow2n, block2n, dev_out, sumOfSums); + timer().endGpuTimer(); + checkCUDAError("scan_implementation failed!"); + + cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpyDeviceToHost failed!"); + + cudaFree(dev_out); + cudaFree(sumOfSums); + } + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + int compact(int n, int *odata, const int *idata) + { + // TODO + int *dbools; + int *dev_in; + int *dev_out; + int *sumOfSums; + + int pow2n = 1 << ilog2ceil(n); + int block2n = 1 << ilog2ceil(pow2n / blockSize); + + cudaMalloc((void**)&dbools, pow2n * sizeof(int)); + checkCUDAError("cudaMalloc dbools failed!"); + + cudaMemset(dbools, 0, pow2n * sizeof(int)); + checkCUDAError("cudaMemset dbools failed!"); + + cudaMalloc((void**)&dev_in, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_in failed!"); + + cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_in failed!"); + + cudaMalloc((void**)&sumOfSums, block2n * sizeof(int)); + checkCUDAError("cudaMalloc sumOfSums failed!"); + + cudaMalloc((void**)&dev_out, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_out failed!"); + + timer().startGpuTimer(); + + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + StreamCompaction::Common::kernMapToBoolean << > > (n, dbools, dev_in); + checkCUDAError("kernMapToBoolean failed!"); + + int num; + cudaMemcpyAsync(&num, dbools + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpyDeviceToHost failed!"); + + int ret = num; + + scan_implementation(pow2n, block2n, dbools, sumOfSums); // requires power of 2 + + cudaMemcpyAsync(&num, dbools + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpyDeviceToHost failed!"); + ret += num; + + StreamCompaction::Common::kernScatter << > > (n, dev_out, dev_in, dbools); + checkCUDAError("kernScatter failed!"); + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_out, ret * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpyDeviceToHost failed!"); + + cudaFree(dbools); + cudaFree(dev_in); + cudaFree(dev_out); + cudaFree(sumOfSums); + + return ret; + } + } +} diff --git a/stream_compaction/efficient_shared.h b/stream_compaction/efficient_shared.h new file mode 100644 index 0000000..2a1561c --- /dev/null +++ b/stream_compaction/efficient_shared.h @@ -0,0 +1,15 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace Efficient_Shared { + StreamCompaction::Common::PerformanceTimer& timer(); + + void scan_implementation(int n, int *dev_out); + + void scan(int n, int *odata, const int *idata); + + int compact(int n, int *odata, const int *idata); + } +} diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..713eaa3 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -5,21 +5,57 @@ namespace StreamCompaction { namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } - // TODO: __global__ + // TODO + __global__ void kernNaiveScan(int n, int val, int *odata, int *idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + odata[index] = (index >= val) ? idata[index] + idata[index - val] : idata[index]; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO - timer().endGpuTimer(); + int *dev_in; + int *dev_out; + + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_in, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_in failed!"); + + cudaMalloc((void**)&dev_out, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_in failed!"); + + cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_in failed!"); + cudaMemcpy(dev_out, odata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_out failed!"); + + timer().startGpuTimer(); + for (int d = 1; d <= ilog2ceil(n); ++d) { + int val = 1 << (d - 1); + kernNaiveScan << > > (n, val, dev_out, dev_in); + std::swap(dev_in, dev_out); + checkCUDAError("kernNaiveScan failed!"); + } + + timer().endGpuTimer(); + + cudaMemcpy(odata + 1, dev_in, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpyDeviceToHost failed!"); + + cudaFree(dev_in); + cudaFree(dev_out); } } } diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 0000000..b87cfcc --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,155 @@ +#include +#include +#include "common.h" +#include "efficient.h" + +namespace StreamCompaction { + namespace Radix { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + /** + * Maps an array to an array of 0s and 1s for bit n. + */ + __global__ void kernMapToBoolean(int pow2n, int n, int k, int *bools, const int *idata) { + // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= pow2n) return; + + int val = 1 << k; + bools[index] = ((idata[index] & val) != 0 || index >= n) ? 0 : 1; + } + + /** + * Computes t array: t[i] = i - f[i] + totalFalses; f = idata, t = odata + */ + __global__ void kernComputeT(int n, int totalFalses, int *odata, const int *idata) { + // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + odata[index] = index - idata[index] + totalFalses; + } + + /** + * Computes d array: d[i] = e[i] ? f[i] : t[i]; + */ + __global__ void kernComputeD(int n, int *e, const int *f, const int *t) { + // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + e[index] = (e[index] != 0) ? f[index] : t[index]; + } + + /** + * Computes scattered array based on D indices + */ + __global__ void kernScatterD(int n, int *odata, int *indices, const int *idata) { + // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + odata[indices[index]] = idata[index]; + } + + void sort_implementation(int n, int k, int* last_e, int* last_f, + int *e_arr, int *f_arr, int *t_arr, int *dev_in) + { + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + + int pow2n = 1 << ilog2ceil(n); + + // Step 1: compute e array + kernMapToBoolean<<>>(pow2n, n, k, f_arr, dev_in); + checkCUDAError("kernMapToBoolean failed!"); + + cudaMemcpy(e_arr, f_arr, n * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpyDeviceToDevice failed!"); + + // Step 2: exclusive scan e + StreamCompaction::Efficient::scan_implementation(pow2n, f_arr); + + // Step 3: compute totalFalses + cudaMemcpy(last_e, e_arr + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("last_e cudaMemcpyDeviceToHost failed!"); + + cudaMemcpy(last_f, f_arr + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("last_f cudaMemcpyDeviceToHost failed!"); + + int totalFalses = *last_e + *last_f; + + // Step 4: compute t array + kernComputeT<<>>(n, totalFalses, t_arr, f_arr); + checkCUDAError("kernComputeT failed!"); + + // Step 4: scatter based on address d + kernComputeD<<>>(n, e_arr, f_arr, t_arr); + checkCUDAError("kernComputeD failed!"); + + kernScatterD<<>>(n, f_arr, e_arr, dev_in); + checkCUDAError("kernScatterD failed!"); + } + + /** + * Performs radix sort on idata, storing the result into odata. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + void sort(int n, int d, int *odata, const int *idata) + { + // TODO + int *e_arr; + int *f_arr; + int *t_arr; + int *dev_in; + + int pow2n = 1 << ilog2ceil(n); + + int *last_e = (int *)malloc(sizeof(int)); + int *last_f = (int *)malloc(sizeof(int)); + + cudaMalloc((void**)&e_arr, n * sizeof(int)); + checkCUDAError("cudaMalloc e_arr failed!"); + + cudaMalloc((void**)&f_arr, pow2n * sizeof(int)); + checkCUDAError("cudaMalloc f_arr failed!"); + + cudaMalloc((void**)&t_arr, n * sizeof(int)); + checkCUDAError("cudaMalloc t_arr failed!"); + + cudaMalloc((void**)&dev_in, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_in failed!"); + + cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_in failed!"); + + timer().startGpuTimer(); + for (int k = 0; k < d; ++k) { + sort_implementation(n, k, last_e, last_f, e_arr, f_arr, t_arr, dev_in); + std::swap(dev_in, f_arr); + /*cudaMemcpy(dev_in, f_arr, n * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpyDeviceToDevice failed!");*/ + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_in, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpyDeviceToHost failed!"); + + free(last_e); + free(last_f); + + cudaFree(e_arr); + cudaFree(f_arr); + cudaFree(t_arr); + cudaFree(dev_in); + } + } +} diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 0000000..cbadf0d --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,11 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace Radix { + StreamCompaction::Common::PerformanceTimer& timer(); + + void sort(int n, int d, int *odata, const int *idata); + } +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..35ea3dc 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -8,21 +8,30 @@ namespace StreamCompaction { namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + thrust::host_vector host_in = thrust::host_vector(idata, idata + n); + thrust::host_vector host_out = thrust::host_vector(odata, odata + n); + + thrust::device_vector dev_in = thrust::device_vector(host_in); + thrust::device_vector dev_out = thrust::device_vector(host_out); + + // TODO timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); + thrust::exclusive_scan(dev_in.begin(), dev_in.end(), dev_out.begin()); + timer().endGpuTimer(); + + thrust::copy(dev_out.begin(), dev_out.end(), odata); + } } }