diff --git a/README.md b/README.md index b71c458..1e15a96 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,24 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Alexander Perry +* Tested on: Windows 10, i5-2410M @ 2.30GHz 8GB, NVS 4200M -### (TODO: Your README) +### Analysis +![](./img/power_of_two_release.PNG) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +![](./img/non_power_of_two_release.PNG) +The most obvious thing to note is that longer arrays take longer to sort. +Beyond that we can see that algorithms that are supposed to more efficient are generally less efficient as implemented. +The CPU algorithm is the fastest algorithm, followed by the thrust implementation and then by the naive GPU solution, with the work-efficient GPU solution. + +I was able to get occupancy of the GPU for all of my algorithm up to 100\%. There are a maximum of 8 blocks and 1536 threads on my GPU. This leads to an optimum number of threads per block of 192. This is also only 6 warps per block and 48 warps total. + +Based on the timeline, there are many back and forth copys from the GPU to main memory in the thrust implementation. +Even though the thrust implementation uses all of these copys, it is still faster than my GPU attempts. I suspect this is due to memory accesses. + +![](./img/power_of_two.PNG) + +The previous graph is based on a debug build. +An interesting thing to note is that the thrust implementation is less optimal in a debug setting compared to the release build. diff --git a/img/non_power_of_two.PNG b/img/non_power_of_two.PNG new file mode 100644 index 0000000..b5e37dd Binary files /dev/null and b/img/non_power_of_two.PNG differ diff --git a/img/non_power_of_two_release.PNG b/img/non_power_of_two_release.PNG new file mode 100644 index 0000000..58a069b Binary files /dev/null and b/img/non_power_of_two_release.PNG differ diff --git a/img/power_of_two.PNG b/img/power_of_two.PNG new file mode 100644 index 0000000..d24e784 Binary files /dev/null and b/img/power_of_two.PNG differ diff --git a/img/power_of_two_release.PNG b/img/power_of_two_release.PNG new file mode 100644 index 0000000..cc4166b Binary files /dev/null and b/img/power_of_two_release.PNG differ diff --git a/src/main.cpp b/src/main.cpp index 7305641..eaa6e44 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,130 +14,131 @@ #include "testing_helpers.hpp" const int SIZE = 1 << 8; // feel free to change the size of array +//const int SIZE = 1 << 3; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int a[SIZE], b[SIZE], c[SIZE]; int main(int argc, char* argv[]) { - // Scan tests - - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); - - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - system("pause"); // stop Win32 console from closing on exit + // Scan tests + + printf("\n"); + printf("****************\n"); + printf("** SCAN TESTS **\n"); + printf("****************\n"); + + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + // initialize b using StreamCompaction::CPU::scan you implement + // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. + // At first all cases passed because b && c are all zeroes. + zeroArray(SIZE, b); + printDesc("cpu scan, power-of-two"); + StreamCompaction::CPU::scan(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("cpu scan, non-power-of-two"); + StreamCompaction::CPU::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(NPOT, b, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("naive scan, power-of-two"); + StreamCompaction::Naive::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("naive scan, non-power-of-two"); + StreamCompaction::Naive::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, non-power-of-two"); + StreamCompaction::Efficient::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("thrust scan, power-of-two"); + StreamCompaction::Thrust::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("thrust scan, non-power-of-two"); + StreamCompaction::Thrust::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + printf("\n"); + printf("*****************************\n"); + printf("** STREAM COMPACTION TESTS **\n"); + printf("*****************************\n"); + + // Compaction tests + + genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + int count, expectedCount, expectedNPOT; + + // initialize b using StreamCompaction::CPU::compactWithoutScan you implement + // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. + zeroArray(SIZE, b); + printDesc("cpu compact without scan, power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedCount = count; + printArray(count, b, true); + printCmpLenResult(count, expectedCount, b, b); + + zeroArray(SIZE, c); + printDesc("cpu compact without scan, non-power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedNPOT = count; + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + zeroArray(SIZE, c); + printDesc("cpu compact with scan"); + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact, power-of-two"); + count = StreamCompaction::Efficient::compact(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact, non-power-of-two"); + count = StreamCompaction::Efficient::compact(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + system("pause"); // stop Win32 console from closing on exit } diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 8fc0211..6da332f 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,39 +1,47 @@ #include "common.h" void checkCUDAErrorFn(const char *msg, const char *file, int line) { - cudaError_t err = cudaGetLastError(); - if (cudaSuccess == err) { - return; - } + cudaError_t err = cudaGetLastError(); + if (cudaSuccess == err) { + return; + } - fprintf(stderr, "CUDA error"); - if (file) { - fprintf(stderr, " (%s:%d)", file, line); - } - fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); - exit(EXIT_FAILURE); + fprintf(stderr, "CUDA error"); + if (file) { + fprintf(stderr, " (%s:%d)", file, line); + } + fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); + exit(EXIT_FAILURE); } namespace StreamCompaction { - namespace Common { +namespace Common { - /** - * Maps an array to an array of 0s and 1s for stream compaction. Elements - * which map to 0 will be removed, and elements which map to 1 will be kept. - */ - __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO - } - - /** - * Performs scatter on an array. That is, for each element in idata, - * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. - */ - __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { - // TODO - } +/** + * Maps an array to an array of 0s and 1s for stream compaction. Elements + * which map to 0 will be removed, and elements which map to 1 will be kept. + */ +__global__ void kernMapToBoolean(int n, int *bools, const int *idata) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index < n) { + bools[index] = idata[index] == 0 ? 0 : 1; + } +} +/** + * Performs scatter on an array. That is, for each element in idata, + * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. + */ +__global__ void kernScatter(int n, int *odata, + const int *idata, const int *bools, const int *indices) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index < n) { + if (bools[index]) { + odata[indices[index]] = idata[index]; } + } +} + +} } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 55f1b38..30d7c28 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -1,5 +1,5 @@ -#pragma once - +#pragma once + #include #include @@ -7,36 +7,36 @@ #include #include #include -#include +#include #include - -#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) -#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) - -/** - * Check for CUDA errors; print and exit if there was a problem. - */ -void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1); - -inline int ilog2(int x) { - int lg = 0; - while (x >>= 1) { - ++lg; - } - return lg; -} - -inline int ilog2ceil(int x) { - return ilog2(x - 1) + 1; -} - -namespace StreamCompaction { - namespace Common { - __global__ void kernMapToBoolean(int n, int *bools, const int *idata); - - __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices); - + +#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) +#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) + +/** + * Check for CUDA errors; print and exit if there was a problem. + */ +void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1); + +inline int ilog2(int x) { + int lg = 0; + while (x >>= 1) { + ++lg; + } + return lg; +} + +inline int ilog2ceil(int x) { + return ilog2(x - 1) + 1; +} + +namespace StreamCompaction { + namespace Common { + __global__ void kernMapToBoolean(int n, int *bools, const int *idata); + + __global__ void kernScatter(int n, int *odata, + const int *idata, const int *bools, const int *indices); + /** * This class is used for timing the performance * Uncopyable and unmovable @@ -127,6 +127,6 @@ namespace StreamCompaction { float prev_elapsed_time_cpu_milliseconds = 0.f; float prev_elapsed_time_gpu_milliseconds = 0.f; - }; - } -} + }; + } +} diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..bfac3c0 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,50 +1,86 @@ -#include -#include "cpu.h" - +#include +#include "cpu.h" + #include "common.h" - -namespace StreamCompaction { - namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - - /** - * CPU scan (prefix sum). - * For performance analysis, this is supposed to be a simple for loop. - * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. - */ - void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - } - - /** - * CPU stream compaction without using the scan function. - * - * @returns the number of elements remaining after compaction. - */ - int compactWithoutScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; - } - - /** - * CPU stream compaction using scan and scatter, like the parallel version. - * - * @returns the number of elements remaining after compaction. - */ - int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; - } - } -} + +namespace StreamCompaction { +namespace CPU { + +using StreamCompaction::Common::PerformanceTimer; + +PerformanceTimer& timer() +{ + static PerformanceTimer timer; + return timer; +} + +inline void scan_impl(int n, int *odata, const int *idata) { + int sum = 0; + for (int i = 0; i < n; ++i) { + *odata++ = sum; + sum += *idata++; + } +} + +/** + * CPU scan (prefix sum). + * For performance analysis, this is supposed to be a simple for loop. + * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. + */ +void scan(int n, int *odata, const int *idata) { + timer().startCpuTimer(); + scan_impl(n, odata, idata); + timer().endCpuTimer(); +} + +/** + * CPU stream compaction without using the scan function. + * + * @returns the number of elements remaining after compaction. + */ +int compactWithoutScan(int n, int *odata, const int *idata) { + timer().startCpuTimer(); + int count = 0; + for (int i = 0; i < n; ++i) { + int d = *idata++; + if (d) { + *odata++ = d; + ++count; + } + } + timer().endCpuTimer(); + return count; +} + +void scatter(int n, int *output, const int *map, const int *input) { + for (int i = 0; i < n; ++i) { + output[map[i]] = input[i]; + } +} + +/** + * CPU stream compaction using scan and scatter, like the parallel version. + * + * @returns the number of elements remaining after compaction. + */ +int compactWithScan(int n, int *odata, const int *idata) { + timer().startCpuTimer(); + int *temp = (int*)malloc(sizeof(int)*n); + int *temp2 = (int*)malloc(sizeof(int)*n); + int count = 0; + for (int i = 0; i < n; ++i) { + if (idata[i]) { + temp[i] = 1; + ++count; + } else { + temp[i] = 0; + } + } + scan_impl(n, temp2, temp); + scatter(n, odata, temp2, idata); + timer().endCpuTimer(); + return count; +} + +} // namespace CPU +} // namespace StreamCompaction diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..7d07494 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -1,40 +1,123 @@ -#include -#include -#include "common.h" -#include "efficient.h" - -namespace StreamCompaction { - namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - - /** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ - int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; - } - } -} +#include +#include +#include "common.h" +#include "efficient.h" +#include "iostream" + +#define BLOCK_SIZE 128 + +namespace StreamCompaction { +namespace Efficient { + +using StreamCompaction::Common::PerformanceTimer; + +PerformanceTimer& timer() +{ + static PerformanceTimer timer; + return timer; +} + +__global__ void UpSweep(int n, int offset, int *data) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + int first_index = offset*(index)*2 + offset - 1; + int end_index = first_index + offset; + if (end_index < n) { + data[end_index] += data[first_index]; + } +} + +__global__ void set_zero(int n, int *data) { + data[n - 1] = 0; +} + +__global__ void clear_array(int n, int*data) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < n) { + data[index] = 0; + } +} + +__global__ void DownSweep(int n, int offset, int *data) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < n) { + //int left_index = index + (offset >> 1) - 1; + //int right_index = index + offset - 1; + int left_index = offset*index*2 + offset - 1; + int right_index = left_index + offset; + if (left_index < n && right_index < n) { + int temp = data[left_index]; + data[left_index] = data[right_index]; + data[right_index] += temp; + } + } +} + +void scan_impl(int arr_length, int *dev_array) { + dim3 fullBlocksPerGrid((arr_length + BLOCK_SIZE - 1) / BLOCK_SIZE); + for (int offset = 1; offset < arr_length; offset *= 2) { + UpSweep<<>>(arr_length, offset, dev_array); + } + set_zero<<>>(arr_length, dev_array); + for(int offset = arr_length/2; offset >= 1; offset = offset >> 1) { + DownSweep<<>>(arr_length, offset, dev_array); + } +} + +/** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ +void scan(int n, int *odata, const int *idata) { + dim3 fullBlocksPerGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE); + int arr_length = (1 << ilog2ceil(n)); + int *dev_array; + cudaMalloc((void**)&dev_array, arr_length * sizeof(int)); + clear_array<<>>(arr_length, dev_array); + cudaMemcpy(dev_array, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + scan_impl(arr_length, dev_array); + timer().endGpuTimer(); + cudaMemcpy(odata, dev_array, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_array); +} + +/** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ +int compact(int n, int *odata, const int *idata) { + dim3 fullBlocksPerGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE); + int arr_length = (1 << ilog2ceil(n)); + int *dev_array_data; + int *dev_array_bool; + int *dev_array_indices; + int *dev_array_out; + + cudaMalloc((void**)&dev_array_data, arr_length * sizeof(int)); + cudaMalloc((void**)&dev_array_bool, arr_length * sizeof(int)); + cudaMalloc((void**)&dev_array_indices, arr_length * sizeof(int)); + cudaMalloc((void**)&dev_array_out, arr_length * sizeof(int)); + + clear_array<<>>(n, dev_array_data); + cudaMemcpy(dev_array_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); + Common::kernMapToBoolean<<>>(n, dev_array_bool, dev_array_data); + cudaMemcpy(dev_array_indices, dev_array_bool, arr_length * sizeof(int), cudaMemcpyDeviceToDevice); + scan_impl(arr_length, dev_array_indices); + Common::kernScatter<<>>(n, dev_array_out, dev_array_data, dev_array_bool, dev_array_indices); + timer().endGpuTimer(); + cudaMemcpy(odata, dev_array_out, n*sizeof(int), cudaMemcpyDeviceToHost); + int ret; + int ret2; + cudaMemcpy(&ret, dev_array_bool + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&ret2, dev_array_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + return ret + ret2; +} + +} +} diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..d0d8ba5 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,25 +1,71 @@ -#include -#include -#include "common.h" -#include "naive.h" - -namespace StreamCompaction { - namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - // TODO: __global__ - - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - } -} +#include +#include +#include "common.h" +#include "naive.h" +#include + +#define BLOCK_SIZE 128 + +namespace StreamCompaction { +namespace Naive { + +using StreamCompaction::Common::PerformanceTimer; + +PerformanceTimer& timer() +{ + static PerformanceTimer timer; + return timer; +} +// TODO: __global__ +__global__ void naive_scan_impl(int n, int offset, int *odata, const int *idata) { + int index = blockIdx.x*blockDim.x + threadIdx.x; + int out_index = index - offset; + if (out_index < 0) { + odata[index] = idata[index]; + } else if (index < n){ + odata[index] = idata[index] + idata[out_index]; + } +} + +__global__ void shift_impl(int n, int *odata, const int *idata) { + int index = blockIdx.x*blockDim.x + threadIdx.x; + if (index == 0) { + odata[0] = 0; + } + if (index < n - 1) { + odata[index + 1] = idata[index]; + } +} + +/** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ +void scan(int n, int *odata, const int *idata) { + dim3 fullBlocksPerGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE); + int *dev_array_A; + int *dev_array_B; + cudaMalloc((void**)&dev_array_A, n * sizeof(int)); + cudaMalloc((void**)&dev_array_B, n * sizeof(int)); + cudaMemcpy(dev_array_B, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + bool direction = false; + + timer().startGpuTimer(); + shift_impl<<>>(n, dev_array_A, dev_array_B); + cudaMemcpy(dev_array_B, dev_array_A, sizeof(int), cudaMemcpyDeviceToDevice); + + for (int offset = 1; offset < n; offset *= 2) { + if (direction) { + naive_scan_impl<<>>(n, offset, dev_array_A, dev_array_B); + } else { + naive_scan_impl<<>>(n, offset, dev_array_B, dev_array_A); + } + direction = !direction; + } + timer().endGpuTimer(); + cudaMemcpy(odata, (!direction ? dev_array_A : dev_array_B), sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_array_A); + cudaFree(dev_array_B); +} + +} // namespace Naive +} // namespace StreamCompaction diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..22feded 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -1,28 +1,34 @@ -#include -#include -#include -#include -#include -#include "common.h" -#include "thrust.h" - -namespace StreamCompaction { - namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); - } - } -} +#include +#include +#include +#include +#include +#include +#include "common.h" +#include "thrust.h" + +namespace StreamCompaction { +namespace Thrust { + +using StreamCompaction::Common::PerformanceTimer; + +PerformanceTimer& timer() +{ + static PerformanceTimer timer; + return timer; +} + +/** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ +void scan(int n, int *odata, const int *idata) { + thrust::host_vector hv_in(idata, idata + n); + thrust::device_vector dv_in = hv_in; + timer().startGpuTimer(); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_in.begin()); + timer().endGpuTimer(); + thrust::copy(dv_in.begin(), dv_in.end(), odata); +} + +} // namespace Thrust +} // namespace StreamCompaction