diff --git a/README.md b/README.md index b71c458..21aefdc 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,93 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Name: Zhan Xiong Chin +* Tested on: Windows 7 Professional, Intel(R) Xeon(R) CPU E5-1630 v4 @ 3.70 GHz 3.70 GHz, GTX 1070 8192MB (SIG Lab) -### (TODO: Your README) +Overview +======== +This implements a GPU-based scan (i.e. computes the prefix sums of an array) and stream compaction (i.e. +moves nonzero elements to the front of array). There are versions for a CPU-based scan, a naive GPU-based scan +(uses O(nlogn) additions) and a work-efficient GPU-based scan (uses O(n) additions). -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +Build Instructions +================== +[See here](https://github.com/CIS565-Fall-2016/Project0-CUDA-Getting-Started/blob/master/INSTRUCTION.md) +Performance analysis +==================== +Milliseconds to calculate prefix sums for arrays of given sizes: + +![](images/milliseconds.png) + +Log2 of nanoseconds for same data: + +![](images/log_nanoseconds.png) + +The timings above did not include the time needed to copy arrays onto device. + +For small arrays, the naive algorithm appears to be the fastest, though this may be due to the inability to +effectively time CPU execution for times smaller than 1 millisecond. The CPU algorithm still beats out the +naive algorithm for most of the timings, suggesting that the extra logn additions needed is slowing down the execution +of the naive algorithm significantly. + +The efficient algorithm is approximately 3 times faster than the naive algorithm and twice as fast as the CPU algorithm. +This is in line with the O(n) operations needed by both of them; the GPU algorithm is making better use of its +multiple cores to achieve this speedup. + +Compared to the thrust-based implementation, all algorithms beat it for small arrays, but it is significantly faster +than all other algorithms for large arrays. Based on the large number of registers used and the small grid size, it +may be using the work-efficient algorithm, but with a larger base than 2 (e.g. ternary or quarternary tree). + +Test output +=========== +``` +S:\cis565\Project2-Stream-Compaction\build>Release\cis565_stream_compaction_test +.exe + +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 10 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 205473618 20547362 +8 ] +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 205473252 20547325 +5 ] + passed +==== naive scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 205473618 20547362 +8 ] + passed +==== naive scan, non-power-of-two ==== + passed +==== work-efficient scan, power-of-two ==== + passed +==== work-efficient scan, non-power-of-two ==== + passed +==== thrust scan, power-of-two ==== + passed +==== thrust scan, non-power-of-two ==== + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 3 ] + passed +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== + passed +==== work-efficient compact, non-power-of-two ==== + passed + +S:\cis565\Project2-Stream-Compaction\build> +``` \ No newline at end of file diff --git a/images/log_nanoseconds.png b/images/log_nanoseconds.png new file mode 100644 index 0000000..974e7a0 Binary files /dev/null and b/images/log_nanoseconds.png differ diff --git a/images/milliseconds.png b/images/milliseconds.png new file mode 100644 index 0000000..26af0aa Binary files /dev/null and b/images/milliseconds.png differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..3565085 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -6,6 +6,7 @@ * @copyright University of Pennsylvania */ +#include #include #include #include @@ -13,111 +14,161 @@ #include #include "testing_helpers.hpp" +#define TIMING 1 + +void comparePerformance(const int SIZE, const int RUNS) { + printf("Timing performance with arrays of size %d, averaged over %d runs\n", SIZE, RUNS); + int * a = new int[SIZE]; + int * b = new int[SIZE]; + genArray(SIZE, a, 2); + + for (int i = 0; i < RUNS; i++) { + std::chrono::time_point start, end; + start = std::chrono::high_resolution_clock::now(); + + StreamCompaction::CPU::scan(SIZE, b, a); + + end = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsedSeconds = end - start; + printf("CPU scan: %lf milliseconds\n", elapsedSeconds.count() * 1000.0f); + + StreamCompaction::Naive::scan(SIZE, b, a); + + StreamCompaction::Efficient::scan(SIZE, b, a); + + StreamCompaction::Thrust::scan(SIZE, b, a); + } + + delete a; + delete b; +} + +void runTests() { + const int SIZE = 1 << 23; + const int NPOT = SIZE - 17; + int * a = new int[SIZE]; + int * b = new int[SIZE]; + int * c = new int[SIZE]; + + // Scan tests + + printf("\n"); + printf("****************\n"); + printf("** SCAN TESTS **\n"); + printf("****************\n"); + + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + zeroArray(SIZE, b); + printDesc("cpu scan, power-of-two"); + StreamCompaction::CPU::scan(SIZE, b, a); + printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("cpu scan, non-power-of-two"); + StreamCompaction::CPU::scan(NPOT, c, a); + printArray(NPOT, b, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("naive scan, power-of-two"); + StreamCompaction::Naive::scan(SIZE, c, a); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("naive scan, non-power-of-two"); + StreamCompaction::Naive::scan(NPOT, c, a); + //printArray(SIZE, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, non-power-of-two"); + StreamCompaction::Efficient::scan(NPOT, c, a); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("thrust scan, power-of-two"); + StreamCompaction::Thrust::scan(SIZE, c, a); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("thrust scan, non-power-of-two"); + StreamCompaction::Thrust::scan(NPOT, c, a); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + printf("\n"); + printf("*****************************\n"); + printf("** STREAM COMPACTION TESTS **\n"); + printf("*****************************\n"); + + // Compaction tests + + genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + int count, expectedCount, expectedNPOT; + + zeroArray(SIZE, b); + printDesc("cpu compact without scan, power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + expectedCount = count; + printArray(count, b, true); + printCmpLenResult(count, expectedCount, b, b); + + zeroArray(SIZE, c); + printDesc("cpu compact without scan, non-power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + expectedNPOT = count; + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + zeroArray(SIZE, c); + printDesc("cpu compact with scan"); + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact, power-of-two"); + count = StreamCompaction::Efficient::compact(SIZE, c, a); + //printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact, non-power-of-two"); + count = StreamCompaction::Efficient::compact(NPOT, c, a); + //printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + delete a; + delete b; + delete c; +} + + int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; - const int NPOT = SIZE - 3; - int a[SIZE], b[SIZE], c[SIZE]; - - // Scan tests - - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); - - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); +#if TIMING == 1 + comparePerformance(1 << 10, 3); + comparePerformance(1 << 13, 3); + comparePerformance(1 << 16, 3); + comparePerformance(1 << 19, 3); + comparePerformance(1 << 22, 3); + comparePerformance(1 << 25, 3); + comparePerformance(1 << 28, 3); +#else + runTests(); +#endif } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..c8709e7 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_50 ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..2a95413 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,3 +1,5 @@ +#include +#include #include "common.h" void checkCUDAErrorFn(const char *msg, const char *file, int line) { @@ -23,7 +25,11 @@ namespace Common { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + bools[index] = (idata[index] == 0) ? 0 : 1; } /** @@ -32,8 +38,14 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (bools[index] != 0) { + odata[indices[index]] = idata[index]; + } } - } + } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..79cfb56 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -7,6 +7,8 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define TIMING 1 + /** * Check for CUDA errors; print and exit if there was a problem. */ diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..102b403 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,4 +1,5 @@ #include +#include #include "cpu.h" namespace StreamCompaction { @@ -8,8 +9,11 @@ namespace CPU { * CPU scan (prefix sum). */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + int sum = 0; + for (int i = 0; i < n; i++) { + odata[i] = sum; + sum += idata[i]; + } } /** @@ -18,8 +22,20 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; + int nonZeroCount = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[nonZeroCount] = idata[i]; + nonZeroCount++; + } + } + return nonZeroCount; +} + +void scatter(int n, int * odata, const int * idata, const int * scatterTargets) { + for (int i = 0; i < n; i++) { + odata[scatterTargets[i]] = idata[i]; + } } /** @@ -28,8 +44,17 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; + int * nonZeroElements = (int *)malloc(n * sizeof(int)); + int * scanCounts = (int *)malloc(n * sizeof(int)); + for (int i = 0; i < n; i++) { + nonZeroElements[i] = (idata[i] == 0) ? 0 : 1; + } + scan(n, scanCounts, nonZeroElements); + scatter(n, odata, idata, scanCounts); + int remainingCount = nonZeroElements[n - 1] + scanCounts[n - 1]; + free(nonZeroElements); + free(scanCounts); + return remainingCount; } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..7c67a6e 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,17 +3,72 @@ #include "common.h" #include "efficient.h" +#define blockSize 128 + namespace StreamCompaction { namespace Efficient { -// TODO: __global__ +__global__ void kernScanUpsweep(int n, int d, int * data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= (n >> d)) { + return; + } + int k = index << d; + data[k + (1 << d) - 1] += data[k + (1 << (d - 1)) - 1]; +} + +__global__ void kernScanDownsweep(int n, int d, int * data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= (n >> d)) { + return; + } + int k = index << d; + int t = data[k + (1 << d) - 1]; + data[k + (1 << d) - 1] += data[k + (1 << (d - 1)) - 1]; + data[k + (1 << (d - 1)) - 1] = t; +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + int * dev_data; + int logCeil = ilog2ceil(n); + int nCeil = 1 << logCeil; + + cudaMalloc((void**)&dev_data, nCeil * sizeof(int)); + cudaMemset((void*)dev_data, 0, nCeil * sizeof(int)); + cudaMemcpy((void*)dev_data, (void*)idata, n * sizeof(int), cudaMemcpyHostToDevice); + +#if TIMING == 1 + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); +#endif + + for (int i = 1; i <= logCeil; i++) { + int gridSize = ((nCeil >> i) + blockSize - 1) / blockSize; + kernScanUpsweep << > >(nCeil, i, dev_data); + } + + cudaMemset((void*)&dev_data[nCeil - 1], 0, sizeof(int)); + + for (int i = logCeil; i >= 1; i--) { + int gridSize = ((nCeil >> i) + blockSize - 1) / blockSize; + kernScanDownsweep << > >(nCeil, i, dev_data); + } + +#if TIMING == 1 + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("Efficient scan: %f milliseconds\n", milliseconds); +#endif + + cudaMemcpy((void*)odata, (void*)dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_data); } /** @@ -26,8 +81,37 @@ void scan(int n, int *odata, const int *idata) { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - // TODO - return -1; + int * dev_bools; + int * dev_idata; + int * dev_odata; + int * dev_indices; + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + // Map to booleans + cudaMemcpy((void*)dev_idata, (void*)idata, n * sizeof(int), cudaMemcpyHostToDevice); + StreamCompaction::Common::kernMapToBoolean << > >(n, dev_bools, dev_idata); + int * temp = (int *)malloc(n * sizeof(int)); + cudaMemcpy((void*)temp, (void*)dev_bools, n * sizeof(int), cudaMemcpyDeviceToHost); + + // Do exclusive scan + scan(n, temp, temp); + int compactedCount = temp[n - 1] + ((idata[n - 1] == 0) ? 0 : 1); + + // Scatter + cudaMemcpy((void*)dev_indices, (void*)temp, n * sizeof(int), cudaMemcpyHostToDevice); + StreamCompaction::Common::kernScatter << > >(n, dev_odata, dev_idata, dev_bools, dev_indices); + cudaMemcpy((void*)odata, (void*)dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + free(temp); + cudaFree(dev_bools); + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_indices); + + return compactedCount; } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..0a85b9d 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,17 +3,69 @@ #include "common.h" #include "naive.h" +#define blockSize 128 + namespace StreamCompaction { namespace Naive { -// TODO: __global__ +__global__ void kernNaiveScan(int n, int round, int * odata, int * idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + odata[index] = ( + (index < (1 << (round - 1))) + ? 0 + : idata[index - (1 << (round - 1))] + ) + idata[index]; +} + +__global__ void kernInclusiveToExclusiveScan(int n, int * odata, int * idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + odata[index] = (index == 0 ) ? 0 : idata[index - 1]; +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int * dev_data; + int * dev_data2; + cudaMalloc((void**)&dev_data, n * sizeof(int)); + cudaMalloc((void**)&dev_data2, n * sizeof(int)); + cudaMemcpy((void*)dev_data, (void*)idata, n * sizeof(int), cudaMemcpyHostToDevice); + +#if TIMING == 1 + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); +#endif + + for (int i = 1; i <= ilog2ceil(n); i++) { + kernNaiveScan << > >(n, i, dev_data2, dev_data); + int * tempPtr = dev_data; + dev_data = dev_data2; + dev_data2 = tempPtr; + } + kernInclusiveToExclusiveScan << > >(n, dev_data2, dev_data); + +#if TIMING == 1 + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("Naive scan: %f milliseconds\n", milliseconds); +#endif + + cudaMemcpy((void*)odata, (void*)dev_data2, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_data); + cudaFree(dev_data2); } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..3e195da 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -13,9 +13,27 @@ namespace Thrust { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::device_vector dev_thrust_idata(idata, idata + n); + thrust::device_vector dev_thrust_odata(n); + +#if TIMING == 1 + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); +#endif + + thrust::exclusive_scan(dev_thrust_idata.begin(), dev_thrust_idata.end(), dev_thrust_odata.begin()); + +#if TIMING == 1 + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("Thrust scan: %f milliseconds\n", milliseconds); +#endif + + thrust::copy(dev_thrust_odata.begin(), dev_thrust_odata.end(), odata); } }