diff --git a/README.md b/README.md index b71c458..fa99094 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,94 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Daniel Krupka +* Tested on: Debian testing (stretch), Intel(R) Core(TM) i7-4710HQ CPU @ 2.50GHz 8GB, GTX 850M -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +# Project 2 - Stream Compaction +This project's goal was to compare various methods for achieving [stream compaction](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html). +The test program was modified to take block size and array size as arguments, and ran tests for each algorithm on both +power-of-two and non-power-of-two data. Test output was the following: +``` +**************** +** SCAN TESTS ** +**************** + [ 33 36 27 15 43 35 36 42 49 21 12 27 40 ... 6 0 ] +==== cpu scan, power-of-two ==== + [ 0 33 69 96 111 154 189 225 267 316 337 349 376 ... 12852633 12852639 ] +==== cpu scan, non-power-of-two ==== + [ 0 33 69 96 111 154 189 225 267 316 337 349 376 ... 12852608 12852617 ] + passed +==== naive scan, power-of-two ==== + passed +==== naive scan, non-power-of-two ==== + passed +==== work-efficient scan, power-of-two ==== + passed +==== work-efficient scan, non-power-of-two ==== + passed +==== real work-efficient scan, power-of-two ==== + passed +==== real work-efficient scan, non-power-of-two ==== + passed +==== thrust scan, power-of-two ==== + passed +==== thrust scan, non-power-of-two ==== + passed +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 2 1 3 1 3 2 0 1 1 2 3 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 3 2 1 3 1 3 2 1 1 2 3 2 3 ... 1 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + [ 3 2 1 3 1 3 2 1 1 2 3 2 3 ... 3 1 ] + passed +==== cpu compact with scan ==== + passed +==== work-efficient compact, power-of-two ==== + passed +==== work-efficient compact, non-power-of-two ==== + passed +==== real work-efficient compact, power-of-two ==== + passed +==== real work-efficient compact, non-power-of-two ==== + passed +``` + +# Analysis - Scanning +A major step of compaction is scanning. I tested a CPU implementation, a naive CUDA +implementation, two efficient CUDA implementations, and the Thrust library's implementation. + +![Scan Comparison 1](images/times_blk256.png "Scan Comparison 1") + +Interestingly, the Thrust implementation fared the worst, though Nvidia's NSight profiler +showed that Thrust was not actually using much GPU time. A likely explanation is that Thrust +may be shuffling data or partitioning the work between CPU and GPU. + +![Scan Comparison 2, no thrust](images/times_blk256_nothrust.png "Scan Comparison 2, no thrust") +![Scan Comparison 2, no thrust](images/times_blk256_nothrust_zoom.png "Scan Comparison 2, no thrust") + +Looking at the other implementations on their own shows that the GPU implementations are substantially +faster than the CPU for large workloads, but somewhat slower on small ones. This makes sense, as for small loads, +the GPU is nowhere near fully saturated. + +# Analysis - Compaction +Moving on to compaction, the CPU fares even worse. +![Compaction Comparison](images/times_all_comp.png "Compaction Comparison") + +Focusing on only the GPU implementations, we see that the more optimized version +begins to perform noticeably better, where the two were mostly indistinguishable +for simple scanning. +![Compaction Comparison](images/times_all_comp_zoom.png "Compaction Comparison") + +For the non-optimized GPU reduction, block size had a substantial effect, with +a 1024 thread block performing 1.3x faster than with 128 threads. +![Block Comparison, unoptimized](images/times_blk_eff.png "Block Comparison, unoptimized") + +The optimized GPU reduction showed much less variance with block size. This likely due to +the active threads no longer being scattered between different warps, allowing for more early +termination and lowering the number of active warps and thus blocks. +![Block Comparison, optimized](images/times_blk_realeff.png "Block Comparison, optimized") diff --git a/cis565_stream_compaction_test.launch b/cis565_stream_compaction_test.launch index 4267429..07b70cd 100644 --- a/cis565_stream_compaction_test.launch +++ b/cis565_stream_compaction_test.launch @@ -8,8 +8,8 @@ - - + + @@ -18,8 +18,8 @@ - + diff --git a/images/times_all_comp.png b/images/times_all_comp.png new file mode 100644 index 0000000..afe4110 Binary files /dev/null and b/images/times_all_comp.png differ diff --git a/images/times_all_comp_zoom.png b/images/times_all_comp_zoom.png new file mode 100644 index 0000000..3939439 Binary files /dev/null and b/images/times_all_comp_zoom.png differ diff --git a/images/times_blk256.png b/images/times_blk256.png new file mode 100644 index 0000000..fbe3097 Binary files /dev/null and b/images/times_blk256.png differ diff --git a/images/times_blk256_nothrust.png b/images/times_blk256_nothrust.png new file mode 100644 index 0000000..81d947c Binary files /dev/null and b/images/times_blk256_nothrust.png differ diff --git a/images/times_blk256_nothrust_zoom.png b/images/times_blk256_nothrust_zoom.png new file mode 100644 index 0000000..2fa02b1 Binary files /dev/null and b/images/times_blk256_nothrust_zoom.png differ diff --git a/images/times_blk_eff.png b/images/times_blk_eff.png new file mode 100644 index 0000000..2eee8e3 Binary files /dev/null and b/images/times_blk_eff.png differ diff --git a/images/times_blk_naive.png b/images/times_blk_naive.png new file mode 100644 index 0000000..642d376 Binary files /dev/null and b/images/times_blk_naive.png differ diff --git a/images/times_blk_realeff.png b/images/times_blk_realeff.png new file mode 100644 index 0000000..dd91f8f Binary files /dev/null and b/images/times_blk_realeff.png differ diff --git a/images/times_cpu_comp.png b/images/times_cpu_comp.png new file mode 100644 index 0000000..ae073ee Binary files /dev/null and b/images/times_cpu_comp.png differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..7f515f8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -10,13 +10,30 @@ #include #include #include +#include #include #include "testing_helpers.hpp" +#include +#include + + int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; - const int NPOT = SIZE - 3; - int a[SIZE], b[SIZE], c[SIZE]; + double t1,t2; + + int sizeExp = 19; + int blkSize = 256; + if (argc >= 3) { + sizeExp = atoi(argv[1]); + blkSize = atoi(argv[2]); + } + int SIZE = 1 << sizeExp; + int NPOT = SIZE - 3; + int *a = new int[SIZE], *b = new int[SIZE], *c = new int[SIZE]; + + StreamCompaction::Naive::blkSize = blkSize; + StreamCompaction::Efficient::blkSize = blkSize; + StreamCompaction::RealEfficient::blkSize = blkSize; // Scan tests @@ -33,48 +50,72 @@ int main(int argc, char* argv[]) { printDesc("cpu scan, power-of-two"); StreamCompaction::CPU::scan(SIZE, b, a); printArray(SIZE, b, true); + double tCpuScanPot = StreamCompaction::CPU::last_runtime; zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); printArray(NPOT, b, true); printCmpResult(NPOT, b, c); + double tCpuScanNpot = StreamCompaction::CPU::last_runtime; zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); + double tNaiveScanPot = StreamCompaction::Naive::last_runtime; zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); //printArray(SIZE, c, true); printCmpResult(NPOT, b, c); + double tNaiveScanNpot = StreamCompaction::Naive::last_runtime; zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); + double tEffScanPot = StreamCompaction::Efficient::last_runtime; zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + double tEffScanNpot = StreamCompaction::Efficient::last_runtime; + + + zeroArray(SIZE, c); + printDesc("real work-efficient scan, power-of-two"); + StreamCompaction::RealEfficient::scan(SIZE, c, a); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + double tRealEffScanPot = StreamCompaction::RealEfficient::last_runtime; + + zeroArray(SIZE, c); + printDesc("real work-efficient scan, non-power-of-two"); + StreamCompaction::RealEfficient::scan(NPOT, c, a); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + double tRealEffScanNpot = StreamCompaction::RealEfficient::last_runtime; + zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); + double tThrustScanPot = StreamCompaction::Thrust::last_runtime; zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + double tThrustScanNpot = StreamCompaction::Thrust::last_runtime; printf("\n"); printf("*****************************\n"); @@ -95,6 +136,7 @@ int main(int argc, char* argv[]) { expectedCount = count; printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); + double tCpuCompNoscanPot = StreamCompaction::CPU::last_runtime; zeroArray(SIZE, c); printDesc("cpu compact without scan, non-power-of-two"); @@ -102,22 +144,51 @@ int main(int argc, char* argv[]) { expectedNPOT = count; printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + double tCpuCompNoscanNpot = StreamCompaction::CPU::last_runtime; zeroArray(SIZE, c); printDesc("cpu compact with scan"); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + double tCpuCompScanPot = StreamCompaction::CPU::last_runtime; zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); //printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + double tEffCompScanPot = StreamCompaction::Efficient::last_runtime; zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + double tEffCompScanNpot = StreamCompaction::Efficient::last_runtime; + + zeroArray(SIZE, c); + printDesc("real work-efficient compact, power-of-two"); + count = StreamCompaction::RealEfficient::compact(SIZE, c, a); + //printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + double tRealEffCompScanPot = StreamCompaction::RealEfficient::last_runtime; + + zeroArray(SIZE, c); + printDesc("real work-efficient compact, non-power-of-two"); + count = StreamCompaction::RealEfficient::compact(NPOT, c, a); + //printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + double tRealEffCompScanNpot = StreamCompaction::RealEfficient::last_runtime; + + fprintf(stderr, "[%d, %d, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f]\n", + SIZE, blkSize, + tCpuScanPot, tNaiveScanPot, tEffScanPot, tRealEffScanPot, tThrustScanPot, + tCpuCompNoscanPot, tCpuCompScanPot, tEffCompScanPot, tRealEffCompScanPot + ); + + delete a; + delete b; + delete c; + + return 0; } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..de726ec 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -7,11 +7,13 @@ set(SOURCE_FILES "naive.cu" "efficient.h" "efficient.cu" + "real_efficient.h" + "real_efficient.cu" "thrust.h" "thrust.cu" ) cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_50 ) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..6f952b6 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -4,12 +4,22 @@ namespace StreamCompaction { namespace CPU { +double last_runtime; + /** * CPU scan (prefix sum). */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + double t1 = clock(); + + int t = 0; + for (int i = 0; i < n; i++) { + odata[i] = t; + t += idata[i]; + } + + double t2 = clock(); + last_runtime = 1.0E6*(t2-t1)/CLOCKS_PER_SEC; } /** @@ -18,8 +28,19 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; + double t1 = clock(); + + int oIdx = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[oIdx] = idata[i]; + oIdx++; + } + } + + double t2 = clock(); + last_runtime = 1.0E6*(t2-t1)/CLOCKS_PER_SEC; + return oIdx; } /** @@ -28,8 +49,30 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; + double t1 = clock(); + + int *keep = new int[n]; + for (int i = 0; i < n; i++) { + keep[i] = (idata[i] != 0) ? 1 : 0; + } + + int *keepScan = new int[n]; + int nKeep = 0; + scan(n, keepScan, keep); + for (int i = 0; i < n; i++) { + if (!keep[i]) + continue; + + nKeep++; + odata[keepScan[i]] = idata[i]; + } + + double t2 = clock(); + last_runtime = 1.0E6*(t2-t1)/CLOCKS_PER_SEC; + + delete keepScan; + delete keep; + return nKeep; } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 6348bf3..7e87420 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -2,6 +2,8 @@ namespace StreamCompaction { namespace CPU { + extern double last_runtime; + void scan(int n, int *odata, const int *idata); int compactWithoutScan(int n, int *odata, const int *idata); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..31ad328 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -6,14 +6,99 @@ namespace StreamCompaction { namespace Efficient { -// TODO: __global__ +double last_runtime; +int blkSize = 256; + +// perform reduction +__global__ void kernScanUp(int n, int dPow, int *data) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k % dPow != 0 || k + dPow - 1 >= n) + return; + + data[k + dPow - 1] += data[k + dPow/2 - 1]; +} + +// perform reduction +__global__ void kernScanDown(int n, int dPow, int *data) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k % dPow != 0 || k + dPow - 1 >= n) + return; + + int t = data[k + dPow/2 - 1]; + data[k + dPow/2 - 1] = data[k + dPow - 1]; + data[k + dPow - 1] += t; +} + +// mark nonzeroes +__global__ void kernMark(int n, int *keep, const int *data) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k >= n) + return; + + keep[k] = (data[k] != 0) ? 1 : 0; +} + +__global__ void kernScatter(int n, int *out, const int *keep, const int *scan, const int *data) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k >= n) + return; + + if (keep[k]) { + out[scan[k]] = data[k]; + } +} + +static int getPot(int n) { + unsigned int pot = n; + pot--; + pot |= pot >> 1; + pot |= pot >> 2; + pot |= pot >> 4; + pot |= pot >> 8; + pot |= pot >> 16; + pot++; + + return pot; +} + +static void devScanUtil(int n, int *devData) { + int pot = getPot(n); + + dim3 blkDim(blkSize); + + int dPow = 2; + while (dPow/2 < n) { + dim3 blkCnt((pot + blkDim.x - 1)/blkDim.x); + kernScanUp<<>>(pot, dPow, devData); + dPow *= 2; + } + cudaMemset(&devData[pot-1], 0, sizeof(int)); + + while (dPow > 1) { + dim3 blkCnt((pot + blkDim.x - 1)/blkDim.x); + kernScanDown<<>>(pot, dPow, devData); + dPow /= 2; + } +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + int pot = getPot(n); + + int *devData; + cudaMalloc((void**)&devData, pot*sizeof(int)); + cudaMemset(devData, 0, pot*sizeof(int)); + cudaMemcpy(devData, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + double t1 = clock(); + devScanUtil(n, devData); + double t2 = clock(); + last_runtime = 1.0E6 * (t2-t1) / CLOCKS_PER_SEC; + + cudaMemcpy(odata, devData, n*sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(devData); } /** @@ -26,8 +111,45 @@ void scan(int n, int *odata, const int *idata) { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - // TODO - return -1; + int pot = getPot(n); + + // upload data + int *devData; + cudaMalloc((void**)&devData, n*sizeof(int)); + cudaMemcpy(devData, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + dim3 blkDim(blkSize); + dim3 blkCnt((n + blkDim.x - 1)/blkDim.x); + + // mark values to keep + int *devKeep, *devScan; + cudaMalloc((void**)&devKeep, pot*sizeof(int)); + cudaMalloc((void**)&devScan, pot*sizeof(int)); + cudaMemset(devKeep, 0, pot*sizeof(int)); + + double t1 = clock(); + kernMark<<>>(n, devKeep, devData); + cudaMemcpy(devScan, devKeep, pot*sizeof(int), cudaMemcpyDeviceToDevice); + + // scan boolean array + devScanUtil(n, devScan); + int nKeep; + cudaMemcpy(&nKeep, &devScan[pot-1], sizeof(int), cudaMemcpyDeviceToHost); + + // scatter to output + int *devOut; + cudaMalloc((void**)&devOut, n*sizeof(int)); + kernScatter<<>>(n, devOut, devKeep, devScan, devData); + double t2 = clock(); + last_runtime = 1.0E6 * (t2-t1) / CLOCKS_PER_SEC; + cudaMemcpy(odata, devOut, nKeep*sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(devOut); + cudaFree(devData); + cudaFree(devKeep); + cudaFree(devScan); + + return nKeep; } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 395ba10..fb54bde 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,6 +2,9 @@ namespace StreamCompaction { namespace Efficient { + extern double last_runtime; + extern int blkSize; + void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..e89b742 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -6,14 +6,67 @@ namespace StreamCompaction { namespace Naive { -// TODO: __global__ +double last_runtime; +int blkSize = 256; + +__global__ void kernScan(int n, int dPow, int *odata, const int *idata) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k >= n) + return; + + if (k >= dPow) + odata[k] = idata[k - dPow] + idata[k]; + else + odata[k] = idata[k]; +} + +__global__ void kernInclToExcl(int n, int *odata, const int *idata) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k >= n-1) + return; + + odata[k+1] = idata[k]; +}; /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + int dPow = 1, dLogPow = 0; + int *devData[2]; + cudaMalloc((void**)&devData[0], n*sizeof(int)); + cudaMalloc((void**)&devData[1], n*sizeof(int)); + cudaMemcpy(devData[0], idata, n*sizeof(int), cudaMemcpyHostToDevice); + cudaMemset(devData[1], 0, n*sizeof(int)); + + double t1 = clock(); + + dim3 blkDim(blkSize); + dim3 blkCnt((n + blkDim.x - 1)/blkDim.x); + + int dst, src; + while (dPow/2 < n) { + src = dLogPow % 2; + dst = 1 - src; + kernScan<<>>(n, dPow, devData[dst], devData[src]); + dPow *= 2; + dLogPow++; + cudaDeviceSynchronize(); + } + + src = dLogPow % 2; + dst = 1 - src; + kernInclToExcl<<>>(n, devData[dst], devData[src]); + cudaDeviceSynchronize(); + + double t2=clock(); + last_runtime = 1.0E6 * (t2-t1) / CLOCKS_PER_SEC; + + cudaMemcpy(odata, devData[dst], n*sizeof(int), cudaMemcpyDeviceToHost); + odata[0] = 0; + + cudaFree(devData[0]); + cudaFree(devData[1]); } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 21152d6..060df1d 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -2,6 +2,9 @@ namespace StreamCompaction { namespace Naive { + extern double last_runtime; + extern int blkSize; + void scan(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..b74f882 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -9,6 +9,8 @@ namespace StreamCompaction { namespace Thrust { +double last_runtime; + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ @@ -16,6 +18,14 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::device_vector devIn(idata, idata+n), devOut(n); + double t1 = clock(); + thrust::exclusive_scan(devIn.begin(), devIn.end(), devOut.begin()); + double t2 = clock(); + last_runtime = 1.0E6 * (t2-t1) / CLOCKS_PER_SEC; + + thrust::copy(devOut.begin(), devOut.end(), odata); } } diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h index 06707f3..e59174f 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.h @@ -2,6 +2,8 @@ namespace StreamCompaction { namespace Thrust { + extern double last_runtime; + void scan(int n, int *odata, const int *idata); } }