CIS565-Fall-2016 · krupkad · Sep 25, 2016 · Sep 25, 2016 · Sep 25, 2016 · Sep 25, 2016
diff --git a/README.md b/README.md
@@ -3,11 +3,94 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Daniel Krupka
+* Tested on: Debian testing (stretch), Intel(R) Core(TM) i7-4710HQ CPU @ 2.50GHz 8GB, GTX 850M
 
-### (TODO: Your README)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+# Project 2 - Stream Compaction
+This project's goal was to compare various methods for achieving [stream compaction](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html).
+The test program was modified to take block size and array size as arguments, and ran tests for each algorithm on both
+power-of-two and non-power-of-two data. Test output was the following:
+```
+****************
+** SCAN TESTS **
+****************
+    [  33  36  27  15  43  35  36  42  49  21  12  27  40 ...   6   0 ]
+==== cpu scan, power-of-two ====
+    [   0  33  69  96 111 154 189 225 267 316 337 349 376 ... 12852633 12852639 ]
+==== cpu scan, non-power-of-two ====
+    [   0  33  69  96 111 154 189 225 267 316 337 349 376 ... 12852608 12852617 ]
+    passed
+==== naive scan, power-of-two ====
+    passed
+==== naive scan, non-power-of-two ====
+    passed
+==== work-efficient scan, power-of-two ====
+    passed
+==== work-efficient scan, non-power-of-two ====
+    passed
+==== real work-efficient scan, power-of-two ====
+    passed
+==== real work-efficient scan, non-power-of-two ====
+    passed
+==== thrust scan, power-of-two ====
+    passed
+==== thrust scan, non-power-of-two ====
+    passed
 
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   3   2   1   3   1   3   2   0   1   1   2   3   2 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+    [   3   2   1   3   1   3   2   1   1   2   3   2   3 ...   1   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+    [   3   2   1   3   1   3   2   1   1   2   3   2   3 ...   3   1 ]
+    passed
+==== cpu compact with scan ====
+    passed
+==== work-efficient compact, power-of-two ====
+    passed
+==== work-efficient compact, non-power-of-two ====
+    passed
+==== real work-efficient compact, power-of-two ====
+    passed
+==== real work-efficient compact, non-power-of-two ====
+    passed
+```
+
+# Analysis - Scanning
+A major step of compaction is scanning. I tested a CPU implementation, a naive CUDA
+implementation, two efficient CUDA implementations, and the Thrust library's implementation.
+
+![Scan Comparison 1](images/times_blk256.png "Scan Comparison 1")
+
+Interestingly, the Thrust implementation fared the worst, though Nvidia's NSight profiler
+showed that Thrust was not actually using much GPU time. A likely explanation is that Thrust
+may be shuffling data or partitioning the work between CPU and GPU.
+
+![Scan Comparison 2, no thrust](images/times_blk256_nothrust.png "Scan Comparison 2, no thrust")
+![Scan Comparison 2, no thrust](images/times_blk256_nothrust_zoom.png "Scan Comparison 2, no thrust")
+
+Looking at the other implementations on their own shows that the GPU implementations are substantially
+faster than the CPU for large workloads, but somewhat slower on small ones. This makes sense, as for small loads,
+the GPU is nowhere near fully saturated.
+
+# Analysis - Compaction
+Moving on to compaction, the CPU fares even worse.
+![Compaction Comparison](images/times_all_comp.png "Compaction Comparison")
+
+Focusing on only the GPU implementations, we see that the more optimized version
+begins to perform noticeably better, where the two were mostly indistinguishable
+for simple scanning.
+![Compaction Comparison](images/times_all_comp_zoom.png "Compaction Comparison")
+
+For the non-optimized GPU reduction, block size had a substantial effect, with
+a 1024 thread block performing 1.3x faster than with 128 threads.
+![Block Comparison, unoptimized](images/times_blk_eff.png "Block Comparison, unoptimized")
+
+The optimized GPU reduction showed much less variance with block size. This likely due to
+the active threads no longer being scattered between different warps, allowing for more early
+termination and lowering the number of active warps and thus blocks.
+![Block Comparison, optimized](images/times_blk_realeff.png "Block Comparison, optimized")
diff --git a/cis565_stream_compaction_test.launch b/cis565_stream_compaction_test.launch
@@ -8,8 +8,8 @@
 <stringAttribute key="org.eclipse.cdt.launch.DEBUGGER_START_MODE" value="run"/>
 <stringAttribute key="org.eclipse.cdt.launch.PROGRAM_NAME" value="build/cis565_stream_compaction_test"/>
 <stringAttribute key="org.eclipse.cdt.launch.PROJECT_ATTR" value="Project2-Stream-Compaction"/>
-<booleanAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_AUTO_ATTR" value="true"/>
-<stringAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_ID_ATTR" value=""/>
+<booleanAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_AUTO_ATTR" value="false"/>
+<stringAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_ID_ATTR" value="com.nvidia.cuda.ide.toolchain.base.1399573849.1760580076"/>
 <booleanAttribute key="org.eclipse.cdt.launch.use_terminal" value="true"/>
 <listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
 <listEntry value="/Project2-Stream-Compaction"/>
@@ -18,8 +18,8 @@
 <listEntry value="4"/>
 </listAttribute>
 <listAttribute key="org.eclipse.debug.ui.favoriteGroups">
-<listEntry value="org.eclipse.debug.ui.launchGroup.profile"/>
 <listEntry value="org.eclipse.debug.ui.launchGroup.debug"/>
+<listEntry value="org.eclipse.debug.ui.launchGroup.profile"/>
 <listEntry value="org.eclipse.debug.ui.launchGroup.run"/>
 </listAttribute>
 <stringAttribute key="org.eclipse.dsf.launch.MEMORY_BLOCKS" value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot; standalone=&quot;no&quot;?&gt;&#10;&lt;memoryBlockExpressionList context=&quot;reserved-for-future-use&quot;/&gt;&#10;"/>

diff --git a/images/times_all_comp.png b/images/times_all_comp.png
diff --git a/images/times_all_comp_zoom.png b/images/times_all_comp_zoom.png
diff --git a/images/times_blk256.png b/images/times_blk256.png
diff --git a/images/times_blk256_nothrust.png b/images/times_blk256_nothrust.png
diff --git a/images/times_blk256_nothrust_zoom.png b/images/times_blk256_nothrust_zoom.png
diff --git a/images/times_blk_eff.png b/images/times_blk_eff.png
diff --git a/images/times_blk_naive.png b/images/times_blk_naive.png
diff --git a/images/times_blk_realeff.png b/images/times_blk_realeff.png
diff --git a/images/times_cpu_comp.png b/images/times_cpu_comp.png
diff --git a/src/main.cpp b/src/main.cpp
@@ -10,13 +10,30 @@
 #include <stream_compaction/cpu.h>
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
+#include <stream_compaction/real_efficient.h>
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
+#include <cstdlib>
+#include <ctime>
+
+
 int main(int argc, char* argv[]) {
-    const int SIZE = 1 << 8;
-    const int NPOT = SIZE - 3;
-    int a[SIZE], b[SIZE], c[SIZE];
+    double t1,t2;
+
+    int sizeExp = 19;
+    int blkSize = 256;
+    if (argc >= 3) {
+      sizeExp = atoi(argv[1]);
+      blkSize = atoi(argv[2]);
+    }
+    int SIZE = 1 << sizeExp;
+    int NPOT = SIZE - 3;
+    int *a = new int[SIZE], *b = new int[SIZE], *c = new int[SIZE];
+
+    StreamCompaction::Naive::blkSize = blkSize;
+    StreamCompaction::Efficient::blkSize = blkSize;
+    StreamCompaction::RealEfficient::blkSize = blkSize;
 
     // Scan tests
 
@@ -33,48 +50,72 @@ int main(int argc, char* argv[]) {
     printDesc("cpu scan, power-of-two");
     StreamCompaction::CPU::scan(SIZE, b, a);
     printArray(SIZE, b, true);
+    double tCpuScanPot = StreamCompaction::CPU::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("cpu scan, non-power-of-two");
     StreamCompaction::CPU::scan(NPOT, c, a);
     printArray(NPOT, b, true);
     printCmpResult(NPOT, b, c);
+    double tCpuScanNpot = StreamCompaction::CPU::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
+    double tNaiveScanPot = StreamCompaction::Naive::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     //printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
+    double tNaiveScanNpot = StreamCompaction::Naive::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
+    double tEffScanPot = StreamCompaction::Efficient::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     //printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
+    double tEffScanNpot = StreamCompaction::Efficient::last_runtime;
+
+
+    zeroArray(SIZE, c);
+    printDesc("real work-efficient scan, power-of-two");
+    StreamCompaction::RealEfficient::scan(SIZE, c, a);
+    //printArray(SIZE, c, true);
+    printCmpResult(SIZE, b, c);
+    double tRealEffScanPot = StreamCompaction::RealEfficient::last_runtime;
+
+    zeroArray(SIZE, c);
+    printDesc("real work-efficient scan, non-power-of-two");
+    StreamCompaction::RealEfficient::scan(NPOT, c, a);
+    //printArray(NPOT, c, true);
+    printCmpResult(NPOT, b, c);
+    double tRealEffScanNpot = StreamCompaction::RealEfficient::last_runtime;
+
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
+    double tThrustScanPot = StreamCompaction::Thrust::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     //printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
+    double tThrustScanNpot = StreamCompaction::Thrust::last_runtime;
 
     printf("\n");
     printf("*****************************\n");
@@ -95,29 +136,59 @@ int main(int argc, char* argv[]) {
     expectedCount = count;
     printArray(count, b, true);
     printCmpLenResult(count, expectedCount, b, b);
+    double tCpuCompNoscanPot = StreamCompaction::CPU::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("cpu compact without scan, non-power-of-two");
     count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
     expectedNPOT = count;
     printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
+    double tCpuCompNoscanNpot = StreamCompaction::CPU::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("cpu compact with scan");
     count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
+    double tCpuCompScanPot = StreamCompaction::CPU::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     //printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
+    double tEffCompScanPot = StreamCompaction::Efficient::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
+    double tEffCompScanNpot = StreamCompaction::Efficient::last_runtime;
+
+    zeroArray(SIZE, c);
+    printDesc("real work-efficient compact, power-of-two");
+    count = StreamCompaction::RealEfficient::compact(SIZE, c, a);
+    //printArray(count, c, true);
+    printCmpLenResult(count, expectedCount, b, c);
+    double tRealEffCompScanPot = StreamCompaction::RealEfficient::last_runtime;
+
+    zeroArray(SIZE, c);
+    printDesc("real work-efficient compact, non-power-of-two");
+    count = StreamCompaction::RealEfficient::compact(NPOT, c, a);
+    //printArray(count, c, true);
+    printCmpLenResult(count, expectedNPOT, b, c);
+    double tRealEffCompScanNpot = StreamCompaction::RealEfficient::last_runtime;
+
+    fprintf(stderr, "[%d, %d, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f]\n",
+      SIZE, blkSize,
+      tCpuScanPot, tNaiveScanPot, tEffScanPot, tRealEffScanPot, tThrustScanPot,
+      tCpuCompNoscanPot, tCpuCompScanPot, tEffCompScanPot, tRealEffCompScanPot
+      );
+
+    delete a;
+    delete b;
+    delete c;
+
+    return 0;
 }
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
@@ -7,11 +7,13 @@ set(SOURCE_FILES
     "naive.cu"
     "efficient.h"
     "efficient.cu"
+    "real_efficient.h"
+    "real_efficient.cu"
     "thrust.h"
     "thrust.cu"
     )
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_50
     )
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -4,12 +4,22 @@
 namespace StreamCompaction {
 namespace CPU {
 
+double last_runtime;
+
 /**
  * CPU scan (prefix sum).
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+    double t1 = clock();
+
+    int t = 0;
+    for (int i = 0; i < n; i++) {
+      odata[i] = t;
+      t += idata[i];
+    }
+
+    double t2 = clock();
+    last_runtime = 1.0E6*(t2-t1)/CLOCKS_PER_SEC;
 }
 
 /**
@@ -18,8 +28,19 @@ void scan(int n, int *odata, const int *idata) {
  * @returns the number of elements remaining after compaction.
  */
 int compactWithoutScan(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+    double t1 = clock();
+
+    int oIdx = 0;
+    for (int i = 0; i < n; i++) {
+      if (idata[i] != 0) {
+        odata[oIdx] = idata[i];
+        oIdx++;
+      }
+    }
+
+    double t2 = clock();
+    last_runtime = 1.0E6*(t2-t1)/CLOCKS_PER_SEC;
+    return oIdx;
 }
 
 /**
@@ -28,8 +49,30 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
  * @returns the number of elements remaining after compaction.
  */
 int compactWithScan(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+    double t1 = clock();
+
+    int *keep = new int[n];
+    for (int i = 0; i < n; i++) {
+      keep[i] = (idata[i] != 0) ? 1 : 0;
+    }
+
+    int *keepScan = new int[n];
+    int nKeep = 0;
+    scan(n, keepScan, keep);
+    for (int i = 0; i < n; i++) {
+      if (!keep[i])
+        continue;
+
+      nKeep++;
+      odata[keepScan[i]] = idata[i];
+    }
+
+    double t2 = clock();
+    last_runtime = 1.0E6*(t2-t1)/CLOCKS_PER_SEC;
+
+    delete keepScan;
+    delete keep;
+    return nKeep;
 }
 
 }

diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h
@@ -2,6 +2,8 @@
 
 namespace StreamCompaction {
 namespace CPU {
+    extern double last_runtime;
+
     void scan(int n, int *odata, const int *idata);
 
     int compactWithoutScan(int n, int *odata, const int *idata);