CIS565-Fall-2017 · phyrephox · Sep 12, 2017 · Sep 15, 2017 · Sep 16, 2017 · Sep 17, 2017
diff --git a/README.md b/README.md
@@ -3,11 +3,24 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Alexander Perry
+* Tested on: Windows 10, i5-2410M @ 2.30GHz 8GB, NVS 4200M
 
-### (TODO: Your README)
+### Analysis
+![](./img/power_of_two_release.PNG)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+![](./img/non_power_of_two_release.PNG)
 
+The most obvious thing to note is that longer arrays take longer to sort.
+Beyond that we can see that algorithms that are supposed to more efficient are generally less efficient as implemented.
+The CPU algorithm is the fastest algorithm, followed by the thrust implementation and then by the naive GPU solution, with the work-efficient GPU solution.
+
+I was able to get occupancy of the GPU for all of my algorithm up to 100\%. There are a maximum of 8 blocks and 1536 threads on my GPU. This leads to an optimum number of threads per block of 192. This is also only 6 warps per block and 48 warps total.
+
+Based on the timeline, there are many back and forth copys from the GPU to main memory in the thrust implementation.
+Even though the thrust implementation uses all of these copys, it is still faster than my GPU attempts. I suspect this is due to memory accesses.
+
+![](./img/power_of_two.PNG)
+
+The previous graph is based on a debug build.
+An interesting thing to note is that the thrust implementation is less optimal in a debug setting compared to the release build.
diff --git a/img/non_power_of_two.PNG b/img/non_power_of_two.PNG
diff --git a/img/non_power_of_two_release.PNG b/img/non_power_of_two_release.PNG
diff --git a/img/power_of_two.PNG b/img/power_of_two.PNG
diff --git a/img/power_of_two_release.PNG b/img/power_of_two_release.PNG
diff --git a/src/main.cpp b/src/main.cpp
@@ -14,130 +14,131 @@
 #include "testing_helpers.hpp"
 
 const int SIZE = 1 << 8; // feel free to change the size of array
+//const int SIZE = 1 << 3; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int a[SIZE], b[SIZE], c[SIZE];
 
 int main(int argc, char* argv[]) {
-    // Scan tests
-
-    printf("\n");
-    printf("****************\n");
-    printf("** SCAN TESTS **\n");
-    printf("****************\n");
-
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    // initialize b using StreamCompaction::CPU::scan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
-    // At first all cases passed because b && c are all zeroes.
-    zeroArray(SIZE, b);
-    printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(SIZE, b, true);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(NPOT, b, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, non-power-of-two");
-    StreamCompaction::Thrust::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    printf("\n");
-    printf("*****************************\n");
-    printf("** STREAM COMPACTION TESTS **\n");
-    printf("*****************************\n");
-
-    // Compaction tests
-
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    int count, expectedCount, expectedNPOT;
-
-    // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
-    zeroArray(SIZE, b);
-    printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedCount = count;
-    printArray(count, b, true);
-    printCmpLenResult(count, expectedCount, b, b);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact without scan, non-power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedNPOT = count;
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    system("pause"); // stop Win32 console from closing on exit
+  // Scan tests
+
+  printf("\n");
+  printf("****************\n");
+  printf("** SCAN TESTS **\n");
+  printf("****************\n");
+
+  genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+  a[SIZE - 1] = 0;
+  printArray(SIZE, a, true);
+
+  // initialize b using StreamCompaction::CPU::scan you implement
+  // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
+  // At first all cases passed because b && c are all zeroes.
+  zeroArray(SIZE, b);
+  printDesc("cpu scan, power-of-two");
+  StreamCompaction::CPU::scan(SIZE, b, a);
+  printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+  printArray(SIZE, b, true);
+
+  zeroArray(SIZE, c);
+  printDesc("cpu scan, non-power-of-two");
+  StreamCompaction::CPU::scan(NPOT, c, a);
+  printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+  printArray(NPOT, b, true);
+  printCmpResult(NPOT, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("naive scan, power-of-two");
+  StreamCompaction::Naive::scan(SIZE, c, a);
+  printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+  //printArray(SIZE, c, true);
+  printCmpResult(SIZE, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("naive scan, non-power-of-two");
+  StreamCompaction::Naive::scan(NPOT, c, a);
+  printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+  //printArray(SIZE, c, true);
+  printCmpResult(NPOT, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("work-efficient scan, power-of-two");
+  StreamCompaction::Efficient::scan(SIZE, c, a);
+  printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+  //printArray(SIZE, c, true);
+  printCmpResult(SIZE, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("work-efficient scan, non-power-of-two");
+  StreamCompaction::Efficient::scan(NPOT, c, a);
+  printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+  //printArray(NPOT, c, true);
+  printCmpResult(NPOT, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("thrust scan, power-of-two");
+  StreamCompaction::Thrust::scan(SIZE, c, a);
+  printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+  //printArray(SIZE, c, true);
+  printCmpResult(SIZE, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("thrust scan, non-power-of-two");
+  StreamCompaction::Thrust::scan(NPOT, c, a);
+  printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+  //printArray(NPOT, c, true);
+  printCmpResult(NPOT, b, c);
+
+  printf("\n");
+  printf("*****************************\n");
+  printf("** STREAM COMPACTION TESTS **\n");
+  printf("*****************************\n");
+
+  // Compaction tests
+
+  genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
+  a[SIZE - 1] = 0;
+  printArray(SIZE, a, true);
+
+  int count, expectedCount, expectedNPOT;
+
+  // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
+  // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
+  zeroArray(SIZE, b);
+  printDesc("cpu compact without scan, power-of-two");
+  count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+  printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+  expectedCount = count;
+  printArray(count, b, true);
+  printCmpLenResult(count, expectedCount, b, b);
+
+  zeroArray(SIZE, c);
+  printDesc("cpu compact without scan, non-power-of-two");
+  count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
+  printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+  expectedNPOT = count;
+  printArray(count, c, true);
+  printCmpLenResult(count, expectedNPOT, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("cpu compact with scan");
+  count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+  printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+  printArray(count, c, true);
+  printCmpLenResult(count, expectedCount, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("work-efficient compact, power-of-two");
+  count = StreamCompaction::Efficient::compact(SIZE, c, a);
+  printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+  //printArray(count, c, true);
+  printCmpLenResult(count, expectedCount, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("work-efficient compact, non-power-of-two");
+  count = StreamCompaction::Efficient::compact(NPOT, c, a);
+  printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+  //printArray(count, c, true);
+  printCmpLenResult(count, expectedNPOT, b, c);
+
+  system("pause"); // stop Win32 console from closing on exit
 }
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
@@ -1,39 +1,47 @@
 #include "common.h"
 
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
-    cudaError_t err = cudaGetLastError();
-    if (cudaSuccess == err) {
-        return;
-    }
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess == err) {
+    return;
+  }
 
-    fprintf(stderr, "CUDA error");
-    if (file) {
-        fprintf(stderr, " (%s:%d)", file, line);
-    }
-    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
+  fprintf(stderr, "CUDA error");
+  if (file) {
+    fprintf(stderr, " (%s:%d)", file, line);
+  }
+  fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+  exit(EXIT_FAILURE);
 }
 
 
 namespace StreamCompaction {
-    namespace Common {
+namespace Common {
 
-        /**
-         * Maps an array to an array of 0s and 1s for stream compaction. Elements
-         * which map to 0 will be removed, and elements which map to 1 will be kept.
-         */
-        __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
-        }
-
-        /**
-         * Performs scatter on an array. That is, for each element in idata,
-         * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
-         */
-        __global__ void kernScatter(int n, int *odata,
-                const int *idata, const int *bools, const int *indices) {
-            // TODO
-        }
+/**
+ * Maps an array to an array of 0s and 1s for stream compaction. Elements
+ * which map to 0 will be removed, and elements which map to 1 will be kept.
+ */
+__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+  if (index < n) {
+    bools[index] = idata[index] == 0 ? 0 : 1;
+  }
+}
 
+/**
+ * Performs scatter on an array. That is, for each element in idata,
+ * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+ */
+__global__ void kernScatter(int n, int *odata,
+  const int *idata, const int *bools, const int *indices) {
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+  if (index < n) {
+    if (bools[index]) {
+      odata[indices[index]] = idata[index];
     }
+  }
+}
+
+}
 }