CIS565-Fall-2016 · aksris · Sep 18, 2016 · Sep 19, 2016 · Sep 24, 2016 · Sep 24, 2016
diff --git a/README.md b/README.md
@@ -3,11 +3,97 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Akshay Shah
+* Tested on: Windows 10, i7-5700HQ @ 2.70GHz 16GB, GTX 970M 6GB (Personal Computer)
 
-### (TODO: Your README)
+### Stream Compaction
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Block Size: 256
 
+Array input: vary from 2<sup>18</sup> to 2<sup>26</sup>.
+
+Max size: 2<sup>30</sup>
+
+CUDA Summary
+
+![](images/GPUCUDAstats.PNG)
+
+![](images/cudaCoreStats.PNG)
+
+Memory I/O looks to be the bottleneck over the GPU, meaning changing the variables over to shared memory might make a difference. (not checked, just speculating)
+
+
+#### Output
+```
+****************
+** SCAN TESTS **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  38   0 ]
+==== cpu scan, power-of-two ====
+215 ms total, average : 215 ms.
+==== cpu scan, non-power-of-two ====
+    passed
+==== naive scan, power-of-two ====
+total time to run naive scan, power-of-two: 187.5 in ms, aver: 187.5
+    passed
+==== naive scan, non-power-of-two ====
+    passed
+==== work-efficient scan, power-of-two ====
+total time to run efficient scan, power-of-two: 171.601 in ms, aver: 171.601
+    passed
+==== work-efficient scan, non-power-of-two ====
+    passed
+==== thrust scan, power-of-two ====
+49 ms total thrust, average : 49 ms.
+    passed
+==== thrust scan, non-power-of-two ====
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   3   2   1   3   1   1   1   2   0   1   0   2 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+182 ms total cpu w/o scan, average : 182 ms.
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   1   1 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+==== cpu compact with scan ====
+356 ms total cpu w scan, average : 356 ms.
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   1   1 ]
+    passed
+==== work-efficient compact, power-of-two ====
+total time to run efficient compact, power-of-two: 186.149 in ms, aver: 186.149
+    passed
+==== work-efficient compact, non-power-of-two ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+
+*****************************
+** RADIX SORT TESTS **
+*****************************
+    [   3   5   0   1   0   2   0   1   6   1   1   2   1 ...   0   0 ]
+==== radix sort, power-of-two ====
+total time to run radix, power-of-two: 67.8532 in ms, aver: 67.8532
+    [   0   0   0   0   0   0   0   0   0   0   0   0   0 ...   6   6 ]
+==== thrust sort, power-of-two ====
+515 ms total thrust sort, average : 515 ms.
+```
+
+![Fig1](images/naivevsefficient.png)
+
+##### A comparison of Stream Compaction over the GPU vs CPU
+
+The time was measured for 1000 iterations
+Exclusive scan was used inside the stream compaction over GPU, which is shown (CPU vs GPU scan) in Fig. 1
+
+![Fig2](images/streamcompact.png)
+
+##### Radix sort
+Implemented Radix sort on the GPU, that uses exclusive scan. The output shows a sorted array.
+Limitation to this implementation is that the max digit that can be used for sorting is 7. To increase the digit limit, you would have to change the lsb iteration number in sort.cu to whatever bit is the maximum.
+There is a comparison between thrust's CPU sort and this GPU sort.
+
+![Fig3](images/sortcpuvgpu.png)
diff --git a/images/GPUCUDAstats.PNG b/images/GPUCUDAstats.PNG
diff --git a/images/cudaCoreStats.PNG b/images/cudaCoreStats.PNG
diff --git a/images/naivevsefficient.png b/images/naivevsefficient.png
diff --git a/images/sortcpuvgpu.png b/images/sortcpuvgpu.png
diff --git a/images/streamcompact.png b/images/streamcompact.png
diff --git a/src/main.cpp b/src/main.cpp
@@ -11,13 +11,23 @@
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
 #include <stream_compaction/thrust.h>
+#include <stream_compaction/sort.h>
 #include "testing_helpers.hpp"
+#include <iostream>
+#include <chrono>
+
+#define ITER 1
 
 int main(int argc, char* argv[]) {
-    const int SIZE = 1 << 8;
+    const size_t SIZE = 1 << 20;
     const int NPOT = SIZE - 3;
-    int a[SIZE], b[SIZE], c[SIZE];
+    //int a[SIZE], b[SIZE], c[SIZE];
+
+	int *a = new int[SIZE];
+	int *b = new int[SIZE];
+	int *c = new int[SIZE];
 
+	float time = 0.f, totalTime = 0.f;
     // Scan tests
 
     printf("\n");
@@ -31,42 +41,64 @@ int main(int argc, char* argv[]) {
 
     zeroArray(SIZE, b);
     printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
-    printArray(SIZE, b, true);
+	auto begin = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < ITER; ++i){
+		StreamCompaction::CPU::scan(SIZE, b, a);
+	}
+	auto end = std::chrono::high_resolution_clock::now();
+	auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
+	std::cout << (float)duration << " ms total, average : " << (float)duration / ITER << " ms." << std::endl;
+    //printArray(SIZE, b, true);
 
     zeroArray(SIZE, c);
     printDesc("cpu scan, non-power-of-two");
     StreamCompaction::CPU::scan(NPOT, c, a);
-    printArray(NPOT, b, true);
+    //printArray(NPOT, b, true);
     printCmpResult(NPOT, b, c);
 
+	totalTime = 0.f;
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+    //printArray(SIZE, c, false);
+	for (int i = 0; i < ITER; ++i) {
+		StreamCompaction::Naive::scan(SIZE, c, a, time);
+		totalTime += time;
+	}
+	std::cout << "total time to run naive scan, power-of-two: " << totalTime << " in ms, aver: "  << totalTime / ITER << std::endl;
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
+    StreamCompaction::Naive::scan(NPOT, c, a, time);
     //printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
+	totalTime = 0.f;
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+	for (int i = 0; i < ITER; ++i) {
+		StreamCompaction::Efficient::scan(SIZE, c, a, time);
+		totalTime += time;
+	}
+	std::cout << "total time to run efficient scan, power-of-two: " << totalTime << " in ms, aver: " << totalTime / ITER << std::endl;
+    //printArray(SIZE, c, false);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
+	StreamCompaction::Efficient::scan(NPOT, c, a, time);
     //printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
+	begin = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < ITER; ++i){
+		StreamCompaction::Thrust::scan(SIZE, c, a);
+	}
+	end = std::chrono::high_resolution_clock::now();
+	duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
+	std::cout << (float)duration << " ms total thrust, average : " << (float)duration / ITER << " ms." << std::endl;
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
@@ -91,7 +123,13 @@ int main(int argc, char* argv[]) {
 
     zeroArray(SIZE, b);
     printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+	begin = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < ITER; ++i){
+		count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+	}
+	end = std::chrono::high_resolution_clock::now();
+	duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
+	std::cout << (float)duration << " ms total cpu w/o scan, average : " << (float)duration / ITER << " ms." << std::endl;
     expectedCount = count;
     printArray(count, b, true);
     printCmpLenResult(count, expectedCount, b, b);
@@ -105,19 +143,66 @@ int main(int argc, char* argv[]) {
 
     zeroArray(SIZE, c);
     printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+	begin = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < ITER; ++i){
+		count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+	}
+	end = std::chrono::high_resolution_clock::now();
+	duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
+	std::cout << (float)duration << " ms total cpu w scan, average : " << (float)duration / ITER << " ms." << std::endl;
     printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
+	totalTime = 0.f;
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
+	for (int i = 0; i < ITER; ++i) {
+		count = StreamCompaction::Efficient::compact(SIZE, c, a, time);
+		totalTime += time;
+	}
+	std::cout << "total time to run efficient compact, power-of-two: " << totalTime << " in ms, aver: " << totalTime / ITER << std::endl;
     //printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    //printArray(count, c, true);
+	count = StreamCompaction::Efficient::compact(NPOT, c, a, time);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
+
+	printf("\n");
+	printf("*****************************\n");
+	printf("** RADIX SORT TESTS **\n");
+	printf("*****************************\n");
+
+	// SORT tests
+
+	genArray(SIZE, a, 7);  // Leave a 0 at the end to test that edge case
+	a[SIZE - 1] = 0;
+	printArray(SIZE, a, true);
+
+	totalTime = 0.f;
+	zeroArray(SIZE, b);
+	printDesc("radix sort, power-of-two");
+	for (int i = 0; i < ITER; ++i) {
+		StreamCompaction::Sort::sort(SIZE, b, a, time);
+		totalTime += time;
+	}
+	std::cout << "total time to run radix, power-of-two: " << totalTime << " in ms, aver: " << totalTime / ITER << std::endl;
+	printArray(SIZE, b, true);
+
+
+	zeroArray(SIZE, b);
+	printDesc("thrust sort, power-of-two");
+	begin = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < ITER; ++i){
+		StreamCompaction::Thrust::sort(SIZE, a);
+	}
+	end = std::chrono::high_resolution_clock::now();
+	float fduration = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
+	std::cout << (float)fduration << " ms total thrust sort, average : " << (float)fduration / ITER << " ms." << std::endl;
+
+	delete[] a;
+	delete[] b;
+	delete[] c;
 }
diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
@@ -47,6 +47,19 @@ void genArray(int n, int *a, int maxval) {
     }
 }
 
+void genArraySort(int n, int *a, int maxval) {
+	if (n == 8) {
+		a[0] = 4;
+		a[1] = 7;
+		a[2] = 2;
+		a[3] = 6;
+		a[4] = 3;
+		a[5] = 5;
+		a[6] = 1;
+		a[7] = 0;
+	}
+}
+
 void printArray(int n, int *a, bool abridged = false) {
     printf("    [ ");
     for (int i = 0; i < n; i++) {

diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
@@ -9,9 +9,11 @@ set(SOURCE_FILES
     "efficient.cu"
     "thrust.h"
     "thrust.cu"
+	"sort.h"
+	"sort.cu"
     )
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_52
     )
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
@@ -1,39 +1,45 @@
 #include "common.h"
 
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
-    cudaError_t err = cudaGetLastError();
-    if (cudaSuccess == err) {
-        return;
-    }
-
-    fprintf(stderr, "CUDA error");
-    if (file) {
-        fprintf(stderr, " (%s:%d)", file, line);
-    }
-    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess == err) {
+		return;
+	}
+
+	fprintf(stderr, "CUDA error");
+	if (file) {
+		fprintf(stderr, " (%s:%d)", file, line);
+	}
+	fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+	exit(EXIT_FAILURE);
 }
 
 
 namespace StreamCompaction {
-namespace Common {
-
-/**
- * Maps an array to an array of 0s and 1s for stream compaction. Elements
- * which map to 0 will be removed, and elements which map to 1 will be kept.
- */
-__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-    // TODO
-}
+	namespace Common {
 
-/**
- * Performs scatter on an array. That is, for each element in idata,
- * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
- */
-__global__ void kernScatter(int n, int *odata,
-        const int *idata, const int *bools, const int *indices) {
-    // TODO
-}
+		/**
+		* Maps an array to an array of 0s and 1s for stream compaction. Elements
+		* which map to 0 will be removed, and elements which map to 1 will be kept.
+		*/
+		__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
+			int index = threadIdx.x + (blockDim.x * blockIdx.x);
 
-}
-}
+			if (index >= n) return;
+			bools[index] = idata[index] != 0 ? 1 : 0;
+		}
+
+		/**
+		* Performs scatter on an array. That is, for each element in idata,
+		* if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+		*/
+		__global__ void kernScatter(int n, int *odata,
+			const int *idata, const int *bools, const int *indices) {
+			int index = threadIdx.x + (blockDim.x * blockIdx.x);
+
+			if (index >= n) return;
+			if (bools[index] == 1) odata[indices[index]] = idata[index];
+		}
+
+	}
+}