diff --git a/README.md b/README.md
index b71c458..21aefdc 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,93 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Name: Zhan Xiong Chin
+* Tested on: Windows 7 Professional, Intel(R) Xeon(R) CPU E5-1630 v4 @ 3.70 GHz 3.70 GHz, GTX 1070 8192MB (SIG Lab)
 
-### (TODO: Your README)
+Overview
+========
+This implements a GPU-based scan (i.e. computes the prefix sums of an array) and stream compaction (i.e. 
+moves nonzero elements to the front of array). There are versions for a CPU-based scan, a naive GPU-based scan 
+(uses O(nlogn) additions) and a work-efficient GPU-based scan (uses O(n) additions).
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Build Instructions
+==================
+[See here](https://github.com/CIS565-Fall-2016/Project0-CUDA-Getting-Started/blob/master/INSTRUCTION.md)
 
+Performance analysis
+====================
+Milliseconds to calculate prefix sums for arrays of given sizes:
+
+![](images/milliseconds.png)
+
+Log2 of nanoseconds for same data:
+
+![](images/log_nanoseconds.png)
+
+The timings above did not include the time needed to copy arrays onto device. 
+
+For small arrays, the naive algorithm appears to be the fastest, though this may be due to the inability to
+effectively time CPU execution for times smaller than 1 millisecond. The CPU algorithm still beats out the
+naive algorithm for most of the timings, suggesting that the extra logn additions needed is slowing down the execution
+of the naive algorithm significantly.
+
+The efficient algorithm is approximately 3 times faster than the naive algorithm and twice as fast as the CPU algorithm.
+This is in line with the O(n) operations needed by both of them; the GPU algorithm is making better use of its 
+multiple cores to achieve this speedup.
+
+Compared to the thrust-based implementation, all algorithms beat it for small arrays, but it is significantly faster
+than all other algorithms for large arrays. Based on the large number of registers used and the small grid size, it
+may be using the work-efficient algorithm, but with a larger base than 2 (e.g. ternary or quarternary tree). 
+
+Test output
+===========
+```
+S:\cis565\Project2-Stream-Compaction\build>Release\cis565_stream_compaction_test
+.exe
+
+****************
+** SCAN TESTS **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  10   0 ]
+==== cpu scan, power-of-two ====
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 205473618 20547362
+8 ]
+==== cpu scan, non-power-of-two ====
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 205473252 20547325
+5 ]
+    passed
+==== naive scan, power-of-two ====
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 205473618 20547362
+8 ]
+    passed
+==== naive scan, non-power-of-two ====
+    passed
+==== work-efficient scan, power-of-two ====
+    passed
+==== work-efficient scan, non-power-of-two ====
+    passed
+==== thrust scan, power-of-two ====
+    passed
+==== thrust scan, non-power-of-two ====
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   3   2   1   3   1   1   1   2   0   1   0   2 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   1   3 ]
+    passed
+==== cpu compact with scan ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+    passed
+==== work-efficient compact, non-power-of-two ====
+    passed
+
+S:\cis565\Project2-Stream-Compaction\build>
+```
\ No newline at end of file
diff --git a/images/log_nanoseconds.png b/images/log_nanoseconds.png
new file mode 100644
index 0000000..974e7a0
Binary files /dev/null and b/images/log_nanoseconds.png differ
diff --git a/images/milliseconds.png b/images/milliseconds.png
new file mode 100644
index 0000000..26af0aa
Binary files /dev/null and b/images/milliseconds.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 675da35..3565085 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -6,6 +6,7 @@
  * @copyright University of Pennsylvania
  */
 
+#include <chrono>
 #include <cstdio>
 #include <stream_compaction/cpu.h>
 #include <stream_compaction/naive.h>
@@ -13,111 +14,161 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
+#define TIMING 1
+
+void comparePerformance(const int SIZE, const int RUNS) {
+	printf("Timing performance with arrays of size %d, averaged over %d runs\n", SIZE, RUNS);
+	int * a = new int[SIZE];
+	int * b = new int[SIZE];
+	genArray(SIZE, a, 2);
+
+	for (int i = 0; i < RUNS; i++) {
+		std::chrono::time_point<std::chrono::high_resolution_clock> start, end;
+		start = std::chrono::high_resolution_clock::now();
+
+		StreamCompaction::CPU::scan(SIZE, b, a);
+
+		end = std::chrono::high_resolution_clock::now();
+		std::chrono::duration<double> elapsedSeconds = end - start;
+		printf("CPU scan: %lf milliseconds\n", elapsedSeconds.count() * 1000.0f);
+
+		StreamCompaction::Naive::scan(SIZE, b, a);
+
+		StreamCompaction::Efficient::scan(SIZE, b, a);
+
+		StreamCompaction::Thrust::scan(SIZE, b, a);
+	}
+
+	delete a;
+	delete b;
+}
+
+void runTests() {
+	const int SIZE = 1 << 23;
+	const int NPOT = SIZE - 17;
+	int * a = new int[SIZE];
+	int * b = new int[SIZE];
+	int * c = new int[SIZE];
+
+	// Scan tests
+
+	printf("\n");
+	printf("****************\n");
+	printf("** SCAN TESTS **\n");
+	printf("****************\n");
+
+	genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+	a[SIZE - 1] = 0;
+	printArray(SIZE, a, true);
+
+	zeroArray(SIZE, b);
+	printDesc("cpu scan, power-of-two");
+	StreamCompaction::CPU::scan(SIZE, b, a);
+	printArray(SIZE, b, true);
+
+	zeroArray(SIZE, c);
+	printDesc("cpu scan, non-power-of-two");
+	StreamCompaction::CPU::scan(NPOT, c, a);
+	printArray(NPOT, b, true);
+	printCmpResult(NPOT, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("naive scan, power-of-two");
+	StreamCompaction::Naive::scan(SIZE, c, a);
+	printArray(SIZE, c, true);
+	printCmpResult(SIZE, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("naive scan, non-power-of-two");
+	StreamCompaction::Naive::scan(NPOT, c, a);
+	//printArray(SIZE, c, true);
+	printCmpResult(NPOT, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("work-efficient scan, power-of-two");
+	StreamCompaction::Efficient::scan(SIZE, c, a);
+	//printArray(SIZE, c, true);
+	printCmpResult(SIZE, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("work-efficient scan, non-power-of-two");
+	StreamCompaction::Efficient::scan(NPOT, c, a);
+	//printArray(NPOT, c, true);
+	printCmpResult(NPOT, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("thrust scan, power-of-two");
+	StreamCompaction::Thrust::scan(SIZE, c, a);
+	//printArray(SIZE, c, true);
+	printCmpResult(SIZE, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("thrust scan, non-power-of-two");
+	StreamCompaction::Thrust::scan(NPOT, c, a);
+	//printArray(NPOT, c, true);
+	printCmpResult(NPOT, b, c);
+
+	printf("\n");
+	printf("*****************************\n");
+	printf("** STREAM COMPACTION TESTS **\n");
+	printf("*****************************\n");
+
+	// Compaction tests
+
+	genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
+	a[SIZE - 1] = 0;
+	printArray(SIZE, a, true);
+
+	int count, expectedCount, expectedNPOT;
+
+	zeroArray(SIZE, b);
+	printDesc("cpu compact without scan, power-of-two");
+	count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+	expectedCount = count;
+	printArray(count, b, true);
+	printCmpLenResult(count, expectedCount, b, b);
+
+	zeroArray(SIZE, c);
+	printDesc("cpu compact without scan, non-power-of-two");
+	count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
+	expectedNPOT = count;
+	printArray(count, c, true);
+	printCmpLenResult(count, expectedNPOT, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("cpu compact with scan");
+	count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+	printArray(count, c, true);
+	printCmpLenResult(count, expectedCount, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("work-efficient compact, power-of-two");
+	count = StreamCompaction::Efficient::compact(SIZE, c, a);
+	//printArray(count, c, true);
+	printCmpLenResult(count, expectedCount, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("work-efficient compact, non-power-of-two");
+	count = StreamCompaction::Efficient::compact(NPOT, c, a);
+	//printArray(count, c, true);
+	printCmpLenResult(count, expectedNPOT, b, c);
+
+	delete a;
+	delete b;
+	delete c;
+}
+
+
 int main(int argc, char* argv[]) {
-    const int SIZE = 1 << 8;
-    const int NPOT = SIZE - 3;
-    int a[SIZE], b[SIZE], c[SIZE];
-
-    // Scan tests
-
-    printf("\n");
-    printf("****************\n");
-    printf("** SCAN TESTS **\n");
-    printf("****************\n");
-
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    zeroArray(SIZE, b);
-    printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
-    printArray(SIZE, b, true);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
-    printArray(NPOT, b, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
-    //printArray(SIZE, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, non-power-of-two");
-    StreamCompaction::Thrust::scan(NPOT, c, a);
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    printf("\n");
-    printf("*****************************\n");
-    printf("** STREAM COMPACTION TESTS **\n");
-    printf("*****************************\n");
-
-    // Compaction tests
-
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    int count, expectedCount, expectedNPOT;
-
-    zeroArray(SIZE, b);
-    printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
-    expectedCount = count;
-    printArray(count, b, true);
-    printCmpLenResult(count, expectedCount, b, b);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact without scan, non-power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
-    expectedNPOT = count;
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
+#if TIMING == 1
+	comparePerformance(1 << 10, 3);
+	comparePerformance(1 << 13, 3);
+	comparePerformance(1 << 16, 3);
+	comparePerformance(1 << 19, 3);
+	comparePerformance(1 << 22, 3);
+	comparePerformance(1 << 25, 3);
+	comparePerformance(1 << 28, 3);
+#else
+	runTests();
+#endif
 }
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..c8709e7 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_50
     )
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index fe872d4..2a95413 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,3 +1,5 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
 #include "common.h"
 
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
@@ -23,7 +25,11 @@ namespace Common {
  * which map to 0 will be removed, and elements which map to 1 will be kept.
  */
 __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-    // TODO
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= n) {
+		return;
+	}
+	bools[index] = (idata[index] == 0) ? 0 : 1;
 }
 
 /**
@@ -32,8 +38,14 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
  */
 __global__ void kernScatter(int n, int *odata,
         const int *idata, const int *bools, const int *indices) {
-    // TODO
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= n) {
+		return;
+	}
+	if (bools[index] != 0) {
+		odata[indices[index]] = idata[index];
+	}
 }
-
 }
+
 }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 4f52663..79cfb56 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -7,6 +7,8 @@
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 
+#define TIMING 1
+
 /**
  * Check for CUDA errors; print and exit if there was a problem.
  */
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index e600c29..102b403 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,4 +1,5 @@
 #include <cstdio>
+#include <cstdlib>
 #include "cpu.h"
 
 namespace StreamCompaction {
@@ -8,8 +9,11 @@ namespace CPU {
  * CPU scan (prefix sum).
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+	int sum = 0;
+	for (int i = 0; i < n; i++) {
+		odata[i] = sum;
+		sum += idata[i];
+	}
 }
 
 /**
@@ -18,8 +22,20 @@ void scan(int n, int *odata, const int *idata) {
  * @returns the number of elements remaining after compaction.
  */
 int compactWithoutScan(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+	int nonZeroCount = 0;
+	for (int i = 0; i < n; i++) {
+		if (idata[i] != 0) {
+			odata[nonZeroCount] = idata[i];
+			nonZeroCount++;
+		}
+	}
+	return nonZeroCount;
+}
+
+void scatter(int n, int * odata, const int * idata, const int * scatterTargets) {
+	for (int i = 0; i < n; i++) {
+		odata[scatterTargets[i]] = idata[i];
+	}
 }
 
 /**
@@ -28,8 +44,17 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
  * @returns the number of elements remaining after compaction.
  */
 int compactWithScan(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+	int * nonZeroElements = (int *)malloc(n * sizeof(int));
+	int * scanCounts = (int *)malloc(n * sizeof(int));
+	for (int i = 0; i < n; i++) {
+		nonZeroElements[i] = (idata[i] == 0) ? 0 : 1;
+	}
+	scan(n, scanCounts, nonZeroElements);
+	scatter(n, odata, idata, scanCounts);
+	int remainingCount = nonZeroElements[n - 1] + scanCounts[n - 1];
+	free(nonZeroElements);
+	free(scanCounts);
+	return remainingCount;
 }
 
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index b2f739b..7c67a6e 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,17 +3,72 @@
 #include "common.h"
 #include "efficient.h"
 
+#define blockSize 128
+
 namespace StreamCompaction {
 namespace Efficient {
 
-// TODO: __global__
+__global__ void kernScanUpsweep(int n, int d, int * data) {
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= (n >> d)) {
+		return;
+	}
+	int k = index << d;
+	data[k + (1 << d) - 1] += data[k + (1 << (d - 1)) - 1];
+}
+
+__global__ void kernScanDownsweep(int n, int d, int * data) {
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= (n >> d)) {
+		return;
+	}
+	int k = index << d;
+	int t = data[k + (1 << d) - 1];
+	data[k + (1 << d) - 1] += data[k + (1 << (d - 1)) - 1];
+	data[k + (1 << (d - 1)) - 1] = t;
+}
 
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+	int * dev_data;
+	int logCeil = ilog2ceil(n);
+	int nCeil = 1 << logCeil;
+
+	cudaMalloc((void**)&dev_data, nCeil * sizeof(int));
+	cudaMemset((void*)dev_data, 0, nCeil * sizeof(int));
+	cudaMemcpy((void*)dev_data, (void*)idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+#if TIMING == 1
+	cudaEvent_t start, stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	cudaEventRecord(start);
+#endif
+
+	for (int i = 1; i <= logCeil; i++) {
+		int gridSize = ((nCeil >> i) + blockSize - 1) / blockSize;
+		kernScanUpsweep << <gridSize, blockSize >> >(nCeil, i, dev_data);
+	}
+
+	cudaMemset((void*)&dev_data[nCeil - 1], 0, sizeof(int));
+
+	for (int i = logCeil; i >= 1; i--) {
+		int gridSize = ((nCeil >> i) + blockSize - 1) / blockSize;
+		kernScanDownsweep << <gridSize, blockSize >> >(nCeil, i, dev_data);
+	}
+
+#if TIMING == 1
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	float milliseconds = 0;
+	cudaEventElapsedTime(&milliseconds, start, stop);
+	printf("Efficient scan: %f milliseconds\n", milliseconds);
+#endif
+
+	cudaMemcpy((void*)odata, (void*)dev_data, n * sizeof(int), cudaMemcpyDeviceToHost);
+	cudaFree(dev_data);
 }
 
 /**
@@ -26,8 +81,37 @@ void scan(int n, int *odata, const int *idata) {
  * @returns      The number of elements remaining after compaction.
  */
 int compact(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+	int * dev_bools;
+	int * dev_idata;
+	int * dev_odata;
+	int * dev_indices;
+	cudaMalloc((void**)&dev_bools, n * sizeof(int));
+	cudaMalloc((void**)&dev_indices, n * sizeof(int));
+	cudaMalloc((void**)&dev_idata, n * sizeof(int));
+	cudaMalloc((void**)&dev_odata, n * sizeof(int));
+
+	// Map to booleans
+	cudaMemcpy((void*)dev_idata, (void*)idata, n * sizeof(int), cudaMemcpyHostToDevice);
+	StreamCompaction::Common::kernMapToBoolean << <n + blockSize - 1, blockSize >> >(n, dev_bools, dev_idata);
+	int * temp = (int *)malloc(n * sizeof(int));
+	cudaMemcpy((void*)temp, (void*)dev_bools, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+	// Do exclusive scan
+	scan(n, temp, temp);
+	int compactedCount = temp[n - 1] + ((idata[n - 1] == 0) ? 0 : 1);
+
+	// Scatter
+	cudaMemcpy((void*)dev_indices, (void*)temp, n * sizeof(int), cudaMemcpyHostToDevice);
+	StreamCompaction::Common::kernScatter << <n + blockSize - 1, blockSize >> >(n, dev_odata, dev_idata, dev_bools, dev_indices);
+	cudaMemcpy((void*)odata, (void*)dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+	free(temp);
+	cudaFree(dev_bools);
+	cudaFree(dev_idata);
+	cudaFree(dev_odata);
+	cudaFree(dev_indices);
+
+    return compactedCount;
 }
 
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 3d86b60..0a85b9d 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,17 +3,69 @@
 #include "common.h"
 #include "naive.h"
 
+#define blockSize 128
+
 namespace StreamCompaction {
 namespace Naive {
 
-// TODO: __global__
+__global__ void kernNaiveScan(int n, int round, int * odata, int * idata) {
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= n) {
+		return;
+	}
+	odata[index] = (
+		(index < (1 << (round - 1))) 
+			? 0 
+			: idata[index - (1 << (round - 1))]
+		) + idata[index];
+}
+
+__global__ void kernInclusiveToExclusiveScan(int n, int * odata, int * idata) {
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= n) {
+		return;
+	}
+	odata[index] = (index == 0 ) ? 0 : idata[index - 1];
+}
 
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+	dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+	int * dev_data;
+	int * dev_data2;
+	cudaMalloc((void**)&dev_data, n * sizeof(int));
+	cudaMalloc((void**)&dev_data2, n * sizeof(int));
+	cudaMemcpy((void*)dev_data, (void*)idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+#if TIMING == 1
+	cudaEvent_t start, stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	cudaEventRecord(start);
+#endif
+
+	for (int i = 1; i <= ilog2ceil(n); i++) {
+		kernNaiveScan << <fullBlocksPerGrid, blockSize >> >(n, i, dev_data2, dev_data);
+		int * tempPtr = dev_data;
+		dev_data = dev_data2;
+		dev_data2 = tempPtr;
+	}
+	kernInclusiveToExclusiveScan << <fullBlocksPerGrid, blockSize >> >(n, dev_data2, dev_data);
+
+#if TIMING == 1
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	float milliseconds = 0;
+	cudaEventElapsedTime(&milliseconds, start, stop);
+	printf("Naive scan: %f milliseconds\n", milliseconds);
+#endif
+
+	cudaMemcpy((void*)odata, (void*)dev_data2, n * sizeof(int), cudaMemcpyDeviceToHost);
+	cudaFree(dev_data);
+	cudaFree(dev_data2);
 }
 
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index d8dbb32..3e195da 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -13,9 +13,27 @@ namespace Thrust {
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO use `thrust::exclusive_scan`
-    // example: for device_vectors dv_in and dv_out:
-    // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+	thrust::device_vector<int> dev_thrust_idata(idata, idata + n);
+	thrust::device_vector<int> dev_thrust_odata(n);
+
+#if TIMING == 1
+	cudaEvent_t start, stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	cudaEventRecord(start);
+#endif
+
+	thrust::exclusive_scan(dev_thrust_idata.begin(), dev_thrust_idata.end(), dev_thrust_odata.begin());
+
+#if TIMING == 1
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	float milliseconds = 0;
+	cudaEventElapsedTime(&milliseconds, start, stop);
+	printf("Thrust scan: %f milliseconds\n", milliseconds);
+#endif
+
+	thrust::copy(dev_thrust_odata.begin(), dev_thrust_odata.end(), odata);
 }
 
 }