diff --git a/README.md b/README.md
index b71c458..1e15a96 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,24 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Alexander Perry
+* Tested on: Windows 10, i5-2410M @ 2.30GHz 8GB, NVS 4200M
 
-### (TODO: Your README)
+### Analysis
+![](./img/power_of_two_release.PNG)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+![](./img/non_power_of_two_release.PNG)
 
+The most obvious thing to note is that longer arrays take longer to sort.
+Beyond that we can see that algorithms that are supposed to more efficient are generally less efficient as implemented.
+The CPU algorithm is the fastest algorithm, followed by the thrust implementation and then by the naive GPU solution, with the work-efficient GPU solution.
+
+I was able to get occupancy of the GPU for all of my algorithm up to 100\%. There are a maximum of 8 blocks and 1536 threads on my GPU. This leads to an optimum number of threads per block of 192. This is also only 6 warps per block and 48 warps total.
+
+Based on the timeline, there are many back and forth copys from the GPU to main memory in the thrust implementation.
+Even though the thrust implementation uses all of these copys, it is still faster than my GPU attempts. I suspect this is due to memory accesses.
+
+![](./img/power_of_two.PNG)
+
+The previous graph is based on a debug build.
+An interesting thing to note is that the thrust implementation is less optimal in a debug setting compared to the release build.
diff --git a/img/non_power_of_two.PNG b/img/non_power_of_two.PNG
new file mode 100644
index 0000000..b5e37dd
Binary files /dev/null and b/img/non_power_of_two.PNG differ
diff --git a/img/non_power_of_two_release.PNG b/img/non_power_of_two_release.PNG
new file mode 100644
index 0000000..58a069b
Binary files /dev/null and b/img/non_power_of_two_release.PNG differ
diff --git a/img/power_of_two.PNG b/img/power_of_two.PNG
new file mode 100644
index 0000000..d24e784
Binary files /dev/null and b/img/power_of_two.PNG differ
diff --git a/img/power_of_two_release.PNG b/img/power_of_two_release.PNG
new file mode 100644
index 0000000..cc4166b
Binary files /dev/null and b/img/power_of_two_release.PNG differ
diff --git a/src/main.cpp b/src/main.cpp
index 7305641..eaa6e44 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,130 +14,131 @@
 #include "testing_helpers.hpp"
 
 const int SIZE = 1 << 8; // feel free to change the size of array
+//const int SIZE = 1 << 3; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int a[SIZE], b[SIZE], c[SIZE];
 
 int main(int argc, char* argv[]) {
-    // Scan tests
-
-    printf("\n");
-    printf("****************\n");
-    printf("** SCAN TESTS **\n");
-    printf("****************\n");
-
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    // initialize b using StreamCompaction::CPU::scan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
-    // At first all cases passed because b && c are all zeroes.
-    zeroArray(SIZE, b);
-    printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(SIZE, b, true);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(NPOT, b, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, non-power-of-two");
-    StreamCompaction::Thrust::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    printf("\n");
-    printf("*****************************\n");
-    printf("** STREAM COMPACTION TESTS **\n");
-    printf("*****************************\n");
-
-    // Compaction tests
-
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    int count, expectedCount, expectedNPOT;
-
-    // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
-    zeroArray(SIZE, b);
-    printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedCount = count;
-    printArray(count, b, true);
-    printCmpLenResult(count, expectedCount, b, b);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact without scan, non-power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedNPOT = count;
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    system("pause"); // stop Win32 console from closing on exit
+  // Scan tests
+
+  printf("\n");
+  printf("****************\n");
+  printf("** SCAN TESTS **\n");
+  printf("****************\n");
+
+  genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+  a[SIZE - 1] = 0;
+  printArray(SIZE, a, true);
+
+  // initialize b using StreamCompaction::CPU::scan you implement
+  // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
+  // At first all cases passed because b && c are all zeroes.
+  zeroArray(SIZE, b);
+  printDesc("cpu scan, power-of-two");
+  StreamCompaction::CPU::scan(SIZE, b, a);
+  printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+  printArray(SIZE, b, true);
+
+  zeroArray(SIZE, c);
+  printDesc("cpu scan, non-power-of-two");
+  StreamCompaction::CPU::scan(NPOT, c, a);
+  printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+  printArray(NPOT, b, true);
+  printCmpResult(NPOT, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("naive scan, power-of-two");
+  StreamCompaction::Naive::scan(SIZE, c, a);
+  printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+  //printArray(SIZE, c, true);
+  printCmpResult(SIZE, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("naive scan, non-power-of-two");
+  StreamCompaction::Naive::scan(NPOT, c, a);
+  printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+  //printArray(SIZE, c, true);
+  printCmpResult(NPOT, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("work-efficient scan, power-of-two");
+  StreamCompaction::Efficient::scan(SIZE, c, a);
+  printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+  //printArray(SIZE, c, true);
+  printCmpResult(SIZE, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("work-efficient scan, non-power-of-two");
+  StreamCompaction::Efficient::scan(NPOT, c, a);
+  printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+  //printArray(NPOT, c, true);
+  printCmpResult(NPOT, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("thrust scan, power-of-two");
+  StreamCompaction::Thrust::scan(SIZE, c, a);
+  printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+  //printArray(SIZE, c, true);
+  printCmpResult(SIZE, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("thrust scan, non-power-of-two");
+  StreamCompaction::Thrust::scan(NPOT, c, a);
+  printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+  //printArray(NPOT, c, true);
+  printCmpResult(NPOT, b, c);
+
+  printf("\n");
+  printf("*****************************\n");
+  printf("** STREAM COMPACTION TESTS **\n");
+  printf("*****************************\n");
+
+  // Compaction tests
+
+  genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
+  a[SIZE - 1] = 0;
+  printArray(SIZE, a, true);
+
+  int count, expectedCount, expectedNPOT;
+
+  // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
+  // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
+  zeroArray(SIZE, b);
+  printDesc("cpu compact without scan, power-of-two");
+  count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+  printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+  expectedCount = count;
+  printArray(count, b, true);
+  printCmpLenResult(count, expectedCount, b, b);
+
+  zeroArray(SIZE, c);
+  printDesc("cpu compact without scan, non-power-of-two");
+  count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
+  printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+  expectedNPOT = count;
+  printArray(count, c, true);
+  printCmpLenResult(count, expectedNPOT, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("cpu compact with scan");
+  count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+  printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+  printArray(count, c, true);
+  printCmpLenResult(count, expectedCount, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("work-efficient compact, power-of-two");
+  count = StreamCompaction::Efficient::compact(SIZE, c, a);
+  printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+  //printArray(count, c, true);
+  printCmpLenResult(count, expectedCount, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("work-efficient compact, non-power-of-two");
+  count = StreamCompaction::Efficient::compact(NPOT, c, a);
+  printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+  //printArray(count, c, true);
+  printCmpLenResult(count, expectedNPOT, b, c);
+
+  system("pause"); // stop Win32 console from closing on exit
 }
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 8fc0211..6da332f 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,39 +1,47 @@
 #include "common.h"
 
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
-    cudaError_t err = cudaGetLastError();
-    if (cudaSuccess == err) {
-        return;
-    }
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess == err) {
+    return;
+  }
 
-    fprintf(stderr, "CUDA error");
-    if (file) {
-        fprintf(stderr, " (%s:%d)", file, line);
-    }
-    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
+  fprintf(stderr, "CUDA error");
+  if (file) {
+    fprintf(stderr, " (%s:%d)", file, line);
+  }
+  fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+  exit(EXIT_FAILURE);
 }
 
 
 namespace StreamCompaction {
-    namespace Common {
+namespace Common {
 
-        /**
-         * Maps an array to an array of 0s and 1s for stream compaction. Elements
-         * which map to 0 will be removed, and elements which map to 1 will be kept.
-         */
-        __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
-        }
-
-        /**
-         * Performs scatter on an array. That is, for each element in idata,
-         * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
-         */
-        __global__ void kernScatter(int n, int *odata,
-                const int *idata, const int *bools, const int *indices) {
-            // TODO
-        }
+/**
+ * Maps an array to an array of 0s and 1s for stream compaction. Elements
+ * which map to 0 will be removed, and elements which map to 1 will be kept.
+ */
+__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+  if (index < n) {
+    bools[index] = idata[index] == 0 ? 0 : 1;
+  }
+}
 
+/**
+ * Performs scatter on an array. That is, for each element in idata,
+ * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+ */
+__global__ void kernScatter(int n, int *odata,
+  const int *idata, const int *bools, const int *indices) {
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+  if (index < n) {
+    if (bools[index]) {
+      odata[indices[index]] = idata[index];
     }
+  }
+}
+
+}
 }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 55f1b38..30d7c28 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -1,5 +1,5 @@
-#pragma once
-
+#pragma once
+
 #include <cuda.h>
 #include <cuda_runtime.h>
 
@@ -7,36 +7,36 @@
 #include <cstring>
 #include <cmath>
 #include <algorithm>
-#include <chrono>
+#include <chrono>
 #include <stdexcept>
-
-#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
-#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
-
-/**
- * Check for CUDA errors; print and exit if there was a problem.
- */
-void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
-
-inline int ilog2(int x) {
-    int lg = 0;
-    while (x >>= 1) {
-        ++lg;
-    }
-    return lg;
-}
-
-inline int ilog2ceil(int x) {
-    return ilog2(x - 1) + 1;
-}
-
-namespace StreamCompaction {
-    namespace Common {
-        __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
-
-        __global__ void kernScatter(int n, int *odata,
-                const int *idata, const int *bools, const int *indices);
-
+
+#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+
+/**
+ * Check for CUDA errors; print and exit if there was a problem.
+ */
+void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
+
+inline int ilog2(int x) {
+    int lg = 0;
+    while (x >>= 1) {
+        ++lg;
+    }
+    return lg;
+}
+
+inline int ilog2ceil(int x) {
+    return ilog2(x - 1) + 1;
+}
+
+namespace StreamCompaction {
+    namespace Common {
+        __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
+
+        __global__ void kernScatter(int n, int *odata,
+                const int *idata, const int *bools, const int *indices);
+
 	    /**
 	    * This class is used for timing the performance
 	    * Uncopyable and unmovable
@@ -127,6 +127,6 @@ namespace StreamCompaction {
 
 		    float prev_elapsed_time_cpu_milliseconds = 0.f;
 		    float prev_elapsed_time_gpu_milliseconds = 0.f;
-	    };
-    }
-}
+	    };
+    }
+}
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 05ce667..bfac3c0 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,50 +1,86 @@
-#include <cstdio>
-#include "cpu.h"
-
+#include <cstdio>
+#include "cpu.h"
+
 #include "common.h"
-
-namespace StreamCompaction {
-    namespace CPU {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-	        static PerformanceTimer timer;
-	        return timer;
-        }
-
-        /**
-         * CPU scan (prefix sum).
-         * For performance analysis, this is supposed to be a simple for loop.
-         * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
-         */
-        void scan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-            // TODO
-	        timer().endCpuTimer();
-        }
-
-        /**
-         * CPU stream compaction without using the scan function.
-         *
-         * @returns the number of elements remaining after compaction.
-         */
-        int compactWithoutScan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-            // TODO
-	        timer().endCpuTimer();
-            return -1;
-        }
-
-        /**
-         * CPU stream compaction using scan and scatter, like the parallel version.
-         *
-         * @returns the number of elements remaining after compaction.
-         */
-        int compactWithScan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-	        // TODO
-	        timer().endCpuTimer();
-            return -1;
-        }
-    }
-}
+
+namespace StreamCompaction {
+namespace CPU {
+
+using StreamCompaction::Common::PerformanceTimer;
+
+PerformanceTimer& timer()
+{
+  static PerformanceTimer timer;
+  return timer;
+}
+
+inline void scan_impl(int n, int *odata, const int *idata) {
+  int sum = 0;
+  for (int i = 0; i < n; ++i) {
+    *odata++ = sum;
+    sum += *idata++;
+  }
+}
+
+/**
+ * CPU scan (prefix sum).
+ * For performance analysis, this is supposed to be a simple for loop.
+ * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
+ */
+void scan(int n, int *odata, const int *idata) {
+  timer().startCpuTimer();
+  scan_impl(n, odata, idata);
+  timer().endCpuTimer();
+}
+
+/**
+ * CPU stream compaction without using the scan function.
+ *
+ * @returns the number of elements remaining after compaction.
+ */
+int compactWithoutScan(int n, int *odata, const int *idata) {
+  timer().startCpuTimer();
+  int count = 0;
+  for (int i = 0; i < n; ++i) {
+    int d = *idata++;
+    if (d) {
+      *odata++ = d;
+      ++count;
+    }
+  }
+  timer().endCpuTimer();
+  return count;
+}
+
+void scatter(int n, int *output, const int *map, const int *input) {
+  for (int i = 0; i < n; ++i) {
+    output[map[i]] = input[i];
+  }
+}
+
+/**
+ * CPU stream compaction using scan and scatter, like the parallel version.
+ *
+ * @returns the number of elements remaining after compaction.
+ */
+int compactWithScan(int n, int *odata, const int *idata) {
+  timer().startCpuTimer();
+  int *temp = (int*)malloc(sizeof(int)*n);
+  int *temp2 = (int*)malloc(sizeof(int)*n);
+  int count = 0;
+  for (int i = 0; i < n; ++i) {
+    if (idata[i]) {
+      temp[i] = 1;
+      ++count;
+    } else {
+      temp[i] = 0;
+    }
+  }
+  scan_impl(n, temp2, temp);
+  scatter(n, odata, temp2, idata);
+  timer().endCpuTimer();
+  return count;
+}
+
+} // namespace CPU
+} // namespace StreamCompaction
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 36c5ef2..7d07494 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -1,40 +1,123 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "common.h"
-#include "efficient.h"
-
-namespace StreamCompaction {
-    namespace Efficient {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-        }
-
-        /**
-         * Performs stream compaction on idata, storing the result into odata.
-         * All zeroes are discarded.
-         *
-         * @param n      The number of elements in idata.
-         * @param odata  The array into which to store elements.
-         * @param idata  The array of elements to compact.
-         * @returns      The number of elements remaining after compaction.
-         */
-        int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-            return -1;
-        }
-    }
-}
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "efficient.h"
+#include "iostream"
+
+#define BLOCK_SIZE 128
+
+namespace StreamCompaction {
+namespace Efficient {
+
+using StreamCompaction::Common::PerformanceTimer;
+
+PerformanceTimer& timer()
+{
+  static PerformanceTimer timer;
+  return timer;
+}
+
+__global__ void UpSweep(int n, int offset, int *data) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int first_index = offset*(index)*2 + offset - 1;
+  int end_index = first_index + offset;
+  if (end_index < n) {
+    data[end_index] += data[first_index];
+  }
+}
+
+__global__ void set_zero(int n, int *data) {
+  data[n - 1] = 0;
+}
+
+__global__ void clear_array(int n, int*data) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < n) {
+    data[index] = 0;
+  }
+}
+
+__global__ void DownSweep(int n, int offset, int *data) {
+   int index = threadIdx.x + blockIdx.x * blockDim.x;
+   if (index < n) {
+     //int left_index = index + (offset >> 1) - 1;
+     //int right_index = index + offset - 1;
+     int left_index = offset*index*2 + offset - 1;
+     int right_index = left_index + offset;
+     if (left_index < n && right_index < n) {
+       int temp = data[left_index];
+       data[left_index] = data[right_index];
+       data[right_index] += temp;
+     }
+   }
+}
+
+void scan_impl(int arr_length, int *dev_array) {
+  dim3 fullBlocksPerGrid((arr_length + BLOCK_SIZE - 1) / BLOCK_SIZE);
+  for (int offset = 1; offset < arr_length; offset *= 2) {
+    UpSweep<<<fullBlocksPerGrid, BLOCK_SIZE>>>(arr_length, offset, dev_array);
+  }
+  set_zero<<<dim3(1), dim3(1)>>>(arr_length, dev_array);
+  for(int offset = arr_length/2; offset >= 1; offset = offset >> 1) {
+    DownSweep<<<fullBlocksPerGrid, BLOCK_SIZE>>>(arr_length, offset, dev_array);
+  }
+}
+
+/**
+ * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+ */
+void scan(int n, int *odata, const int *idata) {
+  dim3 fullBlocksPerGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE);
+  int arr_length = (1 << ilog2ceil(n));
+  int *dev_array;
+  cudaMalloc((void**)&dev_array, arr_length * sizeof(int));
+  clear_array<<<fullBlocksPerGrid, BLOCK_SIZE>>>(arr_length, dev_array);
+  cudaMemcpy(dev_array, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
+  timer().startGpuTimer();
+  scan_impl(arr_length, dev_array);
+  timer().endGpuTimer();
+  cudaMemcpy(odata, dev_array, sizeof(int) * n, cudaMemcpyDeviceToHost);
+  cudaFree(dev_array);
+}
+
+/**
+ * Performs stream compaction on idata, storing the result into odata.
+ * All zeroes are discarded.
+ *
+ * @param n      The number of elements in idata.
+ * @param odata  The array into which to store elements.
+ * @param idata  The array of elements to compact.
+ * @returns      The number of elements remaining after compaction.
+ */
+int compact(int n, int *odata, const int *idata) {
+  dim3 fullBlocksPerGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE);
+  int arr_length = (1 << ilog2ceil(n));
+  int *dev_array_data;
+  int *dev_array_bool;
+  int *dev_array_indices;
+  int *dev_array_out;
+
+  cudaMalloc((void**)&dev_array_data, arr_length * sizeof(int));
+  cudaMalloc((void**)&dev_array_bool, arr_length * sizeof(int));
+  cudaMalloc((void**)&dev_array_indices, arr_length * sizeof(int));
+  cudaMalloc((void**)&dev_array_out, arr_length * sizeof(int));
+
+  clear_array<<<fullBlocksPerGrid, BLOCK_SIZE>>>(n, dev_array_data);
+  cudaMemcpy(dev_array_data, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+  timer().startGpuTimer();
+  Common::kernMapToBoolean<<<fullBlocksPerGrid, BLOCK_SIZE>>>(n, dev_array_bool, dev_array_data);
+  cudaMemcpy(dev_array_indices, dev_array_bool, arr_length * sizeof(int), cudaMemcpyDeviceToDevice);
+  scan_impl(arr_length, dev_array_indices);
+  Common::kernScatter<<<fullBlocksPerGrid, BLOCK_SIZE>>>(n, dev_array_out, dev_array_data, dev_array_bool, dev_array_indices);
+  timer().endGpuTimer();
+  cudaMemcpy(odata, dev_array_out, n*sizeof(int), cudaMemcpyDeviceToHost);
+  int ret;
+  int ret2;
+  cudaMemcpy(&ret, dev_array_bool + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+  cudaMemcpy(&ret2, dev_array_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+  return ret + ret2;
+}
+
+}
+}
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 9218f8e..d0d8ba5 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -1,25 +1,71 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "common.h"
-#include "naive.h"
-
-namespace StreamCompaction {
-    namespace Naive {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-        // TODO: __global__
-
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-        }
-    }
-}
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "naive.h"
+#include <iostream>
+
+#define BLOCK_SIZE 128
+
+namespace StreamCompaction {
+namespace Naive {
+
+using StreamCompaction::Common::PerformanceTimer;
+
+PerformanceTimer& timer()
+{
+  static PerformanceTimer timer;
+  return timer;
+}
+// TODO: __global__
+__global__ void naive_scan_impl(int n, int offset, int *odata, const int *idata) {
+  int index = blockIdx.x*blockDim.x + threadIdx.x;
+  int out_index = index - offset;
+  if (out_index < 0) {
+    odata[index] = idata[index];
+  } else if (index < n){
+    odata[index] = idata[index] + idata[out_index];
+  }
+}
+
+__global__ void shift_impl(int n, int *odata, const int *idata) {
+  int index = blockIdx.x*blockDim.x + threadIdx.x;
+  if (index == 0) {
+    odata[0] = 0;
+  }
+  if (index < n - 1) {
+    odata[index + 1] = idata[index];
+  }
+}
+
+/**
+ * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+ */
+void scan(int n, int *odata, const int *idata) {
+  dim3 fullBlocksPerGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE);
+  int *dev_array_A;
+  int *dev_array_B;
+  cudaMalloc((void**)&dev_array_A, n * sizeof(int));
+  cudaMalloc((void**)&dev_array_B, n * sizeof(int));
+  cudaMemcpy(dev_array_B, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+  bool direction = false;
+
+  timer().startGpuTimer();
+  shift_impl<<<fullBlocksPerGrid, BLOCK_SIZE>>>(n, dev_array_A, dev_array_B);
+  cudaMemcpy(dev_array_B, dev_array_A, sizeof(int), cudaMemcpyDeviceToDevice);
+
+  for (int offset = 1; offset < n; offset *= 2) {
+    if (direction) {
+      naive_scan_impl<<<fullBlocksPerGrid, BLOCK_SIZE>>>(n, offset, dev_array_A, dev_array_B);
+    } else {
+      naive_scan_impl<<<fullBlocksPerGrid, BLOCK_SIZE>>>(n, offset, dev_array_B, dev_array_A);
+    }
+    direction = !direction;
+  }
+  timer().endGpuTimer();
+  cudaMemcpy(odata, (!direction ? dev_array_A : dev_array_B), sizeof(int) * n, cudaMemcpyDeviceToHost);
+  cudaFree(dev_array_A);
+  cudaFree(dev_array_B);
+}
+
+} // namespace Naive
+} // namespace StreamCompaction
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 36b732d..22feded 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -1,28 +1,34 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/scan.h>
-#include "common.h"
-#include "thrust.h"
-
-namespace StreamCompaction {
-    namespace Thrust {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
-            timer().endGpuTimer();
-        }
-    }
-}
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/scan.h>
+#include <thrust/copy.h>
+#include "common.h"
+#include "thrust.h"
+
+namespace StreamCompaction {
+namespace Thrust {
+
+using StreamCompaction::Common::PerformanceTimer;
+
+PerformanceTimer& timer()
+{
+  static PerformanceTimer timer;
+  return timer;
+}
+
+/**
+ * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+ */
+void scan(int n, int *odata, const int *idata) {
+  thrust::host_vector<int> hv_in(idata, idata + n);
+  thrust::device_vector<int> dv_in = hv_in;
+  timer().startGpuTimer();
+  thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_in.begin());
+  timer().endGpuTimer();
+  thrust::copy(dv_in.begin(), dv_in.end(), odata);
+}
+
+} // namespace Thrust
+} // namespace StreamCompaction