diff --git a/README.md b/README.md
index b71c458..fa99094 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,94 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Daniel Krupka
+* Tested on: Debian testing (stretch), Intel(R) Core(TM) i7-4710HQ CPU @ 2.50GHz 8GB, GTX 850M
 
-### (TODO: Your README)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+# Project 2 - Stream Compaction
+This project's goal was to compare various methods for achieving [stream compaction](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html).
+The test program was modified to take block size and array size as arguments, and ran tests for each algorithm on both
+power-of-two and non-power-of-two data. Test output was the following:
+```
+****************
+** SCAN TESTS **
+****************
+    [  33  36  27  15  43  35  36  42  49  21  12  27  40 ...   6   0 ]
+==== cpu scan, power-of-two ====
+    [   0  33  69  96 111 154 189 225 267 316 337 349 376 ... 12852633 12852639 ]
+==== cpu scan, non-power-of-two ====
+    [   0  33  69  96 111 154 189 225 267 316 337 349 376 ... 12852608 12852617 ]
+    passed
+==== naive scan, power-of-two ====
+    passed
+==== naive scan, non-power-of-two ====
+    passed
+==== work-efficient scan, power-of-two ====
+    passed
+==== work-efficient scan, non-power-of-two ====
+    passed
+==== real work-efficient scan, power-of-two ====
+    passed
+==== real work-efficient scan, non-power-of-two ====
+    passed
+==== thrust scan, power-of-two ====
+    passed
+==== thrust scan, non-power-of-two ====
+    passed
 
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   3   2   1   3   1   3   2   0   1   1   2   3   2 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+    [   3   2   1   3   1   3   2   1   1   2   3   2   3 ...   1   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+    [   3   2   1   3   1   3   2   1   1   2   3   2   3 ...   3   1 ]
+    passed
+==== cpu compact with scan ====
+    passed
+==== work-efficient compact, power-of-two ====
+    passed
+==== work-efficient compact, non-power-of-two ====
+    passed
+==== real work-efficient compact, power-of-two ====
+    passed
+==== real work-efficient compact, non-power-of-two ====
+    passed
+```
+
+# Analysis - Scanning
+A major step of compaction is scanning. I tested a CPU implementation, a naive CUDA
+implementation, two efficient CUDA implementations, and the Thrust library's implementation.
+
+![Scan Comparison 1](images/times_blk256.png "Scan Comparison 1")
+
+Interestingly, the Thrust implementation fared the worst, though Nvidia's NSight profiler
+showed that Thrust was not actually using much GPU time. A likely explanation is that Thrust
+may be shuffling data or partitioning the work between CPU and GPU.
+
+![Scan Comparison 2, no thrust](images/times_blk256_nothrust.png "Scan Comparison 2, no thrust")
+![Scan Comparison 2, no thrust](images/times_blk256_nothrust_zoom.png "Scan Comparison 2, no thrust")
+
+Looking at the other implementations on their own shows that the GPU implementations are substantially
+faster than the CPU for large workloads, but somewhat slower on small ones. This makes sense, as for small loads,
+the GPU is nowhere near fully saturated.
+
+# Analysis - Compaction
+Moving on to compaction, the CPU fares even worse.
+![Compaction Comparison](images/times_all_comp.png "Compaction Comparison")
+
+Focusing on only the GPU implementations, we see that the more optimized version
+begins to perform noticeably better, where the two were mostly indistinguishable
+for simple scanning.
+![Compaction Comparison](images/times_all_comp_zoom.png "Compaction Comparison")
+
+For the non-optimized GPU reduction, block size had a substantial effect, with
+a 1024 thread block performing 1.3x faster than with 128 threads.
+![Block Comparison, unoptimized](images/times_blk_eff.png "Block Comparison, unoptimized")
+
+The optimized GPU reduction showed much less variance with block size. This likely due to
+the active threads no longer being scattered between different warps, allowing for more early
+termination and lowering the number of active warps and thus blocks.
+![Block Comparison, optimized](images/times_blk_realeff.png "Block Comparison, optimized")
diff --git a/cis565_stream_compaction_test.launch b/cis565_stream_compaction_test.launch
index 4267429..07b70cd 100644
--- a/cis565_stream_compaction_test.launch
+++ b/cis565_stream_compaction_test.launch
@@ -8,8 +8,8 @@
 <stringAttribute key="org.eclipse.cdt.launch.DEBUGGER_START_MODE" value="run"/>
 <stringAttribute key="org.eclipse.cdt.launch.PROGRAM_NAME" value="build/cis565_stream_compaction_test"/>
 <stringAttribute key="org.eclipse.cdt.launch.PROJECT_ATTR" value="Project2-Stream-Compaction"/>
-<booleanAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_AUTO_ATTR" value="true"/>
-<stringAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_ID_ATTR" value=""/>
+<booleanAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_AUTO_ATTR" value="false"/>
+<stringAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_ID_ATTR" value="com.nvidia.cuda.ide.toolchain.base.1399573849.1760580076"/>
 <booleanAttribute key="org.eclipse.cdt.launch.use_terminal" value="true"/>
 <listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
 <listEntry value="/Project2-Stream-Compaction"/>
@@ -18,8 +18,8 @@
 <listEntry value="4"/>
 </listAttribute>
 <listAttribute key="org.eclipse.debug.ui.favoriteGroups">
-<listEntry value="org.eclipse.debug.ui.launchGroup.profile"/>
 <listEntry value="org.eclipse.debug.ui.launchGroup.debug"/>
+<listEntry value="org.eclipse.debug.ui.launchGroup.profile"/>
 <listEntry value="org.eclipse.debug.ui.launchGroup.run"/>
 </listAttribute>
 <stringAttribute key="org.eclipse.dsf.launch.MEMORY_BLOCKS" value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot; standalone=&quot;no&quot;?&gt;&#10;&lt;memoryBlockExpressionList context=&quot;reserved-for-future-use&quot;/&gt;&#10;"/>
diff --git a/images/times_all_comp.png b/images/times_all_comp.png
new file mode 100644
index 0000000..afe4110
Binary files /dev/null and b/images/times_all_comp.png differ
diff --git a/images/times_all_comp_zoom.png b/images/times_all_comp_zoom.png
new file mode 100644
index 0000000..3939439
Binary files /dev/null and b/images/times_all_comp_zoom.png differ
diff --git a/images/times_blk256.png b/images/times_blk256.png
new file mode 100644
index 0000000..fbe3097
Binary files /dev/null and b/images/times_blk256.png differ
diff --git a/images/times_blk256_nothrust.png b/images/times_blk256_nothrust.png
new file mode 100644
index 0000000..81d947c
Binary files /dev/null and b/images/times_blk256_nothrust.png differ
diff --git a/images/times_blk256_nothrust_zoom.png b/images/times_blk256_nothrust_zoom.png
new file mode 100644
index 0000000..2fa02b1
Binary files /dev/null and b/images/times_blk256_nothrust_zoom.png differ
diff --git a/images/times_blk_eff.png b/images/times_blk_eff.png
new file mode 100644
index 0000000..2eee8e3
Binary files /dev/null and b/images/times_blk_eff.png differ
diff --git a/images/times_blk_naive.png b/images/times_blk_naive.png
new file mode 100644
index 0000000..642d376
Binary files /dev/null and b/images/times_blk_naive.png differ
diff --git a/images/times_blk_realeff.png b/images/times_blk_realeff.png
new file mode 100644
index 0000000..dd91f8f
Binary files /dev/null and b/images/times_blk_realeff.png differ
diff --git a/images/times_cpu_comp.png b/images/times_cpu_comp.png
new file mode 100644
index 0000000..ae073ee
Binary files /dev/null and b/images/times_cpu_comp.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 675da35..7f515f8 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -10,13 +10,30 @@
 #include <stream_compaction/cpu.h>
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
+#include <stream_compaction/real_efficient.h>
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
+#include <cstdlib>
+#include <ctime>
+
+
 int main(int argc, char* argv[]) {
-    const int SIZE = 1 << 8;
-    const int NPOT = SIZE - 3;
-    int a[SIZE], b[SIZE], c[SIZE];
+    double t1,t2;
+
+    int sizeExp = 19;
+    int blkSize = 256;
+    if (argc >= 3) {
+      sizeExp = atoi(argv[1]);
+      blkSize = atoi(argv[2]);
+    }
+    int SIZE = 1 << sizeExp;
+    int NPOT = SIZE - 3;
+    int *a = new int[SIZE], *b = new int[SIZE], *c = new int[SIZE];
+
+    StreamCompaction::Naive::blkSize = blkSize;
+    StreamCompaction::Efficient::blkSize = blkSize;
+    StreamCompaction::RealEfficient::blkSize = blkSize;
 
     // Scan tests
 
@@ -33,48 +50,72 @@ int main(int argc, char* argv[]) {
     printDesc("cpu scan, power-of-two");
     StreamCompaction::CPU::scan(SIZE, b, a);
     printArray(SIZE, b, true);
+    double tCpuScanPot = StreamCompaction::CPU::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("cpu scan, non-power-of-two");
     StreamCompaction::CPU::scan(NPOT, c, a);
     printArray(NPOT, b, true);
     printCmpResult(NPOT, b, c);
+    double tCpuScanNpot = StreamCompaction::CPU::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
+    double tNaiveScanPot = StreamCompaction::Naive::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     //printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
+    double tNaiveScanNpot = StreamCompaction::Naive::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
+    double tEffScanPot = StreamCompaction::Efficient::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     //printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
+    double tEffScanNpot = StreamCompaction::Efficient::last_runtime;
+
+
+    zeroArray(SIZE, c);
+    printDesc("real work-efficient scan, power-of-two");
+    StreamCompaction::RealEfficient::scan(SIZE, c, a);
+    //printArray(SIZE, c, true);
+    printCmpResult(SIZE, b, c);
+    double tRealEffScanPot = StreamCompaction::RealEfficient::last_runtime;
+
+    zeroArray(SIZE, c);
+    printDesc("real work-efficient scan, non-power-of-two");
+    StreamCompaction::RealEfficient::scan(NPOT, c, a);
+    //printArray(NPOT, c, true);
+    printCmpResult(NPOT, b, c);
+    double tRealEffScanNpot = StreamCompaction::RealEfficient::last_runtime;
+
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
+    double tThrustScanPot = StreamCompaction::Thrust::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     //printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
+    double tThrustScanNpot = StreamCompaction::Thrust::last_runtime;
 
     printf("\n");
     printf("*****************************\n");
@@ -95,6 +136,7 @@ int main(int argc, char* argv[]) {
     expectedCount = count;
     printArray(count, b, true);
     printCmpLenResult(count, expectedCount, b, b);
+    double tCpuCompNoscanPot = StreamCompaction::CPU::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("cpu compact without scan, non-power-of-two");
@@ -102,22 +144,51 @@ int main(int argc, char* argv[]) {
     expectedNPOT = count;
     printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
+    double tCpuCompNoscanNpot = StreamCompaction::CPU::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("cpu compact with scan");
     count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
+    double tCpuCompScanPot = StreamCompaction::CPU::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     //printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
+    double tEffCompScanPot = StreamCompaction::Efficient::last_runtime;
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
+    double tEffCompScanNpot = StreamCompaction::Efficient::last_runtime;
+
+    zeroArray(SIZE, c);
+    printDesc("real work-efficient compact, power-of-two");
+    count = StreamCompaction::RealEfficient::compact(SIZE, c, a);
+    //printArray(count, c, true);
+    printCmpLenResult(count, expectedCount, b, c);
+    double tRealEffCompScanPot = StreamCompaction::RealEfficient::last_runtime;
+
+    zeroArray(SIZE, c);
+    printDesc("real work-efficient compact, non-power-of-two");
+    count = StreamCompaction::RealEfficient::compact(NPOT, c, a);
+    //printArray(count, c, true);
+    printCmpLenResult(count, expectedNPOT, b, c);
+    double tRealEffCompScanNpot = StreamCompaction::RealEfficient::last_runtime;
+
+    fprintf(stderr, "[%d, %d, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f]\n",
+      SIZE, blkSize,
+      tCpuScanPot, tNaiveScanPot, tEffScanPot, tRealEffScanPot, tThrustScanPot,
+      tCpuCompNoscanPot, tCpuCompScanPot, tEffCompScanPot, tRealEffCompScanPot
+      );
+
+    delete a;
+    delete b;
+    delete c;
+
+    return 0;
 }
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..de726ec 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -7,11 +7,13 @@ set(SOURCE_FILES
     "naive.cu"
     "efficient.h"
     "efficient.cu"
+    "real_efficient.h"
+    "real_efficient.cu"
     "thrust.h"
     "thrust.cu"
     )
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_50
     )
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index e600c29..6f952b6 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -4,12 +4,22 @@
 namespace StreamCompaction {
 namespace CPU {
 
+double last_runtime;
+
 /**
  * CPU scan (prefix sum).
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+    double t1 = clock();
+
+    int t = 0;
+    for (int i = 0; i < n; i++) {
+      odata[i] = t;
+      t += idata[i];
+    }
+
+    double t2 = clock();
+    last_runtime = 1.0E6*(t2-t1)/CLOCKS_PER_SEC;
 }
 
 /**
@@ -18,8 +28,19 @@ void scan(int n, int *odata, const int *idata) {
  * @returns the number of elements remaining after compaction.
  */
 int compactWithoutScan(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+    double t1 = clock();
+
+    int oIdx = 0;
+    for (int i = 0; i < n; i++) {
+      if (idata[i] != 0) {
+        odata[oIdx] = idata[i];
+        oIdx++;
+      }
+    }
+
+    double t2 = clock();
+    last_runtime = 1.0E6*(t2-t1)/CLOCKS_PER_SEC;
+    return oIdx;
 }
 
 /**
@@ -28,8 +49,30 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
  * @returns the number of elements remaining after compaction.
  */
 int compactWithScan(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+    double t1 = clock();
+
+    int *keep = new int[n];
+    for (int i = 0; i < n; i++) {
+      keep[i] = (idata[i] != 0) ? 1 : 0;
+    }
+
+    int *keepScan = new int[n];
+    int nKeep = 0;
+    scan(n, keepScan, keep);
+    for (int i = 0; i < n; i++) {
+      if (!keep[i])
+        continue;
+
+      nKeep++;
+      odata[keepScan[i]] = idata[i];
+    }
+
+    double t2 = clock();
+    last_runtime = 1.0E6*(t2-t1)/CLOCKS_PER_SEC;
+
+    delete keepScan;
+    delete keep;
+    return nKeep;
 }
 
 }
diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h
index 6348bf3..7e87420 100644
--- a/stream_compaction/cpu.h
+++ b/stream_compaction/cpu.h
@@ -2,6 +2,8 @@
 
 namespace StreamCompaction {
 namespace CPU {
+    extern double last_runtime;
+
     void scan(int n, int *odata, const int *idata);
 
     int compactWithoutScan(int n, int *odata, const int *idata);
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index b2f739b..31ad328 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -6,14 +6,99 @@
 namespace StreamCompaction {
 namespace Efficient {
 
-// TODO: __global__
+double last_runtime;
+int blkSize = 256;
+
+// perform reduction
+__global__ void kernScanUp(int n, int dPow, int *data) {
+  int k = blockIdx.x * blockDim.x + threadIdx.x;
+  if (k % dPow != 0 || k + dPow - 1 >= n)
+    return;
+
+  data[k + dPow - 1] += data[k + dPow/2 - 1];
+}
+
+// perform reduction
+__global__ void kernScanDown(int n, int dPow, int *data) {
+  int k = blockIdx.x * blockDim.x + threadIdx.x;
+  if (k % dPow != 0 || k + dPow - 1 >= n)
+    return;
+
+  int t = data[k + dPow/2 - 1];
+  data[k + dPow/2 - 1] = data[k + dPow - 1];
+  data[k + dPow - 1] += t;
+}
+
+// mark nonzeroes
+__global__ void kernMark(int n, int *keep, const int *data) {
+  int k = blockIdx.x * blockDim.x + threadIdx.x;
+  if (k >= n)
+    return;
+
+  keep[k] = (data[k] != 0) ? 1 : 0;
+}
+
+__global__ void kernScatter(int n, int *out, const int *keep, const int *scan, const int *data) {
+  int k = blockIdx.x * blockDim.x + threadIdx.x;
+  if (k >= n)
+    return;
+
+  if (keep[k]) {
+    out[scan[k]] = data[k];
+  }
+}
+
+static int getPot(int n) {
+  unsigned int pot = n;
+  pot--;
+  pot |= pot >> 1;
+  pot |= pot >> 2;
+  pot |= pot >> 4;
+  pot |= pot >> 8;
+  pot |= pot >> 16;
+  pot++;
+
+  return pot;
+}
+
+static void devScanUtil(int n, int *devData) {
+  int pot  = getPot(n);
+
+  dim3 blkDim(blkSize);
+
+  int dPow = 2;
+  while (dPow/2 < n) {
+    dim3 blkCnt((pot + blkDim.x - 1)/blkDim.x);
+    kernScanUp<<<blkCnt,blkDim>>>(pot, dPow, devData);
+    dPow *= 2;
+  }
+  cudaMemset(&devData[pot-1], 0, sizeof(int));
+
+  while (dPow > 1) {
+    dim3 blkCnt((pot + blkDim.x - 1)/blkDim.x);
+    kernScanDown<<<blkCnt,blkDim>>>(pot, dPow, devData);
+    dPow /= 2;
+  }
+}
 
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+  int pot  = getPot(n);
+
+  int *devData;
+  cudaMalloc((void**)&devData, pot*sizeof(int));
+  cudaMemset(devData, 0, pot*sizeof(int));
+  cudaMemcpy(devData, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+
+  double t1 = clock();
+  devScanUtil(n, devData);
+  double t2 = clock();
+  last_runtime = 1.0E6 * (t2-t1) / CLOCKS_PER_SEC;
+
+  cudaMemcpy(odata, devData, n*sizeof(int), cudaMemcpyDeviceToHost);
+  cudaFree(devData);
 }
 
 /**
@@ -26,8 +111,45 @@ void scan(int n, int *odata, const int *idata) {
  * @returns      The number of elements remaining after compaction.
  */
 int compact(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+  int pot = getPot(n);
+
+  // upload data
+  int *devData;
+  cudaMalloc((void**)&devData, n*sizeof(int));
+  cudaMemcpy(devData, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+
+  dim3 blkDim(blkSize);
+  dim3 blkCnt((n + blkDim.x - 1)/blkDim.x);
+
+  // mark values to keep
+  int *devKeep, *devScan;
+  cudaMalloc((void**)&devKeep, pot*sizeof(int));
+  cudaMalloc((void**)&devScan, pot*sizeof(int));
+  cudaMemset(devKeep, 0, pot*sizeof(int));
+
+  double t1 = clock();
+  kernMark<<<blkCnt,blkDim>>>(n, devKeep, devData);
+  cudaMemcpy(devScan, devKeep, pot*sizeof(int), cudaMemcpyDeviceToDevice);
+
+  // scan boolean array
+  devScanUtil(n, devScan);
+  int nKeep;
+  cudaMemcpy(&nKeep, &devScan[pot-1], sizeof(int), cudaMemcpyDeviceToHost);
+
+  // scatter to output
+  int *devOut;
+  cudaMalloc((void**)&devOut, n*sizeof(int));
+  kernScatter<<<blkCnt,blkDim>>>(n, devOut, devKeep, devScan, devData);
+  double t2 = clock();
+  last_runtime = 1.0E6 * (t2-t1) / CLOCKS_PER_SEC;
+  cudaMemcpy(odata, devOut, nKeep*sizeof(int), cudaMemcpyDeviceToHost);
+
+  cudaFree(devOut);
+  cudaFree(devData);
+  cudaFree(devKeep);
+  cudaFree(devScan);
+
+  return nKeep;
 }
 
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 395ba10..fb54bde 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -2,6 +2,9 @@
 
 namespace StreamCompaction {
 namespace Efficient {
+    extern double last_runtime;
+    extern int blkSize;
+
     void scan(int n, int *odata, const int *idata);
 
     int compact(int n, int *odata, const int *idata);
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 3d86b60..e89b742 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -6,14 +6,67 @@
 namespace StreamCompaction {
 namespace Naive {
 
-// TODO: __global__
+double last_runtime;
+int blkSize = 256;
+
+__global__ void kernScan(int n, int dPow, int *odata, const int *idata) {
+  int k = blockIdx.x * blockDim.x + threadIdx.x;
+  if (k >= n)
+    return;
+
+  if (k >= dPow)
+    odata[k] = idata[k - dPow] + idata[k];
+  else
+    odata[k] = idata[k];
+}
+
+__global__ void kernInclToExcl(int n, int *odata, const int *idata) {
+  int k = blockIdx.x * blockDim.x + threadIdx.x;
+  if (k >= n-1)
+    return;
+
+  odata[k+1] = idata[k];
+};
 
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+  int dPow = 1, dLogPow = 0;
+  int *devData[2];
+  cudaMalloc((void**)&devData[0], n*sizeof(int));
+  cudaMalloc((void**)&devData[1], n*sizeof(int));
+  cudaMemcpy(devData[0], idata, n*sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemset(devData[1], 0, n*sizeof(int));
+
+  double t1 = clock();
+
+  dim3 blkDim(blkSize);
+  dim3 blkCnt((n + blkDim.x - 1)/blkDim.x);
+
+  int dst, src;
+  while (dPow/2 < n) {
+    src = dLogPow % 2;
+    dst = 1 - src;
+    kernScan<<<blkCnt,blkDim>>>(n, dPow, devData[dst], devData[src]);
+    dPow *= 2;
+    dLogPow++;
+    cudaDeviceSynchronize();
+  }
+
+  src = dLogPow % 2;
+  dst = 1 - src;
+  kernInclToExcl<<<blkCnt,blkDim>>>(n, devData[dst], devData[src]);
+  cudaDeviceSynchronize();
+
+  double t2=clock();
+  last_runtime = 1.0E6 * (t2-t1) / CLOCKS_PER_SEC;
+
+  cudaMemcpy(odata, devData[dst], n*sizeof(int), cudaMemcpyDeviceToHost);
+  odata[0] = 0;
+
+  cudaFree(devData[0]);
+  cudaFree(devData[1]);
 }
 
 }
diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h
index 21152d6..060df1d 100644
--- a/stream_compaction/naive.h
+++ b/stream_compaction/naive.h
@@ -2,6 +2,9 @@
 
 namespace StreamCompaction {
 namespace Naive {
+    extern double last_runtime;
+    extern int blkSize;
+
     void scan(int n, int *odata, const int *idata);
 }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index d8dbb32..b74f882 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -9,6 +9,8 @@
 namespace StreamCompaction {
 namespace Thrust {
 
+double last_runtime;
+
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
@@ -16,6 +18,14 @@ void scan(int n, int *odata, const int *idata) {
     // TODO use `thrust::exclusive_scan`
     // example: for device_vectors dv_in and dv_out:
     // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+    thrust::device_vector<int> devIn(idata, idata+n), devOut(n);
+    double t1 = clock();
+    thrust::exclusive_scan(devIn.begin(), devIn.end(), devOut.begin());
+    double t2 = clock();
+    last_runtime = 1.0E6 * (t2-t1) / CLOCKS_PER_SEC;
+
+    thrust::copy(devOut.begin(), devOut.end(), odata);
 }
 
 }
diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h
index 06707f3..e59174f 100644
--- a/stream_compaction/thrust.h
+++ b/stream_compaction/thrust.h
@@ -2,6 +2,8 @@
 
 namespace StreamCompaction {
 namespace Thrust {
+    extern double last_runtime;
+
     void scan(int n, int *odata, const int *idata);
 }
 }