diff --git a/README.md b/README.md
deleted file mode 100644
index b71c458..0000000
--- a/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-CUDA Stream Compaction
-======================
-
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
-
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
-
-### (TODO: Your README)
-
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
-
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..e31ca3c 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_30
     )
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
deleted file mode 100644
index fe872d4..0000000
--- a/stream_compaction/common.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "common.h"
-
-void checkCUDAErrorFn(const char *msg, const char *file, int line) {
-    cudaError_t err = cudaGetLastError();
-    if (cudaSuccess == err) {
-        return;
-    }
-
-    fprintf(stderr, "CUDA error");
-    if (file) {
-        fprintf(stderr, " (%s:%d)", file, line);
-    }
-    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-}
-
-
-namespace StreamCompaction {
-namespace Common {
-
-/**
- * Maps an array to an array of 0s and 1s for stream compaction. Elements
- * which map to 0 will be removed, and elements which map to 1 will be kept.
- */
-__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-    // TODO
-}
-
-/**
- * Performs scatter on an array. That is, for each element in idata,
- * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
- */
-__global__ void kernScatter(int n, int *odata,
-        const int *idata, const int *bools, const int *indices) {
-    // TODO
-}
-
-}
-}
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 4f52663..7ae460a 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -3,10 +3,14 @@
 #include <cstdio>
 #include <cstring>
 #include <cmath>
+#include <stdio.h>
+#include <time.h>
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 
+#define BLOCK_SIZE 16
+
 /**
  * Check for CUDA errors; print and exit if there was a problem.
  */
@@ -31,5 +35,8 @@ namespace Common {
 
     __global__ void kernScatter(int n, int *odata,
             const int *idata, const int *bools, const int *indices);
+
+	__global__ void inclusiveToExclusive(int n, int *idata, int *odata);
+
 }
 }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index e600c29..7b82c15 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,4 +1,4 @@
-#include <cstdio>
+﻿#include <cstdio>
 #include "cpu.h"
 
 namespace StreamCompaction {
@@ -9,7 +9,14 @@ namespace CPU {
  */
 void scan(int n, int *odata, const int *idata) {
     // TODO
-    printf("TODO\n");
+    odata[0] = 0;
+    printf("The output array is:\n");
+    for(int tempCount = 1; tempCount < n; tempCount++)
+    {
+        odata[tempCount] = idata[tempCount-1] + odata[tempCount-1];
+        printf("%5d",odata[tempCount]);
+    }
+	printf("\n");
 }
 
 /**
@@ -19,7 +26,19 @@ void scan(int n, int *odata, const int *idata) {
  */
 int compactWithoutScan(int n, int *odata, const int *idata) {
     // TODO
-    return -1;
+	time_t start = clock();
+    int countOut=0;
+    for(int tempCount = 0; tempCount <n-1; tempCount++)
+    {
+        if(idata[tempCount]!=0)
+        {
+            odata[countOut]=idata[tempCount];
+            countOut++;
+        }
+    }
+	time_t end = clock();
+	 printf("The running time is: %f ms. \n", double(end-start)*1000/CLOCKS_PER_SEC);
+	return countOut;
 }
 
 /**
@@ -29,7 +48,38 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
  */
 int compactWithScan(int n, int *odata, const int *idata) {
     // TODO
-    return -1;
+	time_t start = clock();
+	int tempCount=0;
+	int sumOutTemp=0;
+	int *tempArray_1=&odata[2*n];
+	int *tempArray_2=&odata[10*n];
+
+	for (tempCount = 0; tempCount < n; tempCount++)
+	{
+		if (idata[tempCount] != 0)
+		{
+			tempArray_1[tempCount] = 1;
+		}
+		else
+		{
+			tempArray_1[tempCount] = 2;
+		}
+	}
+	scan(n, tempArray_2, tempArray_1);
+
+	for (tempCount = 0; tempCount < n-1; tempCount++)
+	{
+		if (tempArray_2[tempCount] != tempArray_2[tempCount + 1])
+		{
+			odata[sumOutTemp] = idata[tempCount];
+			printf("%5d", odata[sumOutTemp]);
+			sumOutTemp++;
+		}
+	}
+	printf("\n");
+	time_t end = clock();
+	 printf("The running time is: %f ms. \n", double(end-start)*1000/CLOCKS_PER_SEC);
+	return sumOutTemp++;
 }
 
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index b2f739b..e1bfa71 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -6,14 +6,57 @@
 namespace StreamCompaction {
 namespace Efficient {
 
-// TODO: __global__
+// TODO: 
+__global__ void EfficientSorting(int n, int *idata, int *odata)
+{
+	extern __shared__ int temp[];
+	int parallelCount = threadIdx.x;
+	int offset = 1;
+	temp[2 * parallelCount] = idata[2 * parallelCount];
+	temp[2 * parallelCount + 1] = idata[2 * parallelCount + 1];
+
+	for (int tempCount = n; tempCount > 0; tempCount*=2)
+	{
+		if (parallelCount < tempCount)
+		{
+			int temp_1 = offset*(2 * parallelCount + 1) - 1;
+			int temp_2 = offset*(2 * parallelCount + 2) - 1;
+			temp[temp_2] += temp[temp_1];
+		}
+	}
+	if (parallelCount == 0)
+	{
+		temp[n - 1] = 0;
+	}
+
+	for (int tempCount_1 = 0; tempCount_1 < n; tempCount_1 *= 2)
+	{
+		if (parallelCount < tempCount_1)
+		{
+			int temp_1 = offset*(2 * parallelCount + 1) - 1;
+			int temp_2 = offset*(2 * parallelCount + 2) - 1;
+			int tempStore = temp[temp_1];
+			temp[temp_1] = temp[temp_2];
+			temp[temp_2] += tempStore;
+		}
+	}
+	odata[2 * parallelCount] = temp[2 * parallelCount];
+	odata[2 * parallelCount + 1] = temp[2 * parallelCount + 1];
+}
+
 
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
     // TODO
-    printf("TODO\n");
+	int tempCount=0;
+	odata[0]=0;
+	for (tempCount=0;tempCount<n-1;tempCount++)
+	{
+		    odata[tempCount+1]=odata[tempCount]+idata[tempCount];
+			printf("%5d",&odata[tempCount]);
+	}
 }
 
 /**
@@ -27,7 +70,19 @@ void scan(int n, int *odata, const int *idata) {
  */
 int compact(int n, int *odata, const int *idata) {
     // TODO
-    return -1;
+		time_t start = clock();
+	int tempCount=0;
+	int outCount=0;
+	for(tempCount=0;tempCount<n;tempCount++)
+	{
+	     if(idata[tempCount]!=0)
+		 {
+             EfficientSorting<< <n,BLOCK_SIZE>> >(n, idata,odata);
+		 }
+	}
+		time_t end = clock();
+	 printf("The running time is: %f ms. \n", double(end-start)*1000/CLOCKS_PER_SEC);
+    return outCount++;
 }
 
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 3d86b60..ca1458f 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -7,13 +7,61 @@ namespace StreamCompaction {
 namespace Naive {
 
 // TODO: __global__
+__global__ void NaiveGPUScan(int n, int *odata, const int *idata,int step)
+{
+	int parallelCount = threadIdx.x+blockIdx.x*blockDim.x;
+	
+    if(parallelCount<n)
+	{
+	    if(parallelCount>=step)
+		{
+		    odata[parallelCount]=idata[parallelCount-step]+idata[parallelCount];
+		}
+	}
+}
 
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
-void scan(int n, int *odata, const int *idata) {
+ void scan(int n, int *odata, const int *idata) {
     // TODO
-    printf("TODO\n");
+  time_t start = clock();
+
+
+  int* tempArray_1;
+  int* tempArray_2;
+  int tempCount=0;
+  int step=0;
+
+  cudaMalloc((void**)&tempArray_1, n * sizeof(int));
+  cudaMalloc((void**)&tempArray_2, n * sizeof(int));
+
+  //allocate the device space
+  cudaMemcpy(tempArray_1, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(tempArray_2, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+  
+
+
+  for (tempCount = 1; tempCount <= ilog2ceil(n); ++tempCount) {
+    step=2^(tempCount-1);
+
+    NaiveGPUScan << <n, BLOCK_SIZE >> >(n, (tempCount % 2) == 0 ? tempArray_1 : tempArray_2,  (tempCount % 2) == 0 ? tempArray_2 : tempArray_1,step);
+  }
+
+  if (ilog2ceil(n) % 2 == 0) {
+    Common::inclusiveToExclusive << <n, BLOCK_SIZE >> >(n, tempArray_2, tempArray_1);
+
+    cudaMemcpy(odata, tempArray_2, n * sizeof(int), cudaMemcpyDeviceToHost);
+  } else {
+    Common::inclusiveToExclusive << <n, BLOCK_SIZE >> >(n, tempArray_1, tempArray_2);
+
+	cudaMemcpy(odata, tempArray_1, n * sizeof(int), cudaMemcpyDeviceToHost);
+  }
+
+  	 time_t end = clock();
+	 printf("The running time is: %f ms. \n", double(end-start)*1000/CLOCKS_PER_SEC);
+    cudaFree(tempArray_1);
+    cudaFree(tempArray_2);
 }
 
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index d8dbb32..cdd5569 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -16,6 +16,19 @@ void scan(int n, int *odata, const int *idata) {
     // TODO use `thrust::exclusive_scan`
     // example: for device_vectors dv_in and dv_out:
     // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+	time_t start = clock();
+
+    thrust::device_vector<int> dev_idata(idata, idata + n);
+    thrust::device_vector<int> dev_odata(odata, odata + n);
+
+	thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin());
+
+	 thrust::host_vector<int> host_odata = dev_odata;
+     cudaMemcpy(odata, host_odata.data(), n * sizeof(int), cudaMemcpyHostToHost);
+
+	 time_t end = clock();
+	 printf("The running time is: %f ms. \n", double(end-start)*1000/CLOCKS_PER_SEC);
 }
 
 }