diff --git a/README.md b/README.md deleted file mode 100644 index b71c458..0000000 --- a/README.md +++ /dev/null @@ -1,13 +0,0 @@ -CUDA Stream Compaction -====================== - -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** - -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) - -### (TODO: Your README) - -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) - diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..e31ca3c 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_30 ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu deleted file mode 100644 index fe872d4..0000000 --- a/stream_compaction/common.cu +++ /dev/null @@ -1,39 +0,0 @@ -#include "common.h" - -void checkCUDAErrorFn(const char *msg, const char *file, int line) { - cudaError_t err = cudaGetLastError(); - if (cudaSuccess == err) { - return; - } - - fprintf(stderr, "CUDA error"); - if (file) { - fprintf(stderr, " (%s:%d)", file, line); - } - fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); - exit(EXIT_FAILURE); -} - - -namespace StreamCompaction { -namespace Common { - -/** - * Maps an array to an array of 0s and 1s for stream compaction. Elements - * which map to 0 will be removed, and elements which map to 1 will be kept. - */ -__global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO -} - -/** - * Performs scatter on an array. That is, for each element in idata, - * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. - */ -__global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { - // TODO -} - -} -} diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..7ae460a 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -3,10 +3,14 @@ #include #include #include +#include +#include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define BLOCK_SIZE 16 + /** * Check for CUDA errors; print and exit if there was a problem. */ @@ -31,5 +35,8 @@ namespace Common { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices); + + __global__ void inclusiveToExclusive(int n, int *idata, int *odata); + } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..7b82c15 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,4 +1,4 @@ -#include +#include #include "cpu.h" namespace StreamCompaction { @@ -9,7 +9,14 @@ namespace CPU { */ void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + odata[0] = 0; + printf("The output array is:\n"); + for(int tempCount = 1; tempCount < n; tempCount++) + { + odata[tempCount] = idata[tempCount-1] + odata[tempCount-1]; + printf("%5d",odata[tempCount]); + } + printf("\n"); } /** @@ -19,7 +26,19 @@ void scan(int n, int *odata, const int *idata) { */ int compactWithoutScan(int n, int *odata, const int *idata) { // TODO - return -1; + time_t start = clock(); + int countOut=0; + for(int tempCount = 0; tempCount 0; tempCount*=2) + { + if (parallelCount < tempCount) + { + int temp_1 = offset*(2 * parallelCount + 1) - 1; + int temp_2 = offset*(2 * parallelCount + 2) - 1; + temp[temp_2] += temp[temp_1]; + } + } + if (parallelCount == 0) + { + temp[n - 1] = 0; + } + + for (int tempCount_1 = 0; tempCount_1 < n; tempCount_1 *= 2) + { + if (parallelCount < tempCount_1) + { + int temp_1 = offset*(2 * parallelCount + 1) - 1; + int temp_2 = offset*(2 * parallelCount + 2) - 1; + int tempStore = temp[temp_1]; + temp[temp_1] = temp[temp_2]; + temp[temp_2] += tempStore; + } + } + odata[2 * parallelCount] = temp[2 * parallelCount]; + odata[2 * parallelCount + 1] = temp[2 * parallelCount + 1]; +} + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + int tempCount=0; + odata[0]=0; + for (tempCount=0;tempCount> >(n, idata,odata); + } + } + time_t end = clock(); + printf("The running time is: %f ms. \n", double(end-start)*1000/CLOCKS_PER_SEC); + return outCount++; } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..ca1458f 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -7,13 +7,61 @@ namespace StreamCompaction { namespace Naive { // TODO: __global__ +__global__ void NaiveGPUScan(int n, int *odata, const int *idata,int step) +{ + int parallelCount = threadIdx.x+blockIdx.x*blockDim.x; + + if(parallelCount=step) + { + odata[parallelCount]=idata[parallelCount-step]+idata[parallelCount]; + } + } +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + time_t start = clock(); + + + int* tempArray_1; + int* tempArray_2; + int tempCount=0; + int step=0; + + cudaMalloc((void**)&tempArray_1, n * sizeof(int)); + cudaMalloc((void**)&tempArray_2, n * sizeof(int)); + + //allocate the device space + cudaMemcpy(tempArray_1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(tempArray_2, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + + + for (tempCount = 1; tempCount <= ilog2ceil(n); ++tempCount) { + step=2^(tempCount-1); + + NaiveGPUScan << > >(n, (tempCount % 2) == 0 ? tempArray_1 : tempArray_2, (tempCount % 2) == 0 ? tempArray_2 : tempArray_1,step); + } + + if (ilog2ceil(n) % 2 == 0) { + Common::inclusiveToExclusive << > >(n, tempArray_2, tempArray_1); + + cudaMemcpy(odata, tempArray_2, n * sizeof(int), cudaMemcpyDeviceToHost); + } else { + Common::inclusiveToExclusive << > >(n, tempArray_1, tempArray_2); + + cudaMemcpy(odata, tempArray_1, n * sizeof(int), cudaMemcpyDeviceToHost); + } + + time_t end = clock(); + printf("The running time is: %f ms. \n", double(end-start)*1000/CLOCKS_PER_SEC); + cudaFree(tempArray_1); + cudaFree(tempArray_2); } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..cdd5569 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -16,6 +16,19 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + time_t start = clock(); + + thrust::device_vector dev_idata(idata, idata + n); + thrust::device_vector dev_odata(odata, odata + n); + + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); + + thrust::host_vector host_odata = dev_odata; + cudaMemcpy(odata, host_odata.data(), n * sizeof(int), cudaMemcpyHostToHost); + + time_t end = clock(); + printf("The running time is: %f ms. \n", double(end-start)*1000/CLOCKS_PER_SEC); } }