Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 91 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,97 @@ CUDA Stream Compaction

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**

* (TODO) YOUR NAME HERE
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
* Akshay Shah
* Tested on: Windows 10, i7-5700HQ @ 2.70GHz 16GB, GTX 970M 6GB (Personal Computer)

### (TODO: Your README)
### Stream Compaction

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
Block Size: 256

Array input: vary from 2<sup>18</sup> to 2<sup>26</sup>.

Max size: 2<sup>30</sup>

CUDA Summary

![](images/GPUCUDAstats.PNG)

![](images/cudaCoreStats.PNG)

Memory I/O looks to be the bottleneck over the GPU, meaning changing the variables over to shared memory might make a difference. (not checked, just speculating)


#### Output
```
****************
** SCAN TESTS **
****************
[ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 38 0 ]
==== cpu scan, power-of-two ====
215 ms total, average : 215 ms.
==== cpu scan, non-power-of-two ====
passed
==== naive scan, power-of-two ====
total time to run naive scan, power-of-two: 187.5 in ms, aver: 187.5
passed
==== naive scan, non-power-of-two ====
passed
==== work-efficient scan, power-of-two ====
total time to run efficient scan, power-of-two: 171.601 in ms, aver: 171.601
passed
==== work-efficient scan, non-power-of-two ====
passed
==== thrust scan, power-of-two ====
49 ms total thrust, average : 49 ms.
passed
==== thrust scan, non-power-of-two ====
passed

*****************************
** STREAM COMPACTION TESTS **
*****************************
[ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ]
==== cpu compact without scan, power-of-two ====
182 ms total cpu w/o scan, average : 182 ms.
[ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ]
passed
==== cpu compact without scan, non-power-of-two ====
[ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ]
passed
==== cpu compact with scan ====
356 ms total cpu w scan, average : 356 ms.
[ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ]
passed
==== work-efficient compact, power-of-two ====
total time to run efficient compact, power-of-two: 186.149 in ms, aver: 186.149
passed
==== work-efficient compact, non-power-of-two ====
[ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ]
passed

*****************************
** RADIX SORT TESTS **
*****************************
[ 3 5 0 1 0 2 0 1 6 1 1 2 1 ... 0 0 ]
==== radix sort, power-of-two ====
total time to run radix, power-of-two: 67.8532 in ms, aver: 67.8532
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 6 6 ]
==== thrust sort, power-of-two ====
515 ms total thrust sort, average : 515 ms.
```

![Fig1](images/naivevsefficient.png)

##### A comparison of Stream Compaction over the GPU vs CPU

The time was measured for 1000 iterations
Exclusive scan was used inside the stream compaction over GPU, which is shown (CPU vs GPU scan) in Fig. 1

![Fig2](images/streamcompact.png)

##### Radix sort
Implemented Radix sort on the GPU, that uses exclusive scan. The output shows a sorted array.
Limitation to this implementation is that the max digit that can be used for sorting is 7. To increase the digit limit, you would have to change the lsb iteration number in sort.cu to whatever bit is the maximum.
There is a comparison between thrust's CPU sort and this GPU sort.

![Fig3](images/sortcpuvgpu.png)
Binary file added images/GPUCUDAstats.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/cudaCoreStats.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/naivevsefficient.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/sortcpuvgpu.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/streamcompact.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
119 changes: 102 additions & 17 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,23 @@
#include <stream_compaction/naive.h>
#include <stream_compaction/efficient.h>
#include <stream_compaction/thrust.h>
#include <stream_compaction/sort.h>
#include "testing_helpers.hpp"
#include <iostream>
#include <chrono>

#define ITER 1

int main(int argc, char* argv[]) {
const int SIZE = 1 << 8;
const size_t SIZE = 1 << 20;
const int NPOT = SIZE - 3;
int a[SIZE], b[SIZE], c[SIZE];
//int a[SIZE], b[SIZE], c[SIZE];

int *a = new int[SIZE];
int *b = new int[SIZE];
int *c = new int[SIZE];

float time = 0.f, totalTime = 0.f;
// Scan tests

printf("\n");
Expand All @@ -31,42 +41,64 @@ int main(int argc, char* argv[]) {

zeroArray(SIZE, b);
printDesc("cpu scan, power-of-two");
StreamCompaction::CPU::scan(SIZE, b, a);
printArray(SIZE, b, true);
auto begin = std::chrono::high_resolution_clock::now();
for (int i = 0; i < ITER; ++i){
StreamCompaction::CPU::scan(SIZE, b, a);
}
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
std::cout << (float)duration << " ms total, average : " << (float)duration / ITER << " ms." << std::endl;
//printArray(SIZE, b, true);

zeroArray(SIZE, c);
printDesc("cpu scan, non-power-of-two");
StreamCompaction::CPU::scan(NPOT, c, a);
printArray(NPOT, b, true);
//printArray(NPOT, b, true);
printCmpResult(NPOT, b, c);

totalTime = 0.f;
zeroArray(SIZE, c);
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
//printArray(SIZE, c, true);
//printArray(SIZE, c, false);
for (int i = 0; i < ITER; ++i) {
StreamCompaction::Naive::scan(SIZE, c, a, time);
totalTime += time;
}
std::cout << "total time to run naive scan, power-of-two: " << totalTime << " in ms, aver: " << totalTime / ITER << std::endl;
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("naive scan, non-power-of-two");
StreamCompaction::Naive::scan(NPOT, c, a);
StreamCompaction::Naive::scan(NPOT, c, a, time);
//printArray(SIZE, c, true);
printCmpResult(NPOT, b, c);

totalTime = 0.f;
zeroArray(SIZE, c);
printDesc("work-efficient scan, power-of-two");
StreamCompaction::Efficient::scan(SIZE, c, a);
//printArray(SIZE, c, true);
for (int i = 0; i < ITER; ++i) {
StreamCompaction::Efficient::scan(SIZE, c, a, time);
totalTime += time;
}
std::cout << "total time to run efficient scan, power-of-two: " << totalTime << " in ms, aver: " << totalTime / ITER << std::endl;
//printArray(SIZE, c, false);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient scan, non-power-of-two");
StreamCompaction::Efficient::scan(NPOT, c, a);
StreamCompaction::Efficient::scan(NPOT, c, a, time);
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("thrust scan, power-of-two");
StreamCompaction::Thrust::scan(SIZE, c, a);
begin = std::chrono::high_resolution_clock::now();
for (int i = 0; i < ITER; ++i){
StreamCompaction::Thrust::scan(SIZE, c, a);
}
end = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
std::cout << (float)duration << " ms total thrust, average : " << (float)duration / ITER << " ms." << std::endl;
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

Expand All @@ -91,7 +123,13 @@ int main(int argc, char* argv[]) {

zeroArray(SIZE, b);
printDesc("cpu compact without scan, power-of-two");
count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
begin = std::chrono::high_resolution_clock::now();
for (int i = 0; i < ITER; ++i){
count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
}
end = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
std::cout << (float)duration << " ms total cpu w/o scan, average : " << (float)duration / ITER << " ms." << std::endl;
expectedCount = count;
printArray(count, b, true);
printCmpLenResult(count, expectedCount, b, b);
Expand All @@ -105,19 +143,66 @@ int main(int argc, char* argv[]) {

zeroArray(SIZE, c);
printDesc("cpu compact with scan");
count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
begin = std::chrono::high_resolution_clock::now();
for (int i = 0; i < ITER; ++i){
count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
}
end = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
std::cout << (float)duration << " ms total cpu w scan, average : " << (float)duration / ITER << " ms." << std::endl;
printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);

totalTime = 0.f;
zeroArray(SIZE, c);
printDesc("work-efficient compact, power-of-two");
count = StreamCompaction::Efficient::compact(SIZE, c, a);
for (int i = 0; i < ITER; ++i) {
count = StreamCompaction::Efficient::compact(SIZE, c, a, time);
totalTime += time;
}
std::cout << "total time to run efficient compact, power-of-two: " << totalTime << " in ms, aver: " << totalTime / ITER << std::endl;
//printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient compact, non-power-of-two");
count = StreamCompaction::Efficient::compact(NPOT, c, a);
//printArray(count, c, true);
count = StreamCompaction::Efficient::compact(NPOT, c, a, time);
printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);

printf("\n");
printf("*****************************\n");
printf("** RADIX SORT TESTS **\n");
printf("*****************************\n");

// SORT tests

genArray(SIZE, a, 7); // Leave a 0 at the end to test that edge case
a[SIZE - 1] = 0;
printArray(SIZE, a, true);

totalTime = 0.f;
zeroArray(SIZE, b);
printDesc("radix sort, power-of-two");
for (int i = 0; i < ITER; ++i) {
StreamCompaction::Sort::sort(SIZE, b, a, time);
totalTime += time;
}
std::cout << "total time to run radix, power-of-two: " << totalTime << " in ms, aver: " << totalTime / ITER << std::endl;
printArray(SIZE, b, true);


zeroArray(SIZE, b);
printDesc("thrust sort, power-of-two");
begin = std::chrono::high_resolution_clock::now();
for (int i = 0; i < ITER; ++i){
StreamCompaction::Thrust::sort(SIZE, a);
}
end = std::chrono::high_resolution_clock::now();
float fduration = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
std::cout << (float)fduration << " ms total thrust sort, average : " << (float)fduration / ITER << " ms." << std::endl;

delete[] a;
delete[] b;
delete[] c;
}
13 changes: 13 additions & 0 deletions src/testing_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,19 @@ void genArray(int n, int *a, int maxval) {
}
}

void genArraySort(int n, int *a, int maxval) {
if (n == 8) {
a[0] = 4;
a[1] = 7;
a[2] = 2;
a[3] = 6;
a[4] = 3;
a[5] = 5;
a[6] = 1;
a[7] = 0;
}
}

void printArray(int n, int *a, bool abridged = false) {
printf(" [ ");
for (int i = 0; i < n; i++) {
Expand Down
4 changes: 3 additions & 1 deletion stream_compaction/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@ set(SOURCE_FILES
"efficient.cu"
"thrust.h"
"thrust.cu"
"sort.h"
"sort.cu"
)

cuda_add_library(stream_compaction
${SOURCE_FILES}
OPTIONS -arch=sm_20
OPTIONS -arch=sm_52
)
66 changes: 36 additions & 30 deletions stream_compaction/common.cu
Original file line number Diff line number Diff line change
@@ -1,39 +1,45 @@
#include "common.h"

void checkCUDAErrorFn(const char *msg, const char *file, int line) {
cudaError_t err = cudaGetLastError();
if (cudaSuccess == err) {
return;
}

fprintf(stderr, "CUDA error");
if (file) {
fprintf(stderr, " (%s:%d)", file, line);
}
fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
exit(EXIT_FAILURE);
cudaError_t err = cudaGetLastError();
if (cudaSuccess == err) {
return;
}

fprintf(stderr, "CUDA error");
if (file) {
fprintf(stderr, " (%s:%d)", file, line);
}
fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}


namespace StreamCompaction {
namespace Common {

/**
* Maps an array to an array of 0s and 1s for stream compaction. Elements
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
}
namespace Common {

/**
* Performs scatter on an array. That is, for each element in idata,
* if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
*/
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
}
/**
* Maps an array to an array of 0s and 1s for stream compaction. Elements
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
int index = threadIdx.x + (blockDim.x * blockIdx.x);

}
}
if (index >= n) return;
bools[index] = idata[index] != 0 ? 1 : 0;
}

/**
* Performs scatter on an array. That is, for each element in idata,
* if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
*/
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
int index = threadIdx.x + (blockDim.x * blockIdx.x);

if (index >= n) return;
if (bools[index] == 1) odata[indices[index]] = idata[index];
}

}
}
Loading