From 35b31264963d6fab95229d176300a8e68efcb581 Mon Sep 17 00:00:00 2001 From: ZHAN CHIN Date: Mon, 19 Sep 2016 18:49:28 -0400 Subject: [PATCH 1/4] Finish parts 1 and 2 --- src/main.cpp | 4 +-- stream_compaction/CMakeLists.txt | 2 +- stream_compaction/cpu.cu | 37 ++++++++++++++++++++++----- stream_compaction/naive.cu | 43 +++++++++++++++++++++++++++++--- 4 files changed, 74 insertions(+), 12 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 675da35..d7164c6 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,7 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 1 << 16; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; @@ -43,7 +43,7 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..c8709e7 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_50 ) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..102b403 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,4 +1,5 @@ #include +#include #include "cpu.h" namespace StreamCompaction { @@ -8,8 +9,11 @@ namespace CPU { * CPU scan (prefix sum). */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + int sum = 0; + for (int i = 0; i < n; i++) { + odata[i] = sum; + sum += idata[i]; + } } /** @@ -18,8 +22,20 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; + int nonZeroCount = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[nonZeroCount] = idata[i]; + nonZeroCount++; + } + } + return nonZeroCount; +} + +void scatter(int n, int * odata, const int * idata, const int * scatterTargets) { + for (int i = 0; i < n; i++) { + odata[scatterTargets[i]] = idata[i]; + } } /** @@ -28,8 +44,17 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; + int * nonZeroElements = (int *)malloc(n * sizeof(int)); + int * scanCounts = (int *)malloc(n * sizeof(int)); + for (int i = 0; i < n; i++) { + nonZeroElements[i] = (idata[i] == 0) ? 0 : 1; + } + scan(n, scanCounts, nonZeroElements); + scatter(n, odata, idata, scanCounts); + int remainingCount = nonZeroElements[n - 1] + scanCounts[n - 1]; + free(nonZeroElements); + free(scanCounts); + return remainingCount; } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..8514b0a 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,17 +3,54 @@ #include "common.h" #include "naive.h" +#define blockSize 128 + namespace StreamCompaction { namespace Naive { -// TODO: __global__ +__global__ void kernNaiveScan(int n, int round, int * odata, int * idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + odata[index] = ( + (index < (1 << (round - 1))) + ? 0 + : idata[index - (1 << (round - 1))] + ) + idata[index]; +} + +__global__ void kernInclusiveToExclusiveScan(int n, int * odata, int * idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + odata[index] = (index == 0 ) ? 0 : idata[index - 1]; +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int * dev_data; + int * dev_data2; + cudaMalloc((void**)&dev_data, n * sizeof(int)); + cudaMalloc((void**)&dev_data2, n * sizeof(int)); + cudaMemcpy((void*)dev_data, (void*)idata, n * sizeof(int), cudaMemcpyHostToDevice); + + for (int i = 1; i <= ilog2ceil(n); i++) { + kernNaiveScan << > >(n, i, dev_data2, dev_data); + int * tempPtr = dev_data; + dev_data = dev_data2; + dev_data2 = tempPtr; + } + kernInclusiveToExclusiveScan << > >(n, dev_data2, dev_data); + + cudaMemcpy((void*)odata, (void*)dev_data2, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_data); + cudaFree(dev_data2); } } From b3d908319c208c726ba49488ef2a3616508a0629 Mon Sep 17 00:00:00 2001 From: ZHAN CHIN Date: Tue, 20 Sep 2016 21:55:49 -0400 Subject: [PATCH 2/4] Finish work efficient scan and compact --- src/main.cpp | 2 +- stream_compaction/common.cu | 18 ++++++-- stream_compaction/efficient.cu | 78 +++++++++++++++++++++++++++++++--- 3 files changed, 89 insertions(+), 9 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index d7164c6..cbe5f1d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -15,7 +15,7 @@ int main(int argc, char* argv[]) { const int SIZE = 1 << 16; - const int NPOT = SIZE - 3; + const int NPOT = SIZE - 17; int a[SIZE], b[SIZE], c[SIZE]; // Scan tests diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..2a95413 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,3 +1,5 @@ +#include +#include #include "common.h" void checkCUDAErrorFn(const char *msg, const char *file, int line) { @@ -23,7 +25,11 @@ namespace Common { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + bools[index] = (idata[index] == 0) ? 0 : 1; } /** @@ -32,8 +38,14 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (bools[index] != 0) { + odata[indices[index]] = idata[index]; + } } - } + } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..abed83f 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,17 +3,56 @@ #include "common.h" #include "efficient.h" +#define blockSize 128 + namespace StreamCompaction { namespace Efficient { -// TODO: __global__ +__global__ void kernScanUpsweep(int n, int d, int * data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= (n >> d)) { + return; + } + int k = index << d; + data[k + (1 << d) - 1] += data[k + (1 << (d - 1)) - 1]; +} + +__global__ void kernScanDownsweep(int n, int d, int * data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= (n >> d)) { + return; + } + int k = index << d; + int t = data[k + (1 << d) - 1]; + data[k + (1 << d) - 1] += data[k + (1 << (d - 1)) - 1]; + data[k + (1 << (d - 1)) - 1] = t; +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + int * dev_data; + int logCeil = ilog2ceil(n); + int nCeil = 1 << logCeil; + + cudaMalloc((void**)&dev_data, nCeil * sizeof(int)); + cudaMemset((void*)dev_data, 0, nCeil * sizeof(int)); + cudaMemcpy((void*)dev_data, (void*)idata, n * sizeof(int), cudaMemcpyHostToDevice); + for (int i = 1; i <= logCeil; i++) { + int gridSize = ((nCeil >> i) + blockSize - 1) / blockSize; + kernScanUpsweep << > >(nCeil, i, dev_data); + } + + cudaMemset((void*)&dev_data[nCeil - 1], 0, sizeof(int)); + + for (int i = logCeil; i >= 1; i--) { + int gridSize = ((nCeil >> i) + blockSize - 1) / blockSize; + kernScanDownsweep << > >(nCeil, i, dev_data); + } + + cudaMemcpy((void*)odata, (void*)dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_data); } /** @@ -26,8 +65,37 @@ void scan(int n, int *odata, const int *idata) { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - // TODO - return -1; + int * dev_bools; + int * dev_idata; + int * dev_odata; + int * dev_indices; + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + // Map to booleans + cudaMemcpy((void*)dev_idata, (void*)idata, n * sizeof(int), cudaMemcpyHostToDevice); + StreamCompaction::Common::kernMapToBoolean << > >(n, dev_bools, dev_idata); + int * temp = (int *)malloc(n * sizeof(int)); + cudaMemcpy((void*)temp, (void*)dev_bools, n * sizeof(int), cudaMemcpyDeviceToHost); + + // Do exclusive scan + scan(n, temp, temp); + int compactedCount = temp[n - 1] + ((idata[n - 1] == 0) ? 0 : 1); + + // Scatter + cudaMemcpy((void*)dev_indices, (void*)temp, n * sizeof(int), cudaMemcpyHostToDevice); + StreamCompaction::Common::kernScatter << > >(n, dev_odata, dev_idata, dev_bools, dev_indices); + cudaMemcpy((void*)odata, (void*)dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + free(temp); + cudaFree(dev_bools); + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_indices); + + return compactedCount; } } From 6954d291327bc83c682cb816b60bad8d30f768b2 Mon Sep 17 00:00:00 2001 From: ZHAN CHIN Date: Mon, 26 Sep 2016 20:16:49 -0400 Subject: [PATCH 3/4] Finish readme and thrust --- README.md | 90 ++++++++++- images/log_nanoseconds.png | Bin 0 -> 13646 bytes images/milliseconds.png | Bin 0 -> 12121 bytes src/main.cpp | 263 ++++++++++++++++++++------------- stream_compaction/common.h | 2 + stream_compaction/efficient.cu | 16 ++ stream_compaction/naive.cu | 15 ++ stream_compaction/thrust.cu | 24 ++- 8 files changed, 296 insertions(+), 114 deletions(-) create mode 100644 images/log_nanoseconds.png create mode 100644 images/milliseconds.png diff --git a/README.md b/README.md index b71c458..768122d 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,91 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Name: Zhan Xiong Chin +* Tested on: Windows 7 Professional, Intel(R) Xeon(R) CPU E5-1630 v4 @ 3.70 GHz 3.70 GHz, GTX 1070 8192MB (SIG Lab) -### (TODO: Your README) +Overview +======== +This implements a GPU-based scan (i.e. computes the prefix sums of an array) and stream compaction (i.e. +moves nonzero elements to the front of array). There are versions for a CPU-based scan, a naive GPU-based scan +(uses O(nlogn) additions) and a work-efficient GPU-based scan (uses O(n) additions). -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +Build Instructions +================== +[See here](https://github.com/CIS565-Fall-2016/Project0-CUDA-Getting-Started/blob/master/INSTRUCTION.md) +Performance analysis +==================== +Milliseconds to calculate prefix sums for arrays of given sizes: +![](images/milliseconds.png) + +Log2 of nanoseconds for same data: +![](images/log_nanoseconds.png) + +The timings above did not include the time needed to copy arrays onto device. + +For small arrays, the naive algorithm appears to be the fastest, though this may be due to the inability to +effectively time CPU execution for times smaller than 1 millisecond. The CPU algorithm still beats out the +naive algorithm for most of the timings, suggesting that the extra logn additions needed is slowing down the execution +of the naive algorithm significantly. + +The efficient algorithm is approximately 3 times faster than the naive algorithm and twice as fast as the CPU algorithm. +This is in line with the O(n) operations needed by both of them; the GPU algorithm is making better use of its +multiple cores to achieve this speedup. + +Compared to the thrust-based implementation, all algorithms beat it for small arrays, but it is significantly faster +than all other algorithms for large arrays. Based on the large number of registers used and the small grid size, it +may be using the work-efficient algorithm, but with a larger base than 2 (e.g. ternary or quarternary tree). + +Test output +=========== +``` +S:\cis565\Project2-Stream-Compaction\build>Release\cis565_stream_compaction_test +.exe + +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 10 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 205473618 20547362 +8 ] +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 205473252 20547325 +5 ] + passed +==== naive scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 205473618 20547362 +8 ] + passed +==== naive scan, non-power-of-two ==== + passed +==== work-efficient scan, power-of-two ==== + passed +==== work-efficient scan, non-power-of-two ==== + passed +==== thrust scan, power-of-two ==== + passed +==== thrust scan, non-power-of-two ==== + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 3 ] + passed +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== + passed +==== work-efficient compact, non-power-of-two ==== + passed + +S:\cis565\Project2-Stream-Compaction\build> +``` \ No newline at end of file diff --git a/images/log_nanoseconds.png b/images/log_nanoseconds.png new file mode 100644 index 0000000000000000000000000000000000000000..974e7a05e5fba95ae763b795ef5ed4b228c8b67c GIT binary patch literal 13646 zcmb`uWl$VV^gg--cXwye1SiO%!6jI5mtX;cyGw9)cL=V*g1dWg4{pKT@9^H=hkxCw zt93tYZOzVfPxt9_p6A$xD#}ZuA`u~hKp<3UDKRAw2x=Ml5Fxw+ewiz`um`@N9F!zQ zK$YVp2fzi4nXsHN2vie|{A36VTqD{_X*z&FXgzNqr~#W|V-V=$mb93#imTpnCZa9L zyxX2~OIXE%JXgu}oGJEvrAxd@Mz5;fu+C5tPO@ql(?HZ#qOP;_FXOak66WtFni0kj zEU^#@t1w#OuuaM}kTeDiLJSBBi$WNSg83aa8X0&L#kb1~I)iY9oAr^iJvcZxD+`(; zDy!>Ny-Kl4zvJZKVZ>|&8{z8V?fA8qvZ_D+`c_7BGv$M z(c=@d7mGadgxe({2NM~WJ6IP_IOd&j=rFWz5>qiSHczmvQ3*%w zV+f2C4aUfj-{+z!eMeAV{zpG-^x`(WzJ!fR+g;n0&~5*ApZDZzP2OzJ@(o~rY}VpU+RUi@ffl=XhjC4P zs28RPC!cH!SHl+R$-Xv`!6MowX#n>opq-Uvv7n#HO21R4C}HTZn&l8P)sicd6y9&( zsDEw`vw-TKFTyA#WQaa--Y#;@j*r0~nlI!_!dCSf#L=IhkWfaaoGHs|yRbmK2Hzn; zGQU)%ZXMR}&M324gl)JCpK)Fmx#|mL2Xt^YV-hASQc-dh#xfkH5}sZ{bAZ6iYPG^XROV+}}9&D)(FALfZjW*CtM8EL%K{BTV;Mb@fZwK$1w3f>Fnp{k;lkPEs zsub541S${y)w<+9G81^0uC3?a-un>(+Wsz)_zf_7ASXKQzO&do7I+&WNhaOe7Ly#VZ-hbIPBo zp6BmQ?C^*-iZVFL*9bsoc0Foe`qVoj9Pj)-VHkQ$w7Y(2>p+%RYv1T&{a+9MZv|@* z)~w@uaYix!@Zzw>83a5Sd@X2SHjrrb4X1hGOYvAL! zG{xE);q634GB*VM0{$@jU+#{<`+^CZfC6|kUQpu{J(27|BMRb*^JM1y9xdG5vv5l$ z+WYPaWgj0Yd)B5%)pzgDQ1qlI7)D;67M%P2PxthM>o_OIg}0wHa1#EEH&_QO@1>}Q z%33kM8d-15beX8;%_X`fZ+w9ni6ZSPCVx*Oq9`wuoGfXl`9e4j=GtAW;->^F6=6zG zpY^de$CtIt)Jk7ttj*^O%C_UrDo(?jPYx|cKatF|%;`3TRh8|0#v*pehR#kE3KWV} z+HS1Ut8lju3!gC;JN$TvFIQZ2(XI~|$F^c$wV0hDHZes)tOVY&ffEQE{9dQ=x{^B= zB^aLd;t)mx%}`_by~X7_1apJ!S;YU>VEli9-6#>GwIP`HRBVtbPSd=1Am5*&Hrope z8gtCzu3j)y1(})Bu@<=IL9~(3a2*ijf7%6-GRj=c7Ju8F= z6(0nu79$|mw8}}TiY4cFRRpNd0Pa`bnGpp_gzSr#u;vH+3OwK;{{6^NUbT8+4pi}( zFb;ry#3gOE2L}faysh7<)h={)rvVQo+&!6~1Mg*r{(nR`AOwZ}Lm3bB_Wh+B#|ghZ zdS<0DR6tS)qEcOz`*azn&!q74a~v9-o z60}miD;AUuP#}YyGnUP@A1r9=`~r*1Y^D>Zi7|3U;MGlvivhdQ~?&oPBe6-j3Ye2 z8NtV68%4$W)qR_9D00FpoUDDl&n1?m&5)z2^{=<0V6V4+W_aW)smH-D*`=+H9qKkW zHzz5wV5hBs2L>bKvWX&j)6Cs$#>${oWml_yyb1Ms6GU9qoB==dPg@UIFefSq^x5AB z^YmqHvi{jyY*bcOzRhip(rMIBsn;6YNJkP5SG%bw>DqW2A$*waY5|2qgKPhtMM2R+GmudB*45yiQH3dLf@&E=v6Oy@jBT5p3&1^ zKx7vY>Ie8MA3A$NTwT9k=sCQWY~C~~U&&#!w|aNqOmC?*-7g_7S2XL{wf^_y$bjp# zmp)aY*phf5p5`7q8$@*);#k>gdDItFKVYg_7UVQgv0O{UvgR{+W{`qb^6j6;Wx3I+ z9r0|GsK@{ephz0_#oeE0^-7-(8D+aVLbO-Gi;3g9i~VvvBzep;G;1ey4X6=vvY|bN z2mAZKJ@4&hWMoWDY5T-Wo8H#?xvyJnh2x|5^pppEm6Pb#dii#C#rj)UL*iLacuBfm zLkCJ&I$GMStc`z*wM0 ztjXC$eLaO|lv{-bk3EZ0KtT5&2OHqI8Q7)Ra-r^!T+wX$)$OxMhML*SCpHv1(AVdA z0DE>CuClerxw1+{J#c9`Y7XvY^R{bQ#S5q5P&EMQfG0!cjiI+V< zbqwe?qt>P@)#kG-$5i%a@703_RVvIE{^Z#ZaT!}!l(x0C)zy7uVv<|9y*jwNbHz;G z=nh2Q1YsK)7@raz?R*j(?MDO|?9yzuZ-k~tt%2`f(t^zKO>(}srv*7hL`2NY&*S6a z(bCW$9=xXpWC3xI_?{ss=SD1qw0G zD-*y}i>XcyiSyl0+DUxmdQ_C<=?{L-uaL`5vvLRuu1!uVs;DHy#Zk?aeM(#>=CCH4pk2)Jq%hFX z&|K-q57$TSOEiEAYH4ZF()wFiqcQEyTjVY+;%kV)_+tFr_12sYMCZF9s>9@4_bGoq zn>b{T)@^aS?KD|lmhNZ5!qJPYm&k6+8n4G)tqH$`lXP=)v$j6DzuXIS-xHxKY)g>b zMq|KpK*bERHG;kE`YQ%(la+IyjDc?*zfn*3eZmhlu>8V;R*G{;xw(g0epTT{{Xkve zhQp37XQn82d>5At#@5!>*chNo*R#yRoF?oOfB)w2P^?T5FM9O5i&T0NRr><~j{6jT z*ENKC%S=b!s@3+AFj2VCs7z86wtUKYiykc(77qtcn8cqRAD@rrm=u#49v)6iOguEd z>8tv~j+79DQF({S_Op(}^Z9Ub`jpx$FTM)JhWeXaPavnjr3H~`hWy_1<8AG)UjgoW zaj~&>d*hjvmCUgos_($i7+Ddsi8o}PQ}xFOSJ^Njn(FTwRhzlZ@83EPU>4cB*QdAPYVqVZD?qSi;J6?QOVT{3 zAS&+Sy-%v>HjwlJ^mS8jQE@cKit;i$U)M% zuUyDg*=!@^t`xe7$xYRI@6%sR<~FvBeaX0(m{0zd<>mj5Q6p9P)$j-iePaeDe9;Ik z)?U|31N9?ottKo+fAOU2URo2GJ}F#@h5`guUtgafM*{K(Nx)I4oO2u~+Yu$YNdnr1jzIYoLaDaD@CatmA>`#2NV-i= zzCB)Re6byBh?ZZ@>kwtQx$USoQusHapbiTe`N>^omY0nS54`8&WYTm;>#}WN)J<{K z`ohgaFgcm`j&0NTGipUNmx(7`=AmEi9-R@1vH$K~h4;nd$4`J~+3rD}n zxI5TrQEIm-;!*n0%;zqN-^%5ECwsLuew|Qtj^nkae1pqGhD1QH=z)sq&&3u-z$MENTjjdi3%DCrcb`7F#6-h}s z50y@ijymn0saI*KsnT{bh|8hgK^FW>9s;+oUGcBQYuy&Vug$1BLJ_}jhlC(J>?KeE zgoBPAy}Qr%(wrOe!R6_0rgHtMBica)vx?7JAZA%5W=l(5G+6eDZj z%zk03`LM)m+Tm#8dUq%0mOK-^Y=goZy+rNB(bZ1ukx_3DOnQ0w1GY58$Gx2Wp_TJzr*%zOJWTYh@9uvTMy4pqxwE;sIXo<5 z_bEita_s1LZGFdf>QL*%=#m8ZBx<8)iz%!Qk7Ynp`c|kA?2;C ztal1t1#_WepS!dMJ$fB()3XXmvxL)msEq+=v_-`P+*b;z{`3>ep2G+U2cC7Gr}#EZ z_*-&SNeA1I@#2X}#~aiM?cNnTCNNmE*u^?BMOsb{w@Eax##&*3A^Mm`VT1e-cXVTy z+@n-)dWnkmp^q{*`VA~uZD{H044qQ>P$V7rL%c6ux*|OU1$US>UB>54H8U$tUVXbVU#Pg2LG!1HZ~vttEGH#2;h$aS;!Pgs10G*0^y~?t^AAW>(FOj zWMz$ozr4vKCN&!D92@}0b%WDIzxUS<8Afw_Chq-mA>QT#!SNhdz86(k#+;C<&_;$g#qs2StwUpKK0amHShe|-h1U6Z^udvGee|HBAtOV9PIYO zc3Byj;gONDsw(uKG?2pn`UCI5+QyE8TNe0V9FS&{Dz1GrSwKEHX5EY1X0g<_vB?z& z1A;Oiv^_>kvCrSZ&Bq#&q76h6b9>6MVX`ah%SbHOa|I8RTO6bF!aC63=gnE1Q-Cwv zE12*AEA$UNan}#0{zBTs?d)YTcM#38MtAHU?yByrY$&I2cEk-&GK}f2EPI7#+0mU? zHT9SbjlTJ9WtXrzlP=$jOrWzTkRTce zV^yOeVm`KO*GSO&<;tziAF}df6z!E+bQzI^4FeT{qQ>AY;qbeGe=+&V+Un$fL1*AW z**P$d8=*eVH(!XNjwSIU{LaZ_28G67jDs2qC(d z))k8fB$AjqMbb~Tc5~HvU)-FoSrndzBFG zBY0{ajru4|@;Pj65LVAJL@}VKHU~i=0mXNSuv38H4)5-Bf zOn&-@iwT^@_E^OV0F&KNE0G&>Nklo9D55+j?*7%QJ(wu0R@DjOmYHG<#C=_{U}G|V zUa{FWAs|d4SE~N-5URN+A+WpjRKGZu`e}B%Yn1djdqtl4XeC7@|9>Vd%O&9t$=}5n zEFxpR%NV%Ot?)T7CF!jx80{HkV`Ezj1*Jsl{TRp6k3G*?T_gJaO7s(39c$ObyI0tXO-ox$B{95MSYRakgnXiWI2}YfH*9p~Cmy0Hm z?-T@9$jTUqQ?LM43J3BLHHTDy@tXG>{>dAfAw8kgU!C{1ud_d{Ssa^}U$CAf{KyjU zq!RyqeRo+9;<1E8=(yM_#n!ToRSWW0@Nsu5VUo?Yjv%F|WmYH~Q`u zLY(PJXYfmYomZWM1~i!A)HL5e>ZJdWbchK#KK_A7LmC7=(u=Hhd)1hqnzFmU*hxsh zVQ1_U^`Mu0C9QyT-^5Lr#PBF`-p8BOA253$^&&*U6He>B)Xf#5kOm{LvTHyASR**M zyTADQq&VJHqn7^Dgrc%|)wU85tETG2l3Krh-0jXN9&}yR}b1n-$4!xO0b*Wy>%2S$5jDE3QaXtBvh~QcA zid@G(S24Ab?^{)lud@iU-c4{C+K^v?H_y-QO3#$))r(!23yFu>I4e0NKc3FPjo;fu z(Lc1#kxbQz9+dQoWyP@<4kH06%~~$0;hs5x^LDH;%hi{=oHCG7Gt=>k+>12q}dF7 z2EkfKNNsJc71#3LzXqQ_e|C50-D$Gj=(ay{wOE-}FXfa}c~R6hkvG%hoYWf@g?NNd zDHE`|gNZ9Cv9PhhE_PHPB93tCd*U&}iBsE~c#By>C-#^$27$jy(!yaH+j`!`K*TR8 zW^t|aI)<#zx3iTXv?y|<^$h!^XjSzMFYu)?>`__+9YY@qN$u!6e^OCs!LSoc9` zc~#eGj>L`?#KE{rMD?v+BNG588LVg2vKBkzx|XRw?0gURa5$1qi^ z%)y(jqkP0jf<=)fWzm1qz}1s*8r1KIA*MuOf0Ns0I|FKmCa9$O(~M?jPk`Y%UpzXU z;$*2ehYTwqdFH`$ByWW7Hmi6c4qTg5&5=NLV&!oEx&;!G3!U$%Hb%r;+9t%~OeGvp znwpU;-EVX(xp2<=3(^AO&|dQ)2(=-u<=Vs{I*)~1^pMvtZEhxm9%7Fh<2AxZ$`OfY zU4Py(ie)f0QgNqKBw$?p2OV_#3%`GGrcLP%2T4cW9G>rKQ(H_H^5J z_>yCC1v#Jq<&3NsrG};-gKI(>1q5sbEoRcDH~!5t#T5h@NH}1OFuiTwrTVD!CS1o$ z#kau#%`N&{n)CNbTC{#F$zuNR#!>L*o28RdNGsZW52UmAm*$=yod~b89QAKFu1BxYR>dORyrPXM7p_ zVtbJOkB$Rti`1VVhJX++O5dTBf$=nZjaId$gzvvy$c=4XYj! z+G2|ejqpN#?>m0#&v{3c=~X>bsn>QXpQSFgX?#TsKx4J%`P+eALoha_Igk%Q5pR7- zcvjRa+n|!>-KE>yaIXl>*-DsG`_x?Lsebf?;mP-d=z?r2!Rt0Xt^EkhnaEdLmsYi5 zjQKVhEvr4kmW|9^>ffbvs|&`(Ia>`~F;CS_@i+n@DnS=hpP^YVCKo}Hiq*>NvwFd* z=Wl92oT~H##gT$ko-S9J^K{O0Nj;2wz4JYk1RCeIY%kkuaeNit|Kc_1dTcr36;j|! zW9?l<;+!`X7$uKePe|XJNFQ1zMVMJ~bWGXpRKYpC8Xxv|%zxla=@bYo8J5L0D-~3x zNh|%6l7m(5?$2)DI~sA#pFzYZls6Jkl^W8>zvy$L4v@c~nwI}@1dbfQ*(Jgg0glyi zjGy%!zbh@q1_+#?e9-}0bT;jG_b-U>n;99JvCZa!GVPY=L1e9$?=_z?nib!+9DkiC z`lr3Mby)c%w|*^T@%w@%?Dg)94j{MLigG`pKjmZFlZoX{V2AIMYQO09u?>Qxg>tvU zwOkO6F16-ElG?PmLtpr`k`Y0j{6K^bBz9*9NuWN<_`1!Tno{94ne7u2t!KG@rM|r_ z_`u)|+1guD{AmC4nC94w?N?ED-=0@+N%w29<|Dw{QJvf8;~ZM^26Y^u>SCB#3rs>n z$m%ygq>Df7Q(xFTw|=}VkpK31=JslN5eX1@HTv7>lh5k-a9%?^A+PD3TMvpcJU$Yu z_DojcQ@V+4F0<@+@KHDW;}&?nBI!}}EWMoA;bbhR>+wuyh1)@B-cgJi+~bM_N(#)0 zkYz4RvlkPCDbb`(#cQ0x;Rr_~3@Z&Ld620`&+E*0g#dipI+T0JnL7?dL-i_e{(!1F z1LJRnZhz4=@K&I{?fRWe&rU~G>s{NjdDRNft?BOKI(0V0-kl+c2O8uL6itl2ew7*^ zp#zWnW>K?6jWr!98ERd8tyS3dsoYuX(z4XccU)O~$oYp) zvhtJVdhG=&xGO!{j)3{$LZd6&Uc;cn9~U*=E!!^frLOEL8PM=L9MYyNw{HFqj-YB8 z*9A^7mBDWN8+RuGOb(?N)y!xS8d=>}m-u8MPPd(@p^R5n)vNg|zV*q&sg=)uqQ~jA ziPoCfr!vg%KQ9xJOBx#K%z2pE^Ogx_DHRH&55GQ#-t_j7Yyw3NnRmoZ&CO}Vv52*| z-r5TW)%S4LRym3fJBw-v%b!IuXOGNg7k?$sLT1m-7b^C<3d|mFo>V7pSek(2Bgsgw zqa5cS0TOAw9E1jNh>4*?C0l`3zs#1khoXSmK~z-K(~SO&-Xlvqw|C>~gAcRps(}VL zCu@<*gNKVZEjKTb;vPxAb?n z8(R>b`}7%ef}PTSB6y=_C{sO6s#gf2b^kCGwqCeaQ8%x_nB=fGj(9MRK~B!{6@v{_ zNY@(iMA4&s_gKrTm&;(<+fCx(Q=rTWZ&lLVjZJMC%f}NG9God{W@H3rxZbV!^FuoM zHd}^|ESp;A2tS0tSCO(cLbiMchhzG(>rp4g=yJ=8&PNb%#eS{pL%#y~A{!eUZSC}) zjo&`6-gb89^_KI0rl+y7umW_TbLT=*eLH&Jfks6a2;JN^y2gME-UG=>#>L70^N4c3 z!(kDSy4Tg!MMXsgLaLuXe{x%|3<7yUX=zNXOmx5%BLu7l0->Ry!3+OcULK*;+kVid zQh}F^1Y#9ny%0$ua*S5dnS|rE@zw!y2P$%8QsHFP_>dMpZ7r?R($bTaCS{<$keMmR zsYW^CCIVkrV0OZjN=WK*ow?AepGDtfZbe4Qsj(X+7+X3E1lah@I=)d*2uudsZaJvd z?QUA6D^CW~>goB}m5VoRk9yF$@~frqVPl<9=y`GVT)K=xdba8YY;fXoxU`AO9{Gt+`2m{ z3vR&q=TUprb3ONrDR(8UUeOlM!i;Lp<>z`|skU$jt z8l!|qvuen*WQ5>%$+Y+{f_b57l6V1JfpDHV%dmFPf!c_k*{1Cd@9eC0|V)S^>w2}f_ z;4yppV4>}D`D-=kCt~^3ECc)!||7hvyx5vCts^&mwqnSrjTdd;hp#%#zP)s9Sd;;pp$0O!R zyiu|hqQGw;XfVqMuO93v!fT}gpmuJ^%nkB|kr1f zXSKhF%NmaXD>87ZO|c9)8y&=I%0h?%`VRVV$8PLBJ}mmZEeRImABzW!7842~S^A3G zPw7J*-h8sQ6%Ys+TEd%Nsy(*ENol~Nk~3N^M;jJo;0L79K1r5&7z5S$89Nc_;02)X z1^JzX9tGeP&nt}H*pF0-0aEGg0VEud9;N3L+WIR<5U?K***gNawE{IqKy+fg>a$Mh zp}^M8Y&!=p4ZhkRN_m&foC6G4+0Ir2w6u^B5kNG8251}2IAutIz!*JBKEw;?cp>N< z6rSSv_%eR4Rk8)t_EY!~K-K%5gMQ}Dk*%?;%OgzZ&P8u)3P*|{0ev3vZ5Gq3TxD_w znqY9Ld2;S`SUhVqz_WH+6FueKtxIdrx{g!Z;Qe;z9w?Bb)*7vWrwaFYGA;wL25~= z7TcBWz=ToT`N=@#Y)3OvqVk9=ETk%qSH5EZKM9CyI!Q2F?gQ4l>Uj00k=1|7nZai2 zynuPkD=Yv>s<)-2_^KLj1M#UdTOexaY=!lkVyA?w8gAEwmI%z7dNhd_&gBBH=nM$x zh?Q;!v>)(C-ps{79taMMjj7RdcT0{$L)30V05GX6CVdlfAzF#sfKGMlnVn{7V0#D$Tn)%cR&C8~5iQj@3M*Ss~S?F0Y`h?P&6 zP*aPCmbQNs6~ACh{j0dX5fTgO>`Pqc)SIzl1ti9nI5RqpZWIlCe;a^+i`Oyx6?PuO zO;-~tpy#usAm$^W2Z0)+oMJ$yu5>RB+bfF)Ak|qyHzTFU1z_)mbk(t;At50L<2I)H zp9+A?Up;p1@gU9sx;w}h14xYoV^HsKPw>u>HhwuK;00Q%@HR|=7Geb5{|Al42-4u9 zpFht-aLHr`d;9y9!WL43luOIX`fjf1&knZqa&)N}jAL)}#y?_VVq(6jV_`1NH3K0T zZr&M%k_5FIKTuq*V+dw<+hgN@?}%8)=(-nc)Q(W^8$jQc0DH>MF7D;{L0*bT)2n9t z@+VJQh*JHPiDCxU4zeV}AveyS7`r=p%}*Ip(IPG*b>Ub$TXA*`O2zo%4_xD7KohWVi(ZyPPj5c3 zlH*ZCXHe>sBr-dAMZY8T!#hTTc7Nn=h{aw^&p+(*gD50Qe+Fdl3q^NUo7f@Xi1#mIpbB7QuQ5a>28}h| zf58%}N*g@yMt<6FqQ|;gwauCg941&tHo6l$Aj4S|G}&?`;D_=*@wW-aEO9s=#~%HX zEAroQrcqVkG{Y$4psRZh-LrXLsfQss3|e+IQtH95@j`0B7&u))kc&9R(uGLHYad}H=z0#bLWo3}o4I+w0;C60@ z7O|t+zp-qfxQ~9^wn)dDK8}Rr)!!mMdMp_USV`6mdggq~?q~2!g#&UttJSOg_4D^J3{rdIKN1T$f zGHkx5Jd0%V7I$TK<_`9-;&uPDO5Skg{R~EysKHt$TGI2ZrwLai!4D!H8B*x)n%`rl zzc9PTGsl4w^1PK{1{h*56a%t;T%B&VXOzJcxRI?mUIA^2ZLS6OHQf)&H;m!&cGifE6FVk*G>zPFY_K@WVs#{ zo^tDVxQqrmN6hR*ruGEJTKZCpa*uG>z<7wdf0=c01z}gD6&i zteF2#n;Em9&g93mnjE-tR4sZHW! zeTn*rU5WYw(#%u)pHOjDc2)CB@>y-CF3M9(h2!+i(dptU5?!G#yJo3o70$quz|*J> z{p+Kx^b79@3Ic=ezJfBFe>)41VW8!8Zi?@{Gra%W_n>u;FxzG{_f<$b;~M%zLCY-R zGv{m*(C{nEliq9HVQaU;qZ_A!f-BW7FQ1E0gq@J5$B%0>F0H6jAV>lP^}Ht2ruzEX z3&+H25dlgnHp9P5)nztDulf$^Mh;-C4xOjq?E-F?=X5=Sjs)&IxFuU)v#(KRL)!fQ z9vK&2A59O!taIuLohCBCtfSdnSou@<#p0*xw0wRkm!xb_vZO^rlo(bX>Sm8v`SXHI z{)dMI?L{-1f>d2qHI#HiZkeruNhDyxUF#;?5d*1nFdTh00pNN6D$;bAQU>&7Y9)kU z&fmR@3uUQj{YyxL(N&bK7s~>3EC)KN|G)ns2%c=>6`skXXW8jM4GRc5K+@v!VwEBW G{{I`Bs{kwj literal 0 HcmV?d00001 diff --git a/images/milliseconds.png b/images/milliseconds.png new file mode 100644 index 0000000000000000000000000000000000000000..26af0aa33e544bb4bb2bd85201c9876ebc9d9230 GIT binary patch literal 12121 zcmbt)by$>Nx3+~6A|)WLC?yS2Qc8$`bPfz1qri|N4HD8JF$_I~bPqYCfOJU=9ZC#2 zbm#Z*yWaD?-}U`*&biL{W3FL8ti7MT*IM^l_gb?}J#jr!(qcKqscr21(9lq0jXtMyp>zHDdA|NAY9+1dct*|#Mf<};Qb;x~VQOkh z!fADFO)V^hR0#sf1$XpWFftOEaFLO?5jZ6Vcj+he%E-*Mf|(ijC@_2^(KS5qkl==I zs1GtroHwF&|G#LA9_r$cKT!!7rQWl)ww3^MG&H2|$P#kDf*FZaSdGGZN`!WoU>T3YJgFIy2-5;;qJ@!NKe?>E8mHiXU!s%(mf=&yWE zlX1i-4#)!ot^IE6ff3*ABsjNPu57=>&tpBGs@OL%OUMm@P=%S##`zoHxDYU8V(%Q> z3YB7RgyLCJW*l`64g;+ufy)hr9ZD8_47zX2Cp9<3K7;oMpbuaq>u1JX(o#!EIrk?I z_Uj&zjHF#|Sjm^L+o)YP!d7FR!Iv4!Qc1yN?$J#WeTpIKaK&Jc4#%+`eaw+^e8SDxAwiHKpC zG6r;ic35nW*pl!Lc8E$~@y`^M=e-{f9Y6eLNQ8ArS-sMtd4VlgHlSDyFPfRDe3K0p z!6+v}?rCBVcHb&C(pL-Me)rahjM~y+VVQPgJHrig0C~&BcUk<7 zO3@kQLyLeBsA(5YClubX!TAv$Ul%UuzENU*3)Boo)=6TD!1GTVV?eX&cD;{tYLH!G z5+yV^9IYH(MJUZ4^(gAbXVcWGV_{2QduucSm&7E|i zWw5iD6Xn?$C0QhfGN9iw}bJ%>MLY1;~pTa9@Ehf-EHeO#j5 zhz%OCOfx>^n2?A>g30zA3`#{xZ#9xPe##&3>&L4c=d{Q_WowjTQ}hY2X`M+K4I<5{ zsa`Zga<)?2(6kmE;QOp+Ss{xj#rQBGw@!zX&htb>?f9Cp)$#7yUu33m5AX%l<7MpA zOqE>F)$~rle}dA0PgJE z44ug|TOzqTHiO!JBxik|5=Td1;ZatspZgqEs53(TT_?o*Kr!OY*Zn=^TYH z|G;?GEar(G<@nd*M0uJKeg~0R>Ae`XSY#{*`o{_;|DK5IFcn*UnheV%H)K&n^f|+* z=$+x8@05d3z04KUclr;a2B(^cSbJqZ#T+@Bpah9AV@jBuZ*JL1sU#n{lX;TyR=%)KIVHb(w!* zAQ1I3X>AcRqft>)0=b8P5sFA|%usaU_cwK0p+ER3rh)HooIh^(b0A-pYUZ`oa#57x zdg=9M@{kW**s9P+(7NPK=(|772#>D(%q6Si%uv>~8D%529mlLwFh{`o!LEPNOcdAH zR`yX*9FAqGoctGj91#B)+o2$UgEf>BG+l?)T${mgTH9Pa@7Ea=HqgfSoA8x3zikGw zn5(wVVNdqli=YAL=m==%?TWg)0~*Bd@;d&CW(ft|Nm^6*j*P&bSPx`{g7)8=j){d2 z8!Cr_C@>~2X@0660R$a0nNYzDCM~XB0rNDeX_1akBjbou&@@vd()`|f_EU?|-u4%i z@osCz{g}=}?ccXi+CNy_q16HtU+(R&&FaSYD9a1IkqDC=W@6j0%;9Mih>9wvni7Us z82y}~czO6v?F;m-ZnI$k+BEv$Z8gD1Zbc9)Wyvk8U3(5lyJ=3wJy2Y<_4=Y6L+At2cKm`yb$DGl=gQBejrM!WJ4Y|pH)bq@2|;LMJmCC>_7c1H_$A+=l}dIZ=0$a3sxJ81p~#MLk=M^5;MrC?=s6jp@l)#s|`Ew(1Xz z1R+}j30`HK!*qfhL-f}CQ9n|szbhmPM5RqiSLh)1oVJ(By1pD*J-f&y)iYh;YbR_n zz>GglE1S)ps_c#zC6;){me#kae5cPMw2KsjlxuQythdVHi45V)|CH~##J#O+!lfY- z&jY`$f7~jm#*|cLj_()1?Ej?ufQa(HMMC~7zM|L#t`+kwHCN$4t6%mM5uXy7&X!>uoK-xH3=7{--=F?q^5R0FjMXo?V;4 zrQPN8o&v!ZUU&$r+Oy#Ev6-Rzb-~25F)x>*Y8g&NbAZPwXx_Dx{2>j>x$2ix?Ilh0 z;3{NsRhDein2X?4jTh`am9SydQ}?3TLM}VnO<}D-j85GaFG^*Kz1IZriNSfRvYuQL zB%I{)G?JCe$Zd49PA@5_*5Toc(q9fd@sMRcr0zU(R-fD=nOcfcWEA7Ke$_4gWOMGtk8^bl`(Q#ll6fx1L> z)r`5whDfQ4xrQ?~xhG4kVr6T=_h+6K4!_C={S(h8 zpjI^(cuQ05$!$1z?6~V}OIJFA19;Sbef&K0BpLUijg}<+%Jk>Kr7{&X3ke+j?&Q8) z45zw3hxUF4wjyceyexz@GLS$}p7l6qR$|Uk9?c9F>EI0xZg`Fgf(Za#$rOlZ%k~xssD`^KiPu0E$g7NbSPjo&zN4PSc3`oFpOg|H!O_Vi5sMo{NoX>!Ak3#UPftZfNL@6to9WU33kw4cw1f z$qF~KpJB@Oe%43jjFurWX}#-sX&wAxaM3w$Y|%5u&>k6M9;d_rzF{h=;&qL1MJ`Bf zBhmEHV^0VniCUa3ITWVa5$D&|qjzI!?mbGJ7o~2=;mIE%5pf~G6qZHhMMBswc*~=- z+GDK|W~-D6iQVjWAm=dvMjvl8+a8?=&T(d>Aa52px0vBje8mqB`AFiR>sD=#dRS}S zfp~8+o%gNcU609**M!VcfdHg2fG-CV%TO@2M0$Q~lKUXzsY_3ZfQ$?QkF_q!IJQ6u z%7xnWWDv}ATVz=o6`K!Ik|7YV)^*#EW|Jr?>gSDoD^uxe83-Gn7x`UMCZMh>R-0c|`+gf59BkKDOp=LvsvC}v&tVf5PcJw~NgZ1-;7Um|~<&V6}K+C!@ydGAJIyuXP2pAqrR zDEfNS7SG}X@Xzo6fm8nH&!zh%PtqH9iQ@MGRXwH%f>Cxb!J`{UtIovLS>M=a_<_mhb1g>dE|CiMJ`2^?SDX#8$$ zR@-IiEPKXG^Rjmzpmm=cGx;W(ETiWa${gcvWQu{~EsGSyQ-lI0RpSkUIgA%svEc)P zdcuHn?0b>+088|mEGOKAmR3TnVytR)AWYO+i&HiTm_qOhIDP;h^CR}oe=NXg-lEEka%#GBsi zBbqM==CwADI3Oy(d8~1>=~2}ycf94f%IVJO0!_~Lq#Nt9(+p}GHhH2(6H3|2(Zp(` zG=Sqs0eHTusMq4;yaNXhbaj2tlYKj)qG&f~7?etkF)Gm0Gv`8a zWZ1lex)bA{F;9V{W#lc#%JyP+K|3EZej>;E(w68?GM<%X1cVLRJb)Q7m? zK!W-2sz-Ix-qNw|GY_{Ve;cVD98ufNEoXX-`D2^!Oc=-F<_2As%BlL zqjF2NH=>2NO8DD1+D@JdK=}X0K82^4GQK-{=#KHVz$*(;bVU%4I4bMN)fu4|zsJ~b zL4~{(V#XhKvx$4AkJyPC&OcrR!90RG3`N|PSooOxGSxYAaGgNCy}hw&VK_^CK-j8W zTDP{Yn{(HdLV3)l62b=xaCVXB=KYIQ{F3*N%`LTTKNai6kDBgC*w_QjD^C8aCE@#O zjCER-T2fRRDM2v1hmpmiqnTt>Bbdcv`P0pe)&6sDDwj^yn>>gI`H#fv`iMOjQxTN; zH>MZYh)RJXk9Yn%C-hDKO$rDxUa*X^Tbw2B*`?LC#nQ-&@@!!9rWK}wB*}4VHjRbcQu4aJkhBrnK1u5Pl~jh@ zU(a8g`2n_HvLs}uv{A1Lv*h4O)EcvxW0p)FN$~sEti*n5?YXnxrbO&I0NN);(#b$+ z;&JLxtKoPUJ~h5%Ds4lbm4H$MG}oQ$2^mbHa%xvX+Pq74c4sNYh$n!`nD}P%dLY*i ztut!Xb<*Ri%c2+WNpJACd+df9@Ko7KI1E-E&9m73_HM7cJ$Qb4AwI?MFQyG?W0cZ>UBYIa_wM`}u8Hm2VzM5Jz z{+~}6&iyoTj}pVi0zDSKV{i%LQo=@e%k{jfA}~0<=id}0Yh-v)2q7N8w%Y1h?X1pU1T@!I#yJF;2T`W#TyVK&V%=7?xaB zylx6`{16E1yc0XQJPsYCFTm1jhb^|R?8D&jWJ!l-kqnZNxM?XLrsi25ibRrxTflnX z8}18Ixs3diq%LOr^qytV#+obxf#-Yc>*(kKkcGjn=MnK8%5sl|UwGtgBgDUhKLu1- z!5aXKrZ()wYu%7e)`1I%iz2I8sH3i1H$wasAD_CGR+Z1SH|*EQ-u`}JVWFIy+0h~s zJ!LZ(15gyxt)9s?ac)m(DU{brsV3HsmaEXqYUKmt^+997Jo(t@uMGnt$f>^3qa@q>s0;aksaG`zP7B-WIsg;F*yGz@pnHfw;XvSnwrnK!7kIYSmM$0%LDs zM@v7nc6`qM^5xOiY+;Nmwe5EY7ZHgj^JeK)oPQBqszvO$RoZqNvmQQ>c)QxaUGTN1 zOO~Cw76Orf<5AVGzmw9lv$MG>i0aD%ddsk|FwxhqHM2%3_Pl|bfc-Z}bbqAhd{`6f z`FQ-aPwuk>5cJHk3s))9Pl-R;*ob+alqoEH5)l!hC- zt%HO?`ZL)vQ*n=v_V%ihoBf|pcK%{8mXAO(!2r^pPzV;y3@_G*XF8p)MYg-jK;+~) z^EGlcAFA{Y3`jU(KG<;^oe4<-2>ZpuU(|4>AN`PhQP6Gs8x0V1(*{aPw}N-0!~@(W z^ers1=NX#k|0**B!uM`U5ivP*AVm;_=+v`me`RocvO<(Wv0v z>+MGS=i1Gh;((JI7^K4Cxqzcssjr*ZIowRE%<%d|+A~(>2q`XbenL-ZBRQKvMx`<3 zo1+W7&ZVW6@{0tbDK8KAcz_le46xQ{mk6-f(~NMozOudcLVw_l0Lk#m%Vas5HV4jV zVv-U1_mjubru=eHWBYBV#ICG6<1}Yu-uhA^qh)*Pk~`Ck=M_P9_cQ6#cg`U8x&W;%5D?MPF)LGQ4*~+$;~2-FDMdvaV@-(Y5$&I z2D6d^M6*IWZz>&Na6tBhO3c;$fe_8sI^3NHj|Vk6J3HsUt9* zJEDspbSZ1qIR3B_#%FI>N4ltebZz8=|npC%qq|7UN{am7vyjK-?FHREeQG`+9 z6O^lQ?M_7UoP4QsakZIek zbG(i&EFtoc{&4O|a839|&k#*5$E>apXxo< zvITjYZcH85*cu#aUk%Q&jXWn$5Ns%~x9mSIY2HzqVY8b`*>oE8vWySq{LfA26H#d{|7#=T^g3$ox?7|Vx_>HZoE zVq^%by=oPb)OXqrA(ZT$J7}dTZNA)?r!R|;70#8b)&Xc{?d}d5%7i`V{(`VwEydC{ z@viQ%4gl2w;Ii@_A}XrzOMah=<9aJr1Jd;Rup0qd{_zu> zv-Eh?mcEd47cnay*X#{6x&XFZokz%zE^b;mu=B1vUiO(iTi@82JNc7eoL$`~kjvsS zD$%^%!!B29k+vqLIpl8)U2j7BuuuJZ2>9N@!ot9SGCG*}SGG~w>q8NN)GPn|G=D2K zMl&vr*5ukrvk2L^mAvxc{$p7Q3I_q~0KvwVWTk_w<(VQkbdJQ5!}PSIuJW!DP_tOV zoXNEh7np0ZJ0-=a{X5BA{vWQIo|TnVUQU7oB-tesJ89{CZ@zb}G@nxUgj5dUdL%Q6 zyy$Xhfe5r<4V6D7ms8;V%K7XVA_dCXlhkJT08mDS7eLbl@TDwrIiQKxc>31~(EUMH z=t|0K!;{`#Yj1ngwn3y|vG?WSy!i37v`T~197R(Ckn@b3%1(=8p&|&dBxs|4#=T-z zi1csiH0Ld;$+PIyzExf^VOM0J^b3E-beoVD%bAo7YJq8 zWleqz_Y}WSO#aEf75feEH54FGH6zJ&(KREPMx<=(%IwtRZV#R6$Suw*z7S=%>b?FIsuz&M`OZut^_-zV=Vmu) zJPs8=k<~{X#yat;b_3p6digWH!Xzi2^19A*R+4_=r0?heZZy?k);`_IFsIFSdhTl3 z-@GK)^T+NgcNRkPfVza+d`E|rf-NKfDB)#W$I$3!Cfr6%9C$bMHDFVY!7C09aN%Tj zyn16@EUDxswAD@)tC~$-HG5v1*I3;hIu(e}SJQWhBClGx>I&7<=jizIdSA<$jMBD7 zQ0MY%)fxt-E%X~vlBzvl(#YpL^~8wl(m$U@#ESB!LRsi{H&?OeFg=+_o~`#mAD&5m zE)`_Rqp*!B4{jywtpJScykGCmVG493VCTgx^QjCj^d*~@KbV&OZ13#wkXF~!P~uIG zjRgikZER#(r9dDm(rSP%;OVQXehqNGdMeY`JzKu^)Aw#cX*qfACs}O2qISx&24bj= zkS>+ZaRAH(zFmzZdUmqQFz{=}^aG^X61qI>Qq+&W5PnkiNrTh6_zHaebA9@cTOneR zv!+9Yrzu1SgNCX03U}K{IE@upN~(IJo`uBem0u>a&;=qWz43Q$U=*q`Ht@f zp)`!8F0#=DdR$Owck#msYc!cDIC_&73Uqu*PlTepYdo@ zz7+Af55Fd|)!f%t5d$dA;k&FKCr%av(*7Y7gg z@b2F2p(2K!l;HI+qh0bVcywmz0p-=Ro`QHr|EIQY`vX^5*HjPe(|0ylbsFlCRx49U zoRZd{YjOS2z@)n2t4!O=d3e)>%DJHruUYSgzITns{(zs5lOo@vv~DieZc)99q4HQ9 zIs7OJ3WbSDZ+DyWXCvimOQNI{pr`=ZtzM&BF=Z$ExACUO$Ad(7-6T={E{@2hj!=8B4RK%LPtVpOUd> zLp|2&g~tB!epiu5?$mG=oIoPjysqpV9h?K@278`QqH^c2kd7lNjEU6aS6G<9HT&pi zZ2$JuBjf$*dcE-(blG3S53o`(&D|aq2{$Za0P4NI8K0}tcv#9(b5+N>f7(j->9k%5 zW0AdL(M)gkI;rB!b*o1tSHlBw>UurMnD}zfN7rD7s7j9hqwtZc#!y>7i_etand|f# zO9|p!GU1Htigd)dJ@jvHR0P=D{-T1x49Ung`htzB`^92eMox@RVbmO>ayz1?TI-5M z+vnx=z`D=ki1uhTv^=wVC7rygO+b58dCx^Fey-IC1mV5&+98O~KG&w%PsBkXIDaWj zdo=A@gWXE<+?TvcCk>c%31zoBj|8cC6t;xwDZI*0>pdB-xpZ9l_32Y?47aYD{^4_W ztK_tb%xbe_@~W=MD|L2=oE~~vQU0a(;uCRk@s@FC4@IB9N9zE_}6h~Zm4&hZnug1g~FO79M*>90#>w(gBH7#9Q#HFnW{#$p_tkm76tFNi@5*?uqN1bKV$>oO66Ocu;fD`j-hW!b5~%SsiEG0U|omUz#RM~s%Q`&|18WmHgq2nh}-iq-HD3u zN4Wbehrrt1U`22wp88#wFbBrv4Sroj1+&0k-R{HZYV0+4grDY(jwe?T@de-V2BU>%_F}?K7rGYXon^2@LKMRWr3sR!A^&)N64#(;MPubz@mK~{bA3zlG9n*+MFm3{5 zGPe4ve}rWI2zQa&(G%11dJ6q`SQmjQk_XjRF4aylN*g(*Pj@+x4Xny5ol#7Lo6$1R zva|Wy-u?Le!EdqFIk+CmCd^pJk}In*MbntepVQAkWLjn~|FH@1v;=C8!>o>y9jou7Z)W3g{H#Phr36z2`0ov!^ob6nPkkmUCgFh<7Hb2d#H+ILs8 zg7eL2`@d_}+~p;0n%x%Q5t6HKYe~J)VB&5h7 zYPgPsMi80`=JwJzw1m!2{lHOy)-(cbh}h?hnIC}){Xf0s_+MQx_)qUnZgQUWYwZ<0 zWAi`VV(%*$-W;Fhe)d`AxU8vZZnHR0!L1@L>6mAqP}kkaK82Uvt}O+F6V;w#+{3l27$wB9m}8X(xZ+iyf{Jm7`UPs@%W80uWTfW)+3kZjG;fznpAX+Zh8uFN8tlWpthdT?s$9YiU*}Qu$u`BW;vKaf-d8>2@JaF-JI|E# zn+-RS0veYP2xJ>iDkDE%o)fqCp^?ZYMtKb>VZoXJX(732I`B(LfbI(Xw6@)>&aQ9neh6&DqjN;*8dq(s8L*+FYq7(QLq%>U>O05&sGl+ct7d(giWe% z`P5A?cZi~BNF@qV2f3YKMA}~zelff~?}^~l%|hHXbF@pE<8kRc`I$cc7D^Q!hRizQ z@!>J|acvTK?)-U6o>%TaZB;Zm$-pn}ED{42+<41q2%%l392+UiySfs0= z&aXwZ8BCGTZp*bWBqERZCeWh%Z~g?F8w_7A*yJKr~_00J6GzLOG~;=i849X#~+WId|6Twh*YjKUzJN&bC`k1ijuWyP)* zfc(9p8)Nsa(qshOUUY}BL-h{K{bv!BMDPU;M+DkXhH^XWWX;gk4)*MVrk3AcDpE{J`hT)U0Qsdz+6 z#g!W3H#H5-vgGBh46u$)F2#$Q$m60RJOMAa0<4mj3Z5UTfz^IIT)5-$X)03gPsyZF zd<0vosKs`h0sq1iMwEs!eXX_aw5bxwJX@^G|2F*fLJ(d@*S9s$MnBE9Q+I*u@VDe; Lm1Rn#jQ#!(P+uaO literal 0 HcmV?d00001 diff --git a/src/main.cpp b/src/main.cpp index cbe5f1d..3565085 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -6,6 +6,7 @@ * @copyright University of Pennsylvania */ +#include #include #include #include @@ -13,111 +14,161 @@ #include #include "testing_helpers.hpp" +#define TIMING 1 + +void comparePerformance(const int SIZE, const int RUNS) { + printf("Timing performance with arrays of size %d, averaged over %d runs\n", SIZE, RUNS); + int * a = new int[SIZE]; + int * b = new int[SIZE]; + genArray(SIZE, a, 2); + + for (int i = 0; i < RUNS; i++) { + std::chrono::time_point start, end; + start = std::chrono::high_resolution_clock::now(); + + StreamCompaction::CPU::scan(SIZE, b, a); + + end = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsedSeconds = end - start; + printf("CPU scan: %lf milliseconds\n", elapsedSeconds.count() * 1000.0f); + + StreamCompaction::Naive::scan(SIZE, b, a); + + StreamCompaction::Efficient::scan(SIZE, b, a); + + StreamCompaction::Thrust::scan(SIZE, b, a); + } + + delete a; + delete b; +} + +void runTests() { + const int SIZE = 1 << 23; + const int NPOT = SIZE - 17; + int * a = new int[SIZE]; + int * b = new int[SIZE]; + int * c = new int[SIZE]; + + // Scan tests + + printf("\n"); + printf("****************\n"); + printf("** SCAN TESTS **\n"); + printf("****************\n"); + + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + zeroArray(SIZE, b); + printDesc("cpu scan, power-of-two"); + StreamCompaction::CPU::scan(SIZE, b, a); + printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("cpu scan, non-power-of-two"); + StreamCompaction::CPU::scan(NPOT, c, a); + printArray(NPOT, b, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("naive scan, power-of-two"); + StreamCompaction::Naive::scan(SIZE, c, a); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("naive scan, non-power-of-two"); + StreamCompaction::Naive::scan(NPOT, c, a); + //printArray(SIZE, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, non-power-of-two"); + StreamCompaction::Efficient::scan(NPOT, c, a); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("thrust scan, power-of-two"); + StreamCompaction::Thrust::scan(SIZE, c, a); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("thrust scan, non-power-of-two"); + StreamCompaction::Thrust::scan(NPOT, c, a); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + printf("\n"); + printf("*****************************\n"); + printf("** STREAM COMPACTION TESTS **\n"); + printf("*****************************\n"); + + // Compaction tests + + genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + int count, expectedCount, expectedNPOT; + + zeroArray(SIZE, b); + printDesc("cpu compact without scan, power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + expectedCount = count; + printArray(count, b, true); + printCmpLenResult(count, expectedCount, b, b); + + zeroArray(SIZE, c); + printDesc("cpu compact without scan, non-power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + expectedNPOT = count; + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + zeroArray(SIZE, c); + printDesc("cpu compact with scan"); + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact, power-of-two"); + count = StreamCompaction::Efficient::compact(SIZE, c, a); + //printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact, non-power-of-two"); + count = StreamCompaction::Efficient::compact(NPOT, c, a); + //printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + delete a; + delete b; + delete c; +} + + int main(int argc, char* argv[]) { - const int SIZE = 1 << 16; - const int NPOT = SIZE - 17; - int a[SIZE], b[SIZE], c[SIZE]; - - // Scan tests - - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); - - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); +#if TIMING == 1 + comparePerformance(1 << 10, 3); + comparePerformance(1 << 13, 3); + comparePerformance(1 << 16, 3); + comparePerformance(1 << 19, 3); + comparePerformance(1 << 22, 3); + comparePerformance(1 << 25, 3); + comparePerformance(1 << 28, 3); +#else + runTests(); +#endif } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..79cfb56 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -7,6 +7,8 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define TIMING 1 + /** * Check for CUDA errors; print and exit if there was a problem. */ diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index abed83f..7c67a6e 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -39,6 +39,14 @@ void scan(int n, int *odata, const int *idata) { cudaMalloc((void**)&dev_data, nCeil * sizeof(int)); cudaMemset((void*)dev_data, 0, nCeil * sizeof(int)); cudaMemcpy((void*)dev_data, (void*)idata, n * sizeof(int), cudaMemcpyHostToDevice); + +#if TIMING == 1 + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); +#endif + for (int i = 1; i <= logCeil; i++) { int gridSize = ((nCeil >> i) + blockSize - 1) / blockSize; kernScanUpsweep << > >(nCeil, i, dev_data); @@ -51,6 +59,14 @@ void scan(int n, int *odata, const int *idata) { kernScanDownsweep << > >(nCeil, i, dev_data); } +#if TIMING == 1 + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("Efficient scan: %f milliseconds\n", milliseconds); +#endif + cudaMemcpy((void*)odata, (void*)dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_data); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 8514b0a..0a85b9d 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -40,6 +40,13 @@ void scan(int n, int *odata, const int *idata) { cudaMalloc((void**)&dev_data2, n * sizeof(int)); cudaMemcpy((void*)dev_data, (void*)idata, n * sizeof(int), cudaMemcpyHostToDevice); +#if TIMING == 1 + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); +#endif + for (int i = 1; i <= ilog2ceil(n); i++) { kernNaiveScan << > >(n, i, dev_data2, dev_data); int * tempPtr = dev_data; @@ -48,6 +55,14 @@ void scan(int n, int *odata, const int *idata) { } kernInclusiveToExclusiveScan << > >(n, dev_data2, dev_data); +#if TIMING == 1 + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("Naive scan: %f milliseconds\n", milliseconds); +#endif + cudaMemcpy((void*)odata, (void*)dev_data2, n * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_data); cudaFree(dev_data2); diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..3e195da 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -13,9 +13,27 @@ namespace Thrust { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::device_vector dev_thrust_idata(idata, idata + n); + thrust::device_vector dev_thrust_odata(n); + +#if TIMING == 1 + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); +#endif + + thrust::exclusive_scan(dev_thrust_idata.begin(), dev_thrust_idata.end(), dev_thrust_odata.begin()); + +#if TIMING == 1 + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("Thrust scan: %f milliseconds\n", milliseconds); +#endif + + thrust::copy(dev_thrust_odata.begin(), dev_thrust_odata.end(), odata); } } From 3c0d8cabd8f00aefb75f8e840120885633aa5e24 Mon Sep 17 00:00:00 2001 From: ZHAN CHIN Date: Mon, 26 Sep 2016 20:17:36 -0400 Subject: [PATCH 4/4] Format readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 768122d..21aefdc 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,11 @@ Build Instructions Performance analysis ==================== Milliseconds to calculate prefix sums for arrays of given sizes: + ![](images/milliseconds.png) Log2 of nanoseconds for same data: + ![](images/log_nanoseconds.png) The timings above did not include the time needed to copy arrays onto device.