From 888e6c8fa028a3df073255aa896f757adfa29984 Mon Sep 17 00:00:00 2001 From: chiwsy Date: Sat, 27 Sep 2014 01:30:13 -0400 Subject: [PATCH 1/6] CPU and Naive --- .gitignore | 5 + .../CIS565_2014_Fall_StreamCompaction.sln | 28 +++ .../CIS565_2014_Fall_StreamCompaction.v12.suo | Bin 0 -> 39424 bytes .../CIS565_2014_Fall_StreamCompaction.vcxproj | 163 ++++++++++++ .../Macros.h | 2 + .../kernel.cu | 236 ++++++++++++++++++ 6 files changed, 434 insertions(+) create mode 100644 CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.sln create mode 100644 CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.v12.suo create mode 100644 CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.vcxproj create mode 100644 CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/Macros.h create mode 100644 CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu diff --git a/.gitignore b/.gitignore index b8bd026..6fb8ab5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +*.sdf +*.pdb +Debug/ +Release/ + # Compiled Object files *.slo *.lo diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.sln b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.sln new file mode 100644 index 0000000..5779f38 --- /dev/null +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.sln @@ -0,0 +1,28 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2013 +VisualStudioVersion = 12.0.30723.0 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CIS565_2014_Fall_StreamCompaction", "CIS565_2014_Fall_StreamCompaction\CIS565_2014_Fall_StreamCompaction.vcxproj", "{AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}.Debug|Win32.ActiveCfg = Debug|Win32 + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}.Debug|Win32.Build.0 = Debug|Win32 + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}.Debug|x64.ActiveCfg = Debug|x64 + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}.Debug|x64.Build.0 = Debug|x64 + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}.Release|Win32.ActiveCfg = Release|Win32 + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}.Release|Win32.Build.0 = Release|Win32 + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}.Release|x64.ActiveCfg = Release|x64 + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.v12.suo b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.v12.suo new file mode 100644 index 0000000000000000000000000000000000000000..d79471c3dad1a093f92993b81f93e91367f1fcbd GIT binary patch literal 39424 zcmeHQdvF`ac|TARCCV>ZmTlQ_WQ3A!H5MT70X`&B76FhFWzk|yJ)DRN3<5_AA_y=z zKq4c_b<)Oen>bP2#A%zxm24+=9mk1NCuyBN6t!;Z)~#H%(?^*VtqMY+*g|9r?mo{k{W_Rfgw&P}P)iWiO~{ zpmhMhD#}Fq7oyE5HkFb8Z=2{d9q*V65cllQbl2kD`+z%v8-SYtzPldRmB4j?2x$DQ z!}G0%-)daf0}S75_%#?;`3=jB_bW{P&8B-9&d!z1iQ&3&Z3EhYe#6iD{SNrw2kZoT z03YB70>EuRAFvD94S0b)!2Q5AfHkhg{(Lq2|K-@5wf>JbR89R~wqMByWZx%$Tnotl z&v5MjyvqL1d-8zg0Kb#{|8_i6Zny(r7&`I*_J0{a{bc{=y>tuN|8F+ZyOjF>$C2kx z06z=-9PmltG2n6F=Yd}Uei8T-@M+));4{EyfiuA8fG2@p0zMCX0eA}dBJj(=uK>Ra zu-?u1b;Iu*uFn9^0?z?o0lo@654-^U2Jj;AHQ*&c#_u=%RZv(FZ zzX$vd@Vfxt|30qgf$swsfIk5K5O^K-Q8u#pOn>_j3|ddE2gAp zByYcvzc~+~|31j(QYOzY3A@Xn-vZhk@95L@DU=0{t8F zucr+9+(&NnRkq!Dlsxk~~u{Heepk=rolJ%=zO zVQWYzCm{W*m>0@AGt)r+9cF%KemC2_Isc{5r)*$#EdP7p$+?Q%{Fh6g?Z)b4{wed6 zLqCj?^(dW6hcW_q6_?VAs|UWkYr>UxotV=^#M@EawPB{LA{5<@GTPsF@!dx$w85Ec zc>r}E2gVS4QgNeIvIWHP6v3Q`tz-iC+5NB$_5V|dhx0?b`F|B*SzmMyA0C&s zUA))p-}0C9-w)u*{%n{2_Yjuy1YH0*8bO~QGjg42|JI!9;jO<;m+|^y*fW6QU$^{mcNK;wTN>*BmR_pQ!gz^8agyz-az6oR@kh>Vb5sNo5GM zIQpq(VtMY%(Z5iqaE449j&w;SDSBQE{VXatu{M)3ePoj3qKa_l7O~CXzxN_8mVt=0 z$d8%-)y98a|F~4gfAU=^VGTnT=|{`=VubN3L8TevP!mR>AX=p^Ap}Lg&^gouLMOnz zCk5}$#GyAnj%&lvqOs5J27evFlR0)jG@Sux-`EZ6fc3yp#K)c=6g^*44ukp;_^$>+ zc=CuRJj}STxFc`Kj@^FRISv_Jv8*`$-QOV6kjekJ(VHnvxly`lyyq;Ej@JOt2S}>8 zoz+p(Liv;9@MeYMY>z1{W9S-4MAVowppHi4(Nr`M?~imhhE-2z$4H0Q)!N}vU9C;3 z>gsHZcwM7yZKG<3r){LIZKTl|h-#CueMnylDJ2DB%*6S~oASv#rX-Z5e|{4Y1ngwr zOsK%U7;i-qU=-vp%mvmbOOA#0E(SG>r7DPxc}WwYbp!5)RuMSk}|xI*nefc7Bu1bXP)U&_Dh;C&}xH~;J_FR;PD`9#|v7o%Jf zrEXX5*1u-EFJ1oU(8KbglkH8ezYYm0B#B*1}x{n}E9j@-yo6*heYzQoq0vh;7UTxPh$z`61KJ5nASh*ET%21Jsap z0^5M?0N-`vO5F?ZcjDRu_y9i;0D6HwU>C3(=m+)y_XB$Ye)j;b`+z~gd;|y@&mmk#jQa?#9PvkiG2kc=1&#sZKn$1w;y?nJ1U?Fo zM{7U|;B4bKZ~~YDrh$(ECxKJIY2XpyXMisOh5Elu`QO31L^g%PZBh3oLe%8{qQzu8 zfGm4nZGU703{5`}nozqN2i2q&)l#96nA)FGCmNko6S264sbnnaiX((J849c0dXnSr z!DM1mO{S*B#B+N{)6|KP*mO5u86R!S`D|!fL!6ng6IwWtj73KpcXY|voO&{}WZ2~C zJ{XFn)gAgAHz#cG)MQe1ARojTM_}9Vi9~W-By@OJG_^ZDGVJdk^0s;X2Le6AAaqO( zr(DhQuGuw|N~)m=e_{f8MrPl+H7BS3L@YfKKNO9mj^-&HZCfCbM!oIm^mJ{VdFa|Ii&mE6j7LuV zLK6_0?vF>*sk{XG|;^Xe_P zCRWS*l-o#pbaXBSH!?3x|J13I9ln;1X8gI@nmx^~rlvrXtD~dc3Sy+uGCP^7Z(9uGYSuUROsT(B|svX>V`#^!D}mysf8C>Ag9-d!DTe zV0<6Oct_1WXKlJZ(w_El*rH{3VYYAeYlh6hSxI57r&+gYU;9{j?Q5>~@h7VX`<~Js zd-a)%x7K}q)sN1S6JqL4mL2(2#H7_2J|`8k-Jf5;f~hsyS)OeB2{ zK$_vI3+)ZK&X~O(PpPJ`gZ{&mn@qfgzAA-2WuQW?9Qy||{oN?_h{$^!tt^Jp=tF75 zgby_`#oS!;PySK8oU^%toYNSUQqyug6hr0HHU5FQOJ?+))dAEk`vdR+%gp}%lwS7<)D8pi?(cs5L zMW+~C(_nZsv2q%Vjx9OEFk`kh&j0*UQ~kQ1{M+9><$C(^vw^yQU@~Q+l-JplF5bN8 zh=>g6;|{IhXyxJr7nbYr!^#69!AxKXiQrQ|veggWffIUyLCpD`V$|eXsBX!|bu{OU zUVj{4rtqa)^Bl$uy$!v~3qP*8g;5MF0Lm=K@yq^5X(ov4G~%Qs3j5;jZ~bd*YWd$Z zKEGjYz2lR&yL~5fKZ~$+RCvP8PJbGFfGcv;y3Ow9D*svO51{NP zMDL%@T7Q~N65L|spE_8(`F|Q=sjH;3T5nicY;F86WW8dYD=xnIqqY6zw?8zmMe6NF z{?_dON}qpvQFs-!|H~42WW5x+n&sbO`v2BK5#e1q^ZyUwyK_do#ptuF3OUkuBESZ~ zF8`~v{l^y2t}B85!wCBEDN_jvMmF zKFAoyFtOpx!)oJu(6I5(Pg(b+{SVh`ZwBnr??Tv5=$~WuQrrJwcmGR^QD){#73}|T za4nv7PiIcsZ1)%NLhk=yUm(~VYwQ0@Z~eaz|D7-XzZY$-g8X0Dja-X3?Y94g=Kov| zt~CGOSVI133nTe|1?)c^b%>AlLFJUcb^MX`c=oI>X$z#!|7kN+3HpoC{*$r~?WFAH z|51dM_MZ=3q~@f2w<*VhQ-kL7AZ)X^cf>7z3&l>o#WRq6xP~j`y<8=-z6(ssfx}{l zGSBOY;j8^ZV={?5u3$>FGV4()m4i82o*=Zh`Wh$qz49w6r}-O%#h4dL=nmYqApI%m z!)RAU{Z_T6(_=~}Wx-O?xzP1b*`<{9a#voh{Rf-}9GjpT4EOO~`xzkazb1Ae=ty|zX)=EjN9_pu648tBu-&t(li|DB0p4V>b zd9*UGRu0i>N+;!~m#|_*31%^@BI`E()mpNY8Kum%5+(cLqP*9hZ1@OP)-_0UUc?sw z4cgXo-twr8nj3(GC~aaAu+HacnT5DJ_Y6|<6AMSf`*MS-czyiW0eA=;B zgFDT8#gh+Tw_=wz=}I#yUo%g{c>%$(FRs4FaFbkm%jJf%ln@A z>;F07eEVzr86h2oX{*aJV0DP|&M3>^6NloPyA^0fB6$+IVjhUgH3PS-A#v_iaHzlM$b&Rlu?zb;;Qu9so=D*v+Xu1nw8xN+Ba{`tqJ?-`je7B+{J zj?r}&-o3D4?{i;{y!^!tZ+DjVJ$GG|HX3jwp*5H$tXsH$ZUCB!82qBb6prgL)Ln2_ zw%))La6^5$XghdgW%;Zd$th_9{6BY)7n>&h3Mc zh z<6&%JOB_43fLqtNt*;t(%h}yBRHp>Dl8?Aj+@rNc zU}-W8b32^b>uR_6Hq3lcefxE*FGZ$CCvNq&crxRC{tWJY0B6C}W|Z!(hW?-OI@i<8 z>lD?z?qgl2s0RK2d})XtDXzGjPxY7o(7Cr?x^u-}KJZt8mj2JbEhc4L#x}cxje3FQ z7dejE9f>h`h~t`>r|8Zq7(wQ`Ms8gztF6Ff@9Cax1*Tfo^Mp9V-bS&8?plr`v4JQh z%*1HQeK4w}L$RS$IucE|1L{b6Y)nnM2SZ7mkDG0t>mdL&OAtXj= zY;oF+E>olBw~!-89QVxRaAGnMSK}#hHi*a^i+-kv*_POX8)(d`tX}cIXxetaWy4u{ zoI8+%L1mS$aYG#SYz)hp`sIxDy=23GPxFeX}GEFY*SMbS7IZv#!b8mK{PpsJ{WGW2!JR`6A2DY7{4 z>@p5mLd)ezN7jCPOP4s^$Lf6>c+T!*#Ih3pXKD$ZxGUvMwgV+%r;i*RAFQMwQlXJuZfLf)$0VTGxG*Fek?meJ+l< zdlS%G-Fx8_D!s-y1~6~=_@ne^*)`(fy_5I38wcdKxqAa8MaOFZ=mTWQ%F={aj^mh3 zYa#Zl{OA9F_pPH=g|;7^ zoBe0aew4ck^6w$2*^lP4Wu^6FB{{VlGg^6u;8L+4%^XI))a^&-WDZsVOU|>k?JYtu zn|;UU55B$UKkjmzO@8j^$?hN3-|a^C7bivNCN_)R{%46<@UlE2i6 + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54} + CIS565_2014_Fall_StreamCompaction + + + + Application + true + MultiByte + v120 + + + Application + true + MultiByte + v120 + + + Application + false + true + MultiByte + v120 + + + Application + false + true + MultiByte + v120 + + + + + + + + + + + + + + + + + + + + true + + + true + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + true + Console + cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + + + Level3 + Disabled + WIN32;WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + true + Console + cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + 64 + + + + + Level3 + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + true + true + true + Console + cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + + + Level3 + MaxSpeed + true + true + WIN32;WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + true + true + true + Console + cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + 64 + + + + + + + + + + + + + \ No newline at end of file diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/Macros.h b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/Macros.h new file mode 100644 index 0000000..9f1db26 --- /dev/null +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/Macros.h @@ -0,0 +1,2 @@ +#define arraySize 100 +#define blockSize 128 \ No newline at end of file diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu new file mode 100644 index 0000000..ab9ba02 --- /dev/null +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu @@ -0,0 +1,236 @@ + +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include + +#include +#include +#include + +#include "Macros.h" + +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////Serial Version On CPU/////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +void CPU_PrefixSumEx(int* out, const int* in, const int& size){ + out[0] = 0; + //for (int i = 0; i < INT_MAX>>4; i++){}; + for (int i = 1; i <= size; i++) + out[i] = in[i - 1]+out[i-1]; +} + + +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////Naive Parrallel///////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +__global__ void GPU_PrefixSumEx_Naive(int* out, const int* in, int size,int i){ + int id = (blockIdx.x*blockDim.x)+threadIdx.x; + + if (id > i&&id<=size){ + out[id] += in[id - i-1]; + } +} + + +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////Optimized Parrallel///////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +__global__ void GPU_PrefixSumEx_Optimized(int* out, const int* in, const int& size,int i){ + int id = (blockIdx.x*blockDim.x) + threadIdx.x; + if (i == 0 && id < size) { + out[id + 1] = in[id]; + //out[0] = 0; + } + else if (id >= i&&id <= size){ + out[id] =in[id]+in[id - i]; + } + else if (id < i) out[id] = in[id]; + + +} + +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////Optimized Parrallel with Scatter//////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +__global__ void GPU_PrefixSumEx_Optimized_WithScatter(int* out, const int* in, const int& size){ + +} + +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////Optimized Parrallel with Scatter and Bank Conflicts////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +__global__ void GPU_PrefixSumEx_Optimized_WithScatter_BankConflicts(int* out, const int* in, const int& size){ + +} + + +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////Helper functions//////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +void printResult(const int* in, const int& size){ + for (int i = 0; i < size;i++){ + printf("%d ",in[i]); + } + printf("\n"); +} + +void RandArray(int* arr, const int& size){ + for (int i = 0; i < size; i++) + arr[i] = rand() % 201-100; +} +bool GPU_MemHelper(const int* src, int* &dst, const int& size, const cudaMemcpyKind& type = cudaMemcpyHostToDevice){ + cudaError_t cudaStatus; + if (type == cudaMemcpyHostToDevice) + cudaStatus=cudaMalloc((void**)&dst, size*sizeof(int)); + //if (!cudaStatus) return false; + cudaStatus = cudaMemcpy(dst, src, size*sizeof(int), type); + return cudaStatus; +} +bool verifyResult(const int* a, const int* b,const int& size){ + for (int i = 0; i < size; i++) + if (a[i] != b[i]) { + printf("The %dth is expected %d, but got %d!\n" ,i, a[i], b[i]); + return false; + } + return true; +} + +template +void swap(T& a, T& b){ + T c = a; + a = b; + b = c; +} +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////Main functions////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// + +int main() +{ + //const int arraySize = 6; + srand(time(NULL)); + int a[arraySize];// = { 3, 4, 6, 7, 9, 10 }; + RandArray(a, arraySize); + //const int b[arraySize] = { 10, 20, 30, 40, 50 }; + int c[arraySize+1] = { 0 }; + + std::chrono::time_point start, end; + + ///////////////////////////////////////////// + //////////////CPU calling//////////////////// + ///////////////////////////////////////////// + start = std::chrono::high_resolution_clock::now(); + CPU_PrefixSumEx(c, a, arraySize); + end = std::chrono::high_resolution_clock::now(); + + printf("CPU_Version:\t %f ms:\n", std::chrono::duration(end - start).count()*1000.0f); + //printResult(c, arraySize+1); + + ///////////////////////////////////////////// + //////////////GPU calling//////////////////// + ///////////////////////////////////////////// + cudaSetDevice(0); + + ///////////////////////////////////////////// + //////////////Optimized calling////////////// + ///////////////////////////////////////////// + { + + int *src, *res, *buff; + int *host_res = new int[arraySize + 1]; + memset(host_res, 0, (arraySize + 1)*sizeof(int)); + GPU_MemHelper(a, src, arraySize); + + + + GPU_MemHelper(host_res, buff, arraySize + 1); + GPU_MemHelper(host_res, res, arraySize + 1); + //cudaMalloc((void**)&res, (arraySize + 1)*sizeof(int)); + + + start = std::chrono::high_resolution_clock::now(); + //GPU_PrefixSumEx_Optimized << < (int)ceil((arraySize + 1) / (float)blockSize), blockSize >> >(res, src, arraySize, 0); + GPU_MemHelper(res, host_res, arraySize + 1, cudaMemcpyDeviceToHost); + if (verifyResult(a, host_res, arraySize)) printf("GPU first step verifed OK!\n"); + + + for (int i = 1; i <= arraySize; i <<= 1){ + //GPU_PrefixSumEx_Optimized << < (int)ceil((arraySize + 1) / (float)blockSize), blockSize >> >(buff, res, arraySize, i); + swap(res, buff); + } + end = std::chrono::high_resolution_clock::now(); + GPU_MemHelper(res, host_res, arraySize + 1, cudaMemcpyDeviceToHost); + if (verifyResult(c, host_res, arraySize + 1)) printf("GPU Naive verifed OK!\n"); + printf("GPU_Optimized_Version:\t %f ms:\n", std::chrono::duration(end - start).count()*1000.0f); + free(host_res); + cudaFree(res); + cudaFree(src); + cudaDeviceReset(); + } + + ///////////////////////////////////////////// + //////////////Naive calling////////////////// + ///////////////////////////////////////////// + { + int *src, *res; + int *host_res = new int[arraySize + 1]; + memset(host_res, 0, (arraySize + 1)*sizeof(int)); + GPU_MemHelper(a, src, arraySize); + GPU_MemHelper(host_res, res, arraySize + 1); + start = std::chrono::high_resolution_clock::now(); + for (int i = 0; i <= arraySize; i++) + GPU_PrefixSumEx_Naive << < (int)ceil((arraySize + 1) / (float)blockSize), blockSize >> >(res, src, arraySize, i); + end = std::chrono::high_resolution_clock::now(); + GPU_MemHelper(res, host_res, arraySize + 1, cudaMemcpyDeviceToHost); + if (verifyResult(c, host_res, arraySize + 1)) printf("GPU Naive verifed OK!\n"); + printf("GPU_Naive_Version:\t %f ms:\n", std::chrono::duration(end - start).count()*1000.0f); + free(host_res); + cudaFree(res); + cudaFree(src); + //cudaDeviceReset(); + } + + + + + + + ///////////////////////////////////////////// + //////////////GPU Reset////////////////////// + ///////////////////////////////////////////// + cudaDeviceReset(); + + return 0; +} + + + From d438246340826dca7f6794b415ce683b3144ace6 Mon Sep 17 00:00:00 2001 From: chiwsy Date: Tue, 30 Sep 2014 01:26:59 -0400 Subject: [PATCH 2/6] Almost Done --- .../CIS565_2014_Fall_StreamCompaction.v12.suo | Bin 39424 -> 45568 bytes .../CIS565_2014_Fall_StreamCompaction.vcxproj | 2 + .../Macros.h | 13 +- .../config.cfg | 4 + .../kernel.cu | 449 ++++++++++++++---- 5 files changed, 382 insertions(+), 86 deletions(-) create mode 100644 CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/config.cfg diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.v12.suo b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.v12.suo index d79471c3dad1a093f92993b81f93e91367f1fcbd..14e0692b849b5040948c30bbc292b676bd57bc59 100644 GIT binary patch delta 6741 zcmdT|3vg7`8NPS337Zh|Aml|JY#@=m*lgYrlCU8wkG$WZWDz!-C9G_+ne1H%2{em3 z9XnP__LtEa74T83XqC8t)G9`8JB3;$_-ZR-Q9I69I}UB3Bealy|J}W?M676St!L-E z=bZn%|MS1jx(}$_e^l)(7h}gOB2{7rrK#pcG@V;oYCHLwSRe?(#1a}-4U6y4Sye-* zpaVq@(4LKHq*u)le?+dJhEOpVG)(|~*28`RG9jEqb<$`gN>fNP{CNcOdBA)i9f$%l zfZ0F{kOFXjPVp!w`s;I%Cjrqw6VM89pIQE#%c1`AMU-dy%5=9nEiE{p&vJjc0(l8g z3Utz#I-bs`6Pj=ExAF`tP+keF0&0L-pbn@98h}P18E6A|bxu649rX^N7>L8nrDN;u z9#x&6$){FVeHkVR@{ ztkAFmXaQy7kXKMKigZ3XQ!In^OQDG}4XupI4L^YfmIzLsx!RxtK`*x(Qq-D7pv3{o zEEVseauVRw3`^j^&n`GoatStJ0IFdYEW##ydj%`Zs1YRzs6}ln<4I${%`LqWG{9(CD8eKtN4^x=YL%Y%iq+Lrzkswt>+`)R}z@a z%w&hCoRXyEM)}&#BsXbTfZ5w!ZcJ%Y$#W#_2APG1leWWbcUddun_WX)ww|>#mUXXK zLVNVJv_B{8>3ppi<*nV<@zk4ZKYI4#cRqY5?ocV|3z}3ni*(#Mm}Y@yNf5>%Sv)O5 z{;#&wyt=K@(HW8Pz+1cO#|4<77&_jV5M%DcB7H8qWLsu;ST@OPMQ6v*{n=R&)5yGq zQrc6{8txGK1(R=|0=>FEGfG!eer`e3i5*L3X-{pdvAj#o-+77MGmdF)&LVe0B_FYJ zrTQzdB{z^Qis9#+S*6rD4&gFs(=(&Ez&8`C5Lc&{tL4^v(lg8RK`j6Z0ahz^=p_J$ z3}wLez+zwtP!4cNQ3)&s=%4ACnLN|gq6T)D+@^r)GK{`u@5(4DRj4lX+6-**_ud*X zY8%Sif$svNz`a;;YEyjiP`%AE=x{nVNV*Q2(`B}|O0FK8L)VO=P1@S#aM+!?HroI` zF55tp*|N#pYbA5PaWt6KuHtisw^^Oi)yL{uU5@28yVYox%sfL{IrnX~N|LR=*U2rF zo7SychJ{dHZnN5ZL>0P)%^W;k&Ec9L*rb)NZWHb5E?`3-C;CfwJmqDj(L|!blj%_h z%GwO)pv5oef1hEo^!4o3Fys zT zc?bme39#<3;F=rtA%u8be;<(;=O4%T@=xQuhN1~kixKFhiuX}d;AXg{aK-*?0f(}ms6C&JvC}>=`BWSbs~DW%VZ-Q)PM6 zpku%~C~eg?$VTV0w>77*JkPm!RlihNP+6Ym@9rrxYS@&pSSdDBbdsxVG}sE#Mmkq^ zEL47T%ns-gY(b@s&Xv}L3SWNoOnEeBK%)GFw2V4D7aHK+X9>6;q!!p!J7C5Etpckp zYzd96O{449&Yi&;gG7^E`E=T(@lNE<7jL}nz(=1PEIsn(EB${k_CEP1iF?3U$muRf z!DFb~petzs^e-bFB`=AjG7O5>k;~8+0%7nuCOTt4BoLJ!%mL1yX=~L;I7CVgf{_W7 zn^X4Hjrzi=slf2)%CIOPD57Xp5*b#iHEgE&38!3S;V>%60H+Q}XcM%fMOco!3X)Ze z?+Q>_WwhcF8ifXgZHZ(^RnyN^dU89}qJ}2t)_O`p=-Avto|Q%ovy_dzvIt43O`7>2 zWCaktXHvt#upM@?Z-^3arAf&``=qGE>7LW~ggq==lSS8t-bPdpaW0 zhoHRd3+3@T2(=yav~xEO+kO4%hn*#!&a~+Fq>B$$KXW|#$+^PbqmffDi@1;^`pqqc zWa!+bIw*SI?i>)w(^*X;x3<%`3HE-?&7@je>2-G*M6KcnIsb+{xPjNc77yNr!!s8; zfKFgFa0;NFjNGrmv-iH}b+4@y>zVT-5cLAw&kcw}itx=B7QqB_xe%p%{ERTmi_nYP zs!-D7R|QlfV$owH|Eh&Ty3mzHM{Z0`V1sFdb?S!s?uCu=-SHBEQq2io!}krMr^bzk zgoYk$DI-J28knj$_!SdQ%tmbxUz7Z5!}U{i%k`gk_o-qCge{LI?wIn9-XV%&qIXB# z2R^xjZUx;c7`;=Q&}rOONljT{-rLq67CnO#G}*nBM!UW8lWrMr*8BV&g`&5=$0#mf zjw|UGDChtdMYe1HM59s;osha&Z5K>e+j2gKt{b7*G1mQ5Y8& z%A(Dwn;KoEWN@}on=98l;dF}}rO+wY`frm}z&(t`7ehYEAQ9^zPwb3`0Q~hsll6~N zdhbbUYBP8ZH%*A7Z`$qMy{$wH7?B?^B2G3T?{5E4HQly&>)kH|jw%col`w4&`hd1r zwB0=|@{1>fYa(%IV9@Gx+8q7!m3PnOc(8X~8P3ANg>n}Dmx|-|A_U7zgbG-(X76d! zPVuS+XtHGiy|ku$cAYSIw5BTSo||ePeS7bYmWzk?(fKvToX?Ww$?-A439&)F31Vcy z+D`9i*V3T>in3~1_rxV+u8oCex<${OFF=+479^_%pEf`>w8s>g1$`1_Hll&5K_}Me z6ur=XkzRO@H${kFLwJ+wd@l7L2Oi#azdVxEbssi$DZIHe$g^=m{?EegoCBAA z87I1OX~H=Hwd|(e%?m4h+uZ`2R{;KV&}o)mvMxDHwSmsztO@tA5r4mM`zFW*M`3+X zZ1bYT1m*@O38CurXDS)CydhhF%WqP8(WavTtDbJmnl;*0z1W1Pvv<(kr{y<}Q=2(d zR+43^Z(ZiH*?UZvpHbBV1E$SII=!hQzd&!2TdQ48$VN#Sfwq|73(DP zV6Rnj@}5G<`(1W>rBJw0k_Hy% zc?EiXA%Ywyco;$xBZeTnvQV!eL-vXCmLsxR*N?HQPbcxHML1+Ys|EE(uuJ@I@3C>Gs)y<8-CcSP6>U^-MvI>TaxLo zP7gf1=bZPw@5i~%IqyaJOja(-<4%z@U163*5@pFN632sAvy*@SvB;4mDdeDl91yqY zuH0Y}i{W7}ot(@yp#2H?V;kwZT)gNVSVn-wNWzdYmH;gSSOJ)U$AB;A=H?R7#sZ4~ znDdeXJ1ydiLz@op_?n2%6lwJbcrUC+Kj1I@N~eu^rimRnsg8)N0_|F09Z*Xpd5_T- zhRiHCtR;X4coKLDC_bOoHDoGeGkUEIAx=1;&7Z?k}k|i2~@HeR{@TCjZg~lavJ) zGmzcPlAi$=u)@+n^d+b+8p#+HZ_(WhB>gHln05LT-y?q}_9mBzH|TCyi{YFZ7Zg|) zNZ`Q3(j@rr4;3Fils1f#T3R`cvBt&b0tywlqPsQgU~WY`A6Br1-p-b?*e|Mi!xDX{ ztVp`S8iZR;We**kT502Sc~R<=UXc2ve5nA|6M!x}7b!)R!Q^T_MD$0n-8B9D3bJ_Z z<112W5DY)yUa_=oi@Dnxc|%t3Ka$E z6Gx$m1H~8*`j674K@+vTnM5n))Fhwph5kO9^_k9Ie@}(YH`q=Ci9` zq2jzIOJSY!Ubf}d!HtWoclOnF|B>D(Ytv@HTKu+nO_s@AK~|@g?$lV-$_=3h4EHrb zva8C-S2Yul`O0*;NPgE?^eJdE-E;jq%uv4EcmQ+w`@quxgywRy{_Fh(GpoO0ovNh| zQlF*FRd)JSvYm!(jV58mZXU{N&YOj@x@J^-i~OEOqYZ;|0WD`|JS|#n*#yThjXXIT zz44AC}EZ;)0xABv|q!c0~}U}o=yKZiCOcKs7HKpW#!=b3BW@E=8C}qZZu>^ zy0|=XO*w>$>oxN0d!*oMH403mnfl)v`JEL2JS|_83h-U7b&%jrMI@_Y_n2VK4DC}v zO-GFC)~2NfP09QAZb9$9d_sL|^AW9!K6^IVQd(T&d7`S!v43B6ZEM-uG79Z%qqepL z+h*-fS|f2`+}+bXRJyu#Xt1aA#p-@OnRYXkOlOm2I+a3gS(GDN5{n_8@^xkF*1Jph z3)<9vTn@_W@7nhW&TRTl$1^x1&2=26OM6|aGU19M$1$tTE%iuUsDjU<+I_Vu2S%6E zq349EoG6XbY8e-Cw>taU8lnDq*ez1IPDUg&4y6i6pkIun;dZo9^3%Q4+hC#Vjc4iA z1Gj@+AywH`DNgShlP7y)fgMpO!U+!0*3hWhX=)6q`L7!gO zP6n?_7)W?GX-{20b5q+HzZ8LcJ&nEb93##Jn~>C`lV(9Tm6T;S;9%j_4R`<^$ik0i zR4@EVaOGz5hTdy8lP*_Dr*i6+-O`@bW(fZ=&?hyI^A;v@Qt3A%@$~k&^jf~=cS-${ z4|mRXe4_P0YsWtj#rWw!U;ab=ecCt4Fh)Hx7Lyn86?xaB!Pe`-$umrTQk)&ko>yJD$io@t$}Vp`HuVLJ9RU;;?&^8uF;CyF?od?XIwiG0jH?Bs7Nh(?XbbbUH`<;L{9jySVC7N za1I|Q{GFn+)B%naYD_vh6(Js2XqcxU6gt#kS1SX&p@&!)%M621(#iD{I-~ppSp$-{ diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.vcxproj b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.vcxproj index ef1a2aa..f9f0513 100644 --- a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.vcxproj +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.vcxproj @@ -105,6 +105,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" 64 + compute_30,sm_30 @@ -148,6 +149,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" 64 + compute_30,sm_30 diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/Macros.h b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/Macros.h index 9f1db26..36bede4 100644 --- a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/Macros.h +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/Macros.h @@ -1,2 +1,11 @@ -#define arraySize 100 -#define blockSize 128 \ No newline at end of file +int arraySize = 512; +int scatterSize = 10; +//const __device__ int size = arraySize + 1; +int blockSize = 128; +int blkHost; +//const __device__ int blocks = (arraySize)/blockSize+1; +//typedef std::chrono::duration> microSecond; + +#define NUM_BANKS 32 +#define LOG_NUM_BANKS 5 +#define CONFLICT_FREE_OFFSET(n) ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS)) \ No newline at end of file diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/config.cfg b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/config.cfg new file mode 100644 index 0000000..d42d8c9 --- /dev/null +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/config.cfg @@ -0,0 +1,4 @@ +#This is a configure file of the GPU +#The last line begin without '#' should have space separated numbers: +#arraySize blockSize +3000000 128 3000000 \ No newline at end of file diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu index ab9ba02..d050ef5 100644 --- a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu @@ -1,15 +1,80 @@ #include "cuda_runtime.h" #include "device_launch_parameters.h" - +#include +#include +#include +#include + +//print and file +#include +#include +#include #include -#include +//#include #include +#include #include #include "Macros.h" + + +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////Helper functions//////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +void printResult(const int* in, const int& size){ + for (int i = 0; i < size; i++){ + printf("%d ", in[i]); + } + printf("\n"); +} + +void RandArray(int* arr, const int& size){ + for (int i = 0; i < size; i++){ + arr[i] = rand() % 201-100; + + } + +} + +void RandLotsZeros(int* arr, const int& size){ + for (int i = 0; i < size; i++){ + arr[i] = rand() % 2; + if (arr[i] != 0) arr[i] = rand() % 21 - 10; + } +} + +bool GPU_MemHelper(const int* src, int* &dst, const int& size, const cudaMemcpyKind& type = cudaMemcpyHostToDevice){ + cudaError_t cudaStatus; + if (type == cudaMemcpyHostToDevice) + cudaStatus = cudaMalloc((void**)&dst, size*sizeof(int)); + //if (!cudaStatus) return false; + cudaStatus = cudaMemcpy(dst, src, size*sizeof(int), type); + return cudaStatus; +} +template +bool verifyResult(const T& a, const T& b, const int& size){ + for (int i = 0; i < size; i++) + if (a[i] != b[i]) { + printf("The %dth is expected %d, but got %d!\n", i, a[i], b[i]); + //return false; + } + return true; +} + +template +void swap(T& a, T& b){ + T c = a; + a = b; + b = c; +} + //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// @@ -24,7 +89,18 @@ void CPU_PrefixSumEx(int* out, const int* in, const int& size){ out[i] = in[i - 1]+out[i-1]; } - +void CPU_Scatter(int* out, int & outsize, const int* in, const int& size){ + //out[0] = 0; + //for (int i = 0; i < INT_MAX>>4; i++){}; + //for (int i = 1; i <= size; i++) + //out[i] = (in[i - 1]!=0) + out[i - 1]; + for (int i = 0; i < size; i++) + if (*(in + i) == 0) continue; + else { + *(out+outsize) = *(in + i); + outsize++; + } +} //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// @@ -32,10 +108,10 @@ void CPU_PrefixSumEx(int* out, const int* in, const int& size){ //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// -__global__ void GPU_PrefixSumEx_Naive(int* out, const int* in, int size,int i){ +__global__ void GPU_PrefixSumEx_Naive(int* out, const int* in, int i,int size=arraySize){ int id = (blockIdx.x*blockDim.x)+threadIdx.x; - if (id > i&&id<=size){ + if (id > i&&id>= 1; + if (thid == 0){ + multiBlock[blockIdx.x] = cin[size - 1]; + //out[arraySize] = cin[it-1]; + cin[size-1] = 0; + } + int upbound = it; + while (it>1){ + it >>= 1; + lid >>= 1; + if (lid <= upbound){ + int rid = lid-it; + int tmp = cin[lid - 1]; + cin[lid - 1] += cin[rid - 1]; + cin[rid-1] = tmp; + } + __syncthreads(); + } + + //swap(cin, cout); + out[id] = cin[threadIdx.x]; + + __syncthreads(); + /*for (int i = 1; i < blocks; i <<= 1) + if (blockIdx.x >= i) cout[threadIdx.x] += multiBlock[blockIdx.x - i];*/ + } +} +__global__ void GPU_final(int* out, int*in, int size){ + out[size] = out[size - 1] + in[size - 1]; +} +__global__ void GPU_append(int *out, int *val, int loc,int pos=0){ + out[loc] = val[pos]; +} +__global__ void GPU_add(int *out, int *src, int size = arraySize){ + if (blockIdx.x == 0) return; + //int blockid = (blockIdx.x*blockDim.x); + //int thid = threadIdx.x; int id = (blockIdx.x*blockDim.x) + threadIdx.x; - if (i == 0 && id < size) { - out[id + 1] = in[id]; - //out[0] = 0; + if (id < size){ + out[id] += src[blockIdx.x]; + } +} +void rec_GPU_PO(int* out, int* src, int size = arraySize, int* multiBlock = NULL){ + int blocks = (size-1) / blockSize+1; + if (!multiBlock) + cudaMalloc((void**)&multiBlock, (blocks+1)*sizeof(int)); + GPU_PrefixSumEx_Optimized << < blocks, blockSize, blockSize*sizeof(int) >> >(out, src, multiBlock,blockSize); + cudaThreadSynchronize(); + if (blocks == 1){ + GPU_append << <1, 1 >> >(out, multiBlock, size); + return; } - else if (id >= i&&id <= size){ - out[id] =in[id]+in[id - i]; + else{ + rec_GPU_PO(multiBlock, multiBlock, blocks); + GPU_add<<>>(out, multiBlock, size); + cudaThreadSynchronize(); + GPU_append << <1, 1 >> >(out, multiBlock, size, blocks); } - else if (id < i) out[id] = in[id]; - + if (size == arraySize){ + GPU_final << <1, 1 >> >(out, src, size); + } + cudaFree(multiBlock); } + //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////Optimized Parrallel with Scatter//////////////////////////////// +////////////////////////////////Optimized Parrallel for Scatter///////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// -__global__ void GPU_PrefixSumEx_Optimized_WithScatter(int* out, const int* in, const int& size){ +__global__ void GPU_PrefixSumEx_Optimized_WithScatter(int* out, int* src, int* multiBlock, int size, int dataSize = scatterSize){ + extern __shared__ int cache[]; + int *cin = cache; + //int *cout = &cache[blockSize]; + int blockid = (blockIdx.x*blockDim.x); + int thid = threadIdx.x; + int id = (blockIdx.x*blockDim.x) + threadIdx.x; -} + if (id < dataSize){ + cin[threadIdx.x] = (src[id]!=0); + //cin[2 * threadIdx.x+1] = src[blockid + 2 * threadIdx.x+1]; + __syncthreads(); + //swap(in, out); + int lid = (1 + thid) << 1; + int it = 1; -//////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////// -///////////////////Optimized Parrallel with Scatter and Bank Conflicts////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////// -__global__ void GPU_PrefixSumEx_Optimized_WithScatter_BankConflicts(int* out, const int* in, const int& size){ + for (it = 1; it < size; it *= 2){ + + if (lid <= size + 1){ + int rid = lid - it; + cin[lid - 1] += cin[rid - 1]; + } + lid <<= 1; + __syncthreads(); + //swap(cin, cout); + } + //it >>= 1; + if (thid == 0){ + multiBlock[blockIdx.x] = cin[size - 1]; + //out[arraySize] = cin[it-1]; + cin[size - 1] = 0; + } + int upbound = it; + while (it>1){ + it >>= 1; + lid >>= 1; + if (lid <= upbound){ + int rid = lid - it; + int tmp = cin[lid - 1]; + cin[lid - 1] += cin[rid - 1]; + cin[rid - 1] = tmp; + } + __syncthreads(); + } + + //swap(cin, cout); + out[id] = cin[threadIdx.x]; + + __syncthreads(); + /*for (int i = 1; i < blocks; i <<= 1) + if (blockIdx.x >= i) cout[threadIdx.x] += multiBlock[blockIdx.x - i];*/ + } } +void rec_GPU_POS(int* out, int* src, int size = scatterSize, int* multiBlock = NULL){ + int blocks = (size - 1) / blockSize + 1; + if (!multiBlock) + cudaMalloc((void**)&multiBlock, (blocks + 1)*sizeof(int)); + GPU_PrefixSumEx_Optimized_WithScatter << < blocks, blockSize, blockSize*sizeof(int) >> >(out, src, multiBlock, blockSize); + cudaThreadSynchronize(); + if (blocks == 1){ + GPU_append << <1, 1 >> >(out, multiBlock, size); + return; + } + else{ + rec_GPU_PO(multiBlock, multiBlock, blocks); + GPU_add << > >(out, multiBlock, size); + cudaThreadSynchronize(); + GPU_append << <1, 1 >> >(out, multiBlock, size, blocks); + } + + if (size == scatterSize){ + GPU_final << <1, 1 >> >(out, src, size); + } + cudaFree(multiBlock); +} +__global__ void GPU_scatter_cp(int* out, int* aux, int* in, int size = scatterSize){ + int id = (blockIdx.x*blockDim.x) + threadIdx.x; + if (id < size&&in[id] != 0){ + out[aux[id]] = in[id]; + } + __syncthreads(); +} //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// -////////////////////////////////Helper functions//////////////////////////////////////////////// +///////////////////Optimized Parrallel with Bank Conflicts resolving//////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// -void printResult(const int* in, const int& size){ - for (int i = 0; i < size;i++){ - printf("%d ",in[i]); - } - printf("\n"); -} +__global__ void GPU_PrefixSumEx_Optimized_WithScatter_BankConflicts(int* out, const int* in, const int& size){ -void RandArray(int* arr, const int& size){ - for (int i = 0; i < size; i++) - arr[i] = rand() % 201-100; -} -bool GPU_MemHelper(const int* src, int* &dst, const int& size, const cudaMemcpyKind& type = cudaMemcpyHostToDevice){ - cudaError_t cudaStatus; - if (type == cudaMemcpyHostToDevice) - cudaStatus=cudaMalloc((void**)&dst, size*sizeof(int)); - //if (!cudaStatus) return false; - cudaStatus = cudaMemcpy(dst, src, size*sizeof(int), type); - return cudaStatus; -} -bool verifyResult(const int* a, const int* b,const int& size){ - for (int i = 0; i < size; i++) - if (a[i] != b[i]) { - printf("The %dth is expected %d, but got %d!\n" ,i, a[i], b[i]); - return false; - } - return true; } -template -void swap(T& a, T& b){ - T c = a; - a = b; - b = c; -} + //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// @@ -133,26 +328,43 @@ void swap(T& a, T& b){ //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// - int main() { + std::ifstream fin("config.cfg"); + char buff[80]; + while (fin.getline(buff, 80)){ + if (buff[0] == '#') continue; + std::istringstream is(buff); + is >> arraySize >> blockSize>>scatterSize; + } + fin.close(); + + blkHost = (int)ceil((arraySize) / (float)blockSize); //const int arraySize = 6; srand(time(NULL)); - int a[arraySize];// = { 3, 4, 6, 7, 9, 10 }; + int *a=new int[arraySize];// = { 3, 4, 6, 7, 9, 10 }; RandArray(a, arraySize); //const int b[arraySize] = { 10, 20, 30, 40, 50 }; - int c[arraySize+1] = { 0 }; - - std::chrono::time_point start, end; + int *c=new int [arraySize+1]; + memset(c, 0, (arraySize + 1)*sizeof(int)); + //std::chrono::time_point start, end; + LARGE_INTEGER large_interger; + __int64 start, end; + double diff; + QueryPerformanceFrequency(&large_interger); + diff = large_interger.QuadPart; ///////////////////////////////////////////// //////////////CPU calling//////////////////// ///////////////////////////////////////////// - start = std::chrono::high_resolution_clock::now(); + QueryPerformanceCounter(&large_interger); + start = large_interger.QuadPart; + CPU_PrefixSumEx(c, a, arraySize); - end = std::chrono::high_resolution_clock::now(); + QueryPerformanceCounter(&large_interger); + end = large_interger.QuadPart; - printf("CPU_Version:\t %f ms:\n", std::chrono::duration(end - start).count()*1000.0f); + printf("CPU_Version:\t %f ms:\n", 1000 * (end - start)/diff); //printResult(c, arraySize+1); ///////////////////////////////////////////// @@ -165,38 +377,105 @@ int main() ///////////////////////////////////////////// { - int *src, *res, *buff; + int *src, *res; int *host_res = new int[arraySize + 1]; memset(host_res, 0, (arraySize + 1)*sizeof(int)); GPU_MemHelper(a, src, arraySize); - GPU_MemHelper(host_res, buff, arraySize + 1); + //GPU_MemHelper(host_res, buff, arraySize + 1); GPU_MemHelper(host_res, res, arraySize + 1); - //cudaMalloc((void**)&res, (arraySize + 1)*sizeof(int)); - - start = std::chrono::high_resolution_clock::now(); - //GPU_PrefixSumEx_Optimized << < (int)ceil((arraySize + 1) / (float)blockSize), blockSize >> >(res, src, arraySize, 0); - GPU_MemHelper(res, host_res, arraySize + 1, cudaMemcpyDeviceToHost); - if (verifyResult(a, host_res, arraySize)) printf("GPU first step verifed OK!\n"); + QueryPerformanceCounter(&large_interger); + start = large_interger.QuadPart; + rec_GPU_PO(res, src); + - for (int i = 1; i <= arraySize; i <<= 1){ - //GPU_PrefixSumEx_Optimized << < (int)ceil((arraySize + 1) / (float)blockSize), blockSize >> >(buff, res, arraySize, i); - swap(res, buff); - } - end = std::chrono::high_resolution_clock::now(); + QueryPerformanceCounter(&large_interger); + end = large_interger.QuadPart; GPU_MemHelper(res, host_res, arraySize + 1, cudaMemcpyDeviceToHost); - if (verifyResult(c, host_res, arraySize + 1)) printf("GPU Naive verifed OK!\n"); - printf("GPU_Optimized_Version:\t %f ms:\n", std::chrono::duration(end - start).count()*1000.0f); + + if (verifyResult(c, host_res, arraySize + 1)) printf("GPU Second Step verifed OK!\n"); + printf("GPU_Optimized_Version:\t %f ms:\n", 1000 * (end - start) / diff); free(host_res); cudaFree(res); cudaFree(src); cudaDeviceReset(); } + + ///////////////////////////////////////////// + //////////////CPU scatter calling//////////// + ///////////////////////////////////////////// + int *Host_before_Scatter = new int[scatterSize]; + RandLotsZeros(Host_before_Scatter, scatterSize); + //printResult(Host_before_Scatter, scatterSize); + int *Host_after_Scatter = new int[scatterSize]; + int sizeb=0; + QueryPerformanceCounter(&large_interger); + start = large_interger.QuadPart; + CPU_Scatter(Host_after_Scatter, sizeb, Host_before_Scatter, scatterSize); + QueryPerformanceCounter(&large_interger); + end = large_interger.QuadPart; + printf("CPU_Scatter:\t %f ms:\n", 1000 * (end - start) / diff); + //printResult(Host_after_Scatter, sizeb); + //printf("scatterSize-sizeb=%d\n", scatterSize - sizeb); + ///////////////////////////////////////////// + //////////////GPU scatter calling//////////// + ///////////////////////////////////////////// + { + int* Dev_before_Scatter; + int* allzero = new int[scatterSize + 1]; + memset(allzero, 0, (scatterSize + 1)*sizeof(int)); + GPU_MemHelper(Host_before_Scatter, Dev_before_Scatter, scatterSize); + int* Dev_aux_Scatter; + GPU_MemHelper(allzero, Dev_aux_Scatter, scatterSize+1); + int* Dev_after_Scatter; + GPU_MemHelper(allzero, Dev_after_Scatter, scatterSize+1); + + QueryPerformanceCounter(&large_interger); + start = large_interger.QuadPart; + + rec_GPU_POS(Dev_aux_Scatter, Dev_before_Scatter); + GPU_scatter_cp << <(int)ceil((scatterSize) / (float)blockSize) , blockSize >> >(Dev_after_Scatter, Dev_aux_Scatter, Dev_before_Scatter); + + QueryPerformanceCounter(&large_interger); + end = large_interger.QuadPart; + + int* host_res=new int[scatterSize]; + GPU_MemHelper(Dev_after_Scatter, host_res, scatterSize, cudaMemcpyDeviceToHost); + if (verifyResult(Host_after_Scatter, host_res, sizeb)) printf("GPU Scatter verifed OK!\n"); + printf("GPU_Scatter_Version:\t %f ms:\n", 1000 * (end - start) / diff); + free(host_res); + cudaFree(Dev_after_Scatter); + cudaFree(Dev_aux_Scatter); + cudaFree(Dev_before_Scatter); + cudaDeviceReset(); + } + ///////////////////////////////////////////// + /////////GPU Thrust::scatter calling///////// + ///////////////////////////////////////////// + { + int* host_res = new int[scatterSize]; + struct notzero{ + __host__ __device__ bool operator()(const int x){ + return x != 0; + } + }; + + QueryPerformanceCounter(&large_interger); + start = large_interger.QuadPart; + thrust::copy_if(Host_before_Scatter, Host_before_Scatter + scatterSize, host_res, notzero()); + QueryPerformanceCounter(&large_interger); + end = large_interger.QuadPart; + + if (verifyResult(Host_after_Scatter, host_res, sizeb)) printf("Thrust Scatter verifed OK!\n"); + printf("Thrust::GPU_Scatter_Version:\t %f ms:\n", 1000 * (end - start) / diff); + free(host_res); + cudaDeviceReset(); + } ///////////////////////////////////////////// //////////////Naive calling////////////////// ///////////////////////////////////////////// @@ -206,17 +485,19 @@ int main() memset(host_res, 0, (arraySize + 1)*sizeof(int)); GPU_MemHelper(a, src, arraySize); GPU_MemHelper(host_res, res, arraySize + 1); - start = std::chrono::high_resolution_clock::now(); + QueryPerformanceCounter(&large_interger); + start = large_interger.QuadPart; for (int i = 0; i <= arraySize; i++) - GPU_PrefixSumEx_Naive << < (int)ceil((arraySize + 1) / (float)blockSize), blockSize >> >(res, src, arraySize, i); - end = std::chrono::high_resolution_clock::now(); + GPU_PrefixSumEx_Naive << < (int)ceil((arraySize + 1) / (float)blockSize), blockSize >> >(res, src, i); + QueryPerformanceCounter(&large_interger); + end = large_interger.QuadPart; GPU_MemHelper(res, host_res, arraySize + 1, cudaMemcpyDeviceToHost); if (verifyResult(c, host_res, arraySize + 1)) printf("GPU Naive verifed OK!\n"); - printf("GPU_Naive_Version:\t %f ms:\n", std::chrono::duration(end - start).count()*1000.0f); + printf("GPU_Naive_Version:\t %f ms:\n", 1000 * (end - start) / diff); free(host_res); cudaFree(res); cudaFree(src); - //cudaDeviceReset(); + cudaDeviceReset(); } From e2bcedd2c2b9628630d33f3400db23e98208d920 Mon Sep 17 00:00:00 2001 From: chiwsy Date: Tue, 30 Sep 2014 18:35:11 -0400 Subject: [PATCH 3/6] Comment update --- .../CIS565_2014_Fall_StreamCompaction/config.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/config.cfg b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/config.cfg index d42d8c9..b4fea95 100644 --- a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/config.cfg +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/config.cfg @@ -1,4 +1,4 @@ #This is a configure file of the GPU #The last line begin without '#' should have space separated numbers: -#arraySize blockSize +#arraysize_for_PrefixSum blockSize arraysize_for_Scattering 3000000 128 3000000 \ No newline at end of file From df57fba3903ba4db9047b0edcd04fddcdb44cc18 Mon Sep 17 00:00:00 2001 From: chiwsy Date: Tue, 30 Sep 2014 23:00:19 -0400 Subject: [PATCH 4/6] All set --- .../CIS565_2014_Fall_StreamCompaction.v12.suo | Bin 45568 -> 49152 bytes .../CIS565_2014_Fall_StreamCompaction.vcxproj | 2 +- .../Macros.h | 2 +- .../config.cfg | 2 +- .../kernel.cu | 253 ++++++++++++++++-- 5 files changed, 237 insertions(+), 22 deletions(-) diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.v12.suo b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.v12.suo index 14e0692b849b5040948c30bbc292b676bd57bc59..bb11066b1422028b59cdacfa10452cfc41a67aa0 100644 GIT binary patch delta 2146 zcmcJQeN0UC2#XfmiO4$kV^NdK45R>wN0JZ|l7eI- z=}?lFka`eXnTQ@qMp_VZxPwLww5mF6*$>l&DMfq3y+^~k4D~+5fRrP&dIf45;vgSm zNA=7dYMA1h&`RE$Q9pyMk2NKv9>K=5$iql0(uTAnrJ<)hBJb3`v-j5a1LxZsI9iq@ z4n41-rj3&5uunP_bqtb*L?K(lN#i6G&QNP(BNEP>uq_=mg`F};A%AXFK_XgR@L;I;TzgS>1}GR*|KA@N@m7zo9J=yYf}_Ngd>Tj_Ql z)KEsL5`u=czG(SRr-q5~HMErMiJxcMVD2c4G)3-Ce3OZ04|^57W;f5=vOTlFJm5`p z(UbZvoPQ4?5qCN?R7^@Fv~Jed8WVP`n}+Nmbq{t_u#ypaEf>==I^TuiDRg3~z78|| zlh=xrOtNwaSUuo*WMr}~pZRk$nenar&R>##vJQDqnm=*24Sp(H^Nb9RUd@1(t2r=~ zZ#V9C+E%iScVDkCWG?nQkN*e@`Btfvi}7`kZp)$z2w!@H`3tSg63jYf9%;m7Nz++A zjzE2pL-jm0#*1?5A~UGt605D? zIMXi&sLxbDg zpyN-Oq2v4#cq@IZ36?AGXc-OCbxMPD1-p?$2nn3Gw?jvHOO)qyR}1=U%&*dhnlGMc zgC_e$n94aUmZ_Dey`7SS)31Pwd71w6w$~CTf@@asLIsQcoGDBj}jr4)IjA^nuRbvxmnG z)BO;XtiT){HK@5XAFRf=n|1~DAIFPmGLWi_~NWh=08SvSA|86X=YGBbvgolzKU zRR|%$3!F!8;U=d@NTcX>l&Ddx5&WP=h?j`s2R;}QBOgY>jKcu_pROe}CWJ(NlV6|b zefnO`d2S~VNC^BQ@2L>!1MX7U(lAoxYXAA7C?i$w7guOluD3`fk|a3*9oGv@kZ+Kh z;8TjBDN$<*RKEj#?5Iwe=KOqf+?qL2-3?g|Twp2PFlDw^LaPEEFbh3r&3=(V5@%meem0Zu>U48A%$I3o zQmkMW&L`wFiE=6m$RS%3IpC)R-72iJ zL_E&59J#P-+P6vqCS07OhKIIi9>aulh=;wYr9)lop3cKm6^=wP40sm*H@v2&H8`mabPj^b$dx)fKe5c8syRRu$SbZ{$Vqd#^ciuQ4*i zC351+lC$$~VP4?!nj-Oj%~dda1O9yubb;sNR;ti@z^15ebKHMhp>G3yfD7`lar+TC zcBqE?5F9aHeS&K(GN?<jLaP)Yc{XK z-w*ETqv>T^%r&D0O~xNKMd%A;Dr;+_W37sDz4@A;`khu|$EvU?qWr$bDwDmf&*)qC zqR@7Q*hr@C)QL{%X=#qsB6Uby3QQ{vZ1$D8C9kwrYL!&!8ELEJ$7jNc|D+94C%%Fk zvAk2_elED#j_xF1PdNpaFHG;2+9T_@K4XW@qk&Df!JgM;8r)*1u>VDpdn=8=W{(&w zcL?YGa+JfsSMjG~DOd)s0%~&=-^< 64 - compute_30,sm_30 + compute_20,sm_20 diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/Macros.h b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/Macros.h index 36bede4..1ccfec0 100644 --- a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/Macros.h +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/Macros.h @@ -8,4 +8,4 @@ int blkHost; #define NUM_BANKS 32 #define LOG_NUM_BANKS 5 -#define CONFLICT_FREE_OFFSET(n) ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS)) \ No newline at end of file +#define CONFLICT_FREE_OFFSET(n) (NUM_BANKS*((n) >> LOG_NUM_BANKS)+((n)+((n) >> LOG_NUM_BANKS))%NUM_BANKS) \ No newline at end of file diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/config.cfg b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/config.cfg index b4fea95..d789b9f 100644 --- a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/config.cfg +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/config.cfg @@ -1,4 +1,4 @@ #This is a configure file of the GPU #The last line begin without '#' should have space separated numbers: #arraysize_for_PrefixSum blockSize arraysize_for_Scattering -3000000 128 3000000 \ No newline at end of file +80000 128 80000 \ No newline at end of file diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu index d050ef5..e0e0cab 100644 --- a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu @@ -131,21 +131,21 @@ __global__ void GPU_PrefixSumEx_Optimized(int* out, int* src,int* multiBlock,int int blockid = (blockIdx.x*blockDim.x); int thid = threadIdx.x; int id = (blockIdx.x*blockDim.x) + threadIdx.x; - + if (id < dataSize){ cin[threadIdx.x] = src[id]; //cin[2 * threadIdx.x+1] = src[blockid + 2 * threadIdx.x+1]; __syncthreads(); //swap(in, out); - int lid = (1+thid)<<1; - int it=1; - + int lid = (1 + thid) << 1; + int it = 1; + for (it = 1; it < size; it *= 2){ - - if (lid<= size+1){ - + + if (lid <= size + 1){ + int rid = lid - it; - cin[lid-1] += cin[rid-1]; + cin[lid - 1] += cin[rid - 1]; } lid <<= 1; __syncthreads(); @@ -155,35 +155,36 @@ __global__ void GPU_PrefixSumEx_Optimized(int* out, int* src,int* multiBlock,int if (thid == 0){ multiBlock[blockIdx.x] = cin[size - 1]; //out[arraySize] = cin[it-1]; - cin[size-1] = 0; + cin[size - 1] = 0; } int upbound = it; while (it>1){ it >>= 1; lid >>= 1; if (lid <= upbound){ - int rid = lid-it; + int rid = lid - it; int tmp = cin[lid - 1]; cin[lid - 1] += cin[rid - 1]; - cin[rid-1] = tmp; + cin[rid - 1] = tmp; } __syncthreads(); } //swap(cin, cout); out[id] = cin[threadIdx.x]; - + __syncthreads(); - /*for (int i = 1; i < blocks; i <<= 1) - if (blockIdx.x >= i) cout[threadIdx.x] += multiBlock[blockIdx.x - i];*/ } } + __global__ void GPU_final(int* out, int*in, int size){ out[size] = out[size - 1] + in[size - 1]; } + __global__ void GPU_append(int *out, int *val, int loc,int pos=0){ out[loc] = val[pos]; } + __global__ void GPU_add(int *out, int *src, int size = arraySize){ if (blockIdx.x == 0) return; //int blockid = (blockIdx.x*blockDim.x); @@ -316,11 +317,163 @@ __global__ void GPU_scatter_cp(int* out, int* aux, int* in, int size = scatterSi //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// -__global__ void GPU_PrefixSumEx_Optimized_WithScatter_BankConflicts(int* out, const int* in, const int& size){ +__global__ void GPU_PrefixSumEx_Optimized_BankConflicts(int* out, int* src, int* multiBlock, int size, int dataSize = arraySize){ + extern __shared__ int cache[]; + int *cin = cache; + //int *cout = &cache[blockSize]; + int blockid = (blockIdx.x*blockDim.x); + int thid = threadIdx.x; + int id = (blockIdx.x*blockDim.x) + threadIdx.x; + int offset = 1; + + if (id < dataSize){ + + cin[CONFLICT_FREE_OFFSET(thid)] = src[id]; + + __syncthreads(); + //swap(in, out); + int lid = (1 + thid) << 1; + int it = 1; + + for (it = 1; it < size; it *= 2){ + + if (lid <= size + 1){ + + int rid = lid - it; + cin[CONFLICT_FREE_OFFSET(lid - 1)] += cin[CONFLICT_FREE_OFFSET(rid - 1)]; + } + lid <<= 1; + __syncthreads(); + //swap(cin, cout); + } + //it >>= 1; + if (thid == 0){ + multiBlock[blockIdx.x] = cin[CONFLICT_FREE_OFFSET(size - 1)]; + //out[arraySize] = cin[it-1]; + cin[CONFLICT_FREE_OFFSET(size - 1)] = 0; + } + int upbound = it; + while (it>1){ + it >>= 1; + lid >>= 1; + if (lid <= upbound){ + int rid = lid - it; + int tmp = cin[CONFLICT_FREE_OFFSET(lid - 1)]; + cin[CONFLICT_FREE_OFFSET(lid - 1)] += cin[CONFLICT_FREE_OFFSET(rid - 1)]; + cin[CONFLICT_FREE_OFFSET(rid - 1)] = tmp; + } + __syncthreads(); + } + + //swap(cin, cout); + out[id] = cin[CONFLICT_FREE_OFFSET(thid)]; + + __syncthreads(); + + } +} + +void rec_GPU_POBR(int* out, int* src, int size = arraySize, int* multiBlock = NULL){ + int blocks = (size - 1) / blockSize + 1; + if (!multiBlock) + cudaMalloc((void**)&multiBlock, (blocks + 1)*sizeof(int)); + GPU_PrefixSumEx_Optimized_BankConflicts << < blocks, blockSize, blockSize*sizeof(int) >> >(out, src, multiBlock, blockSize); + cudaThreadSynchronize(); + if (blocks == 1){ + GPU_append << <1, 1 >> >(out, multiBlock, size); + return; + } + else{ + rec_GPU_POBR(multiBlock, multiBlock, blocks); + GPU_add << > >(out, multiBlock, size); + cudaThreadSynchronize(); + GPU_append << <1, 1 >> >(out, multiBlock, size, blocks); + } + + if (size == arraySize){ + GPU_final << <1, 1 >> >(out, src, size); + } + cudaFree(multiBlock); +} + +__global__ void GPU_PrefixSumEx_Optimized_WithScatter_BankConflicts(int* out, int* src, int* multiBlock, int size, int dataSize = arraySize){ + extern __shared__ int cache[]; + int *cin = cache; + //int *cout = &cache[blockSize]; + int blockid = (blockIdx.x*blockDim.x); + int thid = threadIdx.x; + int id = (blockIdx.x*blockDim.x) + threadIdx.x; + int offset = 1; + + if (id < dataSize){ + + cin[CONFLICT_FREE_OFFSET(thid)] = (src[id]!=0); + + __syncthreads(); + //swap(in, out); + int lid = (1 + thid) << 1; + int it = 1; + + for (it = 1; it < size; it *= 2){ + if (lid <= size + 1){ + + int rid = lid - it; + cin[CONFLICT_FREE_OFFSET(lid - 1)] += cin[CONFLICT_FREE_OFFSET(rid - 1)]; + } + lid <<= 1; + __syncthreads(); + //swap(cin, cout); + } + //it >>= 1; + if (thid == 0){ + multiBlock[blockIdx.x] = cin[CONFLICT_FREE_OFFSET(size - 1)]; + //out[arraySize] = cin[it-1]; + cin[CONFLICT_FREE_OFFSET(size - 1)] = 0; + } + int upbound = it; + while (it>1){ + it >>= 1; + lid >>= 1; + if (lid <= upbound){ + int rid = lid - it; + int tmp = cin[CONFLICT_FREE_OFFSET(lid - 1)]; + cin[CONFLICT_FREE_OFFSET(lid - 1)] += cin[CONFLICT_FREE_OFFSET(rid - 1)]; + cin[CONFLICT_FREE_OFFSET(rid - 1)] = tmp; + } + __syncthreads(); + } + + //swap(cin, cout); + out[id] = cin[CONFLICT_FREE_OFFSET(thid)]; + + __syncthreads(); + + } } +void rec_GPU_POSBR(int* out, int* src, int size = scatterSize, int* multiBlock = NULL){ + int blocks = (size - 1) / blockSize + 1; + if (!multiBlock) + cudaMalloc((void**)&multiBlock, (blocks + 1)*sizeof(int)); + GPU_PrefixSumEx_Optimized_WithScatter_BankConflicts << < blocks, blockSize, blockSize*sizeof(int) >> >(out, src, multiBlock, blockSize); + cudaThreadSynchronize(); + if (blocks == 1){ + GPU_append << <1, 1 >> >(out, multiBlock, size); + return; + } + else{ + rec_GPU_POBR(multiBlock, multiBlock, blocks); + GPU_add << > >(out, multiBlock, size); + cudaThreadSynchronize(); + GPU_append << <1, 1 >> >(out, multiBlock, size, blocks); + } + if (size == scatterSize){ + GPU_final << <1, 1 >> >(out, src, size); + } + cudaFree(multiBlock); +} //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// @@ -376,7 +529,7 @@ int main() //////////////Optimized calling////////////// ///////////////////////////////////////////// { - + cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); int *src, *res; int *host_res = new int[arraySize + 1]; memset(host_res, 0, (arraySize + 1)*sizeof(int)); @@ -390,7 +543,7 @@ int main() QueryPerformanceCounter(&large_interger); start = large_interger.QuadPart; - rec_GPU_PO(res, src); + rec_GPU_POBR(res, src); QueryPerformanceCounter(&large_interger); @@ -405,6 +558,36 @@ int main() cudaDeviceReset(); } + { + cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); + int *src, *res; + int *host_res = new int[arraySize + 1]; + memset(host_res, 0, (arraySize + 1)*sizeof(int)); + GPU_MemHelper(a, src, arraySize); + + + + //GPU_MemHelper(host_res, buff, arraySize + 1); + GPU_MemHelper(host_res, res, arraySize + 1); + + QueryPerformanceCounter(&large_interger); + start = large_interger.QuadPart; + + rec_GPU_PO(res, src); + + + QueryPerformanceCounter(&large_interger); + end = large_interger.QuadPart; + GPU_MemHelper(res, host_res, arraySize + 1, cudaMemcpyDeviceToHost); + + if (verifyResult(c, host_res, arraySize + 1)) printf("GPU Second Step verifed OK!\n"); + printf("GPU_Optimized_Bank_Conflicts_Resolved_Version:\t %f ms:\n", 1000 * (end - start) / diff); + free(host_res); + cudaFree(res); + cudaFree(src); + cudaDeviceReset(); + } + ///////////////////////////////////////////// //////////////CPU scatter calling//////////// ///////////////////////////////////////////// @@ -426,6 +609,7 @@ int main() //////////////GPU scatter calling//////////// ///////////////////////////////////////////// { + cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); int* Dev_before_Scatter; int* allzero = new int[scatterSize + 1]; memset(allzero, 0, (scatterSize + 1)*sizeof(int)); @@ -454,6 +638,37 @@ int main() cudaFree(Dev_before_Scatter); cudaDeviceReset(); } + + { + cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); + int* Dev_before_Scatter; + int* allzero = new int[scatterSize + 1]; + memset(allzero, 0, (scatterSize + 1)*sizeof(int)); + GPU_MemHelper(Host_before_Scatter, Dev_before_Scatter, scatterSize); + int* Dev_aux_Scatter; + GPU_MemHelper(allzero, Dev_aux_Scatter, scatterSize + 1); + int* Dev_after_Scatter; + GPU_MemHelper(allzero, Dev_after_Scatter, scatterSize + 1); + + QueryPerformanceCounter(&large_interger); + start = large_interger.QuadPart; + + rec_GPU_POSBR(Dev_aux_Scatter, Dev_before_Scatter); + GPU_scatter_cp << <(int)ceil((scatterSize) / (float)blockSize), blockSize >> >(Dev_after_Scatter, Dev_aux_Scatter, Dev_before_Scatter); + + QueryPerformanceCounter(&large_interger); + end = large_interger.QuadPart; + + int* host_res = new int[scatterSize]; + GPU_MemHelper(Dev_after_Scatter, host_res, scatterSize, cudaMemcpyDeviceToHost); + if (verifyResult(Host_after_Scatter, host_res, sizeb)) printf("GPU Scatter Bank Conflicts Resolved verifed OK!\n"); + printf("GPU Scatter Bank Conflicts Resolved Version:\t %f ms:\n", 1000 * (end - start) / diff); + free(host_res); + cudaFree(Dev_after_Scatter); + cudaFree(Dev_aux_Scatter); + cudaFree(Dev_before_Scatter); + cudaDeviceReset(); + } ///////////////////////////////////////////// /////////GPU Thrust::scatter calling///////// ///////////////////////////////////////////// @@ -479,7 +694,7 @@ int main() ///////////////////////////////////////////// //////////////Naive calling////////////////// ///////////////////////////////////////////// - { + /*{ int *src, *res; int *host_res = new int[arraySize + 1]; memset(host_res, 0, (arraySize + 1)*sizeof(int)); @@ -498,7 +713,7 @@ int main() cudaFree(res); cudaFree(src); cudaDeviceReset(); - } + }*/ From 7bcceea20e51676216529dbd87084c5570e89aff Mon Sep 17 00:00:00 2001 From: chiwsy Date: Wed, 1 Oct 2014 17:13:09 -0400 Subject: [PATCH 5/6] Finished --- .../CIS565_2014_Fall_StreamCompaction.v12.suo | Bin 49152 -> 50176 bytes .../CIS565_2014_Fall_StreamCompaction.vcxproj | 2 +- .../kernel.cu | 44 +++++- .../output.log | 6 + Performance.xlsx | Bin 0 -> 64740 bytes README.md | 137 +----------------- performance.py | 20 +++ 7 files changed, 73 insertions(+), 136 deletions(-) create mode 100644 CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/output.log create mode 100644 Performance.xlsx create mode 100644 performance.py diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.v12.suo b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.v12.suo index bb11066b1422028b59cdacfa10452cfc41a67aa0..3db652b824c1b6af7304038a07df70537a3c0d41 100644 GIT binary patch delta 2165 zcmchYe{54#6vumSyYjlR(*0!Me!Q{rYsl=zHV}(;w&0w=ibGJGVU-PREE}w^b0}^V zi4Y_L?+_)CpAQoGLkae&;SY%r(2zg`(M0@3(0?>hiI|{)MC$jfEuzr`H1Q;#o_o)| z=broSx%V_4x5n37H+#w~8F5)G4zwk>mP{tuEGRE)(wiOYg)WyX+Zwvx{pz%+rDQ0c6Wu+}x(6SdOfMpa3+iio~a|52`1PGPnHAVoA`w%p5 zitQ}C3hNT+f^@qax-Q-3F_t#;)wGTGm9nOOtI?mBvN@C4>A~U9FrDs9&rkkPf2QABaREZCfs5oCSZjd6OoFYOpb#6H*_+)8Al;ofsGSM z2Xjgv3!%YNZ}sFktDHqU$ecW9yb|zwRnY(E!Ou^}<=FhGW6-~egLiabUJ)ou9ez;H zj;QRiN4C&^zh zgW@BO|BWrK7i(EF5Pi3~s(779G=3@E{phX&PlIQ`YVa%=0E2)xnW%3TRPG%^9+^#A zWl(`_=+0&y_YU)N!?!oLmQf<b%bPag@ zt;6*4fUL7nsn^YcMIXh=TluT>y*8oH6?LL$ph{UrKBrBofu{t#BQ7JKg8d{a{BTt$ zP;s{@1J?wd9Vnz=yNgb};HvDm#4MU6Y>5C3XK=~?qf@UB z$ch{GtA6s2ndP#(0}R!BXd@tvF)}eNAe2m{<`}ec{PS6_V%$boMzuHuzw0{ykC`jy zT4mYu-~Qp2OWY_Jvu#`RxWIlBd%}Gz7zIi+2+-GcBvO0?h7a?Yqv52- zq2=nVaHK2RJ}11SwHE9hwHeuT1njYEol@CY87P2oYvi+Zen mINwg*4hwyxjU!i?Q?K?>yev+>>h~PMpZuHm?pi$O};dMFU7L(l0xf+-Df^N~>3-&YQ;6u_4PAB)Wa zy#laP4YbOWt_P(waEd`opvkpLGbnsL%?Bt}b*=0B(9N=Nd@&_wQKhzM`kc=%)|YkndzA zWj@a6ftbdR0#~ASX7Mbg0|||biHp!0F5-0>4cA5yu125K?x2ant^LGe4 z<@$nVu-yF1k|La@RzQ{Ar$lJQ-99h0fJT#g@?)h&%S z`5$Evlj`!d+uw%OOHT8~Lw1j`Pm+A`sCo3s5n=R7qu)y-M#jt>tdM44_(zTQ9ynpT zo>>sa%TiDI9b1%E-$%vf?jX8QerWt230F+G(7%XV1MODV$!b@zcVM zlIeDRS|#Ph c`46_9|LXF<_pvA5{WZnTtS9MsvGI}iCo 64 - compute_20,sm_20 + compute_30,sm_30 diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu index e0e0cab..2ec85cb 100644 --- a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu @@ -75,6 +75,12 @@ void swap(T& a, T& b){ b = c; } +void printusage(){ + printf("This is the help of this program!\n"); + printf("You can use the configure.cfg to configure the program as well!\n"); + printf("If you use cmd parameters, you have three option:\n"); + printf("-b : blockSize\n-p : prefix sum array size\n-s : scattering array size\n"); +} //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// @@ -481,7 +487,7 @@ void rec_GPU_POSBR(int* out, int* src, int size = scatterSize, int* multiBlock = //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// -int main() +int main(int argc, char** argv) { std::ifstream fin("config.cfg"); char buff[80]; @@ -491,7 +497,29 @@ int main() is >> arraySize >> blockSize>>scatterSize; } fin.close(); - + if (argc > 1){ + for (int i = 1; i < argc;){ + if (!strcmp(argv[i], "-b")){ + blockSize = atoi(argv[i + 1]); + i += 2; + } + else if (!strcmp(argv[i], "-p")){ + arraySize = atoi(argv[i + 1]); + i += 2; + } + else if (!strcmp(argv[i], "-s")){ + scatterSize = atoi(argv[i + 1]); + i += 2; + } + else{ + printf("Unrecognized Parameter: %s\n", argv[i]); + printusage(); + exit(-1); + } + } + } + std::ofstream fout("output.log", std::ios::app); + fout << blockSize << '\t' << arraySize << '\t' << scatterSize << '\t'; blkHost = (int)ceil((arraySize) / (float)blockSize); //const int arraySize = 6; srand(time(NULL)); @@ -518,6 +546,7 @@ int main() end = large_interger.QuadPart; printf("CPU_Version:\t %f ms:\n", 1000 * (end - start)/diff); + fout << 1000 * (end - start) / diff << '\t'; //printResult(c, arraySize+1); ///////////////////////////////////////////// @@ -552,6 +581,7 @@ int main() if (verifyResult(c, host_res, arraySize + 1)) printf("GPU Second Step verifed OK!\n"); printf("GPU_Optimized_Version:\t %f ms:\n", 1000 * (end - start) / diff); + fout << 1000 * (end - start) / diff << '\t'; free(host_res); cudaFree(res); cudaFree(src); @@ -582,6 +612,7 @@ int main() if (verifyResult(c, host_res, arraySize + 1)) printf("GPU Second Step verifed OK!\n"); printf("GPU_Optimized_Bank_Conflicts_Resolved_Version:\t %f ms:\n", 1000 * (end - start) / diff); + fout << 1000 * (end - start) / diff << '\t'; free(host_res); cudaFree(res); cudaFree(src); @@ -602,6 +633,8 @@ int main() QueryPerformanceCounter(&large_interger); end = large_interger.QuadPart; printf("CPU_Scatter:\t %f ms:\n", 1000 * (end - start) / diff); + + fout << 1000 * (end - start) / diff << '\t'; //printResult(Host_after_Scatter, sizeb); //printf("scatterSize-sizeb=%d\n", scatterSize - sizeb); @@ -632,6 +665,7 @@ int main() GPU_MemHelper(Dev_after_Scatter, host_res, scatterSize, cudaMemcpyDeviceToHost); if (verifyResult(Host_after_Scatter, host_res, sizeb)) printf("GPU Scatter verifed OK!\n"); printf("GPU_Scatter_Version:\t %f ms:\n", 1000 * (end - start) / diff); + fout << 1000 * (end - start) / diff << '\t'; free(host_res); cudaFree(Dev_after_Scatter); cudaFree(Dev_aux_Scatter); @@ -663,6 +697,7 @@ int main() GPU_MemHelper(Dev_after_Scatter, host_res, scatterSize, cudaMemcpyDeviceToHost); if (verifyResult(Host_after_Scatter, host_res, sizeb)) printf("GPU Scatter Bank Conflicts Resolved verifed OK!\n"); printf("GPU Scatter Bank Conflicts Resolved Version:\t %f ms:\n", 1000 * (end - start) / diff); + fout << 1000 * (end - start) / diff << '\t'; free(host_res); cudaFree(Dev_after_Scatter); cudaFree(Dev_aux_Scatter); @@ -688,13 +723,14 @@ int main() if (verifyResult(Host_after_Scatter, host_res, sizeb)) printf("Thrust Scatter verifed OK!\n"); printf("Thrust::GPU_Scatter_Version:\t %f ms:\n", 1000 * (end - start) / diff); + fout << 1000 * (end - start) / diff << '\n'; free(host_res); cudaDeviceReset(); } ///////////////////////////////////////////// //////////////Naive calling////////////////// ///////////////////////////////////////////// - /*{ + { int *src, *res; int *host_res = new int[arraySize + 1]; memset(host_res, 0, (arraySize + 1)*sizeof(int)); @@ -713,7 +749,7 @@ int main() cudaFree(res); cudaFree(src); cudaDeviceReset(); - }*/ + } diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/output.log b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/output.log new file mode 100644 index 0000000..b858342 --- /dev/null +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/output.log @@ -0,0 +1,6 @@ +Blocksize PrefixArraySize ScatterArraySize CPUPrefixTime GPUOPTPrefixTime GPUOPTPrefixBankConfResolvedTime CPUScatteringTime GPUScatteringTime GPUScatteringTimeBankConfResolvedTime GPU::ThrustScatteringTime +128 80000 80000 0.1779 0.919437 0.827493 0.349814 0.997696 0.926707 0.320306 +128 80000 80000 0.214678 1.07767 1.06141 0.335274 0.882659 1.1679 0.316457 +128 80000 80000 0.18517 1.15849 0.869402 0.339123 0.856145 0.874106 0.320734 +128 80000 80000 0.175762 1.253 1.02036 0.336557 0.874962 1.0473 0.31774 +128 80000 80000 0.175334 0.998551 0.9498 0.451593 0.83904 0.868547 0.316885 diff --git a/Performance.xlsx b/Performance.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..720c662cf8e80a8d17d791b7edaea49aa365db1d GIT binary patch literal 64740 zcmeFYbx>Vhw)P7_f(9o*aCdiiclY4#?jGD7g1aZUTkzm68+UgJes|Ju_uG9s-KX!Z z`l`;U`|$_6sJ&}bbFF7R;~8WAro0pwI4TGv2owki2m#1>(3`QAHy|M0m>?i1AOoOU zLbf(e#x_nLmEG-(9d&5ktgZ0#dO*o@K!6|mzd!$ruRyPcOw1BJV!PU*Z>U3MsDA^% z8saz?Z@{JS^aX?s6Q8h}2yk5U+95>0*sBOVrhf1Vabx7ec$<3`bVRF=hOk_Y6Kx-3 z7RfaGo4D`P?>`$q{Wv#t_#n1q7k42Dqux)Kv(-8aV2cPzfpH3wgKdv&>{tGXF@1n8 zS%sqE-}P}$F==_da_MV$F;@pDYqqSdI^`!&ebQYsB3&AvTrkRU{Fc4%PunD$Uxm7- zXjTuWcSr>S?XPdHw~@RsugWMd6F)d*@`R1}XAa(Q$`K?SbO?VXR(is8RY8}qzFV2Z zZH4jV#ld{iD9c=Zv%Y&YY|7YCg?e7W8VqW7dQtrT-5U}MW%`D~&-5c3o+59&vs)E@v_F&xF zwZ7KR@p;+PrSJ&}&ifL<0T%sPDiz=$>D`@Ql zEC|T!D;S9UKR6bOZig`hCkOMCV4=> zS0?-j48>TTDR060udi9yXK?1gKXZ42y#j>LQRM9DB_9Ht)aPVmzT}fnR;Un z_A^oW^^4ENYH(yPcv(%e50RSK;D*I?8bxG)QYE_hGgG)u^Yg$dNutl~`QjxjNawJY8%&gY;@%90M>zJ-G0 zD1t_ z$6t}Caw4jrau(2v!xaH#j?N>Q-NGd1_>6hXBzw*4cpp;bJTz}w&izHvP4U1JY)UsY zCtDIv23l`LJ{R_D1{{shV}kR3D0Dkqj|s1N4h)p(C|^xIm42 zCkEnIgxYU<>chIX%MDg$ zfXadg2LV9-+(y$GH9EZi_?K!*dggWaZ~fD}~(y$B33 z4wvs5iAs57j*#4@Av+gWttOwgro?KudajB4SkyPo5;o)?)|z9A>dafMRCYLS2Vr1F%1L#9$@7{_LmpgI#&cM-Cyo>AYt zMO>2>^#oIO*O%Q%CjzrX|OEL8=_za3iCfM@`%r=2*Qm?c~SC3R$j=+(|s23Ugh! z^cFbt&Lu8GBVcTQCp}(}31aEvCEzH>Bre_Pf{sF;UXvBT?Jc z0MC-~uN4XEX9T0=aCRI#D~>C-MSib71is z!)2i3a{`<1Zv|kZ&JpVd0Rp0d1OkHbrvMzyjE$Wf>3+R1|8$}Zb%4zlE21~ugfHSX zZu)zLl&)KYc~?U%R2hJ`sud@?D!Ph97~B2BI}sgnQRtdkA;NKv49qM{#k`_yA8o}w z2~=ak+B7Ld7o0epZH6ISlhZZdN?XBsigI?Y{?d`&5pVRuNAs5>y6JN2jucY@%D3D` zJ0m=5Dn49ki6OO3y(IC?gCHD3o5*VCwQi}2?gX2LJp1vo{K7X)&} z+xW?s=Wa+Eg_)>?QB;GS++^+tW5v!vd@yf|2z+AbVsA1fXB^3{4jBCHEox*5WSQ=m znpecW>&D!ZARFn+#r0ArYK4o{ThM`romtSOO zc=AMsjVoeu72@yrJ&OIzx?K>#k$rn-{p3u8J+etIdtsJ3_9m1CIih&JRRYY!4^)6sH7jV75z!4Wp@u%s^5UBN}( zfaj>&kF`iL)O*Ck{C9pDWb1eD&gF~|ncuCyz7B(MHjRNeS>iltLxM}{c*?U?Mj({Z zPtKMAGkUojFU|I>zo^gFQU!@SH# zlA|N+uM|JiTuN#Y?Yo8R4>i8mT2W{G!jeFAXgH#YyWC-5BT8gl&mx>E;?I!PiPdSD zMopYMFGlz!)st(AUwQ$lzt?EXfYrnby3VbqW5y+--J&AYKRJs*syut1CMGzQiSCfL z+>|x_t=7kNPe>Zg74?-iYaZlmu|(H*`5n`TXib*xi9e|8zMb$P+vG4DryF8XzG_gBiTh+yU-J z@_b9f))XyAZW$iBW=+_XUth(y{V&fKFqN`msMbj%v7HriFBTDGoB{oq^zz+Xz6a-V`GXm-adGL zz)T=#HVYgCJJ^umBW>rS^){?3HZp%@t(!pmcgoFx_p_zMYO^$BAdBe59uj2-frNoAz*2r)a zT)pl6R?8yto9Jj@2jJ^$5|vEg0MXTmSe#01eGvpX96KxLl!u-z^QaJ9$U;eo2*5(K zB&&}4*u7yQU6lSCZ>>fB8KFpG^{U9n82{} zi|*}k1JzPG>MI1TnMbz>xLc!0^iLE3s~@bO+Swu+EFF8uVWMEvmHZN_IMv-o#LjhD zc4gG~@kDRjN%(rIJ0;)QA@U4WC>Cqq)X1J??<(FDs!1)~?d-?}wUsfDi;(5+_>|D{_q{^8aR z0H9l+Nx%B{xC()|-HCrE&Z|mr;dBOY6CoSf3nEcV0IEAXU;}dagD%N;0%=Y(GIX%1 z`74U>I~wPFa4m3txgNW>-f?A#3%LjVfb1^dF>Eo zFA5A!gvU`%Mrl=!^yZ~^yh1c@8v>LL*V*{hB(pq z#+GBYeNx0dxpuQ7J7@Y$_hdq^da`!fNPdjy_9+o%@>9S`O2Bu(T*#z1grr3n+htv~ zkM!%GpQX{cWsNVhfMUK{3spG}O5i#2wtLN8s8!n0M}YCw6t`)FK1sQl$uhCo*PqB; z)NCwnmHK;m34?LOwGaBgdEwlHQ%!6cOY_cd#x(-Ki|L{^jj8Ffl}krEWj-sdggT1J zaiX)Vp7r;<+HmKR2gx@g`Uw=m0adp947y(g?L!OpMrws|5Fve& z6phq^W$BZwM$hiFWE;Euehs!HX(N2|TUB*>^Jgd%btH|RaYyb@S{E8qj%-@h73m_Z>8YeQ2Ib)%aMSI@ZRCB$8d{7^#!UM0vvuus>Awh?lR% zY+_7DWUG)KeI~N?=uh>ic1^M`e-5AzC8T;@q1k|}=@H-{ZRMkEtqC}8IGy)X21OO9 zv9D{ZJ+vv+izE!-F_37S(%UCd&`V5mHmFF#Ym?Wr5A66s|4gtf$|TDEPJi)Xz+l;e zPdxpH{#o!zKnKYszqMOyWGPN$S!IQywbp65co||icgBR9yQz#-sXj8|+QLM`B@Uk? z;aVf5RVBSV;7ZWYw`)_go0UgOTF?7&0NK9W@!QwIl)$=`aV{LV5N2l9Mf?SRJvZwZ zXzXa1iVDj5jayyq5Jd=mVQ86e=QVPb%PHUK`2EEUk9xH!M#KcL<)vckckk?x)A#)P zIwzkndReyXRrSk-1gcCLR7JUPW~B@k%G^CHtNNt!Yt7=@rZoCdtecno;i6TWw z1bitkU*--#M)*wjKtEMx2&;fBrkeBx4KfMNK2oy5`IMXxRj-rIz$zO8KufUTp&zUL zRDoh!9O%K$n6icJl+$!a$(p#Fhan zWJYysR|EkI-&okIzd}5>RhO;qvF}B2=K+Xan*t;-Yl`GRkMcir1|lb z<;gR=!D}K)pA^&aEmk;Ea8N_+IMpAWXZ5r22-iv^TiudABj`H3-o>ccd>BPXH2V0^ z;Ej${Qy4<3w0hu;5M9#iR;;_9R8)qbTq)=t5ZyX-I{}6(GC2sTv>NwO%FGP58c}4v zVc4hADCUVxmB&QnWcvv!#jy#jYq(l*CHG+}w5IR}C6?LUQVf~dAcqq!287YM9{cBL zLFV}`+FdZ)EXiUck}tS5Y~pZb0?L?myVhd7j#3INXv0|Z9RAPnyWN*{BSv4SeO>$x zZr1!23u`U5IH|{RBj_|74@pS}{Kj()=+2LVlvYf0#OrzL;ExJkc?|xA0i%5><8B99 zGZnCBBKoJe{Fes)S7>JVTl*dqGYa#K9x>=F;0*EBX37IGu1Ht-obm=V+2}FQ8oB18 z*w&UdAYk3vH09Z(Xo;^_VVUA`Ju_Gl9e)th_U*iR6Z60H~AZWZu}}l(_+DrQ~WQ&`~rl#iP81k{?5U4-piyszTh0 z$ihq^$4}WO02g|v^Ig_BDL(4Ns2_u)jL3#!5`l?lIGrJiyMsB0lHMv6AAehZ{4a9& zdV%j${<(T12Ypv_8&k(Wh8KVQ`RD4}V#k52hYJC&Uf!W`#RUY@f(E;~jow-gbeG#kE_xxr(v9iIMdYCgfa=z3& z8|I)E3jPd}?0DZ=9l*Qb+c9mjnhU8QMxNkQKMc-rtO76D9~v-eMgrbx}**3H++Wza}{UC;>%Jpaj}O*HsR|sE2Of0*H;a)&QR)a`4>%i^iR4 zrRf9QVIM?r+o!)KLqIw7114{<#!mF>A-bV6`-Zrt&WVGIbZN^_WKu6t5DfL=D56>5 zC_ZH>T0y4A6(Ve37GBx+CmJ8C@H=*ADQlp9K;y%d_hsmtv!340$5kTMj;88R*+9RsLEOV9X(=s6%MU{I3EZA}G zYFVJ7%`CAqNZ?%!7LunW>~la3IXA2BsFC|>f@wuN6~bl2h-#d&td^K*BQcadXzomA zRFfo`E=qoKg-pbgR+z;SRN2FicKs!R>FZ#ZF>TRb62SQ>0nI-okjyv~@NmlTnmw?& zlgpc>{rlM+mSNn^ee{g{}3~)bP0Qel}pa~nb{&ayOuprjQz*cMX!!-b480Z2t z39+^&-=7_^dou~|R?XGFzX9jivOd#s1P_#etM@Mn zIQ^0UC)QsiaADt@Xlw%nEC;^qy4oX$q-%8-twMvW^7(#3RbqM{nZs5q^??Y%45;0FQW7uhW-E=8n zH!Ti73?*DoVk!LjMhCHtFma$1m)c@{^LSGa)b9Cwe|Plf`#lI63W=` zMlgd4J1HeW+4ts(M$M7$W^<$%b1@=@NWzuo>6iD;Hsl8k+frcbeOm_Wn2DRE^N=s; z#_Uy_YC|Z$;WdLz$99UnAAC?C`@TGMyn)!m&Z9w=GEn*$w|pjO{iXj2W!Q4k(?vI_ zh5IyLSQR`w|NV`!1=-Jufo9+rN$=^R6a7O29BopEFSYiBKRl8h(}Xt;`~b;rQa z%j!LXX8_KIn$6`u*ypYk(X98$2<9c8@}jPsKZ|ZVz-X6ugU9QiYglOT z5!BN!?DUjt@;|oYjvg8lGX`c2EvY-Lu5TcyNa*r&qhZu%)Vaz1C`58l;X~FNT6{4K zYlIpfVsri8f}YwxA=*q&(Z`@@ZKA_T;Cpb-m45yCWlMI&k) zU(YtGv{yAywC2DR*6(MIuB%xO%j|n1?aI&RApQz4rKnWyqyNbdzmGIMI5RxnQT%)h8M| zeSR2`!NrqdZw<##kf)ZQs1_Dc2AE?u7%=;H-y&XixJK-z>E*#&jVcby_yJ3!`Gck| zJ9BY*xOOk~*I*(WIv&!KcFGEd)~NM#?QD~u1U7D9T^dxvhdqc9xQ#k}D=i|S!ns?e z;rrV`?0q~lS!q7mRBB>IdRdzVd%*^}D0X%@LS7eWeI&|0xR<7i&CJcE-Qu}+6H7{N z@tryAqQkr{FHJA6JN@p=mn=edua%g+S!ajr9z+T%q&Gh#b7$(4NLfzaQHH$Ke0Tx( z+z>SOMi**uJ#eFqgHmPn%~5Jhjo0@K>hDG`7@dL!U(YFHa<8hePD?+RwvFUBIb|{} zeALHPs2Ma^fjN4m?7CiB<2RquP_fS>50g7cY2uzonKta&ixpG*u@{)IHjpFU!J`a+ zAl$g5g2(a*9K-+fZjr%oWgtH#Ew}wrK+=bA9KY465r>mx+MI2I)fW}>lNd@4E`o7pF>6lpd z_BLV}FT5cN`e-8LNh9pNcy?tKMb(R=Y52xc>N_Q|-gnjKjNEL!Hp<72qkeDWURKBn zDmdj5{3u?}FkT(n2sqz@#iDP*W^@>|aei>;&I*p6;N3}~6%{m;A-BFqt3M)EQUrG@ zuxCin=HZCiui|WiH0fXi1{kRnI}DtZgbJ3z#|Vb&38K@M6}P$C4=ossIZWOnS0_9+ z>3zIVj84v9e0s^ID_Pj=q~+ChQ}d>n*6N33NTBLVXa%i5a|vCLYEeU{$uqb4k}->2`*G3}`K+z?jC4F-g@6maLbl z&Mg?I5*u&D<@8T^!~Gb;c5pwX{2>@lbl@n>O@}P>34igZ$;M8ykBu^S3n73QdTKWz zV+sljeq3UO9zIW$h^G(<3x3U2$rMo>?IgX->;shWY>~WNx<`qm-{4oL$i!05w-iJi z5i{HH;!%5dj=?E?r&T5A`s-7pY)^`PUpzerSk@X;PX*rewmyt6YINoH4s zoN>Svh{6@f9JP6C0X>FV)f{6$cAG$J}auePe1w+)B_k(KB?*372Rg?FT-e)HPZCfI0cQvBm*+0`(@z%osSc_#cuI|LUf$D zNlwmz?5uCkv=%co2*v-ldoiSAg&ALPj0qUa9DRVz0Z^%KocX707 z`>-^J6!Ii!*EDg^7alcSnTp@o|R-z%$%SH|EH(2-`i^z>xngT7gG-KB66CY6nB~&Bq^sV8c2N z7#JYnef$IZXn<-;ZC5b26qQF!BSeU`1u4PxP(mkL>eH?anjkB#TGtN9!xX}iA^V|3YpIz zLFf(SYkHAp2KHV=+NHQ&_i@=Y((Oi&J1^IFgp}blSPFrM{46HND9T~e8%#JxvPA(UNgq&Y=sb+KN>lqWtGYgfws|5?9-d&rb0nh_ zlc_3G^8+c3#isF-#0{R$G$~{9orf7Y=cLcf$oTD#o9IHyjk$^!~9A z_+M{0$IoWwSVijRj_}I+eH^IM58j({H0m$`SQIT1wc-f;@`)n+#GeQZsZ0;bTC?&R zDJBdP3&u#mT_GoPd>m9}}#*p}&w6(V8`+R?aLc$J1 zvn6#x=>`&=PLJdhJcYMoQ_1FsYyN@0Cda| z{mE82{QGIUh!aTWZ#PGdFlu@Se)Tl&nS)>EwPG?QW4(VOF z8E;9%AuSyUeg5p7x2xDE;fuP#sPXF8D|5_qWVvIVfB`rv(i&4C7eK-I{jO%7NP*<9-Ob75FALh#HdS(>Ac zdn3_0=JWhcLWi2|cDZUYPzE>a!}z_pcZ+u?9jv8T(;z0!V%F9YdM?GVLkrg8xLobg zLPJn5a}Pwmkx?h2ekUGqFT&a*p^72pBV`6MY=*pjLiBNc;_Rp%{uB#EwVk>Mc?At7 zU3Cz}M|j5Q&VZ{L4E*^h$fYnHt}edyoL&y=<9m`@tL71)Ww|~uAKBI++SrR7h#g z{L(k5a%&iUaVP`19~u#z0+|LaVl&yZJ`C(Je+X@rrH4x(KLv4^IXPWm26>ynF8Bg* z9xYEFnGiVh$bYEKc7B^VaTg#J&*9hjp;zF&&_`F(jgwZ8vwWrVtyIx9?cw@u>LOD< z=;^yu8`4NxK9ufooFShF86G8!>~@BJAyfA2IM<0@SbAIzUdgs|#{74uyy5bf*(vOL z+!w*n6EV9SVdbZ+F~Q{F`Tbqaa_LTXa^LXzR~EShja~UQn+9Z&JvT}nXLL7u)O9wz z>xUV14LZ!lMZJZ#p3l+>m_Va-sF=V0+bB!v!yYY+`4(~Q7 zR0rGQn4lePkRD+y&OXpd4iw@&rP?wJ_-igl=PpkTknPm( z8dSf|@}L*ITaNEMqm$O3qPkd%24s_Yt}jwuK%A4Jll09!`{pS4K6+1cvexZvBakhreY9yo;!iqpsez z@9t(CI^El6WgLqiZhelForn~mp@L}@(2y-WhSJDYmZ}?Q_d)lR#oshRR9u2{t-IfY zz*)g|&|G&~p+`2S-?PqWxHBZoy$1)Vnv{!me{Q>;pZ@XtK{I6ql13KLT55nJrr)$X z4D7$GCH0rJ_^rJ?=5rJD;}t17jYnz~Z_}Qv5=l7Kx2+QfXc8;t?9VKIS<594ulay! zuK7q%B%VdZH@UZc{M%lnYMK0 z>s;|wXGL{}RYnrfS~fFKDqMJLqs-3DwB&uE9!(H zsejfi2|G1seSNy&?Gr?NN6WiS5|^H3a%l^3+VdX6nx!nH9}I&s1)pUKbtRF`b62*j zy}%LS*7=kIYnPg+)>069K%}-(zaVV|fxw3YoVYJ?c*-6vf4YAkV1xSLW5AnI{1O}6 zN`#tox}}@tAtb0vwoOb9nsYPI%maK5S|-Fp52Ie2D#$| zB0B$B&}>4qfJAW3C17E}Om=b`Gu}p4iN}l?i(Id)&olcHy$^TiWJKV*txv$^QNE&& zj-<%sMt?zizQJBYWER0`6TpTl)ZMfnIAb zk{xvP-Wg0SPl6Vz+;PV#YsZc8_ket+GZEP-!^;WhvNVL_-n*5-j3a_v+!p~vBBQRz zkgns6Gj;~;yA@OUf{@N5;URj*V8y6TQ%{2BZqjrf$tdAFQa6|5x1K5Wi*p-i!hXK5 zV)kt7jaz5lA--IWh_9{LJ-&?Y>Q^!}(dr}165D6dkx%yFm|?=$cM9Gj3|os5s5iR2 zPWvK^p`H@4#68A=1PVm7R0*qeZ=lZN&4`&dee@m$5aF$-Lmzdm24wjTb;9O}bPZlg zs4g2+y8!@cmANVu!g&$dlBP8czwfj#=>WXQlR5}_r))Nn=dqWLJkCD+>i8X&Eskq) z8OuDYWRB<1#wq;$6YE~Uc!jDK`gfeT;ut)15jL42Rc`wFWCzvxJ?k@2eGJ>&fjG;8 z*-vZ%&q8igq}eX%;RXsB@2Sdm6}yvMQlIOz=Y-ktOAc(h)@YqdP(BU{tVQUWL1^i$ zr`H9y${q}$D3RYuKGAK>bB}LV&&aGn-W;n?h!!!VX_D0#b;`vp5E0k?%!F~moNXjxMyf!OuHd@-Ky3%e2*_BPWDcF^c*&g4 zHoI8nl5-bqaj(pwLA?`(0;x_Wv<%2)Yd3)7>?buv(~XNs{oM2j1gL8rrw31stIH!2%6_1h`YO#pe z*W;Kbz&DW9I+$uz@HFOEB$g;Kbl}MuG?krI+9H?1$F8C0eoQfY+PGdK$pa2eyZ~tl z<9)VOQ(nRJUmvcv$RzDiN!{4L*ZJ@roA>5|wpuBc6bT50(UfPc zOq0e1z(Y<>N|1dCNskwNJzdeO3MJTw{()JLLj!79T>Po=P5b_$IOw{FRpYedchzuS zXwp$}cRd^iRjZbr_I@asMwcJjWHua=-Mx={Ur(NY25b9>6LjX4LFm(LyA$z4=5sj* z6OG#q3(`|iu8%^f2RD&CRDEbeSAE2lIU^FTtjtB&5eyPj>`oR!9ZKV6YVsmJ1>wh? zl)G|2O1?BIetLar`$Wmjviw^9`-2d)ylO2B!1`DIFQWCYRtz}q`x&i&)xVOT^$*=y z4Y5^xnH^(-R`m1n&jL7{PNIl!KL4`-mV90rJgVtOfTr6Ulbe~m2@%r3@cbq+wm&1B zY>gE}Njsp%2xYuQx?p8Xcqv(RAs_L$s9CPF1eg6- z{hBm55H40;{eJk|Uw`%TqKZDF_1kJ?+hzZ1Koz7+`ij7I zPEJGfn7czrhOHgnjadl#D7((CJHD_lvRHfeCSiZu<9LmU=iDCk#|Fs9v(y^hLWtGI zO%QloN>AnkbT_)1Z+haFX6h$AXB=HGVJmoh?Y&-r2EtLAXoSu}88LtDAi@Eg)scY@ zx>k?=9-Li^3Vexr?nun*y*(`WYfIE89x0X{ipnhc*)XcaY;U; z##vT;mEksN@qu1O9&{Vhp`VKm1Y&dl58XKC%Qm2R9Cw?({L@@73BM<*rFUN9tGT7Hkkz>K|y2q$wN zz#!qKQww9d&9ti_`>8s>X9>o8+V2b;BNfuif9BP=HFtJ)ge7p;W4`=8V>A9LM|J8_jzEyG!?oyDV8@M=ObH*lgE^V8;c$;w zhSbGlAD)mPj0M_`e=zRxs1?5}#7N0LBlRt;`Y~*Tt-nro+Zy`)zJEC7mX!quYmI0k z3HRuNg@t!}D|>W>0tvK~#299#W=vhp>bTGA32_>+CE>QU1TyCvl8SUnEpOe4os zlu^A&eiQ{|TF`I?6V8THQld&F+uTr28PO_Ix{y1R|DdGkm?*@eF+D@Sf1VMb^|>3E z+=+8!R23d&B13dxq20-_O`S={u53*tqGzj@Z*LD-e7c$k1c^&P{BTP_Q#SWVE$t-v zfxxFXYc`Xx1c70ZjnDcW6@Us>)r@jZXY&l!KgSmp!XbBX9M4E8u3I}xURwI-8Je{Q zrf85k{@t-%V!#R+V1uED?Yef)fQ#%96>DlZZ~r7q%FA~f9Mm=!kCKmpzv4A^U(H1v z6opvBAj*EtsWcfIy0&DWKY_p*W+zT)M*Q`p_X1yQ;p=@@$eniZxyxP=89dD4)!TUg zOWr*ALyu6_#GcTO2Wa7s9W;4gqCqCrY$Nx(f_vaW?L6OJpe|KMd83}?Imws!v)1Uu zev<`L3_66}Zbr56gU7g0*cZth)T4F!8cjHV<1s|PuMTFp7OjBTgj4YrCD*?`#%f%s zedeC`+seW_dt|{9 z+jxfF%<5f_nB5KtIhlYB%uUH@-duqa%4wsO_B6kL06)(n_RGbV_4}MHi@DECGfh!Nm}E*cG5%Zp@4(3L4LU6iqGuk$~Mo_5Qiex{^Y5T6Znn&SZB>s z&GOF68Q<0LEQCK=hwHXo7ZK8$*==B-#ru^k7NIdRE0`egThlRr{{@p5bLde!Y@ObJ|7|IWyj24H`zrM2lGMJfbzvBUGa` z6Jg}F+b=zYwnLFNJ2>-}Ve2Z+7FY-+j`l}1%CTu6B{Ysf^#;J;=F6c5@5}Lmv@qvY8 zB30M!-HHKDZG6SP7@;a=xhU*DMXp7?=mb?y7nZ<#-Pi9X0ukkDrUDsB=sf1m{?#(2 zjV-2a7v(Q}|0-uMN1|Hy0`2a4sta6FSfU0C@2?h5frk>Edad_< z5o*@I2(@I$UxZqz4hE1=V+Imx20|7|#?3I*Q8-#WXsa@fp&geUzv+4DC- z?eFj;k^jYbk}@j6&VM9c5?J9b30CZFpKwm05%!h{q<36;uIfs@(&krpN)z8lj9*NigDz6RDqmonIe!=UH$Fq zW6I+L`+)C!0e0MfP`ZEH%=!zX`^TSuuKxED_y(YekY-GU#)~O0ow})R4mDr_z4>%aanA;rbv89S8?na!&?!tlZ4e3_ znwwi9@ZJ8K;Sd|+V!#7!{Hub|GrB?o+xy?6ngYlpaUH;`Kn=VK#K6}-&UXHBwD0d( zaphC%@(pbqbShvmm}6&`!XhgHsS|_- zi9ylHMz)Cvi0%@qA(OS@sKZ#n7|#k6tYD*qQ|>iG2imC&MGV(K6&T6^RnQKz23MjgJz0k( zIK7KsWt$!UTB|c36)Zc(ZBla~7Q_hllUViu63g1GZQ^mm*EA(%%H=Mzz|rDAZ_=vy z5>x#3A^5je_-jk$mk0cpX8>HJ*XBJ^Tln7sQJ$@>KGZw#zXl>IJR!PM)Fww&p}+9j zoB*??41{0Rkm@*-1YXX7JmTKUH4hiaYd2i|;;nT4DX|qJ0U-Vzr(&!y=|Fh zCE)#j=#{+2Bcf#JbHwkhAo4cA_CG3ze_n-ud;}DELdB>5&9RM&>}m2-RQGe)9?ic$ zw(;)IV;j!(+KHxE48U?2@|SX`1mjt_aLr8cs~jF}1p&)pDX<*kl*%Yr{e3ya`tOfz zWRAoMs1!bwssCK#KWm^%tQG7((!j4B``>AxC^YonG5d>Fwp;jBs74Y2&2!~d-s{#`1D5c%H=?W-$MLH^W0)qfX>{#^Zk@d578@Smyp|7|3) z{A*+Me>M^^{hh15eCt9HbL@S1N3bdoL z#ChvGzR2zQY8Wn-1|&sMnr>z&<&k7Bik4h(QyplVh}?c*2Po4kS&Qt&7B9)mG@}kp zRpL53v29W8aPb&W?O1eoP83@G8vAac-gme1$yUiN*H#}-6FR=pNI8)lSu8?biY=E> zGXP&LO3pXAWB%;>;*pI=KpujKR}&y9CC7B%7;wJvHiH%YCL$apos(ER%1Om0JcURI z1&Yq52dtTuTc#(Y(8)zc>bpc$FDg-)t1rytu##PY=KC5r?eR(L-Tg&!IBvSrtYIZv zap)8Bd)ddFVI}T@ATE-~z!FDP-K;zUk}nDlRtWA;8?sV6M}?U8m3YVNW)lv2z1)D9 z06TKOiO~Cws4X7L*lC;U>j(|^`rV1yiB<%AAx(tMqCNaxHa^gc8wG+64$x5BXgGVl zR-f%&8tZ6JKWs1;qYytCJN~JO;{}WXpBNKU+vZjsLv={%9(+OF_fz;UI79ZKZ8>`; z6A=WMi}u?sHhaQ2#&p%TIiiE_mVdCc_MJyzTeW(+tVOduY((04yxa7Cq`K(9-`UDY z8m6l=xkr)ly>IrWitFUzy{w8^IbDc&=Vuj*VAVt`%=7KtNkX*G+J*mabWQhGXa1UN z4>?@x{!^}Doml4wF#+42V}|(?1|hx%1ys+#WUXx+fY^rKNU$TD!v_Dp@1Bd057*U? zx4WOS0!P-Kv9A(_deUtz?0X-FGPj>j2Q%>Yd;(4eBi17(dB1u>BA5xWG#DQ1&m8uK zG-wa7L=*906wGuTFJ?uw&9B=w-FgA8>}}r#zQ+q}+9LpTHvtAc@lt_I1>%bt?c(Tj zkd>{Hw}vRKiy#75)~=5&2yemR;9FBJHt!~}>~hbCEHBd%StzrL?WnX1xbN>&tj)4T zRDFoG4iJxT!G~^{%IgOOLx^g&FMxyIR;d;=1nC%(jM*D^CD{`5RI^;GIK#SX5mq2l z;hs;r=5iS16_>qNu6RchV0kq0Xx>~=T1tVf@|C~sWM;$WhrzoRau229#lwJ^I{RgG zzE?hm=67H4Um1NyYn5rGDkT}3dUVkYW@fRJ$|+cLvW!1uO(1C8iz}75uf;Fhm?IFY z!RyFhL`{+A!afu72wtXV2v`Q20Asg)@-9Q5mDG$-{&GXl%@*$zN&~(!mk{gMl>%WnpmkNild=x898ppp&bp8Jew*iB!&_OnfL2!dFZRfP1(`ZEDO&(L zVh7#V*KV{o*G6d`QUb?>_9uG&E{1hOxVNHGb(QtPFvKUEk4LWJhx)~F8!jJktQ^?s z#=Hu6SG?F0`}E{~(x+O{Q5B%S=u=!RxtJ?`EBHU?Q!5@6eecw#7l4dMrVV|{M<62x z%AUK)iK7MFB|n-JFZj7jUK=j2t_1`Lp_Mvtjj?^f_KX?+AI9D}IMROI8jkH`V%xTD z+qR7fCdR}Q+sVW>CblNFZ5viZ|vsZ=M`eW%xb{noYMj0_{O5}^iA zpKkw``m}LW@*}cdrp#P{(`2bsT+xK=X`OgON|D!%l2B=SSmgWDdPs;%6 zQ=3Wm|EWH8XDlBM$C`o?h#wC6qdp~~Vbhk|V2C_lg71C2tm4<_;vfO2PgVYO<<4L&nadHW&;z{{5;41s81>WNqwpZP@i6~6(%>72Iyy`3-u9%I0hU(jvZ$>~Axo1vK>m>3@l#fEYQ%(R+0U!^rDx0w=*Py#tmz%WuIr zx5eGC%PHuzX#E+b6l7x>7O+dy44)|Kt7)egeMTrAf9Xo@lp_Us3HI!S;!ZB~=OL+c z9@P0PPN7g+!cz7rfl%io6q}^Sk70Rv`@q=>Km5B3k7jo|F4&i629S5W;#I1Gat6^F zV}&{czxb(nqSRm4L{Mi-&_90m)q6usO#r?b7U^2dCy$uIbPMYiLB2^SP%IHV(kEUg zx&q~h>E-W)coxjapN?$vxf{D&>%8b$eq;!A0`*y}?OcMTAMOI~iqxgNu)0VI`m-&M zi+`aX3KL)!q1L5vRXpNk+fYE~-IN;#0pl?erAls2pUONU;6b5+h-@^*h&mLUVx?hW z`m-yCblaP=`Ki($297FjW`$5VK6_qb_WL^(*RX{7g7Rc=OQjWtZNV7&QI7GkTnCdS!RDJHTgup0;yR z#6$cGSsBCCikzAlt-1;;O5T%03pJhUC)(G!ruJj-Dle`jEcl7&fY#5XNJwp7f_oLW zpWEkOrb^$W3&99s%<70REuS-)q*|)9LKXqr^3%-kJt&loc;E{_U1)}e@LM>dpJV1d zsed+0SxFcyR=z?tdS*>8#GVN$AEW7S|Hz{RH`%Ac0Afj@|Fb{hFL&_2{1L=s@rG2C z)`dsp6U2N#LQ2U%5on=Nm6=857hk_`6br^GDd+oz>XAa!oepA7T5jw{Z6t6s+v(-; zfoj2I8(xkMgga5!{kTMVJHNi@gBMpu&W@LMOy05}zqWh%STCL+0!}0~Tl2Vk~a*1=rzq2G*97j+wnM zKOy9=EqfWZBFa_RCp_WL&zD}0HX&y;TFecq_TrbxP|*N(k}UYJLPg+Gd0fy(l5MOi z3d(9p#Jq+~G?}El(-XkFtS9Wso=zFh!+_^|QAfS)ZU&t}BkKfJW>H0jA?|VK@cHyo z_o+F}dwhiyG)vwBBB^Red3$}W77*~uw>ewFx`^d=>%w_tHO>gq=Zp}`E@1D`)|xud z;D}Fcg!#=c6=n;`#=88PVPV%o_C1$^k^u>V4vLbGFS>nLI!Zsv6j;B|lh0RbS$(=B z-D!RdUsEiR9bnHCHwLTK;_(^bt;e-sx_+~5=KI$8FrZpA$PSCWF%12lBPqwsmVVcv z6jWdyU6Y^m3!5fuCd%c*EE0!9TSthYpX(Z)nXRKsAfLgr&+VvsiZSPYfJ*`)_;$Z@ z2BVzB0PNG3$#2*@tTTz>t@%dmafIp9_}Jhd?^dcIC`dzqv-beNPzAVQ{UPIBT)k|~ zT>cz13spM*m})j)_W-7vXxnD^^gv~HP;g~JLET&$x-3$VH{{_2B9DvYTEyR#F3DYT zpm$7P(dhxcTB!|?2lk^pxcF#zicf?CpIn+&i#i@mhiCdHp z>`=2PYw3rtXu(yub`E4#BLoNsSKCss-8b%N5mm6py_Mu3{`%${TU(dJuZT!>KCy~9 zMy%bNNmOOv&j6^16SrE+xc{ zf^K~3BUa9JQb($%{1XY=x=^XpdgV!C@wdk*QWjBxa-Ea9HFLemKu@ld^z;jV)uoJY zDBs-pgSp+6Ns>z&l+s{<6bbHla7w1IWF@vu_Y6dL0rhT^v~y4FEgYMh<7mp#XGSXg zy$cWFWWw($h#YLw>+kz-$Jf_Wu?{}OMyvW(s_aVQN+mG86`@*#Vwu?c$=Tv)p2kFM zG(XL%omlV=e3}G=ioUw=nXQ**^&t}<*8Ac#cXnVWWo;T<14zgJmvwUj)zriSTyZ%7 zYM}l5iUTaWW~M5x&VLe9{#^J(<(@yexSb@|h)NfI;V2%@f(PnG*<>mNWX&3 zoZh`qlN*z@OGZ?swI?yD&P{7;KOrc_jfO4MC2kx3gas@cfLrXw?aZqkNCeXO=cCzn>bsI9~WN_#II5Ght4kfa8MA!Dz?s=K)J7F1$cy*g5K z618cMn`@W~so_5tp+z?A)Fy!Cru>?W_-)SKFNr(z&zr2}78=|Sz+3r8J^LTbX@9x< z{{r9PzveXCl3D=x8vg}fE&%wV1IPa_@cof*x~Mn}P|q#{)U%T~lUID!Zu1ckc375G zrZEY(m?Y$-)U9iKPum{{yJY`mPNM*r)0`fR=9A*i@37;*B3A&?evnpW^{g*k-?XNk_|KH{`9=D0~gZlq&PAmVnIn6=MZR@W& zjpqND)97dZZBDz80hrS;#*TlCMY7^6_ELqS4De$8Hc5>w$9i#kYPoTo2rRg;Yf?qzzX` zo3FK6ju`6{F26GYK_8KgG~~9uvqu+IWSZm(Ra3|YWYArK#R1&!cS1Z!cjS?M`(a|` z7hg2l^z7SiY~zX**9o0yWK;XIt47&1$NX>H{JLRQ1p0O@si8Wy9J$-#q1K1Jgv#qX4;=ntRUrmTpy1{2&2gwSqD zl*Tv2nE1ClJ2s@zKAyc`m7X34-+z7o^^br%d{u0?_u~AHV{*CVaML$kywkTYlE(ayJ%BmEn4QBJg7gqe{o2$>l2 zz12NAI_c}8hU6Ld6dGA_wj07W{PDYk@U-^cQGkGa!5>!CSyef3XaV(LA#5b)o32^P zG6d6gr#KeAKXNZaeQhIheiv@EXH(CY9rjA5fX**wUr-p$PL(F75gGN9NT!_fD+sZtjU6_~NrF(P(FgG-SV1b-_2*~%8MCoQ@VghxHJ@GE2cuAJ>hB()~j zPZ{beEOG6d2>Xx~fM33(j@sFG;;1Q9w(*OeNV1IxE(?|No_FWzk=dy8c#L|#{-{$= z*LD(aCHZB_{_c|Z=f%!$h&S608g0Yxxd+a=kyPs_c@r%wj&u-PHnv>U&FiLZo|=pu z)DMqhAw@~+q8|%LoNoc`gX!6nn~ZZkf=Y!rb(NC?dwRl#RydfFjR96@vQ{w^&3{bu zWW_ak>Iy*$laK^U`ncdiD}41vgmP|D!>X=>_6gSvR9?|PEZY5c02D_{ectBb$Yml5 zg`9;*wF0vxDc-8fY`lM7*Y)o28v~J~~JrzNY79Jqi`Wv`hi%-z$=E$)2f%go6>NyJ#>M2iiy9&?3D6qHpsgHqlm5w zZ`Eymql(q=kHpLvmFw;Cz@{6QD?W=9l5U~g$RDp*$~s5%##vV*oDVzsx9+IrCzZpK z@Scd#TSLIZhCWq^hcOyb*2y+0 zvItigC>M!k*1hwG@dX&h+|#x?TS~q%m?YkwhAOl&!=ut&vA2fd=vdMmZKkqWJG+Y} zjhcDsRYmBJfz2)$-P%i-yVTi8sCImKy9EpmkK4myo}_;oTCAyf;0 zD!Pw-*Tsj`&_&Z)C-EloLw% z3ZNja)6;v7!WDPf$XkP@OU2KfGU^54w;}lG@+=t?Z+^k}NsN{R-6m3)LU~QV5O_(z11?uyJ$_&1mO4+!%&0IMNY>kt3nzlpK*xdk8~sOm2I z01&g0Y1s_$sP%RIMzOOY0k~gFCO=;Ae&mz#C{?iwBi*J4^(ov%#q!&iJG+KqF;J#y zSDkB;$TFv6n6fo?ZD!(;FNFUKVoDbH4i<)%FMr{gCD=x%wp1yFO5_G#NwGIB2+t*4 zr&`$P64}pUqo(R8s}lk1HDMpZy#wv5aXsKtVAGJP7XWn%35CEI?=8#h9Xo58J}IQz zQOgED1a6~1hXWa4ur-_4w4%h%A0Z^Eoc_Aseq*qqg-oiiQ%X_AWr?~Q^=?D9?xKat z9{V|b{iPrS+r6yu2(wO?W+!!oOss3ZJ_f&-cF5L3)OMeneu<+av2y>cFTW$}E5QdR zr+{PZptC4LDoJ>&?V_05X%*yJz?Z^GATWD}M9Bu~{H7(ov#NH=% zW*8#lvgI6}8A19A5mXeb82h}5pBJhlyRFxSNO87}I$VreoJ|J>0B#E&74 zEy>zRqvNKHywx+tnodrbzy@!5#2_Lf2MJ2ol_4I!JAfrkoU%>`NYg%N>uwqc1qPdF zD5kd`K3{!UDeQSErgx#0)e)~b0rQTQ86`ez&}lA2Fl)qweMgj}DIY7_Z1614ker?a-{ol{D=ny1b*oD`Hh4sFP9#b`TN zgc!nQ@j+MMduTFN4c~O8xc%W6A%CbfMwh@3)(amm^phn}=;r24AsmR4VJSNXC~lmf z>VN=lS(PFAM_n(Txs9OqHY_iYlQ9G6duZZj7wP0p#*%%!$#2*1zEncLd_1Q6<{<0Z zOmGXEAittK^R;AJT)HgEzb%E_I^-OtDfD_GCS8mb?c?fme_QHktpxzuj}MK$je!*^ z`)zqX9^AELU~XxU&4f8kxO`y>_-kwacZ;GrDEQP;!k4@N0x;7o!u9<=n(TbRNE--QYB z-2G@rk;ox_+u!bRGj?2!)Xpa5o)DdTZnjZ4R=XR5{C2F7FR(49zx9AqyPr} zs(GE;G!W6GW>?I?hDxp|l&W&Y z$y>lrApY}c{TImo*JxdV_Gh%VCLJc9$p6D@{I}8C`Cq)ok9;(pnt*pTy6y7paS?-$%Ql!hdK? zRzmf|a-X!Jub#2xo)B85&8k5hrLWrEudOqwg^#P+zC#Z;lth9l>o#I^_HM@Tfve9$ zT$K>)puPejv8so8dmRGM8U~8r3#PmKXaVDO^$_wa_F02e)d}`JcvHhycY_R3=LH;5 zND918d}~>lLl5tuR`SqXiY<$pKz6XG^X!8Sn74cj5^(=-`R(V~DoyHlQjAs*lb|*c zwbrxnm#pG(?X`WrNiEx=ka(BXeGQM*?7FgJbWJ$Zo+J{(-}>)Il_Z8y*E;IfaIx@7 znQuO$MdKl2rqtrK?4n6?H28}mRh)f5^-NjxXg^MkzYai#bf(b1!QSDwT`9!8h9v!S z+Cw?5Verd@$;D}*%kz5mqdpp=sxMNGWl`Ylc zSk8*q>dftW@@-^&b+=$+kw3w)j+K-Q5Bw@5tjkH5t4tFh`piZpRwVfj+rd)bz7erV zN`(yd6EVW$!Rwd6LDuy?%_DIrLU>%_5M8xez?*2OV9estjBsFd<<1Ukx z7YlJGO~z%H3zoyrI8iWQUs?{wG1w$&cIq$3eay)iUc$zn90$a`F!*AMSkOLDJ|yZ| zrRs~+$i=l`#<;+%8M{pP&|g1x6&QYf-Lo@aYljoV`B6@ilhXVBK4_vsit*oWDWj7ZfM*2z6hQ;<$Ui6W;FFNO+joz|}2 z=xE5g{5BL15atnnoXPQ zCg|On=#7Bo(xd`o0QvNlEo4e54&KY%3fTm|+^iLSyIuDVC^rC(2AhOUW*GA|ZV(R_ z9(+I>XoIljYLNcf*+^>Z#Swwk8gQY$ur3}A9L9e0eR&!v zh&AxLtpV#IyQ+aAm=2e84Z_X$#SgF>WU4warJNLJf?{%`bcE`V(ezL|0&*B z<;gpjM?Ff{Qs(Zu4DMHQiQU%R_U{)ZdXG_nqN=?>D2g%gC}n8bGYPp+oEvPIBLn4j zMX{1^5FrU4s~9(jYTV6OU;R0ul{UY+*DWS2WT>A1=Fd13t#Pfo{WEGGNTTBC>Qpnt zO!~p1mtm((S*Q@Sm-8ehJkgAm1AIvb>V*yNy^Qs&-4?x7GViD`>d-snfDy_cQA+osE|^Y)EzU6=Ib z`?Kz=UO^!naHF;BTO$3q^?$5*Y)hGLB zmT~n=I`W2-Icd3S6I%_v>I_A)J!c%q>a)^W+S(I8&9$>g=s2*9TP(3@4>p3SzKbO0ITd7NkQfC(|4-138zFioaxf1Og7- z>FMVNaqj7aUk2t=!pHI-HS!b9u6U)}>|>KiA|nSzj9r&*(xa0?!+bIuI+$>6wrvsx z=Zf&(7-6@Zhc$YO-y;TW%Y5)B%*|7Ez}Q+!Yy1otmySUr;;eDTri(+pl8S*M8%fJQ zr9s!e>O(BaegmabUnIi%V4~}ewYMDcl=V@Ab2k!uo#ZdBOjP)S{O;%)9%@Vdz!Ypy zLcPS2k77?DD>04em{i{-)F(%4Woj z0P@>!RmLM#*2nN(O^@jCzZnip)cML`kkpr8(yf+%<-u62S=$Y1*LW7iQ$Z@}HejOP z22cjb5EOoPe@KK(g-e-q<{>=s->HNBVzV@lV~?B1INJ(v4QhxK!OT3%mm{ATZDYsF zYFMnBc*&$#*S%n}u$hoYy=39$A(+&4lBdfqsaHYd>1wi((SwiY)6cJAokUgx7deDK3;Ef6|gN#N>M|M#z1t6cuTE28=NOFyR2VUgo_3I9L=w(bm7M z_kSvm{w;(5S6KJo+gFrYPfW(&7DWNV3#KL}hZ8f!39$5LnWJ8#Z!q-PU zMxt(rQd|@khquAYWlwd)ZwjqW=xSLb#&#&daw2FEvh_PP?BCv_(6g=PsOe`oQ$jQ| zF7w$5*mDFb>Fd5$#tKt{Vt)T>ho3~9lB`A{;q={xR7MG?E@hrd{Yo3JqOC$SvnfHo zRgCFifoc*)g@v~Kb35+UH#uC8+<`)gQcgL7SH0%rB{iwBRjUZgFq*Ic<+bryGMUF- zJ3~$9*@K+^-q4m+SijK6&i~NW&e-s$82yjXnegpP+9u+u#RHC=K&)@tGv;uK>Qh{} z2G0fRZ;zvr1N?qt^N3#0#C3g20kd#QB?hWU<(~CrdjmkIP-s-UgUWTTModzuKEaf- zHhWndo@QBj`ZGofB2iJb=ST%6w3x=G!q^Bx-n>lM#{{n!>Vb4 zz#B2Z&a-A5)cG*vRiiu8L{=ek*yTkb?%3$!)#bB0*M^l7kR0(RUErTj(m-m z9+|yAhtg9QobnH{5hg3GV(89LzXt!>u0+Q|X~0)fXlIgQ!?F1WSVsN7M}k1HdZci0 zAfRu6N~r(zf%)s0NYvDGDPTwY(=r0u_c|&Zj%c=XcJ{Kmdf}Sh^eSu1WsXh)L%OlI z_y|LUR3}ycRkCLCcc;dozEt>Xtblyl?f4k=m*=^AKqpl8Wk3G}K?$H`gifHpN@ZTE zOMyB5w)C}^=5cqu`qx^v*mEbhvp$a@QFZY}ZB=B(NZ|_JGgWCEr6b&S%y%FtuP}3MG5JIp4oP>Pb|C>XCV(iMtceBl#N;8o|dx6+dLB;_2Wg;^_9eN zH|x{l9+=YRm>}Po{Du7^6swj&RdHf5nK@YagAnvML z>i%QD6w|lzObrNMOtjOlI0C=O7nd77| zBAzP>wAsJ*?mfK4@tWPsnr_bc`ewF`&-;G+ew$trJB0nFA_`*VM#0PoXogaOi6F0X zk@l0-+jN^ecifW*IOV-6(=T=|JUz=U)6Uesm9t&~Fo&JtYP-Ur0L)X}6=tB&$lF&N`LA-P%q!upDp#IZ{p> zZ!^*kTpmZi{0iMhL{L|gb9r^Xhc#-vXq+QAj$_P%N+M!wZ~TFzlL|m37D=Tq*GqzZe8yrOdQGuF+j93Af zrMpqHA`ntzO(XuGB9w*T;#aS9QCqa=ezcIDr@-Kh+74Gmz*+KvJhAsAyNsT30GS_Q z?ik^s2r_>0<_FE@ngL9U_n|TQ%>!&uM+Eu^$tNEb!EA>!v!8!Cy!g$5RvAwDi&A7h z(>0-WOCWo!#FKmXHh(anzM>8B1)>T;c+SM|^fr4@Aw-1~oZ)g-0}9SsB$3KT%#t5p zc3n8pEv>Ne6gThl%R#*9EpgJL+j9t5@!~r~0(GX@?xIxP9n)v>!jQYMGo0gtnR6&2 zmAqF%Ao0F=HT{#t`|;kzKkMRN>1%yF0PvmqpBw%DO5^@_Jpa{6e4-SsQi>E$R-0Y6 z6?7eE9#nGSIjO-WxKJFStXJlZP|AAH`oQu4O7~k|v&;qWZP}z~ts``ARbONfr}@Bc z@B3@@9=UT?n^dGpdSoEO+GU5k0PVMTvk&63>ie|YL10+1PRA)gNz|nI{q{J6RkY{` z>~yB6n-tEI4&UTS-Oh0>p6iGO+ODCyRDG`+x_$ZM5>knZ`xcFpXWLc2k~npC34Fgn zPJNww%I!iQK@#Zf)~^zb@E3iEllJ=kx6~d^F0GQCk`O>Z|Ah&a!_Zs8b;`=q`5V%X z^EXMC4eX~(KMaLvFkltT)E~IKP}geT-G@RVi(ImAgQ5ndwolVGuXznxa6ni;R@>Gr zQ|dPko%!DuJ`$2SQwQR6W=c9*QJ+)JmNGHy&#aeocFe}=hg?9fJ;R$&dBbsKyGvX~ z2n*)i=#QI2#t2vJIgRr!xe+_;fBb2Q>I1Yy$yj7da=%>bbKrNiUkB#{>2@vx5v~FS z_5gj9LiJ7p0TTM5asyYP{5kBl4v7ilP8I^+^5i-qL()dYGmg|&3dg+2^l>R&wGhmx zQ{Y>{%LQ8!{lJ(^MHjAaQKl@W6fwi(A4lfmYcD3}e%IQYoh)fKv%76*rlfLzt4Nq% z8^Vfm(^0{QapO_Jh<5Ws`kHvjpv_3&lW-2cix?1bBxE;z^$~sJJPX@h-1XolM~@%IiQ7vm={&a*C3g*` zVqRuVlZMl)o|4t6_0ZU(Aqy);gUA-r?dYmKv8{~5j1M>B35!r?mX`GQ{xB)8VZU^5 zo=2BL8Zc$VL?m03w=HpEy&(peUn9GDg_tDHz0h z4CTO8Aa3*uwFV_l#CXgh^Tw{5j(PNgVc1>At&PzG`Q05x!~P={_1#q(ToEVCbl{kk z-{Ia%(6UYhvM$yAqjmG%Rpsi^Ap!5xLp_XAHc8bysC9DNN7S^(M!|Kb3+yX4g1k8p zBAwcac8y${6b@j`vy9jh<5iyL- z4`|Z4R2l*slLm_a7C@&}Vnh_u8aT7IXU87owN;F=gP3lK>5bS^e~MpLUVgZ}P7r6Dba)jP-WfTVt91AZ$ky4}b<{+qg1 zv?n#zcr#e5-=5nBAN!uEUCe>UhWWJXdZV#j-hxgT%v)TiA@hOHf{U`fCo0u7ED*=vcg??gT#kV#z0lKs;t`TWP4>jKEu zLVK5D(+ai-+Jo}D$w1R=tr1%Hj}rn|h!S%z$!pE<&^D#Q^7;zyi$seSHp@EDOcRBC z%RM5ki`!o*j+6}jS(I(%kX{G0vl1TwDZ}3~DKJR5s6DAhWeVN<5@a)*9U)dnQ@d~3 z+sXlb$I$G|!QK46p560O;Roq;w_Z~jB{NjBKQtbf5uxBl#Z<9$85){lV4Km@REo5A zxh>*8{k;mG%f|P-4D_?Q?_NF)l|vF5{IXd8)hBfU=#v6E&@5tb?AuXo#$r@Hk2%lt ze#pO6P5x*4{THzf|*MEbi_Wg9m)1;JFlEt?eG_ll6W3>AI0k8?SU z?bwwd`|!Lvhhl4l~o_Vcm^r61Bv&)sYgvi6euMd8bUR|peAxBXB z7MuGf<(Yp<{ixtCir&g(SNqn6s9Pxe_04w8903KaN@%%-Kdv|FNqL1kmXW6Y*n4K> z*^YXJaO;Qi^@ew3{45n*1g@jc6`nhgpZiKfOfg}E2ij7=6}!J(X7)Kkba_m6P58o; zK1?73w8`xlav29@MVIFU8S2^IUuWu%9x5CVAG`RVk-`)&$$+Nlf&Knf=!e?<{%Ho( z=7HsJ|F=tpaS|`+Djeu89rKu8FOQ$#2Zn6;0kjclKns zB{(n~)ga?+WxAug9eGH8D;6hLUD=G36}WoJohY&t8@K(dsuxE}=^t^s+xg`*5-+OW zq2WWwkPa>b47`ggvr=v*D?_BR`9w$v`VE`unUa{hdplD4o)>G9BmqWAsVJ%PuoV+E za0mtSYS%koUX@qzF4Y{wfDxhDP)c7@+1UI@^GTuUhH5vpUpq|*IYODj6M~gw8ku(8 zuSimjEKbfhQ0)tuDhgGw>#d?2u~^+rbvhtu*^mf@ce^j+V2VTRf8ULhNo=}6@<%6^nlGT?>DbPDC=mM)Hyrk>KJKA9!<4Cjs}_)&NfD$v`80%vzyYd zWLvE@S=@?e49;5T4nt9tl~*8zdoxzdtBk#oP@EW3oU~wG*+6rTv!rYipxxLyo0c?) zxx@NB3q*!Z9ND5jZOUdgY9{U)U{)}}Bpxj->R=Jp>4PZf+s-tir#PBy0Y^r=Av-dI z#S#D&6=pYE&xCe6y#kjRuz}FpL#g$`W82Pe>Hov>ZiVg6wNRj=SK&BE_OyB; zh!M2NPsR&X&l(L~9c^e*=d5ffA4yDos)prpmR}I$@*9J-o=MF+k$G$;$SKH}vLqRR*n|HWvHv;;Gc|P*G62Enf5{Q=8ZgQ3$*(LC*KRYiC9CXH{mF~6 zjQQPF32!g>M8H36-JAA}uCp)K?v9TncRt=9UA8-F+D5#%JZi&q!;-uKZ%j$+>tas1 zN9`%AojQJ9V2bUbmoS_Zx2?JB=WL_gjG@8p~J+q9%gSLJ$Q8=}vHM2+ZI z$4#-PPIp7hGnYRltYZ1`Jl-t=dxjJcr*mhV9s*nKdc=EK6PsM^tSb(O6o-dHzUnno z#+cwp!Tz~IE3NN|?^{!qjA}K82Q-J9!Is{s?UIH4FijX-+*lp@OMzdu=mrEY1oVih zx^$G{hi!*NKi4f9PmY3X@s^mp=~uuRy7s~zSUpgi0WEeA}p8Ln|kNiBn&G8YHv>(?=x>S;m&zr zOLyq$-@q>lSfZoA9iIjeii>&>S53Gb%=Uz>EP);lXx@y;8nTRNvKAzMcj6AjHslnQ zV~37B(Qayu(-x86=aUkl!7J4GSUCOqk=_pr8udd6`B@{~rqrsa?f4uJ zm)>I;TYc8-E037~6gx7ck

OqleKoK?e>J6^@3h?nYmBScR&&Aw^j_@3o)%`nTEz zO{r7|W&$bpb4yRjoKK&vy{bIEIADr+5LjYStI~ zS#P)?#$aQ7{PF}l$j&%oLtme9qstAH*?M~PEQzdMpO^`*DU=&1EwW1DS@u)e}^>Qf5{HSu2^DJ6y%ZF*%LR@);{1AjJ>yYpd^28*tM#c*Mn z_IaJ4FaHKxmZqm-n$_+IEJC@>3qZ275mCPx(~sVPvhmT*GMz6M;`q+WP3K@!iY_ct z{K9UL#ljxcyfow}bxdI6+|;QZwKMEodrIe`&J1pNlq>}@(P@RMXxWW67d3jW69hpLYI^PHAg%H zDg5xFNyUcZO%w>af0asAS%giC!0=lkP+iR7t3w0^_)4hl#GHER@9D_9P~*~`Ms3!< zIAhFGKV}@~1Qo)|_8UKytSuaojLU65?>#db`l%hm&w@Z2pAn1>RRZdMy@t{3B?qnd z#FS{28j>~TI12lk#*XLFZ`%59^j$aTRdwrAs6K&)j(UObsJoc)*8z$WldDvU13sqb zF8iul11Cdo6r>OXeW$qm;;x&72lfM}O_MhZZt{em(7VIm8%75_fUfhvLpk0M>CL#y z>8ldHJgYpDIVe~g_CmNKxVZ;hGRga1b3D+Diy1o6!xFZFaX6jmM2hTB>B{_Us0md6MgvHnpYr zK51ClX-y-@AW2`qq5^`8cJ;h zhrd3Ued7n3Z)ipLfrQm3*Id;0Jrbn6I3s0riPIwCSShr0cVIryV-uJeM13kC^=2Z% zoETgCr4Fov+jsadZ(OAjjHwV6Fyhs)3z4jJh2HkLMr1trEBW4(IO9T}___q=9;p>C zPtk(=@&1(Hal<2gtvC>miPCYlgu0I(p_6wl{7Fszo)sjA-?Z$$|G-c7?o+%)ICRs^crfCWl+0}_Jt|3)1+4b=Vk841MQ5(DIE1LQ{! z0#a0kJQoYGqem<2%61~muuGk-0Qh(Nd6GR38F|>)bcamghqm)@T}8{m z!DrXVD%gL>X&q+b%;vYKjgs?&p488*wV4loDVS^9bvJ`(v^dCW)) z26V8EvS1#vd{%vhSupc8)g!IG+-G5BMuKP{$~|>4&0*u*x15d(h7TKDgWf~Vu|`kG z^V~%*`Q2L=0S3tCq!Oei9>kIht-Go_)z~TyDQ~U7?x?KjEJRSp80{(8xY}ePz58@9 zW(TQWKSevisNmf7+Pk91w1(F~M-=;+PsWz$0-J~6l@_kew8DJFhUYuyE@SKpq;}yg z?@th0#pbfDSXH4hwJ&GPQP#;2u*hG2t3_5eV3VXi5uGX=ZOiI#xQ{=2TGF}WRDums z!Fy)7qv^M6siR#Dy6_TG>H(+es*iVx2cm8Y&}Th81+v4F(|( zHg=e=75nHm&QVA4qB!xXTUgs=l@HSQMoIKoCfn@7Y%}KaF}G!vk=xC7GsSw<UOE+EW6eI2NqS>kX9MBq=WS$6Ye=GtH5$6QtIhqfC%MRK^x1cgx^ zuAP*s6$CXHiP#aG($*}Pqq*x14km0^vV>BGg4<(mH{JI`AJ1>@*e!bAc=mIzoM>O* zc#3zdD#0zOVHDv=aP#+8C3D2X2N`q}>0*n9rf%*+=%iJ`i3cA^-nsp-2@=oMb?rdp zScziNECjJpS;B_mL{6~m*!LKOTi{}7(69r&6p?Rew)UvE5oF8Ub5O#IR;=8IpMSu< zDR#GFOhb%p{&F`O6(tO2xANdT2VRNAjUAq7AE6Lj={`Q4Ks(w z6iYDR;B=j8T9oh|RTqj;nPW_NQ}RAX`hcxJsAB88)+cYD3=nWLo+TIvPyQy~)(NQ@ zTQMvDrPzHUk5^{o4*xT7fDq3xalj(*y^eoc##lb}PN8!a%kVcre5bZ?R5t;QYHq^L z2LZto$Av}%3i*VgGaKbBIb&i?w009;sq7BcOTmMG=1QPkJ}ayN zXf6An(fV(m-~Z%F7=Iv1{^3eK!q2S$T*=Hllcta|>?;lo?*<<69zSum!KbLh((Exv`Be)y9jB6kVP+3WTh zE=5{7pDqT$HBs!4DMPaJrz^TvQ5G6;G|liYUNW!ajg2|v4_e&R&?gdpk%dQUrHe^> z2L$LSl>|2!r_A=8-}hBbizZ#=P7S^oi>Fv zU?rI|DxHn<`13k{2y(*nX){V<36^M10uQKOp70FsNgzpRj?@41lL1t-+YKUD=M>OB zt5{Ud_cLQ*AWSv&*m^ERe^AZGTV*&B^b}W=;a2k;+ZA3v&v5hwYg`ezF*wKJJ!6@L zHp(~1mx+nBADuijY-y%{7)d8hld*q+Ts&+kD7=YP{}5Y^C)+b#>zFf-?3h`YHi>!}zpOHJGZWLX0Sw#WhMc%7u$@L5(c&u& z@rm#8790}cnv`%Z7&!D*I zoZ}JlII)OKC0w99-K2)bg78KIKMA%tkdE7`b?R4MD91bf;dKdN1hh~Q z7ujt?VIw<@FO8(xAj#O>_{4AtLKhG6i?W-r#G@h$r)KB9Zet{z{cFz7B|??0N_ZV@ zb*UMXa>2gV6-qj9F%nc~RU-}V=@Pu?jOH`r!=T68wJhX(S#kQSr=oP{6!AskH4;8C z%NcHsGmMmz!m<@!)jV+p;eC;c-dd2aSqI?wJ4l6O(dS`ur}4%psfS^1ybe70!f=`H zDUC>felNb5_(22)>^yQ zX9R1YOGy*9aB}P1b2+a{;*KCXw$SZp7Qst2q1kW=Xqc$A%A_WHtIS^LHJJs4TKeOx zV(^-EMl!w$?ahqjgZgl-rKV`nIp`&fRrF3bp`U38m$&$qRWAkc8eHsotxe53qbTou zj-^6yVT9}ip2_m%F$a*eM-*0-M{Xh}_!h%{owJ%i+A4GAFxj+O?9s3nVo|9IIopLz zdcw51>UHhpDEA=JuYAnY82YP2cl)R>*4`=ij41={N6H1MDpMR}7qHccDWT#%xmu{& z9Gd-!kw#ZM(cnb5hjUAa$&qrW!kw=21n{e&p~Sgqr$(4)={obWV8LE+(fE(rw(C(R z_+$b?B?e^Pmi)2I<*Fv-`jZCEmW3>>aay;nf^6Tt-YETIBZo+7vQ%;Nys!D@_0ikt z<^`zJ7U}U#!F|nry#o<}LA3EVgUAMbRt%6%D9zZw*_r+?K6{1q8>Rv9Spnoe#lQ9c z;4|u+{9w`zaV2Ra01- zO19q0C1LYFHPT<9-J@^VxpbR>nQWpCZKY6LaF;V6{pYtZ(eKY==JD1psAm zfH)4b-v}yjY#Qg%@_;xF_+85_X#~Tn+K`jR{}IQb_<(w^C$w_~I*Y}+UV0c>ED!NL zzc{b`rYLjGMv~SF6||b2X(EtnYAN>14XQ=W=f5;PW2>H`K=uX zjIY2kKfq*%v6oJ`h|o(IV=huXHEYAim)LS`d-UM-G6)Fc*d;ewB2(5lC|V0bUyIHH z(?ddLN6wDMZT9e0+C3m|!pSy=b30>~^hocl^1zY}Ej)Zv-nw)3HoGuP6XU#n8rvsKeAu&S;x9BwlXQ~qV?Q!1_-I4trG^E*F!3hXkoA>Mec>;m zQ=e2Z3B#cI!`qNfI!Yf((NkSTGc7SM-oe5xLnw_W2nQ9cZ}H)(lMEa@^~b_7w63f3 zDL=0wFxXNmes$7?PS*;4*L}&@PhLioKNfNF``~Q=8=GPJgKnjsI!S!lwUcB*JmAw> z-7a^}Z@QhtjoyG{1k*f;T#@b_Zy#Z&57l2dtH@kIz(A{nS zi7ERJ|6AA}{~N*KhyRU`Qo;rcOm$fbq5A8U#$z-Gx#+p0J!><$MqJo9?u78T-((sp zspdp_CA(o@F*DNV8DQnGnC5GKziW@nV|vj?nhhPgO6;^)D2i;ogo*UZUS9k+pa^k7qECdihA&%lMVKbP8cc<2nNzA+ozM6MAKYWTU z7rD+p?Jc#2{G2N_W&Rqc{RW#h%C)caeLsy*cNn^cgB&22Jp7r0!287Eb#n+5_4ycA z$s!f9ovml`HkJdn_KlMlcF^yIHyhGE($fVa3<|7W4zbVJl7f@>X3(!k39^}RWQ*k> z->xGggyQJ*2L<*bW_(b&-+4iGru|gbl*(snqyajKHhCy9m(J9^UZ3n}uWYo_LBB=< zct}gp48oT!2uMCpp&oj8uZz$|vejE+ghz2%F(&Z%pLkgkuR4(5xybZ_MJ;@$G4fv{ zB6Ccg?^kJV1HukW4=8PDPofP>_~(k?Mq4tQkf2=)CqF*o|FA#{%dmoJ!vC6q&Bp~% zS-AXuwr(Msk^OXK_|4tta4lwP4&cABmkr5u1NcR!={M5s=x^Ie4jWzV8eOMzIV57H zPR}kQa%W+-(Tf_Ko`Fo`!q&?Z4kv%E!#w@gYNSg{^^2*>ipd2|OY@h5I9(`CY{`6Y#%3;y51ya;l^#kNJ5*xC~m!42&?*_GZ32xoK zIpt0MjCi!`sY77>l>xt}p{85O_idfazlT&zd_${+0p!fV{&|z`PYLv|O}c|WkO~^| zg#Q;p5CBr4D(BtD@34l^OSsQFe!%PEy28rJ#Sj(0NeN%!7KTwsRQd3we*^FQiy1>q z$VJ+$Pb~@`bFgLY{dt2X{IL8PkTDP)a=Vmuv0|!IMjokYP}^tPZI?#GNyPlbs(Tv( zQMk2Pt%K)<3^VU&jfAjb%}!rjyk?+sLe#sRbjE7@S;?B8*wJ)kYMW zvY#Fn+T#~gury#!UI?S@bVhL%&r;Y-K4_#bq~1!Od_B`(Qqd_(eio#55l<)Y5hx6x z(FotdZ`aeX0#2S0`AzZKA$Xms-1ODcj38C!EWRMfpg`r7z5gNT{?3J3VR3(-R!VmR z`x+tP70|`_-??jI_C}=-%E|xb0s4~#`PT#V7v+TJKPV>&Kz~CyQJ3RmNYP~fmx6o4 zt_`5zE=eHjo%$Jsgsg~Gzi9AqczJfpDk6a_A0+~v@Eg70LfYffU=20uYz{o?6CxGs zD6FztqcI|d8l%i?zF9KE)Hh)AWg1;<6#hH#6m|ie0A+6z?`XmAVTS+(Hy%L2Js?hN zbtY4Af0e-c%T<6Hs=X9+*fn&73}M0hv@3(rtpn=?#RV>ojvJqstHA- zRfO}QuPG!kH8RA@+=rX?azZhDoomOx;HE%%h1|jEjfDAA&y0^~CIBYx7>Ebzp42nb z5w@5wwq$k_E;?{iB9i$T14ks7YK_^x|u@R9A9?OHu$Mgloa1FSVR>aIVnbx>Q zCO;8{J2&@-E-AS^b3=BAR``mEw8j#{m40_!7_apeO3fL4OMuA6gMJ5*Cqb5{c?;EZ z>vMH(PoP-6zQ)S2IuC#K-R1k+^o;J{VnYjS#l{RsSoFG8d#EtC(MA9`C!`~~be^U`IB#9r zz6InXoUryRQK*LjC#8nIHuQuwd7-egEki5_Xe_QW3s`QH>y_nD)1350v@B6p#B0);^iw>JH*DG&tOMjD zlRVL@saPqwbi3ZeWMf3Hxa#Sa~GSLqYe<@>4`9DxCi#-0$1mUFjU!}5^PLspxp>R z9Xf4B%0z4DBd5*nCb>iAJJf%cHR-MYz1ZC~Y@|V3>rQ=>qtf1)v28qRHUYLX=`2iv zTbtVzvT1ObQH%VtP2$+;EXp2}DAw^LPSPo`V=Vw6neNr`Y@Qf)HYms3l(Lh#>-bxv z0AH?8T1ttF7~Q75myD}0u{D9 zuyFJjkJY&Jq|ICa?S%FxfOb+)YLhNdH`zYSx~h}ip5>`KpESHA-l=FYa?C`omH=Ha zCE(JyX0v*y)Nr&yfUq|IJLiQBf?OW9L23F0+L7;o>f- z-7*o*H$N3EL<_U5pA9)(ESo|=;02#DF^;tBV3sw;d62JYZJv;vlpq-@W#oC28 zH>{v|DODaLF#Gg{Hs5@mqzm8Vl?J z4q75jfp|ijcp_nxht#viPT@0E9eNsy3P0$Pz1G}3ESj=T3l{IDo{76Q{=|FDK#c+? zHTL)JK1y&*4y1c#LjLpHCcok_ysL|Mi~4!T_A)!y z3sfL`x_evONWRVVO1)KdxjulfVTV5na{@`9B7GG}$w|Eu9-IVilTd%D zY9#?&$%>3a5a*ulz%ECrt@2d%t8dilr_2)$RwzDNJQ>&eo??)!f*H8a)+csMmygx# zBs=yAJoI7Q*#|Eea8Lj_rT#M{SjB$bL{@rM{kF3=zJr6JpLU3V5sBN1z1(7o75G|6 z*jL0Aj{}F#A=TsHpzSNtuX;mrQvAvw_X-2ppIyV)d7m*lpP&Zl5r!Y;kQhAdbp}AV z9ehcS*g#G{jar>ALdxB^>hLc0oE2dldG`}I;5;x+5QGLeBSQ6pMo2uvsvQQ&Zc)Iy zJOl^@_931xMWeg4=fhr^28njy26}I}?83JE404P`Ou&bgXiImxIZ?87xkK`Nr4Hun zz3~`7xPmEGz2vXacAwD(u(RDa6%d!{CX;X24(toN>|cJboloAo;e+|&2SnThRU9v# zPH}@){>F`=+!3ClCV@S(Qg@YsbdGigUt?-=9SXRfipAg|0Cz+gV6&0i|6bP`(YSra z+uMG)6!`pW;`f*2)$+#zv_MG%3z98%2$2_t;jnQp7j|s7FK_pH_vVArTUsNVXPoBX z_JemZ9FV@6aV-ea5F8==&Jfx>B)UNN(dl2gRac|@KCRvpb6*7TNmW9$^}Vd`Y)e=8 zG%h5|ltp8M2BT+_@tz`ly)X^~wqNkAPR{|dIa7$AzTZEmC{cg~z)v)pGlcweppngM z#|}l?;s;@G?#Q-I4V8{5Vca2wGuAy-P|Vp$)vX6_R7O-aGuh%HY+Tm`{fP&~a$zh% zp93#Mq&iM#QTlT@Xq6Q2^LFf;F|N*k z=mG^>mA5=GJ9nJP8uOLDWAD_W=VRFW56_xVAyc?71r+WG$()OW?_MH6I2xMXgDS(N zBVC3^k!1lW(zJpXDM_KKnX&&(<^r+-_%xhW2Q}@ z)S;TdcM6MOL?As{;dxM@pDx`PC}C89M$>-oFI?9@alU^Q-GANM%1Hj_*7kuG z{2#9C$$xQOKVVK#-hF?e1%J7w{pPW;pJQcRX8Ln$8~?9c8(IIo7}l>D!3oy5w4f|s4@u41aE4@r*z3`s z-u%hcbx0CP^-b19t9psavsm-CIBK~SxG%%!Zk1&uKed+vP2HfO+V55JD0nlP`>=VS z38Q5OPGK;@V}n+YZmpY-bvq_y5wmq~$^4KA0|gntF+RCi`vM+fX6?rw^lAhx-3PJ; zE~c(@gC*80Cqxm1(z*pv<_Tyg+kA;#YNrk>G}G2e}nD6XiK`vGdk;` ze#)b#>}A0r{#8DUuHU5w@0T)pw4Xin)VerCr9W28>f5i*aSdeQNf1SHSa4b>3Qb<= zvR#p|jIbRbEtbocr;wWZECjgi5`0&{rA^K1uS;9O=>;)@lx>@D$o)da%CqP({Kuuu zc+!f9`I>J#rC>Jfht*eP`KC6Gf{@)dDxZ9OZ7#xgeso<96AP43%)^6k$erIXVto*T zHu%w}02Gujc8pp(?LF&~=y%{Q2%3x3gKc{M@VHc_vsSYLUW6_GP;dV=lk4-%;kf`G3$o-iuo!F0Hon zT{Yt!A>lkx7GZSIvB%j1jALR0)9{DCHBOerdm9V~D+e)5v-~?GP?Ik#ThK}Z`hz#~ zkPb+bIamh5f)oTK7g!#*7VdgAGN<>(sGe9pF)nBJjP18ItVA(KqEQFE%4jtB9)4c(Sh#+Ek`&`|`E*F*G2!RSsW!z+-{dz@CsrDk*j ziy;J>O&a)C+V;Fz;tc^0FRmO2-DCJqg4x?0J};1j8)w!$P90oVzcB*ZQNTGZ&YZ~W zaxgeGr+u+#9|e9qvaQmb?%(iqWV`@hYtAIkHgbOPkU542kTNN#OHz|+{TGEhxX3q8 z@^~ex+R9T>nIFfa#2Ads6id6Ez}wa4a*Q*DMLM_i%^S&sA|=V{h9oj&l1W?XOWV>5 zb|~R^YW=@B7|vBmMWYR5qV_ zq8PwA)BFzw_n!gCSkfQ4f!8_AiRT6MCzlYZc2Z)8Xs7G1t^e}0h!BCrIHUvMf+0C1 zQ5-WA9sQFwCLQIdc_8tD{+oSbnO?C@UNvuk~A-h5wh{z=Ri3 zw>|UqZAF|*d?9Ic1-kGDs6No9Ha@_T#XCMA3wvgR;sGGRR&F$`m}6nn5$Xp(#mi zV`1>Rv#c7cEd4wC_wF4)!CmR5gRMG&&Pi&E3K)TGnJIM3)T7}7#;L+SR`nzk4lZbi zx&zET_aZwovM0JZWKvzjwYwX#IMH7c3giX=2Vk|~E39WN=C)ySiapNB%6U8%#A)Mx z+Euoca@HZYp5=qz2T}`g`@uaFyhX?Xd(5A;|D4s)euO?^Ylkr%jZ07%sla5^Zv~H! z>bIKZ>wqZW+iVI^ecW~5tr3ZM@@-A$tj}Td?C0s^I^3R2)ME6)=(VU;JD%5PtV}WfmlJ%s#9Fpd>Q5QIt@IJj@84lZCUa-NhS2^fpi84ILU?bFtAnMAO(T!_?oLR$9C&S+W6(}c|E z3E!$vHO~X4A_OO2v?;3SJ9a zw5CIO)PPg9{NLvqZaMuz=RCUN3;TkO6qbLEu%iV~p-@0+%@SchM%6}ykt7=|{f z^{2Ep4Zt=9eRX=J{^f_JESmvlt)zA%R0c7>l9(y;VU3 zmF^W)lR;{0O8u2JRp$7IS=Y9|6ENRs8ta zc=1s%9h}oFq1K^{K+D(?5dhz^m717-8sE)ZmBJJM()hNNLHAx&5VMFu4e@=vkF)UW zshb}6RN20&cp?*3Ovi6CW$!|XkPt%I_^cG0W>U#}6I4njfNcr_r8@E1q?>QL9;Y0= zQt7c5r?t|9yP3;+ydWmefd&dxGlIJMN9#8V&WIR|U?_@I^&;yt>*|+_Lpnl1Ie4^n zL$LT5JoGC|u=!qu12fVRaeW}zcwt1$;*wQVZPu}LZ+mM)T(jdc1r?Q=IJycWan9jX zb2ve)qe!AoD`Y@s1&k@x|ecTP_+!PErxel0J z5E=Nc1MgEWb6$g9T7&1Dd?xYWJOhgk=~_yx!nd42=N|I2>LZL@8$9Hj^kR=yKX#vp z%T?dRX7=fB*qNX#v^b!MutjBNLAtAtOk%qH1ja|N49ill$s;7;&QB+#dep-Fwbt zR3blXC?>G6w2e5?j{YoJO_HPZ___gvvkHw1w^>A8^#}Q)c3h*Gjl-n+PU#EJMqEe& zWJMmx?W>EX!dhTf!Js#XT}7{}HHNP@D(`|#Lo)%ZIjZC9^gRTb@*e7rKWX6y(KgwBizWU2{($7_fiD;jk(g7NtETCbJ(*@qe#$3)t0O+=6rL0WfLM+!Heaj z?t?6_d~ASmiprV!C_+6G%vfC!s@I}2?!r*1lm5DJ-FXCgn$`?#ExRdK5D3wf(nAu; zuE-L@cc400RhaDt_jhVp89B8P{pn;Q*(6JER|-G(`U`Y>*5WR#I-((^ur!xymYFi8 zGQ>!|<}W@Rw0FQ1_kWM)j;;=7s6I}!zmK>5i5}M}OIvQUA-1C*c*^+$xzSWF99)_q+u`9R87&H$E0is%cKw#?YM7I&-+9t`$^aWvg|c9QE01q;%o z%@zb2gjL64=agqaP1PkDaZ0AiDhHk~18g{g~eDq9o$I^n?Z+>2+T3#Eh*5>{#|5Ip6!#r?g) zkTRD!9hG(ivV)r7+jjqs9LYd*o?vO-aCaFI@g7n3mRJtFnk?02e8sI1?`zcv9y#9I z#Y%r6%)JSqs?irAICRk{!rWl|`O-9Cn@zK5rkBgq^lI1OrUKm}gK%#wQ=j;%_>Sg< z^QO}|lA0Ws+B=!$U3Or;NXG{Stqojb?F&1L^&fe~C%f^^I=%(@287V=zd^slSKJ8wV{x>fqmxzf%C zwxM%uXN|tEJ9h4M?(34DfM1PmxxMXw<(cP|uuqHaJr6c!>yhW^k+(O-5AKD0ev9i6S>FM$eXN;H&0ojm0d!&DQa^-8P*-nmsd@0RLH;F-v2%y zgvndvYB8_Zo!?gR!~mHk0=<@@l^suBqwqpcMMccZeN5~#!-$yg15!0_Q1s%Rf@DBr)g>x|uw)e?>_hbFUh=+tWJ3c5EqhnK!=ro8>_ zd_`yPF>Vjgg#`WQ?8qNx&wq9yIpY5x*^z&vF#oFy`GefbYhEl$`hRBsn(#S7>fkBV zssK0nK#+Ke2^~A>n~Ur18SjVTBvwXTY0I_X^5^Z{FO#C|#Y0vC7y^6UL{f|J!%&Lp zx}QD=%yZ1fx#7S9VVXT;CEh1Abq6^t5sA@f-qC7}Vac}`-Jc@U{uoZK$eFosqwUuq z#MQxaOFxl{gqBo}qi#msZm5hu^= z7LQM#Y-sEe{i_z%SFDvx>+yRrl_5S@#Cx*|{~r&wOcH zwqnF>T(sby3CAIPM_S0plK~w@_EFKInJHX+q$v?YXTCKo=71w{PCG@t9w`~ZE7rTz zM!m-Va&(-p_2Gm-8ncfe_<;A$Kg6p4OLM4h3y4*FY$U%Tl#w&L z_lRA^OJ&8%4V_4}>MsAlzmr}BZd;5I{Tz2vxj{UNu0tr@3Wq>|n|__XV)$rv?V-KX zjvGsb?6#wW&dED~9BtsnYLkIy__}yVDeWv_+r2d8Y%t=?_ja3@y2GWaP}~{EqRR{P z`-lsZ)3ki_xSOBUR7q|(B*vOU9a>cmPQ4i`FUAAxf(RecuSWoEYZmAR0thgXcd1b|CChwVsgFFdjD>;rdi$#RvWqa6$_qZzF(y32G4_KI^aLq z*)~W);rTWzL~2t<*lEleqfH^r7Qw;Q%ykmLkfUoZdAFse+rE`H$%_LH znF)xTp$O!fLJg2USVQuq4i1g3}?l$5x zm8{W{7YqZmT!P>4$MccwHm4?G=m-PDv8)3-AP^L;n;~)P(W<#&Wds9K9DdcL7My<{ z6i}?ai)&*Ygdb9u)FL>)@r=6nc)MZnbVGgvd?`9-8uuzCA%mxcZCF`ED$u>O7NhTwXl2wtFL19}$2eS<*Lg>Nr1wtou!oDinCqaT=_L?5&W6YA6@6(sjN&R7W; z<^a`t+Q_57+wOwv-Bx?ya3A(OTUWaRb~oe;?r@B=B=e!?VhjU(ppR+x00OWFnOA$R0-ouWBm^NfQ*#`gabj97>x{G;JUVI?~H8 zY7FF^0dl{SRrUqC0YYnE3w*%!JirLiR~ zK4{kjvFurM-cJI+(s>Mq$;e8?Q{`x$jH=klYg0zC^j9lx{pB=M(|W!)y3LAZn7(qX zx&m;^b-(i+$lmD!eCuqgTXI9@ELK!cpF?9-Hld#yqwbj`rG*q(IeSVSl@7GniNe!d zM0_sQ&3^WLF>veshB4LIS*59{b0-&>unU22N>HZ}_~k%{19+%f|@kw4b-)>8)#O$E|vfMT$njF>W9qDQJ(Evs0; zRN(@XWVS|O<-$ea(0X{&Msq(cTtbIejvl934;cjO%qVRbFbkhp z(0ZofL)}Sol)3~T0tvP3hnvZBWp&)&0#vkdfacLH z!|RXxG5lW*FA?o_z&UV)b#0v>Z+jMWB32n7s~1`;30G`c7quiCoD1=L>$|zEaBG?XNQYv#508{(3}07=w`&$qN%mWdTmLh4k%dYJ8%JE4$=Uw(EBLM-CFedeS$39 z42RcI@5;>Gi)#zXRzq>}@s^Jn9kcrDQwBn3II$n?0XGcco%@dxNrnc6CLvunvyrp( zS509iQiAGPQU-KRJEQ{VDz?!ycM@!Kxdq`l7Z)ueX|1n79+EbnAOh3i}183&i;g-hn(D zvxRi;#o$}C>;$PFuKdK*u-EUH=w+pJ*Qkd(HPS+PW(J9*4Z*+;RExpQpBb!=-aaiW zbf8hbM%AjP2*5mF?Kni<1@ENfBbRi@tC8DT;MWO@X{9q`oYn=}iK`aKQ;r$AuKrSu zYHF#tSX#vLyR<< zHr$Ql!uPOi1wJ|1(}l@jYS!_+R;T|~0g$S$M|-XU%;)Fh;p``zwi6!>yKerTW*Me> z;rRrpj9&oiiGNW4{h5XQt1|uw#UhKp)dBn@kbk_Z|LG_BS7l7Hg6Px5Uo!#_uR5)} zJ>b=M7z>9xz_KVa^$vdzLyA>I)v#cG*?wNzW&J3O0Sc>x-vIR=AQY>+h8A^p3;v=0 zs~?0_9$+yBsQ(xRZsJXn>ZZN{)PH!o*r@z>Xlk=lG!c9vEuTc7H5m=qG(jPFY`{|p zG`3b94epMZOOMsXsln1qK}oqnOSmEzqEr7A#v3C{+O&qDgj+aYAWbsK3452?nE}Zm zSnNwoJf|{)Svf*k1CWPJw~iCNFJAcZxUHC`ckvDUDIowAzu(3Wt(K8AQ|{O?Y__m}5e>|a(WSnYhnbsl^NEPCobnr!0VRdd*S6TZuyEs#aW z)q_usi@R0^9>og2pLKSxB7Ug~N$!i_?pGu~nO#++c(W-IYLPY`TIyv=!Q-&SDP%vdC{<|+lQ_|iIgg-!iQW*il9V=yLe-prC3aThppn}ob&l#fwQ}pWC*ZvF zUSru@&;tNPyMI48>-7^K{iv3c|GrD}C(H5QQgDEiE9+xyR@S&f;1J#9AeUtx#&(#^ zu9ow2?lWan-7%n8&KBcM3$L+gnCd1AWZvL;h{n?F;EVIG9t_)$lad;&kp_wNE8rX^ z+k0Kx;$efMLll=6fXYZhNJ>2~^lWdIqKkpk+fw)cYz74R8-2KM6;fU^OoV4?^RsoR zQc~A)O3i1eq#Cn0rW=^9QN5jI2aXZ}DL5}JAi110mb()a*A}gPs5kyt&EQ};Kpd9f z6Y6Ga+twaR3^n3#7%vss-bU*661pvkzdaxhi>j!>7#6n&vKH*pJ_MC9Tq5Q?a8ThR zg&I;qD2e(Jhut|8C|{P4QVm&({)&?ah{IBAjZV2dS$sDmmw7llT?%C0w;>Uh&=1Hu zu+a#{TtmvX!tvLD0!fpWYL&6zOQ$_ggsr^p)zs1puE`wPfF=s$yC|HR&2xYkLWNTl zZXs;F8qQjU2{Kz10XH%K5dy)SvULbYmXw{NRJtTDi7$%Jhucp3t0m~?DiyD2A$!rL z3D^QOVDWHoe;qMy!K6hOqv_h#QEbLCO|1euyj@3(-Ux}8Jm}=Cxbd>su4ka9UT-2I zz$;gYwq_uk84_0b*g;4-CB6I}~>)S&)~%4`{w?yt2YPddoGmp%umAm zl~PmHs4W$1D=_Dn;PrL&JXII%-MA-7>xfGYFJ0C>7of2LQxUt)9vGQAQw%HhX+!w; z8BReVaD#;I6OpgugPX-h9C)otC{7nLMj7vUt|xv8mV@n85xFj$YipjGgKAeeYv`Uq zXA41QhH+(`dNQHBGVeEV?NK+{ZA`BqqhJzuL2g*DYm4x0cr)P6^SPb84!#@No#vnD zc{$QfnVsJ69{BEht)h(As?8qoRjpALsemv9G7UJ}%Z;d^iUa|F14%|Nqqn0+M?)0k@?t@c0&B8Q}5 zT9S}U5TaIc)EO~Mj-5EjvUUwQsLInQE}O+fb*_1>qf%q+hOic$Q|qDFS8nOIAqZzv zGuqw3npZI*KR_6(OHRXOmF+KSqGHPV5v&7^o7Z^EIw+kxH=hmg{FTVgWuXhx*ZUYO z1nHkPnL~PZGWbJRe&wTV)pe$7^j|ooYr2=b#ZIahkA>PrIq>3y$RNT zYJSSxU!GV|{or3scOW61CrS_vP=80bn5F5)NR*)5{u#wbopqDKscQyb@a{L}IUSsm z7umV?jtJu=91D&D?8_ut03wx7U(FMrPf7RPTo0 zDV>?C9tT^MSWqEh*AC$MTi3)HoG(I4GHB9?lPVYerhabF`R))SGtIiU)zCwU>HS$v zcDlG8Ehi;x&>k>|uP|3ag4UNDkfhm+uN=S}+~9Xs4b$}Nss8pch==)W5bqhuqfU1n z_tHHrEnYMU*hSR&gjBTP{9T<{MZ^2h1_&YRA#r#Oc4yZFinT|I@;V1 zi{3_JkBO?rvwx`ou-8n>6~HqSV$!o4$Fp)IpVx{x0eOD=@5$FzK=M`Q;qKSeZnP~^ zf~Ly=j?m=K*3Y&L*@DN`b$b$FU5m4VCe{eGkNcI4lxRLy7Z-H&!Lv6Ev7XcycphB_ zx$>P95Lv~%K0{(3bNJ&V?oa12>5dHM`~R7IZSC}Vctjz8e~VBCS!452XXSJ3Um%D+;twy*fIOJ{E4~DyZ5nZt8mN^Ge79v$ zf->U5^|Fu~0|M-3hatDjL^>fhxYY*CDTe(QXV>8OSFgmRjZ|(YTfXj<qVFo*;MlVK@F;VHz4aHEFkUa z!A{S~b1ox-?r&-bBv3(~_%tBio1YNdrj>H6PaaV@h;qV@Bp{Mmh8g`J4WHeBq# zR#V7?qRzP&Z>n+Mc&O2t>Y2dj9bsNLHrh2|e+d_u2pyk>Zs6)=HHJMm$;C&I%3hIDU-e-9G}aq=3L{V`DvpPieq-(1VB=@}$2< zG#a6k^jOv$HT!VLq*TtgqIrwfUs{_5IAEN-LUVp{Rd2})nMRk3OR~*E1sdg9pHUjW zLIPDM5SjX9W3}tRhI~b>g{~Ze7*pmf7lol9J*B7;ks}c5#S%Cy_wdJ@+`9TzOECT| z)ME*-rkE;0Z$Hs=Dfw!vd-wNiiW$r?zjOfmXSDxJ(fPA9l%`@1kb5A$ttY=h1e`~c zA_DU?Skg6Ij%s`mhu!zgmJHwlc8}(r1xNobNHABoT_L?=)zi`1QsLv_Lj3(%m*2w4 zL1re%e&(Tu+|{x*+p}-al?Q}X?ym}a;5c}9Bk23i z9>;1WC@Gz=ls7${C9bA3q)lKvQPBLlJtHSJZ)5G^7IT+=WOjYQEc<@ZJNqUzup(En z;QA*1U|Q_nu`iXR>D#ePUKa41+CC@~w8O+@Wbvs_<1dGurwnuV?VqTmHihP=8KV+u*te|GiLZ4_ z2=A${eFCsjYo$OWrVuKHK<{x79=Sko?|cYZKBfp>jY{1U&in*(_27@$->uPMh~vY# zrb3%S3}Xe`)Z`BMU6Kc{7NZ!A{QZXKCaPdDqgh{=dSjX}_$Dx8- zOu6-DYBW~vTlQb<*K+7GPk9Gi@_M;Vg6u4#?CRAZOshOlRasIHp|T_r5kyK=s1}8- z4vI74JKl;=^=IKK13SOz!G4;s8f@XY@m;e&M!Oagskmx3?gyS=ziRo8C>-~c;Q-{m zrW9$?1P&|z3%ppVw4~xmW2{mL226J+ilIq9c2opRAWXizC)^}NO)>~kkHlgwMNf;L zqE*g3;r&SuIwEGC4wj7_U198XQd~`e4s`cPX!z(7wlSYD<5eJI&ZIY9;0DfY%18)a zF+eFwBvOT{x=Ub^roO=BD?TM-Cc(zkHnZB1M6f){H=p_&tXI=DBU?oHpgSton$|-% zn41|r5|2}h@EWZ` z@weD;c9hB+R#Pm8Qx9^c4tk_;FzDz}2R3n`7rHz~i2R*(fx>RG<9#S#5OcvBEE@eW z{0D{?>-H#;m}{VBO565&c%gJ<*TGW@Tb-%zW~+(~qFz9A-QrJlns|cBVq#0%+&Xs| zC=K=KYWD7dIA!)Oqug<`oQ(~0yiq@;GB>x?ab9Ma5 z;_mOOOzGDHUU@*9!wC>QKmvmN^utnL&c@QpmR{e|<|mz#xfwwY=qHLSAi%HufBVNR zMqaj;0L6bp`UL^*oObe1Lu`fA(|P`>Qmc`aQ*&DHU^9Bn zZZ%adh)SVJI&lgjiZ@!lqDx}#!Y?YVtRd$LQhHN3PLuBJpVsd2P`zKxe=p60mnm=N z@J~lT#pbr(2k4&k;ZdE%Y)%TS&Npm*iE{{qL(g=B;2`}GL4BcNGhq!8_Dk$5c^(S! zwA)e*gqU}>A9S5#5*f<76Yt{4t8tlw)a$x6;h>8sf!v-t4HEdMO zj-tGSR;)8%O#d{qgQK#xUwC-z)QWk21&qmo^@MT1ASKVgv`C@vx32bz*h}2;U6K`$ zklDq4c4*aMNk^RUBD~zyrtAPbO8-0LlBOZ|#te7{*#H#+;=ezKx>i>I|6}++U%fO% zNsAQ*#5ME-T+mZ}_Y@3o0ZNLr0NFjAvB^95S%$?O@>1`*(AK_G?pXM>%RDMna%Tvx73b z2b6b@m4E2Pfp2pxLaK|Y_et*G5Qx*L_k=9tO@B^SOGdTFND|D+AVXqC=efD@=C|f^=%? z09l+P)uZ<)0Sge5fF_L4txc&1esuBXcZxM4n^FxiP=1lN5`Mg+2t6h|1h56^6OB1|2hDvHgWPS-!Eg+m>JRKiur^)}UOcw!3r9tE_cV?C!k|ufFu{ z$_#rwaA(q!Aj6K0kq-QV<{}^BZw!C= z5f<@L{dk>5Iq8>Pu=^OV=J>khr7KJlM`#*xP7k-)m-_qPkztIHV70+jk@0Z=h%2 zifQI2)qk7iWStAOGkF*gl$uZ{>pe!*w5eQlG$Ki!{L7$2w=5#EPak|@>NIGmi}RAY z?VDG4xH#qqw%eI1s#o^!nwk|n=kc4T$C`eKdzSIaV7X0R$ybiT=&{0J-=J)Huuw>z)YFVya;?0m>S57xN5UKpM< zeUEo#T)B-Z$5J)P#93u`;?Y7i%;0LGr8i0hqFQUiRnG&t#gc z*8Iv2+tIA{H#weyD-=Wc=v2sBivrCQZFRW9Sr8IB73u~KOx~i#70!nH=5;D$w_BnL z+p|kTCW33rd9IBaOaobWuqfKKNexBk{2@NW`U?a$9M@iQVkya~^A=}9FP-`3%Q}EY z+$Oj!G)VuJ#IuX0ej?^-vXwWW`!WZcn(J~?0RHcOscp@JRI!F z)ofoSltee4JwEJY8I5q0TgEc)!^`oCH<26xSf4?9xI)_qJqmR@Wn74PeW4Mw zTfBPOI+6uyx5GZHFuJwq+Nd1u2Z*D*)}mR&N3gk1Bi8;XE$jo`<({)bNUb7_<>r&7 zl1E!VQLdGR3SyxiIs~9a>nVt1DYDFEX%ZRMXJ2F(6Jrlag&=}C6_-3Y@!d(GA0kPK z%3RF!4l%-S74CzX0V&h;a(lcN=b|Q{ossh|N)LHMUgm**u8wz!EQR9^Oh;(Vrv2MA z4&@ROB|HM!>n9Zb|mWiO3$sI<8r?HgGR7tvI1p5`u zie8mZ_H}|gtY9?6?aP?TSO7hQ@If)FrwTVBEpQe4K@2(9Q^b_cIBQjuj{d zjdS`^QbPG|6-q%19DPw9p+v7kDQI_-zM_dxJ}f{fXqlt04k45)TTu$yTci)TCzOSS zEJZu)(eVU*tTiE2@8J>9AuD~#EFom162_mAFhd{%e-i$^F#b;Oo9b#xDLT_6l6czM_U>KuaSWp7JPk|8LzsMt? z0ZK0(CWQJ%)DqMw^pa9Sd3lAW0}U>E4H+Q>-{29@3XER0K?t_DSOTw^!CGDJEF)nE z1oCEqfH&tz_kXyPL)ytlwP+>ROcl|oxo^C+Qx~bVQj_=JkzvNrGQ-Ok_;?PTgxh~I Gzx@lCx6Yyf literal 0 HcmV?d00001 diff --git a/README.md b/README.md index 6e02afa..6c1e6f9 100644 --- a/README.md +++ b/README.md @@ -1,133 +1,8 @@ -Project-2 -========= +Questions and answers: +##You can use the performance.py to execute the program with ease. +1, Please see the enclosed performance.xlsx to see all the result and chart; + Based on chart in tab:GPU_Naive, we can see that the performance of this naive version is very bad. Much worse than the serial version. Because the computation threads of this naive version is very large and data transfer from the global memory to the SMs can consume large quantity of time. That is why the performance is so bad. -A Study in Parallel Algorithms : Stream Compaction +2, Also in the tab:GPU_Naive, we can see that the curve of optimized version is better than the naive version. That is because the latency of the shared memory is much shorter than global memory. But still, it does not exceeded the performance of CPU with little test data set. -# INTRODUCTION -Many of the algorithms you have learned thus far in your career have typically -been developed from a serial standpoint. When it comes to GPUs, we are mainly -looking at massively parallel work. Thus, it is necessary to reorient our -thinking. In this project, we will be implementing a couple different versions -of prefix sum. We will start with a simple single thread serial CPU version, -and then move to a naive GPU version. Each part of this homework is meant to -follow the logic of the previous parts, so please do not do this homework out of -order. - -This project will serve as a stream compaction library that you may use (and -will want to use) in your -future projects. For that reason, we suggest you create proper header and CUDA -files so that you can reuse this code later. You may want to create a separate -cpp file that contains your main function so that you can test the code you -write. - -# OVERVIEW -Stream compaction is broken down into two parts: (1) scan, and (2) scatter. - -## SCAN -Scan or prefix sum is the summation of the elements in an array such that the -resulting array is the summation of the terms before it. Prefix sum can either -be inclusive, meaning the current term is a summation of all the elements before -it and itself, or exclusive, meaning the current term is a summation of all -elements before it excluding itself. - -Inclusive: - -In : [ 3 4 6 7 9 10 ] - -Out : [ 3 7 13 20 29 39 ] - -Exclusive - -In : [ 3 4 6 7 9 10 ] - -Out : [ 0 3 7 13 20 29 ] - -Note that the resulting prefix sum will always be n + 1 elements if the input -array is of length n. Similarly, the first element of the exclusive prefix sum -will always be 0. In the following sections, all references to prefix sum will -be to the exclusive version of prefix sum. - -## SCATTER -The scatter section of stream compaction takes the results of the previous scan -in order to reorder the elements to form a compact array. - -For example, let's say we have the following array: -[ 0 0 3 4 0 6 6 7 0 1 ] - -We would only like to consider the non-zero elements in this zero, so we would -like to compact it into the following array: -[ 3 4 6 6 7 1 ] - -We can perform a transform on input array to transform it into a boolean array: - -In : [ 0 0 3 4 0 6 6 7 0 1 ] - -Out : [ 0 0 1 1 0 1 1 1 0 1 ] - -Performing a scan on the output, we get the following array : - -In : [ 0 0 1 1 0 1 1 1 0 1 ] - -Out : [ 0 0 0 1 2 2 3 4 5 5 ] - -Notice that the output array produces a corresponding index array that we can -use to create the resulting array for stream compaction. - -# PART 1 : REVIEW OF PREFIX SUM -Given the definition of exclusive prefix sum, please write a serial CPU version -of prefix sum. You may write this in the cpp file to separate this from the -CUDA code you will be writing in your .cu file. - -# PART 2 : NAIVE PREFIX SUM -We will now parallelize this the previous section's code. Recall from lecture -that we can parallelize this using a series of kernel calls. In this portion, -you are NOT allowed to use shared memory. - -### Questions -* Compare this version to the serial version of exclusive prefix scan. Please - include a table of how the runtimes compare on different lengths of arrays. -* Plot a graph of the comparison and write a short explanation of the phenomenon you - see here. - -# PART 3 : OPTIMIZING PREFIX SUM -In the previous section we did not take into account shared memory. In the -previous section, we kept everything in global memory, which is much slower than -shared memory. - -## PART 3a : Write prefix sum for a single block -Shared memory is accessible to threads of a block. Please write a version of -prefix sum that works on a single block. - -## PART 3b : Generalizing to arrays of any length. -Taking the previous portion, please write a version that generalizes prefix sum -to arbitrary length arrays, this includes arrays that will not fit on one block. - -### Questions -* Compare this version to the parallel prefix sum using global memory. -* Plot a graph of the comparison and write a short explanation of the phenomenon - you see here. - -# PART 4 : ADDING SCATTER -First create a serial version of scatter by expanding the serial version of -prefix sum. Then create a GPU version of scatter. Combine the function call -such that, given an array, you can call stream compact and it will compact the -array for you. Finally, write a version using thrust. - -### Questions -* Compare your version of stream compact to your version using thrust. How do - they compare? How might you optimize yours more, or how might thrust's stream - compact be optimized. - -# EXTRA CREDIT (+10) -For extra credit, please optimize your prefix sum for work parallelism and to -deal with bank conflicts. Information on this can be found in the GPU Gems -chapter listed in the references. - -# SUBMISSION -Please answer all the questions in each of the subsections above and write your -answers in the README by overwriting the README file. In future projects, we -expect your analysis to be similar to the one we have led you through in this -project. Like other projects, please open a pull request and email Harmony. - -# REFERENCES -"Parallel Prefix Sum (Scan) with CUDA." GPU Gems 3. +3, I don't really sure what is thrust do. But one thing, according to my performance statistics, the thrust performance is highly similar to my CPU version. You can see here I plot 10 plus chart and the thrust version and the CPU version is always almost the same line. And my GPU version exceeded them with block size=128 and data size larger than about 1MB. However, the thrust seems allocate memory on device with no time. Because If I count the time used for data transfer, it even exceeded the time thrust required for complete the whole computation. \ No newline at end of file diff --git a/performance.py b/performance.py new file mode 100644 index 0000000..29ab152 --- /dev/null +++ b/performance.py @@ -0,0 +1,20 @@ +from os import* +import os +#os.chdir("D:\workspace\GitHub\CIS565CUDA\Project-2\Project2-StreamCompaction\CIS565_2014_Fall_StreamCompaction\x64\Release"); +from subprocess import call +blkSize=range(1,11) +dataSize=range(1,9) +for i in range(10): + blkSize[i]=2**blkSize[i] +for i in range(8): + dataSize[i]=10**dataSize[i]; +#print blkSize +#print dataSize +#ss=' '.join([".\CIS565_2014_Fall_StreamCompaction.exe", "-b", str(blkSize[0]),"-p",str(dataSize[0]),"-s", str(dataSize[0])]) +#print ss +#os.system(' '.join([".\CIS565_2014_Fall_StreamCompaction.exe"])); +for blk in blkSize: + for ds in dataSize: + print "Current: "+str(blk)+" "+str(ds) + os.system(' '.join([".\CIS565_2014_Fall_StreamCompaction.exe", "-b", str(blk),"-p",str(ds),"-s", str(ds)])) + \ No newline at end of file From 414eaa0c8b16e1304cf264fa97668bb6d0aad46c Mon Sep 17 00:00:00 2001 From: chiwsy Date: Wed, 1 Oct 2014 17:19:28 -0400 Subject: [PATCH 6/6] GPU naive is deactivited --- .../CIS565_2014_Fall_StreamCompaction.v12.suo | Bin 50176 -> 48640 bytes .../kernel.cu | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.v12.suo b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.v12.suo index 3db652b824c1b6af7304038a07df70537a3c0d41..736e9e55ab1bd20236d87015811869af0830abc4 100644 GIT binary patch delta 1153 zcmbW0T}YEr7{_yO4DI5v9*oEwcx(K6Cy7kVxK)mU$Bt}=}`k&fN&>(u?_nzmR^StMI-v4>t(MkR2 zCSM41b>aHwz~ZMBR-p~@ZbmLo8P)Tg+1cP5}?_(U_rx`SGl_1u&QKL5#UzD;BWZF2LypP2eK9Ov8ppu`8Ci@s|1f`W}){ z%H*)w5cvZ~WlJbR0z0*stq-#@NC}Lf5ByC6DWwxgDZ7CiC}2M*0RN&$4dQCS5l{y; z-#XO)6{u;aKp~tbzb+r66fJ9e=%q@%k~v<3%Bm18?~>*crRX1%7Rg^>yO-x9HpVKo z!2-X5mJiokUdSXsnsD)WyI1oT&FNjU@2);5z5R4^xXt}?maWF*ezuTllW_B-+pDNd zUZ$)wo0TGse)X4{vURhoTB^~)L_4R+d8=lv++fKTA2Z8VmbaOWqR6?Zs8Qk_URvS} zTBIYD+);Us5gkF2?~n=7&BIh5Q$A2H7i?{mB@-I?BHnSG#kFt2stLo~pDx2uf zP7r0ERYVULU0e)tY8}Hjhc@(dZ6d6NN0MBd?hBj9ePcu$85n1@)@#u$_ii%L&*|I5 z6irPPP;jDITOFuU#eJiAqKKh9F~jMM+pW2yYj*LRQ^OnhVvSMkSuNd}`JjDC`k3t_ iM@rJzDA&ksy?is?uApQd2hd zSTM&zP~_HvuxRI(N@#_!un@0=6^V3bBZ!UXIH$eCL?W!5$(M8PIp?1L|Ihj7hQIP~ zho!rL+kK}4jeV~kGrwrzbxc+yNrDU4^1#QFhIQeC4I#+_Cg;=^phYRENDL(j^>E}n z+RIUoG5r*Cnsz4Ybkp{rE;en-fhIX)EE#c`h`|YFg2U7)sK>*5IcMZp(=!)!2~vuz z#farY4)-Az6XGmB{LZ zSm2ipUt~L52RoF*5$iVFg~C6z6h0_!9_BcF2x-6p?x|7`n>&>!%?Hdh& znr=nneWL;8A++aw)c><@%gl9YZ91D5pZ~FM_QO$!s=&SMG7%nrkC6f4w2!Pzw2Y`x zY*dk^62F<&PFhM5pVO^8nmu5FfvX-PlSgw^*)CBc-N|?|+YK%L8lEeGJ5+%8+F@e;&#q&jN4Nvy=exEv18l5tD=LZK(?KzcXgycw-HrOkB zLLs47)v`hFhy-s+4L;DoNQ^l=I^+IOQevA^hT3cm|O5}k%qQpd%E z+8~Rp&_D|=6es32F=Lr*Eaogru%&ej<0`kq*u;UaYZJB&hnTU|CXOC=ad6$d z0!un)L+qSzSa2Eo*J?d*d!J_Pv>N+UjXfMHJh>wG`0I3-cD`Tq?^78}yMZ3#H8al0 z(aQrC=kIM1AFifa;bdzOY(IKeyuGBcv<^OW2ubqZ>1SvQ)xdM~^60g&LMuc2n6gaz z&`^+MyVNMH88Riv>nv2>sn}IedS=o$j|ArS00ptE@|Cdo8iM>YlRvJ vY^>k3zNJB5t8Z%xR%*d1dVRCrT(9d}8aFq?nqvhDtz;3Iy&)#p&6wo}>z(dG diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu index 2ec85cb..a7576d3 100644 --- a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu @@ -730,7 +730,7 @@ int main(int argc, char** argv) ///////////////////////////////////////////// //////////////Naive calling////////////////// ///////////////////////////////////////////// - { + /*{ int *src, *res; int *host_res = new int[arraySize + 1]; memset(host_res, 0, (arraySize + 1)*sizeof(int)); @@ -749,7 +749,7 @@ int main(int argc, char** argv) cudaFree(res); cudaFree(src); cudaDeviceReset(); - } + }*/