diff --git a/README.md b/README.md index 98dd9a8..7fb2ae3 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,61 @@ **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Xiang Deng +* Tested on: Windows 7, i7-3610QM @ 2.3GHz 8GB, GTX 660M 1024MB -### (TODO: Your README) +### DEMO -Include screenshots, analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +* With 5000 particles: +![](images/5000.gif) + +* With 50000 particles: +![](images/50000.gif) + +### Performance Analysis +Note: The visualization has been disabled so that the performance analysis is based on the CUDA simulation only. +* Performance comparison with blocksize fixed (128). + +![](images/performance-1.JPG) + +| | 5000 particles | 7500 particles | 10000 particles | 15000 particles | 20000 particles |30000 particles| +|------------------------|----------------|-----------------|-----------------|------------------|------------------|------------------| +| Naive search (avg cuda time per frame (ms)) | 0.0185 | 0.0388 | 0.0633 | 0.122 |0.188 | 0.28| +| Scattered uniform grid (avg cuda time per frame (ms)) | 0.00192 | 0.00269 | 0.00355 | 0.0067 |0.00752 |0.0121 | +| Coherent uniform grid (avg cuda time per frame (ms)) | 0.00126 | 0.0014 | 0.00169 | 0.00232 |0.00278 | 0.00407| +* Performance comparison with particle number fixed (5000). + +As expected, as the number of particles grows, the average cuda time each implementation increases. However the scatted uniform grid method +has achieved a significant speed up from naive search, while the coherent uniform grid method improves the speed up a lot more. +* Possible reasons explaining the speed up from naive search to scatter uniform grid (typically consider two major factors): +1. in terms of searching, the complexity of single thread for naive search is O(N), where N is the number of particles; in comparison, the average complexity of scattered + uniform grid is O(8/G*N), where G is the number of grids. Theoretically increasing G contributes to a reduced complexity of searching. 2. However, the sorting involved in the scatted uniform +grid should add on to the complexity, the typical complexity of the thrust-sort-by-key implementation for sequential frame update is currently unknown. +* Why does coherent uniform grid significantly improve the performance of scattered uniform grid? We reshuffled the boid data such that the velocities and positions of +boids in each cell are contiguous in memory, in this way we have direct access to the vel/pos data without memory accessing to the intermediate buffer. Since GPU has very limited amount +of cache for each thread/ALU, fetching data via pointers causes frequent jumping around in memory which is quiet inefficient for GPU architecture. + +![](images/performance-2.JPG) + +| | Blocksize=64| Blocksize=128 | Blocksize=192| Blocksize=256 | +|------------------------|----------------|-----------------|-----------------|------------------| +| Naive search (avg cuda time per frame (ms)) | 0.02012 | 0.01861 | 0.02178 | 0.01863 | +| Scattered uniform grid (avg cuda time per frame (ms)) | 0.00197 | 0.0022 | 0.0022 | 0.00191 | +| Coherent uniform grid (avg cuda time per frame (ms)) | 0.00119 | 0.00115 | 0.00118 | 0.00125 | + + +Changing the blocksize slightly influence the performance for the three implementations; the uniform grid methods are less sensitive to blocksize change. +This is most likely due to that despite the change of the blocksize, no threads is dependent on/ waiting for other threads to finish. + + Three more plots showing the performance analysis (using Nsight) (5000 particles, 128 blocks) +* Performance analysis of Naive search +![](images/5000-naive.JPG) +* Performance analysis of Scattered Uniform Grid +![](images/5000-scattered.JPG) +* Performance analysis of Coherent Uniform Grid +![](images/5000-coherent.JPG) + +### Build +Please follow the instructions here: https://github.com/CIS565-Fall-2016/Project0-CUDA-Getting-Started/blob/master/INSTRUCTION.md + +Note: CMakeLists.txt has been modified : 20ms --> 30ms \ No newline at end of file diff --git a/images/5000-coherent.JPG b/images/5000-coherent.JPG new file mode 100644 index 0000000..13f46ea Binary files /dev/null and b/images/5000-coherent.JPG differ diff --git a/images/5000-naive.JPG b/images/5000-naive.JPG new file mode 100644 index 0000000..c242dec Binary files /dev/null and b/images/5000-naive.JPG differ diff --git a/images/5000-scattered.JPG b/images/5000-scattered.JPG new file mode 100644 index 0000000..3dea6b5 Binary files /dev/null and b/images/5000-scattered.JPG differ diff --git a/images/5000.gif b/images/5000.gif new file mode 100644 index 0000000..58acad5 Binary files /dev/null and b/images/5000.gif differ diff --git a/images/50000.gif b/images/50000.gif new file mode 100644 index 0000000..2ecd4d1 Binary files /dev/null and b/images/50000.gif differ diff --git a/images/performance-1.JPG b/images/performance-1.JPG new file mode 100644 index 0000000..ac7eea9 Binary files /dev/null and b/images/performance-1.JPG differ diff --git a/images/performance-2.JPG b/images/performance-2.JPG new file mode 100644 index 0000000..036d931 Binary files /dev/null and b/images/performance-2.JPG differ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fdd636d..750f0cb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,5 +10,5 @@ set(SOURCE_FILES cuda_add_library(src ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_30 ) diff --git a/src/kernel.cu b/src/kernel.cu index aaf0fbf..f76c397 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -5,7 +5,7 @@ #include #include "utilityCore.hpp" #include "kernel.h" - +//#include // LOOK-2.1 potentially useful for doing grid-based neighbor search #ifndef imax #define imax( a, b ) ( ((a) > (b)) ? (a) : (b) ) @@ -21,14 +21,14 @@ * Check for CUDA errors; print and exit if there was a problem. */ void checkCUDAError(const char *msg, int line = -1) { - cudaError_t err = cudaGetLastError(); - if (cudaSuccess != err) { - if (line >= 0) { - fprintf(stderr, "Line %d: ", line); - } - fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) { + if (line >= 0) { + fprintf(stderr, "Line %d: ", line); + } + fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } } @@ -70,6 +70,8 @@ glm::vec3 *dev_pos; glm::vec3 *dev_vel1; glm::vec3 *dev_vel2; +////////////////////////////////// + // LOOK-2.1 - these are NOT allocated for you. You'll have to set up the thrust // pointers on your own too. @@ -85,6 +87,9 @@ int *dev_gridCellEndIndices; // to this cell? // TODO-2.3 - consider what additional buffers you might need to reshuffle // the position and velocity data to be coherent within cells. +glm::vec3 *dev_pos_buff; +glm::vec3 *dev_vel1_buff; +glm::vec3 *dev_vel2_buff; // LOOK-2.1 - Grid parameters based on simulation parameters. // These are automatically computed for you in Boids::initSimulation @@ -99,13 +104,13 @@ glm::vec3 gridMinimum; ******************/ __host__ __device__ unsigned int hash(unsigned int a) { - a = (a + 0x7ed55d16) + (a << 12); - a = (a ^ 0xc761c23c) ^ (a >> 19); - a = (a + 0x165667b1) + (a << 5); - a = (a + 0xd3a2646c) ^ (a << 9); - a = (a + 0xfd7046c5) + (a << 3); - a = (a ^ 0xb55a4f09) ^ (a >> 16); - return a; + a = (a + 0x7ed55d16) + (a << 12); + a = (a ^ 0xc761c23c) ^ (a >> 19); + a = (a + 0x165667b1) + (a << 5); + a = (a + 0xd3a2646c) ^ (a << 9); + a = (a + 0xfd7046c5) + (a << 3); + a = (a ^ 0xb55a4f09) ^ (a >> 16); + return a; } /** @@ -113,10 +118,10 @@ __host__ __device__ unsigned int hash(unsigned int a) { * Function for generating a random vec3. */ __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) { - thrust::default_random_engine rng(hash((int)(index * time))); - thrust::uniform_real_distribution unitDistrib(-1, 1); + thrust::default_random_engine rng(hash((int)(index * time))); + thrust::uniform_real_distribution unitDistrib(-1, 1); - return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng)); + return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng)); } /** @@ -124,52 +129,73 @@ __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) { * CUDA kernel for generating boids with a specified mass randomly around the star. */ __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, float scale) { - int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index < N) { - glm::vec3 rand = generateRandomVec3(time, index); - arr[index].x = scale * rand.x; - arr[index].y = scale * rand.y; - arr[index].z = scale * rand.z; - } + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + glm::vec3 rand = generateRandomVec3(time, index); + arr[index].x = scale * rand.x; + arr[index].y = scale * rand.y; + arr[index].z = scale * rand.z; + } } /** * Initialize memory, update some globals */ void Boids::initSimulation(int N) { - numObjects = N; - dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); - - // LOOK-1.2 - This is basic CUDA memory management and error checking. - // Don't forget to cudaFree in Boids::endSimulation. - cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3)); - checkCUDAErrorWithLine("cudaMalloc dev_pos failed!"); - - cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3)); - checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!"); - - cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3)); - checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!"); - - // LOOK-1.2 - This is a typical CUDA kernel invocation. - kernGenerateRandomPosArray<<>>(1, numObjects, - dev_pos, scene_scale); - checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!"); - - // LOOK-2.1 computing grid params - gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); - int halfSideCount = (int)(scene_scale / gridCellWidth) + 1; - gridSideCount = 2 * halfSideCount; - - gridCellCount = gridSideCount * gridSideCount * gridSideCount; - gridInverseCellWidth = 1.0f / gridCellWidth; - float halfGridWidth = gridCellWidth * halfSideCount; - gridMinimum.x -= halfGridWidth; - gridMinimum.y -= halfGridWidth; - gridMinimum.z -= halfGridWidth; - - // TODO-2.1 TODO-2.3 - Allocate additional buffers here. - cudaThreadSynchronize(); + numObjects = N; + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + // LOOK-1.2 - This is basic CUDA memory management and error checking. + // Don't forget to cudaFree in Boids::endSimulation. + cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_pos failed!"); + + cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!"); + + cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!"); + + // LOOK-1.2 - This is a typical CUDA kernel invocation. + kernGenerateRandomPosArray<<>>(1, numObjects, + dev_pos, scene_scale); + checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!"); + + // LOOK-2.1 computing grid params + gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); + int halfSideCount = (int)(scene_scale / gridCellWidth) + 1; + gridSideCount = 2 * halfSideCount; + + gridCellCount = gridSideCount * gridSideCount * gridSideCount; + gridInverseCellWidth = 1.0f / gridCellWidth; + float halfGridWidth = gridCellWidth * halfSideCount; + gridMinimum.x -= halfGridWidth; + gridMinimum.y -= halfGridWidth; + gridMinimum.z -= halfGridWidth; + + // TODO-2.1 TODO-2.3 - Allocate additional buffers here. + + cudaMalloc((void**)&dev_pos_buff, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_pos_buff failed!"); + cudaMalloc((void**)&dev_vel1_buff, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_vel1_buff failed!"); + cudaMalloc((void**)&dev_vel2_buff, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_vel2_buff failed!"); + + cudaMalloc((void**)&dev_particleArrayIndices, N*sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc-dev_particleArrayIndices-failed!"); + cudaMalloc((void**)&dev_particleGridIndices, N*sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc-dev_particleGridIndices-failed!"); + cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount*sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc-dev_gridCellStartIndices-failed!"); + cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount*sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc-dev_gridCellEndIndices-failed!"); + dev_thrust_particleArrayIndices=thrust::device_pointer_cast(dev_particleArrayIndices);//boids idx table + dev_thrust_particleGridIndices=thrust::device_pointer_cast(dev_particleGridIndices);//boids grid idx table + + + + cudaThreadSynchronize(); } @@ -181,41 +207,41 @@ void Boids::initSimulation(int N) { * Copy the boid positions into the VBO so that they can be drawn by OpenGL. */ __global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float s_scale) { - int index = threadIdx.x + (blockIdx.x * blockDim.x); + int index = threadIdx.x + (blockIdx.x * blockDim.x); - float c_scale = -1.0f / s_scale; + float c_scale = -1.0f / s_scale; - if (index < N) { - vbo[4 * index + 0] = pos[index].x * c_scale; - vbo[4 * index + 1] = pos[index].y * c_scale; - vbo[4 * index + 2] = pos[index].z * c_scale; - vbo[4 * index + 3] = 1.0f; - } + if (index < N) { + vbo[4 * index + 0] = pos[index].x * c_scale; + vbo[4 * index + 1] = pos[index].y * c_scale; + vbo[4 * index + 2] = pos[index].z * c_scale; + vbo[4 * index + 3] = 1.0f; + } } __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float s_scale) { - int index = threadIdx.x + (blockIdx.x * blockDim.x); - - if (index < N) { - vbo[4 * index + 0] = vel[index].x + 0.3f; - vbo[4 * index + 1] = vel[index].y + 0.3f; - vbo[4 * index + 2] = vel[index].z + 0.3f; - vbo[4 * index + 3] = 1.0f; - } + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < N) { + vbo[4 * index + 0] = vel[index].x + 0.3f; + vbo[4 * index + 1] = vel[index].y + 0.3f; + vbo[4 * index + 2] = vel[index].z + 0.3f; + vbo[4 * index + 3] = 1.0f; + } } /** * Wrapper for call to the kernCopyboidsToVBO CUDA kernel. */ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) { - dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); - kernCopyPositionsToVBO << > >(numObjects, dev_pos, vbodptr_positions, scene_scale); - kernCopyVelocitiesToVBO << > >(numObjects, dev_vel1, vbodptr_velocities, scene_scale); + kernCopyPositionsToVBO << > >(numObjects, dev_pos, vbodptr_positions, scene_scale); + kernCopyVelocitiesToVBO << > >(numObjects, dev_vel1, vbodptr_velocities, scene_scale); - checkCUDAErrorWithLine("copyBoidsToVBO failed!"); + checkCUDAErrorWithLine("copyBoidsToVBO failed!"); - cudaThreadSynchronize(); + cudaThreadSynchronize(); } @@ -230,10 +256,47 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) * in the `pos` and `vel` arrays. */ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) { - // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves - // Rule 2: boids try to stay a distance d away from each other - // Rule 3: boids try to match the speed of surrounding boids - return glm::vec3(0.0f, 0.0f, 0.0f); + + glm::vec3 pvj(0.0f, 0.0f, 0.0f); //perceived center of mass v + glm::vec3 pcj(0.0f, 0.0f, 0.0f); //perceived center of mass + glm::vec3 v2 (0.0f, 0.0f, 0.0f); + glm::vec3 v3 (0.0f, 0.0f, 0.0f); + //int cnt1, cnt2, cnt3 =0; // THIS IS A BUG!!!!! + float cnt1=0; float cnt2=0; float cnt3=0; + for (int i=0; i= N) return; + glm::vec3 val(0.0,0.0,0.0); + val+=computeVelocityChange(N,id, pos, vel1)+vel1[id]; + // Clamp the speed + if (glm::length(val) > maxSpeed){ + val= glm::normalize(val) * maxSpeed; + } + vel2[id]=val; + } -/** -* LOOK-1.2 Since this is pretty trivial, we implemented it for you. -* For each of the `N` bodies, update its position based on its current velocity. -*/ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) { - // Update position by velocity - int index = threadIdx.x + (blockIdx.x * blockDim.x); - if (index >= N) { - return; - } - glm::vec3 thisPos = pos[index]; - thisPos += vel[index] * dt; - - // Wrap the boids around so we don't lose them - thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x; - thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y; - thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z; - - thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x; - thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y; - thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z; - - pos[index] = thisPos; + // Update position by velocity + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + glm::vec3 thisPos = pos[index]; + thisPos += vel[index] * dt; + + // Wrap the boids around so we don't lose them + thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x; + thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y; + thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z; + + thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x; + thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y; + thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z; + + pos[index] = thisPos; } // LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index. @@ -277,183 +345,426 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) { // order for iterating over neighboring grid cells? // for(x) // for(y) -// for(z)? Or some other order? +// for(z)? Or some other order? -----cool! __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) { - return x + y * gridResolution + z * gridResolution * gridResolution; + return x + y * gridResolution + z * gridResolution * gridResolution; } __global__ void kernComputeIndices(int N, int gridResolution, - glm::vec3 gridMin, float inverseCellWidth, - glm::vec3 *pos, int *indices, int *gridIndices) { - // TODO-2.1 - // - Label each boid with the index of its grid cell. - // - Set up a parallel array of integer indices as pointers to the actual - // boid data in pos and vel1/vel2 + glm::vec3 gridMin, float inverseCellWidth, + glm::vec3 *pos, int *indices, int *gridIndices) { + // TODO-2.1 + // - Label each boid with the index of its grid cell. + // - Set up a parallel array of integer indices as pointers to the actual + // boid data in pos and vel1/vel2 + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + //int xi = std::floor((pos[index].x-gridMin.x)*inverseCellWidth); + //int yi = std::floor((pos[index].y-gridMin.y)*inverseCellWidth); + //int zi = std::floor((pos[index].z-gridMin.z)*inverseCellWidth); + glm::ivec3 xyzi=(pos[index]-gridMin)*inverseCellWidth; + int gid = gridIndex3Dto1D(xyzi.x, xyzi.y, xyzi.z, gridResolution); + gridIndices[index] = gid;// - Label each boid with the index of its grid cell. + indices[index] = index;//- Set up a parallel array of integer indices as pointers to the actual boid data in pos and vel1/vel2 } // LOOK-2.1 Consider how this could be useful for indicating that a cell // does not enclose any boids __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) { - int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index < N) { - intBuffer[index] = value; - } + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + intBuffer[index] = value; + } } __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices, - int *gridCellStartIndices, int *gridCellEndIndices) { - // TODO-2.1 - // Identify the start point of each cell in the gridIndices array. - // This is basically a parallel unrolling of a loop that goes - // "this index doesn't match the one before it, must be a new cell!" + int *gridCellStartIndices, int *gridCellEndIndices) { + // TODO-2.1 + // Identify the start point of each cell in the gridIndices array. + // This is basically a parallel unrolling of a loop that goes + // "this index doesn't match the one before it, must be a new cell!" + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idx >= N) return; + if (idx >=1){ + if (particleGridIndices[idx-1] != particleGridIndices[idx]){ + gridCellEndIndices[particleGridIndices[idx-1]]=idx-1;//by default -1 + gridCellStartIndices[particleGridIndices[idx]]=idx;//by default -1 + } + } + else { + gridCellStartIndices[particleGridIndices[idx]]=idx;//by default -1 + } + } + __global__ void kernUpdateVelNeighborSearchScattered( - int N, int gridResolution, glm::vec3 gridMin, - float inverseCellWidth, float cellWidth, - int *gridCellStartIndices, int *gridCellEndIndices, - int *particleArrayIndices, - glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { - // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce - // the number of boids that need to be checked. - // - Identify the grid cell that this particle is in - // - Identify which cells may contain neighbors. This isn't always 8. - // - For each cell, read the start/end indices in the boid pointer array. - // - Access each boid in the cell and compute velocity change from - // the boids rules, if this boid is within the neighborhood distance. - // - Clamp the speed change before putting the new speed in vel2 + int N, int gridResolution, glm::vec3 gridMin, + float inverseCellWidth, float cellWidth, + int *gridCellStartIndices, int *gridCellEndIndices, + int *particleArrayIndices, + glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { + // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce + // the number of boids that need to be checked. + // - Identify the grid cell that this particle is in + int idx= threadIdx.x + blockIdx.x*blockDim.x; + if (idx >= N) return; + + // - Identify which cells may contain neighbors. This isn't always 8. + int pi=particleArrayIndices[idx]; + + glm::ivec3 cid0= (pos[idx] - gridMin)*inverseCellWidth; + cid0=cid0-1; + cid0.x=imax(cid0.x,0); + cid0.y=imax(cid0.y,0); + cid0.z=imax(cid0.z,0); + + glm::vec3 pvj(0.0f, 0.0f, 0.0f); //perceived center of mass v + glm::vec3 pcj(0.0f, 0.0f, 0.0f); //perceived center of mass + glm::vec3 v2 (0.0f, 0.0f, 0.0f); + glm::vec3 v3 (0.0f, 0.0f, 0.0f); + float cnt1=0; float cnt2=0; float cnt3=0; + // - For each cell, read the start/end indices in the boid pointer array. + // PS: assume sequential (sorted already) + for (int i=0; i <2 ; i++){ + for (int j=0; j<2 ; j++){ + for (int k=0; k <2; k++){ + //if (i==1&&j==1&&k==1) continue; + if (cid0.x+i maxSpeed){ + val= glm::normalize(val) * maxSpeed; + } + vel2[idx]=val; + // - Clamp the speed change before putting the new speed in vel2 } +__global__ void makeContiguous(int N, glm::vec3 *buff_mess, glm::vec3 *buff_good, int *particleArrayIndices){ + int idx= threadIdx.x + blockIdx.x*blockDim.x; + if (idx >= N) return; + buff_good[idx] = buff_mess[particleArrayIndices[idx]]; +} +__global__ void undo_makeContiguous(int N, glm::vec3 *buff_mess, glm::vec3 *buff_good, int *particleArrayIndices){ + int idx= threadIdx.x + blockIdx.x*blockDim.x; + if (idx >= N) return; + buff_mess[particleArrayIndices[idx]]=buff_good[idx]; +} __global__ void kernUpdateVelNeighborSearchCoherent( - int N, int gridResolution, glm::vec3 gridMin, - float inverseCellWidth, float cellWidth, - int *gridCellStartIndices, int *gridCellEndIndices, - glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { - // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered, - // except with one less level of indirection. - // This should expect gridCellStartIndices and gridCellEndIndices to refer - // directly to pos and vel1. - // - Identify the grid cell that this particle is in - // - Identify which cells may contain neighbors. This isn't always 8. - // - For each cell, read the start/end indices in the boid pointer array. - // DIFFERENCE: For best results, consider what order the cells should be - // checked in to maximize the memory benefits of reordering the boids data. - // - Access each boid in the cell and compute velocity change from - // the boids rules, if this boid is within the neighborhood distance. - // - Clamp the speed change before putting the new speed in vel2 + int N, int gridResolution, glm::vec3 gridMin, + float inverseCellWidth, float cellWidth, + int *gridCellStartIndices, int *gridCellEndIndices, + glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { + // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered, + // except with one less level of indirection. + // This should expect gridCellStartIndices and gridCellEndIndices to refer + // directly to pos and vel1. + // - Identify which cells may contain neighbors. This isn't always 8. + // - For each cell, read the start/end indices in the boid pointer array. + // DIFFERENCE: For best results, consider what order the cells should be + // checked in to maximize the memory benefits of reordering the boids data. + // - Access each boid in the cell and compute velocity change from + // the boids rules, if this boid is within the neighborhood distance. + // - Clamp the speed change before putting the new speed in vel2 + int idx= threadIdx.x + blockIdx.x*blockDim.x; + if (idx >= N) return; + // - Identify the grid cell that this particle is in + /*glm::ivec3 gid3=(pos[idx]-gridMin)*inverseCellWidth; + int gid = gridIndex3Dto1D(gid3.x,gid3.y, gid3.z, gridResolution);*/ + glm::ivec3 cid0= (pos[idx] - gridMin)*inverseCellWidth; + cid0=cid0-1; + cid0.x=imax(cid0.x,0); + cid0.y=imax(cid0.y,0); + cid0.z=imax(cid0.z,0); + + glm::vec3 pvj(0.0f, 0.0f, 0.0f); //perceived center of mass v + glm::vec3 pcj(0.0f, 0.0f, 0.0f); //perceived center of mass + glm::vec3 v2 (0.0f, 0.0f, 0.0f); + glm::vec3 v3 (0.0f, 0.0f, 0.0f); + float cnt1=0; float cnt2=0; float cnt3=0; + // - For each cell, read the start/end indices in the boid pointer array. + // PS: assume sequential (sorted already) + for (int i=0; i <2 ; i++){ + for (int j=0; j<2 ; j++){ + for (int k=0; k <2; k++){ + //if (i==1&&j==1&&k==1) continue; + if (cid0.x+i maxSpeed){ + val= glm::normalize(val) * maxSpeed; + } + vel2[idx]=val; + // - Clamp the speed change before putting the new speed in vel2 + } /** * Step the entire N-body simulation by `dt` seconds. */ void Boids::stepSimulationNaive(float dt) { - // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. - // TODO-1.2 ping-pong the velocity buffers + dim3 N_blocks(std::ceil((float)numObjects/ blockSize)); + // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. + kernUpdateVelocityBruteForce<<>>(numObjects, dev_pos, dev_vel1, dev_vel2); + checkCUDAErrorWithLine("stepSimulationNaive-kernUpdateVelocityBruteForce failed!"); + kernUpdatePos<<>>(numObjects, dt, dev_pos, dev_vel2); + checkCUDAErrorWithLine("stepSimulationNaive-kernUpdatePos failed!"); + // TODO-1.2 ping-pong the velocity buffers + //std::swap(dev_vel1,dev_vel2); + dev_vel1=dev_vel2; } void Boids::stepSimulationScatteredGrid(float dt) { - // TODO-2.1 - // Uniform Grid Neighbor search using Thrust sort. - // In Parallel: - // - label each particle with its array index as well as its grid index. - // Use 2x width grids. - // - Unstable key sort using Thrust. A stable sort isn't necessary, but you - // are welcome to do a performance comparison. - // - Naively unroll the loop for finding the start and end indices of each - // cell's data pointers in the array of boid indices - // - Perform velocity updates using neighbor search - // - Update positions - // - Ping-pong buffers as needed + // TODO-2.1 + // Uniform Grid Neighbor search using Thrust sort. + // In Parallel: + // - label each particle with its array index as well as its grid index. + dim3 N_blocks(std::ceil((float)numObjects/ blockSize)); + dim3 N_blocks_grids(std::ceil((float) gridCellCount/blockSize)); + + //kernComputeIndices(int N, int gridResolution, glm::vec3 gridMin, float inverseCellWidth, glm::vec3 *pos, int *indices, int *gridIndices) + kernComputeIndices <<>>(numObjects,gridSideCount,gridMinimum,gridInverseCellWidth,dev_pos,dev_particleArrayIndices,dev_particleGridIndices); + checkCUDAErrorWithLine("kernComputeIndices-failed!"); + // Use 2x width grids. + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you + // are welcome to do a performance comparison. + // needed for use with thrust + // LOOK-2.1 Example for using thrust::sort_by_key + // sort grid idx by boid pos idx + thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices); + // - Naively unroll the loop for finding the start and end indices of each + // cell's data pointers in the array of boid indices + //kernResetIntBuffer(int N, int *intBuffer, int value) + //kernResetIntBuffer<<>>(numObjects, dev_gridCellStartIndices, -1); //use -1 indicate not occupied + kernResetIntBuffer<<>>(gridCellCount, dev_gridCellStartIndices, -1); //use -1 indicate not occupied + checkCUDAErrorWithLine("kernResetIntBuffer-dev_gridCellStartIndices-failed!"); + kernResetIntBuffer<<>>(gridCellCount, dev_gridCellEndIndices, -1); //use -1 indicate not occupied + checkCUDAErrorWithLine("kernResetIntBuffer-dev_gridCellEndIndices-failed!"); + kernIdentifyCellStartEnd<<>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices); + checkCUDAErrorWithLine("kernIdentifyCellStartEnd-failed!"); + // - Perform velocity updates using neighbor search + kernUpdateVelNeighborSearchScattered<<>>(numObjects,gridSideCount,gridMinimum,gridInverseCellWidth,gridCellWidth,dev_gridCellStartIndices,dev_gridCellEndIndices + ,dev_particleArrayIndices,dev_pos,dev_vel1,dev_vel2); + checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered-failed!"); + // - Update positions + kernUpdatePos<<>>(numObjects, dt, dev_pos, dev_vel2); + // - Ping-pong buffers as needed + //std::swap(dev_vel1,dev_vel2); + dev_vel1=dev_vel2; } void Boids::stepSimulationCoherentGrid(float dt) { - // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid - // Uniform Grid Neighbor search using Thrust sort on cell-coherent data. - // In Parallel: - // - Label each particle with its array index as well as its grid index. - // Use 2x width grids - // - Unstable key sort using Thrust. A stable sort isn't necessary, but you - // are welcome to do a performance comparison. - // - Naively unroll the loop for finding the start and end indices of each - // cell's data pointers in the array of boid indices - // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all - // the particle data in the simulation array. - // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED - // - Perform velocity updates using neighbor search - // - Update positions - // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid + // Uniform Grid Neighbor search using Thrust sort on cell-coherent data. + // In Parallel: + // - Label each particle with its array index as well as its grid index. + // Use 2x width grids + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you + // are welcome to do a performance comparison. + // - Naively unroll the loop for finding the start and end indices of each + // cell's data pointers in the array of boid indices + // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all + // the particle data in the simulation array. + // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED + // - Perform velocity updates using neighbor search + // - Update positions + // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + + dim3 N_blocks(std::ceil((float)numObjects/ blockSize)); + dim3 N_blocks_grids(std::ceil((float) gridCellCount/blockSize)); + kernComputeIndices <<>>(numObjects,gridSideCount,gridMinimum,gridInverseCellWidth,dev_pos,dev_particleArrayIndices,dev_particleGridIndices); + checkCUDAErrorWithLine("kernComputeIndices-failed!"); + + thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices); + + kernResetIntBuffer<<>>(gridCellCount, dev_gridCellStartIndices, -1); //use -1 indicate not occupied + checkCUDAErrorWithLine("kernResetIntBuffer-dev_gridCellStartIndices-failed!"); + kernResetIntBuffer<<>>(gridCellCount, dev_gridCellEndIndices, -1); //use -1 indicate not occupied + checkCUDAErrorWithLine("kernResetIntBuffer-dev_gridCellEndIndices-failed!"); + + kernIdentifyCellStartEnd<<>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices); + checkCUDAErrorWithLine("kernIdentifyCellStartEnd-failed!"); + + //dev_pos_buff=dev_pos; + //dev_vel1_buff=dev_vel1; + //dev_vel2_buff=dev_vel2; + makeContiguous<<>>(numObjects, dev_pos, dev_pos_buff, dev_particleArrayIndices); + makeContiguous<<>>(numObjects, dev_vel1, dev_vel1_buff, dev_particleArrayIndices); + makeContiguous<<>>(numObjects, dev_vel2, dev_vel2_buff, dev_particleArrayIndices); + + + //kernUpdateVelNeighborSearchCoherent<<>>(numObjects,gridSideCount,gridMinimum,gridInverseCellWidth,gridCellWidth,dev_gridCellStartIndices,dev_gridCellEndIndices + // ,dev_pos,dev_vel1,dev_vel2); + kernUpdateVelNeighborSearchCoherent <<> >(numObjects, gridSideCount, + gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, + dev_pos_buff, dev_vel1_buff, dev_vel2_buff); + checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered-failed!"); + + kernUpdatePos <<>>(numObjects, dt, dev_pos_buff, dev_vel2_buff); + + //dev_vel1=dev_vel2_buff; + //dev_pos=dev_pos_buff; + + //std::swap(dev_pos, dev_pos_buff); + //std::swap(dev_vel1, dev_vel2_buff); + undo_makeContiguous<<>>(numObjects, dev_pos, dev_pos_buff, dev_particleArrayIndices); + undo_makeContiguous<<>>(numObjects, dev_vel1, dev_vel2_buff, dev_particleArrayIndices); + undo_makeContiguous<<>>(numObjects, dev_vel2, dev_vel2_buff, dev_particleArrayIndices); } void Boids::endSimulation() { - cudaFree(dev_vel1); - cudaFree(dev_vel2); - cudaFree(dev_pos); - - // TODO-2.1 TODO-2.3 - Free any additional buffers here. + cudaFree(dev_vel1); + cudaFree(dev_vel2); + cudaFree(dev_pos); + + // TODO-2.1 TODO-2.3 - Free any additional buffers here. + cudaFree(dev_particleArrayIndices); + cudaFree(dev_particleGridIndices); + cudaFree(dev_gridCellStartIndices); + cudaFree(dev_gridCellEndIndices); + + cudaFree(dev_pos_buff); + cudaFree(dev_vel1_buff); + cudaFree(dev_vel2_buff); } void Boids::unitTest() { - // LOOK-1.2 Feel free to write additional tests here. - - // test unstable sort - int *dev_intKeys; - int *dev_intValues; - int N = 10; - - int *intKeys = new int[N]; - int *intValues = new int[N]; - - intKeys[0] = 0; intValues[0] = 0; - intKeys[1] = 1; intValues[1] = 1; - intKeys[2] = 0; intValues[2] = 2; - intKeys[3] = 3; intValues[3] = 3; - intKeys[4] = 0; intValues[4] = 4; - intKeys[5] = 2; intValues[5] = 5; - intKeys[6] = 2; intValues[6] = 6; - intKeys[7] = 0; intValues[7] = 7; - intKeys[8] = 5; intValues[8] = 8; - intKeys[9] = 6; intValues[9] = 9; - - cudaMalloc((void**)&dev_intKeys, N * sizeof(int)); - checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!"); - - cudaMalloc((void**)&dev_intValues, N * sizeof(int)); - checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!"); - - dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); - - std::cout << "before unstable sort: " << std::endl; - for (int i = 0; i < N; i++) { - std::cout << " key: " << intKeys[i]; - std::cout << " value: " << intValues[i] << std::endl; - } - - // How to copy data to the GPU - cudaMemcpy(dev_intKeys, intKeys, sizeof(int) * N, cudaMemcpyHostToDevice); - cudaMemcpy(dev_intValues, intValues, sizeof(int) * N, cudaMemcpyHostToDevice); - - // Wrap device vectors in thrust iterators for use with thrust. - thrust::device_ptr dev_thrust_keys(dev_intKeys); - thrust::device_ptr dev_thrust_values(dev_intValues); - // LOOK-2.1 Example for using thrust::sort_by_key - thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values); - - // How to copy data back to the CPU side from the GPU - cudaMemcpy(intKeys, dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost); - cudaMemcpy(intValues, dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost); - checkCUDAErrorWithLine("memcpy back failed!"); - - std::cout << "after unstable sort: " << std::endl; - for (int i = 0; i < N; i++) { - std::cout << " key: " << intKeys[i]; - std::cout << " value: " << intValues[i] << std::endl; - } - - // cleanup - delete[] intKeys; - delete[] intValues; - cudaFree(dev_intKeys); - cudaFree(dev_intValues); - checkCUDAErrorWithLine("cudaFree failed!"); - return; + // LOOK-1.2 Feel free to write additional tests here. + + // test unstable sort + int *dev_intKeys; + int *dev_intValues; + int N = 10; + + int *intKeys = new int[N]; + int *intValues = new int[N]; + + intKeys[0] = 0; intValues[0] = 0; + intKeys[1] = 1; intValues[1] = 1; + intKeys[2] = 0; intValues[2] = 2; + intKeys[3] = 3; intValues[3] = 3; + intKeys[4] = 0; intValues[4] = 4; + intKeys[5] = 2; intValues[5] = 5; + intKeys[6] = 2; intValues[6] = 6; + intKeys[7] = 0; intValues[7] = 7; + intKeys[8] = 5; intValues[8] = 8; + intKeys[9] = 6; intValues[9] = 9; + + cudaMalloc((void**)&dev_intKeys, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!"); + + cudaMalloc((void**)&dev_intValues, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!"); + + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + std::cout << "before unstable sort: " << std::endl; + for (int i = 0; i < N; i++) { + std::cout << " key: " << intKeys[i]; + std::cout << " value: " << intValues[i] << std::endl; + } + + // How to copy data to the GPU + cudaMemcpy(dev_intKeys, intKeys, sizeof(int) * N, cudaMemcpyHostToDevice); + cudaMemcpy(dev_intValues, intValues, sizeof(int) * N, cudaMemcpyHostToDevice); + + // Wrap device vectors in thrust iterators for use with thrust. + thrust::device_ptr dev_thrust_keys(dev_intKeys); + thrust::device_ptr dev_thrust_values(dev_intValues); + // LOOK-2.1 Example for using thrust::sort_by_key + thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values); + + // How to copy data back to the CPU side from the GPU + cudaMemcpy(intKeys, dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost); + cudaMemcpy(intValues, dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("memcpy back failed!"); + + std::cout << "after unstable sort: " << std::endl; + for (int i = 0; i < N; i++) { + std::cout << " key: " << intKeys[i]; + std::cout << " value: " << intValues[i] << std::endl; + } + + // cleanup + delete[] intKeys; + delete[] intValues; + cudaFree(dev_intKeys); + cudaFree(dev_intValues); + checkCUDAErrorWithLine("cudaFree failed!"); + return; } diff --git a/src/main.cpp b/src/main.cpp index e416836..87bdb35 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -5,17 +5,18 @@ * @date 2013-2016 * @copyright University of Pennsylvania */ - +//#include "stdafx.h" #include "main.hpp" - +#include +#include // ================ // Configuration // ================ // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID #define VISUALIZE 1 -#define UNIFORM_GRID 0 -#define COHERENT_GRID 0 +#define UNIFORM_GRID 1 +#define COHERENT_GRID 1 // LOOK-1.2 - change this to adjust particle count in the simulation const int N_FOR_VIS = 5000; @@ -32,7 +33,7 @@ int main(int argc, char* argv[]) { Boids::endSimulation(); return 0; } else { - return 1; + return 1; } } @@ -219,14 +220,15 @@ void initShaders(GLuint * program) { double fps = 0; double timebase = 0; int frame = 0; - + int totalframe=0; + double time0 = glfwGetTime(); Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure - // your CUDA development setup is ready to go. - + // your CUDA development setup is ready to go. while (!glfwWindowShouldClose(window)) { glfwPollEvents(); frame++; + totalframe++; double time = glfwGetTime(); if (time - timebase > 1.0) { @@ -235,6 +237,10 @@ void initShaders(GLuint * program) { frame = 0; } + //if ((time - time0)>10){ + // std::cout<<(double)(time - time0)/totalframe< #include "utilityCore.hpp" #include "glslUtility.hpp" +//#include "stdafx.h" #include "kernel.h" + //==================================== // GL Stuff //==================================== diff --git a/src/stdafx.cpp b/src/stdafx.cpp new file mode 100644 index 0000000..1577c4e --- /dev/null +++ b/src/stdafx.cpp @@ -0,0 +1 @@ +#include "stdafx.h" \ No newline at end of file diff --git a/src/stdafx.h b/src/stdafx.h new file mode 100644 index 0000000..331ffd8 --- /dev/null +++ b/src/stdafx.h @@ -0,0 +1,13 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "utilityCore.hpp" +#include "glslUtility.hpp" \ No newline at end of file