diff --git a/Common/include/CConfig.hpp b/Common/include/CConfig.hpp index cb5a607c9df0..eb96b5a973a7 100644 --- a/Common/include/CConfig.hpp +++ b/Common/include/CConfig.hpp @@ -521,6 +521,7 @@ class CConfig { Kind_Gradient_Method_Recon, /*!< \brief Numerical method for computation of spatial gradients used for upwind reconstruction. */ Kind_Deform_Linear_Solver, /*!< Numerical method to deform the grid */ Kind_Deform_Linear_Solver_Prec, /*!< \brief Preconditioner of the linear solver. */ + Kind_Graph_Part_Algo, /*!< \brief Algorithm for parallel partitioning of the matrix graph. */ Kind_Linear_Solver, /*!< \brief Numerical solver for the implicit scheme. */ Kind_Linear_Solver_Prec, /*!< \brief Preconditioner of the linear solver. */ Kind_DiscAdj_Linear_Solver, /*!< \brief Linear solver for the discrete adjoint system. */ @@ -634,6 +635,8 @@ class CConfig { unsigned long Linear_Solver_Restart_Frequency; /*!< \brief Restart frequency of the linear solver for the implicit formulation. */ unsigned long Linear_Solver_Prec_Threads; /*!< \brief Number of threads per rank for ILU and LU_SGS preconditioners. */ unsigned short Linear_Solver_ILU_n; /*!< \brief ILU fill=in level. */ + unsigned short Cuda_Block_Size; /*!< \brief User-specified value for the X-Axis dimension of thread blocks + that are deployed by the CUDA Kernels. */ su2double SemiSpan; /*!< \brief Wing Semi span. */ su2double Roe_Kappa; /*!< \brief Relaxation of the Roe scheme. */ su2double Relaxation_Factor_Adjoint; /*!< \brief Relaxation coefficient for variable updates of adjoint solvers. */ @@ -4160,6 +4163,12 @@ class CConfig { */ bool GetLeastSquaresRequired(void) const { return LeastSquaresRequired; } + /*! + * \brief Get the type of algorithm used for partitioning the matrix graph. + * \return Algorithm that divides the matrix into partitions that are executed parallely. + */ + unsigned short GetKind_Graph_Part_Algo(void) const { return Kind_Graph_Part_Algo; } + /*! * \brief Get the kind of solver for the implicit solver. * \return Numerical solver for implicit formulation (solving the linear system). @@ -4221,6 +4230,18 @@ class CConfig { */ su2double GetLinear_Solver_Smoother_Relaxation(void) const { return Linear_Solver_Smoother_Relaxation; } + /*! + * \brief Get thread block dimensions (X-axis) being used by the CUDA Kernels. + * \return Thread block dimensions (X-axis) being used by the CUDA Kernels. + */ + unsigned short GetCuda_Block_Size(void) const { return Cuda_Block_Size; } + + /*! + * \brief Get the number of matrix rows assigned per CUDA Block. + * \return The number of matrix rows assigned per CUDA Block. + */ + unsigned short GetRows_Per_Cuda_Block(void) const { return cudaKernelParameters::rounded_up_division(cudaKernelParameters::CUDA_WARP_SIZE, Cuda_Block_Size); } + /*! * \brief Get the relaxation factor for solution updates of adjoint solvers. */ diff --git a/Common/include/geometry/CGeometry.hpp b/Common/include/geometry/CGeometry.hpp index a345f72c046f..100e9beeb3f5 100644 --- a/Common/include/geometry/CGeometry.hpp +++ b/Common/include/geometry/CGeometry.hpp @@ -260,6 +260,13 @@ class CGeometry { unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points on all prior ranks in the linear partitioning. */ + unsigned long nPartition; /*!< \brief Number of divisions of the matrix graph during execution of parallel + partitioning algorithms. */ + unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */ + vector + partitionOffsets; /*!< \brief Vector array containing the indices at which different parallel partitions begin. */ + vector chainPtr; /*!< \brief Vector array containing distribution of levels into chains. */ + /*--- Data structures for point-to-point MPI communications. ---*/ int maxCountPerPoint{0}; /*!< \brief Maximum number of pieces of data sent per vertex in point-to-point comms. */ diff --git a/Common/include/geometry/CPhysicalGeometry.hpp b/Common/include/geometry/CPhysicalGeometry.hpp index 1148471f1c3e..5ac21523ee25 100644 --- a/Common/include/geometry/CPhysicalGeometry.hpp +++ b/Common/include/geometry/CPhysicalGeometry.hpp @@ -148,6 +148,14 @@ class CPhysicalGeometry final : public CGeometry { */ void DistributeColoring(const CConfig* config, CGeometry* geometry); + /*! + * \brief Divide the graph produced by the matrix into parallel partitions. + * \param[in] config - Definition of the particular problem. + * \param[in] pointList - Ordered list of points in the mesh. + */ + template + void PartitionGraph(const CConfig* config, vector& pointList); + /*! * \brief Distribute the grid points, including ghost points, across all ranks based on a ParMETIS coloring. * \param[in] config - Definition of the particular problem. diff --git a/Common/include/linear_algebra/CGraphPartitioning.hpp b/Common/include/linear_algebra/CGraphPartitioning.hpp new file mode 100644 index 000000000000..44b7d74928c9 --- /dev/null +++ b/Common/include/linear_algebra/CGraphPartitioning.hpp @@ -0,0 +1,182 @@ +/*! + * \file CGraphPartitioning.hpp + * \brief Headers for the classes realted to the algorithms that are used + to divide the matrix acyclic graph into parallel partitions. + * \author A. Raj + * \version 8.2.0 "Harrier" + * + * SU2 Project Website: https://su2code.github.io + * + * The SU2 Project is maintained by the SU2 Foundation + * (http://su2foundation.org) + * + * Copyright 2012-2025, SU2 Contributors (cf. AUTHORS.md) + * + * SU2 is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * SU2 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with SU2. If not, see . + */ + +#pragma once + +#include "../geometry/CGeometry.hpp" + +/*! + * \class CGraphPartitioning + * \brief Abstract base class for defining graph partitioning algorithms. + * \author A. Raj + * + * In order to use certain parallel algorithms in the solution process - + * whether with linear solvers or preconditioners - we require the matrix + * to be partitioned into certain parallel divisions. These maybe in the form + * of levels, blocks, colors and so on. Since a number of different algorithms + * can be used to split the graph, we've introduced a base class containing the + * "Partition" member function from which child classes of the specific + * algorithm can be derived. Currently, we are only using direct declarations + * of the derived classes in the code. However, this method was chosen as it + * allows us to pass different child class algorithms to a single implementation + * of the function that requires it - similar to the CMatrixVectorProduct class. + */ + +template + +class CGraphPartitioning { + public: + virtual ~CGraphPartitioning() = 0; + virtual void Partition(vector& pointList, vector& partitionOffsets, + vector& chainPtr, unsigned short chainLimit) = 0; +}; +template +CGraphPartitioning::~CGraphPartitioning() {} + +template + +class CLevelScheduling final : public CGraphPartitioning { + private: + ScalarType nPointDomain; + CPoint* nodes; + + public: + ScalarType nLevels; + ScalarType maxLevelWidth; + vector levels; + + /*! + * \brief constructor of the class + * \param[in] nPointDomain_ref - number of points associated with the problem + * \param[in] nodes_ref - represents the relationships between the points + */ + inline CLevelScheduling(ScalarType nPointDomain_ref, CPoint* nodes_ref) + : nPointDomain(nPointDomain_ref), nodes(nodes_ref) { + nLevels = 0ul; + maxLevelWidth = 0ul; + } + + CLevelScheduling() = delete; // Removing default constructor + + /*! + * \brief Divides the levels into groups of chains depending on the preset GPU block and warp size. + * \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level. + * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain. + * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block. + */ + void CalculateChain(vector levelOffsets, vector& chainPtr, unsigned short rowsPerBlock) { + ScalarType levelWidth = 0; + + /*This is not a magic number. We are simply initializing + the point array with its first element that is always zero.*/ + chainPtr.push_back(0); + + for (ScalarType iLevel = 0ul; iLevel < nLevels; iLevel++) { + levelWidth = levelOffsets[iLevel + 1] - levelOffsets[iLevel]; + maxLevelWidth = std::max(levelWidth, maxLevelWidth); + + if (levelWidth > rowsPerBlock) { + if (chainPtr.back() != iLevel) { + chainPtr.push_back(iLevel); + } + + chainPtr.push_back(iLevel + 1); + } + } + + chainPtr.push_back(nLevels); + } + + /*! + * \brief Reorders the points according to the levels + * \param[in] pointList - Ordered array that contains the list of all mesh points. + * \param[in] inversePointList - Array utilized to access the index of each point in pointList. + * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level. + */ + void Reorder(vector& pointList, vector& inversePointList, vector levelOffsets) { + for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) { + const auto globalPoint = pointList[localPoint]; + inversePointList[levelOffsets[levels[localPoint]]++] = globalPoint; + } + + pointList = std::move(inversePointList); + } + + /*! + * \brief Reorders the points according to the levels + * \param[in] pointList - Ordered array that contains the list of all mesh points. + * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level. + * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain. + * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block. + */ + void Partition(vector& pointList, vector& levelOffsets, vector& chainPtr, + unsigned short rowsPerBlock) override { + vector inversePointList; + inversePointList.reserve(nPointDomain); + levels.reserve(nPointDomain); + + for (auto point = 0ul; point < nPointDomain; point++) { + inversePointList[pointList[point]] = point; + levels[point] = 0; + } + + // Local Point - Ordering of the points post the RCM ordering + // Global Point - Original order of the points before the RCM ordering + + for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) { + const auto globalPoint = pointList[localPoint]; + + for (auto adjPoints = 0u; adjPoints < nodes->GetnPoint(globalPoint); adjPoints++) { + const auto adjGlobalPoint = nodes->GetPoint(globalPoint, adjPoints); + + if (adjGlobalPoint < nPointDomain) { + const auto adjLocalPoint = inversePointList[adjGlobalPoint]; + + if (adjLocalPoint < localPoint) { + levels[localPoint] = std::max(levels[localPoint], levels[adjLocalPoint] + 1); + } + } + } + + nLevels = std::max(nLevels, levels[localPoint] + 1); + } + + levelOffsets.resize(nLevels + 1); + for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++) { + ++levelOffsets[levels[iPoint] + 1]; + } + + for (auto iLevel = 2ul; iLevel <= nLevels; ++iLevel) { + levelOffsets[iLevel] += levelOffsets[iLevel - 1]; + } + + Reorder(pointList, inversePointList, levelOffsets); + + CalculateChain(levelOffsets, chainPtr, rowsPerBlock); + } +}; diff --git a/Common/include/linear_algebra/CPreconditioner.hpp b/Common/include/linear_algebra/CPreconditioner.hpp index 9310225179cc..7f5e601e323a 100644 --- a/Common/include/linear_algebra/CPreconditioner.hpp +++ b/Common/include/linear_algebra/CPreconditioner.hpp @@ -205,7 +205,15 @@ class CLU_SGSPreconditioner final : public CPreconditioner { * \param[out] v - CSysVector that is the result of the preconditioning. */ inline void operator()(const CSysVector& u, CSysVector& v) const override { +#ifdef HAVE_CUDA + if (config->GetCUDA()) { + sparse_matrix.GPUComputeLU_SGSPreconditioner(u, v, geometry, config); + } else { + sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config); + } +#else sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config); +#endif } }; diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp index d3212e7a9f5f..284afeca0411 100644 --- a/Common/include/linear_algebra/CSysMatrix.hpp +++ b/Common/include/linear_algebra/CSysMatrix.hpp @@ -148,8 +148,10 @@ class CSysMatrix { ScalarType* d_matrix; /*!< \brief Device Pointer to store the matrix values on the GPU. */ const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */ const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */ - bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not. - Mainly used to conditionally free GPU memory in the class destructor. */ + const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */ + unsigned long* d_partition_offsets; + bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not. + Mainly used to conditionally free GPU memory in the class destructor. */ ScalarType* ILU_matrix; /*!< \brief Entries of the ILU sparse matrix. */ unsigned long nnz_ilu; /*!< \brief Number of possible nonzero entries in the matrix (ILU). */ @@ -904,8 +906,8 @@ class CSysMatrix { * \param[in] vec - CSysVector to be multiplied by the preconditioner. * \param[out] prod - Result of the product A*vec. */ - void GPUComputeLU_SGSPreconditioner(ScalarType& vec, ScalarType& prod, CGeometry* geometry, - const CConfig* config) const; + void GPUComputeLU_SGSPreconditioner(const CSysVector& vec, CSysVector& prod, + CGeometry* geometry, const CConfig* config) const; /*! * \brief Build the Jacobi preconditioner. diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh index 05f40ee4d9b0..7e1109a4ee81 100644 --- a/Common/include/linear_algebra/GPUComms.cuh +++ b/Common/include/linear_algebra/GPUComms.cuh @@ -25,16 +25,72 @@ * License along with SU2. If not, see . */ +#pragma once + #include #include +#include "../option_structure.hpp" + +/*! + * \struct matrixParameters + * \brief Structure containing information related to the Jacobian Matrix which is utilized by any launched Kernel. + * + * This implementation alleviates the need to pass an excessive number of arguments + * to a Kernel and, instead, packages it into a single structure. While this leads + * to data duplication for a short period of time, this is a much cleaner and resuable approach. + * \author A. Raj + */ +struct matrixParameters{ -namespace KernelParameters{ + public: + unsigned long totalRows; /*!< \brief Contains the total number of rows of the Jacbian Matrix. */ + unsigned short blockRowSize; /*!< \brief Contains the row dimensions of the blocks of the Jacobian Matrix. */ + unsigned short blockColSize; /*!< \brief Contains the column dimensions of the blocks of the Jacobian Matrix. */ + unsigned int nChainStart; /*!< \brief Starting partition of the current chain. */ + unsigned int nChainEnd; /*!< \brief Ending partition of the current chain. */ + unsigned short blockSize; /*!< \brief Contains the total number of elements in each block of the Jacbian Matrix. */ + unsigned short activeThreads; /*!< \brief Cotains the number of active threads per iteration during MVP - depending on the + dimensions of the Jacbian Matrix. */ + unsigned short rowsPerBlock; /*!< \brief Number of rows being processed by each thread block. This is equal to the number + of warps present in the block as each row gets assigned a warp. */ -inline constexpr int round_up_division(const int multiple, int x) { return ((x + multiple - 1) / multiple); } + matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions, unsigned short rowsPrBlck){ + totalRows = nPointDomain; + blockRowSize = nEqn; + blockColSize = nVar; + nChainStart = 0; + nChainEnd = 0; + blockSize = nVar * nEqn; + activeThreads = nVar * (cudaKernelParameters::CUDA_WARP_SIZE/nVar); + rowsPerBlock = rowsPrBlck; + } + + /*! + * \brief Returns the memory index in the shared memory array used by the Symmetric Iteration Kernels. + */ + __device__ __forceinline__ unsigned short shrdMemIndex(unsigned short localRow, unsigned short threadNo){ + return (localRow * blockSize + threadNo); + } + + /*! + * \brief Returns a boolean value to check whether the row is under the total number of rows and if the + * thread number is within a user-specified thread limit. This is to avoid illegal memory accesses. + */ + __device__ __forceinline__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){ + return (row Blending_Map = { MakePair("BEZIER", BEZIER) }; +/*! + * \brief Types of graph partitioning algorithms for parallel computing + */ +enum ENUM_GRAPH_PART_ALGORITHM { + LEVEL_SCHEDULING, /*!< \brief Partitions the graphs according to level-set algorithm. */ +}; +static const MapType Graph_Part_Map = { + MakePair("LEVEL_SCHEDULING", LEVEL_SCHEDULING) +}; + /*! * \brief Types of solvers for solving linear systems */ diff --git a/Common/src/CConfig.cpp b/Common/src/CConfig.cpp index af17bd4e6f5c..4418a8917291 100644 --- a/Common/src/CConfig.cpp +++ b/Common/src/CConfig.cpp @@ -1860,6 +1860,9 @@ void CConfig::SetConfig_Options() { /*!\par CONFIG_CATEGORY: Linear solver definition \ingroup Config*/ /*--- Options related to the linear solvers ---*/ + /*!\brief GRAPH_PARTIONING + * \n DESCRIPTION: Algorithm for partioning the matrix graph to facilitate parallel execution of linear algebra subroutines\n OPTIONS: see \link Graph_Part_Map \endlink \n DEFAULT: LEVEL_SCHEDULING \ingroup Config*/ + addEnumOption("GRAPH_PART_ALGORITHM", Kind_Graph_Part_Algo, Graph_Part_Map, LEVEL_SCHEDULING); /*!\brief LINEAR_SOLVER * \n DESCRIPTION: Linear solver for the implicit, mesh deformation, or discrete adjoint systems \n OPTIONS: see \link Linear_Solver_Map \endlink \n DEFAULT: FGMRES \ingroup Config*/ addEnumOption("LINEAR_SOLVER", Kind_Linear_Solver, Linear_Solver_Map, FGMRES); @@ -1898,7 +1901,8 @@ void CConfig::SetConfig_Options() { addEnumOption("DISCADJ_LIN_SOLVER", Kind_DiscAdj_Linear_Solver, Linear_Solver_Map, FGMRES); /* DESCRIPTION: Preconditioner for the discrete adjoint Krylov linear solvers */ addEnumOption("DISCADJ_LIN_PREC", Kind_DiscAdj_Linear_Prec, Linear_Solver_Prec_Map, ILU); - /* DESCRIPTION: Linear solver for the discete adjoint systems */ + /* DESCRIPTION: Thread block size for CUDA GPUs */ + addUnsignedShortOption("CUDA_BLOCK_SIZE", Cuda_Block_Size, 1024); /*!\par CONFIG_CATEGORY: Convergence\ingroup Config*/ /*--- Options related to convergence ---*/ diff --git a/Common/src/geometry/CPhysicalGeometry.cpp b/Common/src/geometry/CPhysicalGeometry.cpp index c97d2453541e..bc6cd5258270 100644 --- a/Common/src/geometry/CPhysicalGeometry.cpp +++ b/Common/src/geometry/CPhysicalGeometry.cpp @@ -26,6 +26,7 @@ */ #include "../../include/geometry/CPhysicalGeometry.hpp" +#include "../../include/linear_algebra/CGraphPartitioning.hpp" #include "../../include/adt/CADTPointsOnlyClass.hpp" #include "../../include/toolboxes/printing_toolbox.hpp" #include "../../include/toolboxes/CLinearPartitioner.hpp" @@ -699,6 +700,21 @@ void CPhysicalGeometry::DistributeColoring(const CConfig* config, CGeometry* geo delete[] nPoint_Flag; } +template +void CPhysicalGeometry::PartitionGraph(const CConfig* config, vector& pointList) { + unsigned short KindAlgorithm = config->GetKind_Graph_Part_Algo(); + partitionOffsets.reserve(nPointDomain); + + switch (KindAlgorithm) { + case LEVEL_SCHEDULING: + auto levelSchedule = CLevelScheduling(nPointDomain, nodes); + levelSchedule.Partition(pointList, partitionOffsets, chainPtr, config->GetRows_Per_Cuda_Block()); + nPartition = levelSchedule.nLevels; + maxPartitionSize = levelSchedule.maxLevelWidth; + break; + } +} + void CPhysicalGeometry::DistributeVolumeConnectivity(const CConfig* config, CGeometry* geometry, unsigned short Elem_Type) { unsigned short NODES_PER_ELEMENT = 0; @@ -4542,6 +4558,9 @@ void CPhysicalGeometry::SetRCM_Ordering(CConfig* config) { if (!status) SU2_MPI::Error("RCM ordering failed", CURRENT_FUNCTION); } + /*Partition graph into parallel constituents for GPU Operations*/ + if (config->GetCUDA()) PartitionGraph(config, Result); + /*--- Add the MPI points ---*/ for (auto iPoint = nPointDomain; iPoint < nPoint; iPoint++) { Result.push_back(iPoint); diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp index c0a6aa81ee04..8764bd11bba9 100644 --- a/Common/src/linear_algebra/CSysMatrix.cpp +++ b/Common/src/linear_algebra/CSysMatrix.cpp @@ -72,6 +72,7 @@ CSysMatrix::~CSysMatrix() { GPUMemoryAllocation::gpu_free(d_matrix); GPUMemoryAllocation::gpu_free(d_row_ptr); GPUMemoryAllocation::gpu_free(d_col_ind); + GPUMemoryAllocation::gpu_free(d_partition_offsets); } #ifdef USE_MKL @@ -156,9 +157,15 @@ void CSysMatrix::Initialize(unsigned long npoint, unsigned long npoi ptr = GPUMemoryAllocation::gpu_alloc_cpy(src_ptr, num * sizeof(const unsigned long)); }; + auto GPUVectorAllocAndCopy = [](unsigned long*& ptr, vector& src_ptr, unsigned long num) { + ptr = GPUMemoryAllocation::gpu_alloc_cpy(&src_ptr[0], num * sizeof(unsigned long)); + }; + GPUAllocAndInit(d_matrix, nnz * nVar * nEqn); GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0)); GPUAllocAndCopy(d_col_ind, col_ind, nnz); + GPUAllocAndCopy(d_dia_ptr, dia_ptr, nPointDomain); + GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nPartition + 1); } if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data(); diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu index d63430c96f96..d306d6e3d31f 100644 --- a/Common/src/linear_algebra/CSysMatrixGPU.cu +++ b/Common/src/linear_algebra/CSysMatrixGPU.cu @@ -1,6 +1,15 @@ /*! * \file CSysMatrixGPU.cu * \brief Implementations of Kernels and Functions for Matrix Operations on the GPU + * + * The kernel implementations will feature a lot of if-statements. + * The reason for such heavy usage of conditionals is to do a check + * whether the memory locations being accessed by the threads are + * within bounds or not. Usually the entire kernel is "wrapped" in + * a single conditional for these checks. But, in our case, it is + * necessary for us to use intermittent synchronization barriers like + * __syncthreads() which will lead to thread divergence issues if used + * inside a conditional. * \author A. Raj * \version 8.2.0 "Harrier" * @@ -26,34 +35,251 @@ */ #include "../../include/linear_algebra/CSysMatrix.hpp" +#include "../../include/geometry/CGeometry.hpp" #include "../../include/linear_algebra/GPUComms.cuh" +using namespace cudaKernelParameters; + +template +__device__ void DeviceGaussElimination(matrixType* matrixCopy, vectorType* prod, unsigned long row, unsigned int threadNo, bool rowInPartition, matrixParameters matrixParam) +{ + auto matrixIndex = [=](unsigned short x, unsigned short y){ + return (x * matrixParam.blockColSize + y); + }; + + auto vectorIndex = [=](unsigned short row, unsigned short elemNo){ + return (row * matrixParam.blockRowSize + elemNo); + }; + + unsigned short x = threadNo/matrixParam.blockRowSize; + unsigned short y = threadNo % matrixParam.blockColSize; + + matrixType pivot, weight = 0.0; + + __syncthreads(); + + for(unsigned short currentRow=0; currentRow < matrixParam.blockRowSize; currentRow++) + { + if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)){ + pivot = matrixCopy[matrixIndex(currentRow, currentRow)]; + } + __syncthreads(); + + if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)){ + + if(x==currentRow) + { + matrixCopy[matrixIndex(currentRow, y)] /= pivot; + if(y==0) prod[vectorIndex(row, currentRow)] /= pivot; + } + + weight = matrixCopy[matrixIndex(x, currentRow)]; + + if(x!=currentRow) + { + matrixCopy[matrixIndex(x, y)] -= weight * matrixCopy[matrixIndex(currentRow, y)]; + if(y==0) prod[vectorIndex(row, x)] -= weight * prod[vectorIndex(row, currentRow)]; + } + } + + __syncthreads(); + } + +} + +template +__global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_partition_offsets, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, matrixParameters matrixParam) +{ + /*--- First part of the symmetric iteration: (D+L).x* = b ---*/ + + auto matrixIndex = [=](unsigned long row, unsigned short threadNo, const unsigned long* ptr_array){ + return (ptr_array[row] * matrixParam.blockSize + threadNo); + }; + + auto vectorIndex = [=](unsigned long row, unsigned short elemNo){ + return (row * matrixParam.blockRowSize + elemNo); + }; + + auto vectorMultIndex = [=](unsigned short blockNo, unsigned short blockCol){ + return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol); + }; + + unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/CUDA_WARP_SIZE; + unsigned short localRow = origRow % matrixParam.rowsPerBlock; + unsigned short threadNo = threadIdx.x % CUDA_WARP_SIZE; + + unsigned short blockCol = threadNo % matrixParam.blockColSize; + + extern __shared__ matrixType tempBuffer[]; + + /* Forward Substitution Algorithm with Gaussian Elimination. */ + for(auto iPartition = matrixParam.nChainStart; iPartition < matrixParam.nChainEnd; iPartition++) + { + + /* We move to the set of rows present in the current partition. */ + unsigned long row = origRow + d_partition_offsets[iPartition]; + bool rowInPartition = (row < d_partition_offsets[iPartition + 1]); + + if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) { + + tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = 0.0; + } + + __syncthreads(); + + if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.activeThreads)) { + + for(unsigned long index = matrixIndex(row, threadNo, d_row_ptr); index < matrixIndex(row, 0, d_dia_ptr); index+=matrixParam.activeThreads) + { + + unsigned short blockNo = index/matrixParam.blockSize; + unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize; + + atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], matrix[index] * prod[vectorMultIndex(blockNo, blockCol)]); // Compute L.x* + } + } + + __syncthreads(); + + if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)) { + + if(threadNo < matrixParam.blockRowSize) prod[vectorIndex(row, threadNo)] = vec[vectorIndex(row, threadNo)] - tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)]; // Compute y = b - L.x* + + /* Reinitialize the shared memory to hold the matrix diagonal elements now. */ + tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = matrix[matrixIndex(row, threadNo, d_dia_ptr)]; + } + + DeviceGaussElimination(&tempBuffer[matrixParam.shrdMemIndex(localRow, 0)], prod, row, threadNo, rowInPartition, matrixParam); // Solve D.x* = y + + __syncthreads(); + + } +} + + +template +__global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* prod, const unsigned long* d_partition_offsets, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, matrixParameters matrixParam) +{ + + /*--- Second part of the symmetric iteration: (D+U).x_(1) = D.x* ---*/ + + auto matrixIndex = [=](unsigned long row, unsigned short threadNo, const unsigned long* ptr_array){ + return (ptr_array[row] * matrixParam.blockSize + threadNo); + }; + + auto vectorIndex = [=](unsigned long row, unsigned short elemNo){ + return (row * matrixParam.blockRowSize + elemNo); + }; + + auto vectorMultIndex = [=](unsigned short blockNo, unsigned short blockCol){ + return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol); + }; + + unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/CUDA_WARP_SIZE; + unsigned short localRow = origRow % matrixParam.rowsPerBlock; + unsigned short threadNo = threadIdx.x % CUDA_WARP_SIZE; + + unsigned short blockCol = threadNo % matrixParam.blockColSize; + + extern __shared__ matrixType tempBuffer[]; + + /* Backward Substitution Algorithm with Gaussian Elimination. */ + for(auto iPartition = matrixParam.nChainStart; iPartition > matrixParam.nChainEnd; iPartition--) + { + + /* We move to the set of rows present in the current partition. */ + unsigned long row = origRow + d_partition_offsets[iPartition - 1]; + bool rowInPartition = (row < d_partition_offsets[iPartition]); + + if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) { + + tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = 0.0; + } + + __syncthreads(); + + if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.activeThreads)) { + + for(unsigned long index = matrixIndex(row, threadNo, d_dia_ptr); index < matrixIndex(row, matrixParam.blockSize, d_dia_ptr); index+=matrixParam.activeThreads) + { + + unsigned short blockNo = index/matrixParam.blockSize; + unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize; + + atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], matrix[index] * prod[vectorMultIndex(blockNo, blockCol)]); // Compute D.x* + } + + for(unsigned long index = matrixIndex(row, threadNo + matrixParam.blockSize, d_dia_ptr); index < matrixIndex(row + 1, 0, d_row_ptr); index+=matrixParam.activeThreads) + { + + unsigned short blockNo = index/matrixParam.blockSize; + unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize; + + atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], -(matrix[index] * prod[vectorMultIndex(blockNo, blockCol)])); // Compute y = D.x*-U.x_(n+1) + } + } + + __syncthreads(); + + if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)) { + + if(threadNo < matrixParam.blockRowSize) prod[vectorIndex(row, threadNo)] = tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)]; + + /* Reinitialize the memory to hold the matrix diagonal elements now. */ + tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = matrix[matrixIndex(row, threadNo, d_dia_ptr)]; + } + + DeviceGaussElimination(&tempBuffer[matrixParam.shrdMemIndex(localRow, 0)], prod, row, threadNo, rowInPartition, matrixParam); // Solve D.x* = y + + __syncthreads(); + } +} + template -__global__ void GPUMatrixVectorProductAdd(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn) +__global__ void MatrixVectorProductKernel(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, matrixParameters matrixParam) { - int row = (blockIdx.x * blockDim.x + threadIdx.x)/32; - int threadNo = threadIdx.x%32; - int activeThreads = nVar * (32/nVar); + auto matrixIndex = [=](unsigned long row, unsigned short threadNo) { + return (d_row_ptr[row] * matrixParam.blockSize + threadNo); + }; + + auto vectorIndex = [=](unsigned long row, unsigned short elemNo) { + return (row * matrixParam.blockRowSize + elemNo); + }; + + unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/CUDA_WARP_SIZE; + unsigned short threadNo = threadIdx.x % CUDA_WARP_SIZE; + unsigned short localRow = row % matrixParam.rowsPerBlock; - int blockRow = (threadNo/nVar)%nVar; + unsigned short blockCol = threadNo % matrixParam.blockColSize; - if(row @@ -74,15 +300,55 @@ void CSysMatrix::GPUMatrixVectorProduct(const CSysVector vec.HtDTransfer(); prod.GPUSetVal(0.0); - dim3 blockDim(KernelParameters::MVP_BLOCK_SIZE,1,1); - int gridx = KernelParameters::round_up_division(KernelParameters::MVP_WARP_SIZE, nPointDomain); + matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition, config->GetRows_Per_Cuda_Block()); + + dim3 blockDim(config->GetCuda_Block_Size(),1,1); + unsigned int gridx = rounded_up_division(config->GetCuda_Block_Size(), matrixParam.totalRows * CUDA_WARP_SIZE); dim3 gridDim(gridx, 1, 1); - GPUMatrixVectorProductAdd<<>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, nPointDomain, nVar, nEqn); + MatrixVectorProductKernel<<>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam); gpuErrChk( cudaPeekAtLastError() ); prod.DtHTransfer(); } -template class CSysMatrix; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits. +template +void CSysMatrix::GPUComputeLU_SGSPreconditioner(const CSysVector& vec, CSysVector& prod, + CGeometry* geometry, const CConfig* config) const + { + ScalarType* d_vec = vec.GetDevicePointer(); + ScalarType* d_prod = prod.GetDevicePointer(); + + HtDTransfer(); + vec.HtDTransfer(); + prod.HtDTransfer(); + + matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition, config->GetRows_Per_Cuda_Block()); + + dim3 blockDim(matrixParam.rowsPerBlock * CUDA_WARP_SIZE,1,1); + unsigned int gridx = rounded_up_division(matrixParam.rowsPerBlock, geometry->maxPartitionSize); + dim3 gridDim(gridx, 1, 1); + + for(auto elem = geometry->chainPtr.begin(); elem != geometry->chainPtr.end() - 1; elem++) + { + matrixParam.nChainStart = *(elem); + matrixParam.nChainEnd = *(elem + 1); + + FirstSymmetricIterationKernel<<>>(d_matrix, d_vec, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam); + gpuErrChk( cudaPeekAtLastError() ); + } + + for(auto elem = geometry->chainPtr.rbegin(); elem != geometry->chainPtr.rend() - 1; elem++) + { + matrixParam.nChainStart = *(elem); + matrixParam.nChainEnd = *(elem + 1); + + SecondSymmetricIterationKernel<<>>(d_matrix, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam); + gpuErrChk( cudaPeekAtLastError() ); + } + + prod.DtHTransfer(); + } + +template class CSysMatrix; // This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits. diff --git a/meson.build b/meson.build index e550d57893b7..66c90c64a4f8 100644 --- a/meson.build +++ b/meson.build @@ -19,7 +19,7 @@ python = pymod.find_installation() if get_option('enable-cuda') add_languages('cuda') - add_global_arguments('-arch=sm_86', language : 'cuda') + add_global_arguments('-arch=native', language : 'cuda') endif su2_cpp_args = []