From 79c9e525a1677dbac161ccec3d83445b3c2e063f Mon Sep 17 00:00:00 2001
From: Areen Raj <areenraj@gmail.com>
Date: Tue, 10 Jun 2025 09:07:37 +0530
Subject: [PATCH 01/15] updating branch

---
 .../linear_algebra/CMatrixVectorProduct.hpp   |  61 ++----
 .../linear_algebra/CPreconditioner.hpp        |  10 +
 Common/include/linear_algebra/CSysMatrix.hpp  |  33 +---
 Common/include/linear_algebra/GPUComms.cuh    |   6 +-
 Common/src/linear_algebra/CSysMatrix.cpp      |   3 +-
 Common/src/linear_algebra/CSysMatrixGPU.cu    | 177 +++++++++++++++++-
 6 files changed, 205 insertions(+), 85 deletions(-)
diff --git a/Common/include/linear_algebra/CMatrixVectorProduct.hpp b/Common/include/linear_algebra/CMatrixVectorProduct.hpp
index c688ecb540be..a27a0db199af 100644
--- a/Common/include/linear_algebra/CMatrixVectorProduct.hpp
+++ b/Common/include/linear_algebra/CMatrixVectorProduct.hpp
@@ -57,20 +57,11 @@
  * functions from its derived classes to map the suitable path of
  * execution - CPU or GPU.
  */
-template <class ScalarType>
-class CMatrixVectorProduct {
- public:
-  virtual ~CMatrixVectorProduct() = 0;
-  virtual void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const = 0;
-};
-template <class ScalarType>
-CMatrixVectorProduct<ScalarType>::~CMatrixVectorProduct() {}
 
-/*!
+ /*!
  * \class CExecutionPath
  * \brief Dummy super class that holds the correct member functions in its child classes
  */
-
 template <class ScalarType>
 class CExecutionPath {
  public:
@@ -78,38 +69,14 @@ class CExecutionPath {
                             const CConfig* config, const CSysMatrix<ScalarType>& matrix) = 0;
 };
 
-/*!
- * \class CCpuExecution
- * \brief Derived class containing the CPU Matrix Vector Product Function
- */
 template <class ScalarType>
-class CCpuExecution : public CExecutionPath<ScalarType> {
+class CMatrixVectorProduct {
  public:
-  void mat_vec_prod(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v, CGeometry* geometry,
-                    const CConfig* config, const CSysMatrix<ScalarType>& matrix) override {
-    matrix.MatrixVectorProduct(u, v, geometry, config);
-  }
+  virtual ~CMatrixVectorProduct() = 0;
+  virtual void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const = 0;
 };
-
-/*!
- * \class CGpuExecution
- * \brief Derived class containing the GPU Matrix Vector Product Function
- */
 template <class ScalarType>
-class CGpuExecution : public CExecutionPath<ScalarType> {
- public:
-  void mat_vec_prod(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v, CGeometry* geometry,
-                    const CConfig* config, const CSysMatrix<ScalarType>& matrix) override {
-#ifdef HAVE_CUDA
-    matrix.GPUMatrixVectorProduct(u, v, geometry, config);
-#else
-    SU2_MPI::Error(
-        "\nError in launching Matrix-Vector Product Function\nENABLE_CUDA is set to YES\nPlease compile with CUDA "
-        "options enabled in Meson to access GPU Functions",
-        CURRENT_FUNCTION);
-#endif
-  }
-};
+CMatrixVectorProduct<ScalarType>::~CMatrixVectorProduct() {}
 
 /*!
  * \class CSysMatrixVectorProduct
@@ -134,11 +101,7 @@ class CSysMatrixVectorProduct final : public CMatrixVectorProduct<ScalarType> {
   inline CSysMatrixVectorProduct(const CSysMatrix<ScalarType>& matrix_ref, CGeometry* geometry_ref,
                                  const CConfig* config_ref)
       : matrix(matrix_ref), geometry(geometry_ref), config(config_ref) {
-    if (config->GetCUDA()) {
-      exec = new CGpuExecution<ScalarType>;
-    } else {
-      exec = new CCpuExecution<ScalarType>;
-    }
+
   }
 
   /*!
@@ -152,6 +115,16 @@ class CSysMatrixVectorProduct final : public CMatrixVectorProduct<ScalarType> {
    * \param[out] v - CSysVector that is the result of the product
    */
   inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
-    exec->mat_vec_prod(u, v, geometry, config, matrix);
+    #ifdef HAVE_CUDA
+    if(config->GetCUDA()) 
+    {
+      matrix.GPUMatrixVectorProduct(u, v, geometry, config);
+    }
+    else {
+    matrix.MatrixVectorProduct(u, v, geometry, config);
+    }
+    #else
+    matrix.MatrixVectorProduct(u, v, geometry, config)
+    #endif
   }
 };
diff --git a/Common/include/linear_algebra/CPreconditioner.hpp b/Common/include/linear_algebra/CPreconditioner.hpp
index 9310225179cc..b2399a1384f1 100644
--- a/Common/include/linear_algebra/CPreconditioner.hpp
+++ b/Common/include/linear_algebra/CPreconditioner.hpp
@@ -205,7 +205,17 @@ class CLU_SGSPreconditioner final : public CPreconditioner<ScalarType> {
    * \param[out] v - CSysVector that is the result of the preconditioning.
    */
   inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
+    #ifdef HAVE_CUDA
+    if(config->GetCUDA()) 
+    {
+      sparse_matrix.GPUComputeLU_SGSPreconditioner(u, v, geometry, config);
+    }
+    else {
+      sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
+    }
+    #else
     sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
+    #endif
   }
 };
 
diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
index d91010636874..64dec51ebfa0 100644
--- a/Common/include/linear_algebra/CSysMatrix.hpp
+++ b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -148,6 +148,7 @@ class CSysMatrix {
   ScalarType* d_matrix;           /*!< \brief Device Pointer to store the matrix values on the GPU. */
   const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
   const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
+  const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
 
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
@@ -859,41 +860,13 @@ class CSysMatrix {
   void GPUMatrixVectorProduct(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
                               const CConfig* config) const;
 
-  /*!
-   * \brief Performs first step of the LU_SGS Preconditioner building
-   * \param[in] vec - CSysVector to be multiplied by the sparse matrix A.
-   * \param[in] geometry - Geometrical definition of the problem.
-   * \param[in] config - Definition of the particular problem.
-   * \param[out] prod - Result of the product.
-   */
-
-  void GPUFirstSymmetricIteration(ScalarType& vec, ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
-
-  /*!
-   * \brief Performs second step of the LU_SGS Preconditioner building
-   * \param[in] geometry - Geometrical definition of the problem.
-   * \param[in] config - Definition of the particular problem.
-   * \param[out] prod - Result of the product.
-   */
-
-  void GPUSecondSymmetricIteration(ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
-
-  /*!
-   * \brief Performs Gaussian Elimination between diagional blocks of the matrix and the prod vector
-   * \param[in] geometry - Geometrical definition of the problem.
-   * \param[in] config - Definition of the particular problem.
-   * \param[out] prod - Result of the product.
-   */
-
-  void GPUGaussElimination(ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
-
   /*!
    * \brief Multiply CSysVector by the preconditioner all of which are stored on the device
    * \param[in] vec - CSysVector to be multiplied by the preconditioner.
    * \param[out] prod - Result of the product A*vec.
    */
-  void GPUComputeLU_SGSPreconditioner(ScalarType& vec, ScalarType& prod, CGeometry* geometry,
-                                      const CConfig* config) const;
+  void GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
+    const CConfig* config) const;
 
   /*!
    * \brief Build the Jacobi preconditioner.
diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
index 582f9eb46652..dba0a280e420 100644
--- a/Common/include/linear_algebra/GPUComms.cuh
+++ b/Common/include/linear_algebra/GPUComms.cuh
@@ -30,7 +30,11 @@
 
 namespace KernelParameters{
 
-  inline constexpr int round_up_division(const int multiple, int x) { return ((x + multiple - 1) / multiple); }
+  /*Returns the rounded up value of the decimal quotient to the next integer (in all cases)*/
+  inline constexpr int rounded_up_division(const int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }   
+
+  /*Returns the rounded down value of the decimal quotient to the previous integer (in all cases)*/
+  inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }   
 
   const int MVP_BLOCK_SIZE = 1024;
   const int MVP_WARP_SIZE = 32;
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
index 4f853e42fa17..86dee081da83 100644
--- a/Common/src/linear_algebra/CSysMatrix.cpp
+++ b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -153,7 +153,8 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
   GPUAllocAndInit(d_matrix, nnz * nVar * nEqn);
   GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
   GPUAllocAndCopy(d_col_ind, col_ind, nnz);
-
+  GPUAllocAndCopy(d_dia_ptr, dia_ptr, nPointDomain);
+  
   if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();
 
   if (type == ConnectivityType::FiniteVolume) {
diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu
index 19e6a8e0b06f..edc1e1b5b561 100644
--- a/Common/src/linear_algebra/CSysMatrixGPU.cu
+++ b/Common/src/linear_algebra/CSysMatrixGPU.cu
@@ -28,14 +28,139 @@
 #include "../../include/linear_algebra/CSysMatrix.hpp"
 #include "../../include/linear_algebra/GPUComms.cuh"
 
+
+template<typename matrixType, typename vectorType>
+__global__ void GaussEliminationKernel(matrixType* matrix, vectorType* prod, const unsigned long* d_dia_ptr, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn, unsigned long size)
+{
+   #define A(I, J) matrixCopy[localRowNo * size + (I)*nVar + (J)]
+   #define validRow row<nPointDomain
+
+   int row = (blockDim.x*blockIdx.x + threadIdx.x)/size;
+   int threadNo = threadIdx.x%size;
+
+   /*Each block caters to blockDim.x/(nVar * nEqn) rows. This variable tells us which row out 
+      of the blockDim.x/(nVar * nEqn) rows  in the block does the individual thread belong to*/
+   int localRowNo = threadIdx.x/size;           
+  
+   int x = threadNo/nEqn;                                         //  A single block is in use so no remainder step necessary like in MVP
+   int y = threadNo%nVar;
+
+   matrixType pivot, weight = 0.0;
+
+  extern __shared__ matrixType matrixCopy[];
+
+   if(validRow) A(x, y) = matrix[d_dia_ptr[row] * size + x * nVar + y];
+
+   __syncthreads();
+
+   for(int currentRow=0; currentRow<nEqn; currentRow++)
+   {
+      pivot = A(currentRow, currentRow);             
+
+      __syncthreads();
+
+      if(validRow)
+      {
+         A(currentRow, y) /= pivot;                              
+         if(y==0) prod[row * nVar + currentRow] /= pivot;
+      }
+
+      __syncthreads();
+      
+      weight = A(x, currentRow);                            
+
+      if(x!=currentRow && validRow) 
+      {
+         A(x, y) -= weight * A(currentRow, y);       
+         if(y==0) prod[row * nVar + x] -= weight * prod[row * nVar + currentRow];
+      }
+
+   __syncthreads();
+   }
+}
+
+template<typename matrixType, typename vectorType>
+__global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn)
+{
+   int row = (blockIdx.x * blockDim.x + threadIdx.x)/KernelParameters::MVP_WARP_SIZE;
+   int threadNo = threadIdx.x%KernelParameters::MVP_WARP_SIZE;
+   int activeThreads = nEqn * (KernelParameters::MVP_WARP_SIZE/nEqn);
+
+   int blockRow = (threadNo/nEqn)%nVar;
+   
+   vectorType res = 0.0;
+
+   if(row<nPointDomain && threadNo<activeThreads)
+   {
+      for(int index = d_row_ptr[row] * nVar * nEqn + threadNo; index < d_dia_ptr[row] * nVar * nEqn; index+=activeThreads)
+      {
+         int blockCol = index%nVar;
+         int blockNo = index/(nVar * nEqn);
+         res -= matrix[index] * prod[(d_col_ind[blockNo])*nVar + blockCol];    // Compute -L.x*
+      }
+   }
+
+   __syncthreads();   // Syncthreads outside the conditional statements to prevent excecptions
+
+   if(row<nPointDomain && threadNo<nVar) prod[row * nVar + threadNo]  = vec[row * nVar + threadNo];   // First condition just selects the first nVar threads of the warp - this is to exclude multiple writes
+   
+   __syncthreads(); 
+
+   if(row<nPointDomain && threadNo<activeThreads) atomicAdd(&prod[row * nVar + blockRow], res);    // Compute y = b - L.x* (subtracts the previously obtained lower product)
+}
+
 template<typename matrixType, typename vectorType>
-__global__ void GPUMatrixVectorProductAdd(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn)
+__global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn)
 {
-   int row = (blockIdx.x * blockDim.x + threadIdx.x)/32;
-   int threadNo = threadIdx.x%32;
-   int activeThreads = nVar * (32/nVar);
+   int row = (blockIdx.x * blockDim.x + threadIdx.x)/KernelParameters::MVP_WARP_SIZE;
+   int threadNo = threadIdx.x%KernelParameters::MVP_WARP_SIZE;
+   int activeThreads = nEqn * (KernelParameters::MVP_WARP_SIZE/nEqn);
 
-   int blockRow = (threadNo/nVar)%nVar;
+   int blockRow = (threadNo/nEqn)%nVar;
+
+   vectorType res = 0.0;
+
+   if(row==nPointDomain-3) 
+   {
+      int hi = 1;
+   }
+
+   if(row<nPointDomain && threadNo<activeThreads)
+   {
+      // Diagonal Product
+      for(int index = d_dia_ptr[row] * nVar * nEqn + threadNo; index < (d_dia_ptr[row] + 1) * nVar * nEqn; index+=activeThreads)
+      {
+         int blockCol = index%nEqn;
+         int blockNo = index/(nVar * nEqn);
+         res += matrix[index] * prod[(d_col_ind[blockNo]) * nVar + blockCol];    // Compute D.x*
+      }
+      
+      // Upper Product along with Simultaneous Subtraction
+      for(int index = (d_dia_ptr[row] + 1) * nVar * nEqn + threadNo; index < d_row_ptr[row+1] * nVar * nEqn; index+=activeThreads)
+      {
+         int blockCol = index%nVar;
+         int blockNo = index/(nVar * nEqn);
+         res -= matrix[index] * prod[(d_col_ind[blockNo])*nVar + blockCol];    // Compute U.x_(n+1)
+      }
+   }
+
+   __syncthreads();  // Syncthreads outside conditional statements to prevent exceptions
+
+   if(row<nPointDomain && threadNo<nVar) prod[row * nVar + threadNo] = 0.0;    // First condition just selects the first nVar threads of the warp - this is to exclude multiple writes
+
+   __syncthreads();
+
+   if(row<nPointDomain && threadNo<activeThreads) atomicAdd(&prod[row * nVar + blockRow], res);   // Compute y = D.x*-U.x_(n+1)
+}
+
+template<typename matrixType, typename vectorType>
+__global__ void MatrixVectorProductKernel(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn)
+{
+   int row = (blockIdx.x * blockDim.x + threadIdx.x)/KernelParameters::MVP_WARP_SIZE;
+   int threadNo = threadIdx.x%KernelParameters::MVP_WARP_SIZE;
+   int activeThreads = nEqn * (KernelParameters::MVP_WARP_SIZE/nEqn);
+
+   int blockRow = (threadNo/nEqn)%nVar;      // The remainder step is necessary as the blockRow sequnce should reset when we move to the next block
 
    if(row<nPointDomain && threadNo<activeThreads) prod[row * nVar + blockRow] = 0.0;
 
@@ -47,9 +172,9 @@ __global__ void GPUMatrixVectorProductAdd(matrixType* matrix, vectorType* vec, v
 
       for(int index = d_row_ptr[row] * nVar * nEqn + threadNo; index < d_row_ptr[row+1] * nVar * nEqn; index+=activeThreads)
       {
-         int blockCol = index%nEqn;
+         int blockCol = index%nVar;
          int blockNo = index/(nVar * nEqn);
-         res += matrix[index] * vec[(d_col_ind[blockNo])*nVar + blockCol];
+         res += matrix[index] * vec[(d_col_ind[blockNo]) * nVar + blockCol];
       }
 
       atomicAdd(&prod[row * nVar + blockRow], res);
@@ -75,14 +200,48 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
    prod.GPUSetVal(0.0);
 
   dim3 blockDim(KernelParameters::MVP_BLOCK_SIZE,1,1);
-  int gridx = KernelParameters::round_up_division(KernelParameters::MVP_WARP_SIZE, nPointDomain);
+  int gridx = KernelParameters::rounded_up_division(KernelParameters::MVP_BLOCK_SIZE, nPointDomain * KernelParameters::MVP_WARP_SIZE);
   dim3 gridDim(gridx, 1, 1);
 
-  GPUMatrixVectorProductAdd<<<gridDim, blockDim>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, nPointDomain, nVar, nEqn);
+  MatrixVectorProductKernel<<<gridDim, blockDim>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, nPointDomain, nVar, nEqn);
   gpuErrChk( cudaPeekAtLastError() );
 
   prod.DtHTransfer();
 
 }
 
+template<class ScalarType>
+void CSysMatrix<ScalarType>::GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
+   CGeometry* geometry, const CConfig* config) const
+   {
+      ScalarType* d_vec = vec.GetDevicePointer();
+      ScalarType* d_prod = prod.GetDevicePointer();
+
+      HtDTransfer();
+      vec.HtDTransfer();
+      prod.HtDTransfer();
+
+      unsigned long size = nVar * nEqn;
+
+      dim3 blockDim(KernelParameters::MVP_BLOCK_SIZE,1,1);
+      int gridx = KernelParameters::rounded_up_division(KernelParameters::MVP_BLOCK_SIZE, nPointDomain * KernelParameters::MVP_WARP_SIZE);
+      dim3 gridDim(gridx, 1, 1);
+
+      int geBlockx = size * (KernelParameters::MVP_BLOCK_SIZE/size);
+      dim3 gaussElimBlockDim(geBlockx, 1, 1);
+      int geGridx = KernelParameters::rounded_up_division(geBlockx, size * nPointDomain);
+      dim3 gaussElimGridDim(geGridx,1,1);
+
+      FirstSymmetricIterationKernel<<<gridDim, blockDim>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, d_dia_ptr, nPointDomain, nVar, nEqn);
+      gpuErrChk( cudaPeekAtLastError() );
+     GaussEliminationKernel<<<gaussElimGridDim, gaussElimBlockDim, sizeof(ScalarType) * geBlockx>>>(d_matrix, d_prod, d_dia_ptr, nPointDomain, nVar, nEqn, size);
+      gpuErrChk( cudaPeekAtLastError() );
+     SecondSymmetricIterationKernel<<<gridDim, blockDim>>>(d_matrix, d_prod, d_row_ptr, d_col_ind, d_dia_ptr, nPointDomain, nVar, nEqn);
+      gpuErrChk( cudaPeekAtLastError() );
+      GaussEliminationKernel<<<gaussElimGridDim, gaussElimBlockDim, sizeof(ScalarType) * geBlockx>>>(d_matrix, d_prod, d_dia_ptr, nPointDomain, nVar, nEqn, size);
+      gpuErrChk( cudaPeekAtLastError() );
+
+      prod.DtHTransfer();
+   }
+
 template class CSysMatrix<su2mixedfloat>; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.

From b7591f46788ecc13d7103030e2608322c467f19e Mon Sep 17 00:00:00 2001
From: Areen Raj <areenraj@gmail.com>
Date: Wed, 25 Jun 2025 19:15:39 +0530
Subject: [PATCH 02/15] Major commit: graph partitioning algorithms, level
 scheduling method, GPU Preconditioner framework

---
 .../linear_algebra/CGraphPartitioning.hpp     | 139 ++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 Common/include/linear_algebra/CGraphPartitioning.hpp

diff --git a/Common/include/linear_algebra/CGraphPartitioning.hpp b/Common/include/linear_algebra/CGraphPartitioning.hpp
new file mode 100644
index 000000000000..80d5e30e05c3
--- /dev/null
+++ b/Common/include/linear_algebra/CGraphPartitioning.hpp
@@ -0,0 +1,139 @@
+/*!
+ * \file CGraphPartitioning.hpp
+ * \brief Headers for the classes realted to the algorithms that are used 
+                to divide the matrix acyclic graph into parallel partitions.
+ * \author A. Raj
+ * \version 8.2.0 "Harrier"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2025, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../CConfig.hpp"
+#include "../geometry/CGeometry.hpp"
+#include "../geometry/dual_grid/CPoint.hpp"
+
+/*!
+ * \class CGraphPartitioning
+ * \brief Abstract base class for defining graph partitioning algorithms
+ * \author A. Raj
+ *
+ * In order to use certain parallel algorithms in the solution process - 
+ * whether with linear solvers or preconditioners - we require the matrix 
+ * to be partitioned into certain parallel divisions. These maybe in the form 
+ * of levels, blocks, colors and so on. Since a number of different algorithms 
+ * can be used to split the graph, we've introduced a base class containing the 
+ * "Partition" member function from which child classes of the specific 
+ * algorithm can be derived. Currently, we are only using direct declarations 
+ * of the derived classes in the code. However, this method was chosen as it
+ * allows us to pass different child class algorithms to a single implementation 
+ * of the function that requires it - similar to the CMatrixVectorProduct class.
+ */
+
+template <class ScalarType>
+
+class CGraphPartitioning {
+
+ public:
+  virtual ~CGraphPartitioning() = 0;
+  virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets) = 0;  
+};
+template <class ScalarType>
+CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}  
+
+template <class ScalarType>
+
+class CLevelScheduling final : public CGraphPartitioning<ScalarType> { 
+
+ private:
+  ScalarType nPointDomain;
+  CPoint* nodes;
+  
+ public:
+  ScalarType nLevels;
+  ScalarType maxLevelWidth;
+  vector<ScalarType> levels;
+
+ /*!
+   * \brief constructor of the class
+   * \param[in] nPointDomain_ref - number of points associated with the problem
+   * \param[in] nodes - represents the relationships between the points
+   */
+ inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref) 
+                                                        : nPointDomain(nPointDomain_ref), nodes(nodes_ref) 
+                                                        { nLevels = 0ul; maxLevelWidth = 0ul; }
+
+ CLevelScheduling() = delete;   // Removing default constructor
+
+ void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType>& levelOffsets)
+ {
+    for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
+      const auto globalPoint = pointList[localPoint];
+      inversePointList[levelOffsets[levels[localPoint]]++] = globalPoint;
+    }
+      
+    pointList = std::move(inversePointList);
+  }
+
+ void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets) override
+  {
+    vector<ScalarType> inversePointList;
+    inversePointList.reserve(nPointDomain);
+    levels.reserve(nPointDomain);
+
+    for (auto point = 0ul; point < nPointDomain; point++) {
+      inversePointList[pointList[point]] = point;
+      levels[point] = 0;
+    }
+
+    //  Local Point - Ordering of the points post the RCM ordering
+    //  Global Point - Original order of the points before the RCM ordering
+
+    for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
+      const auto globalPoint = pointList[localPoint];
+
+      for (auto adjPoints = 0u; adjPoints < nodes->GetnPoint(globalPoint); adjPoints++) {
+        const auto adjGlobalPoint = nodes->GetPoint(globalPoint, adjPoints);
+        
+        if (adjGlobalPoint < nPointDomain) {
+          const auto adjLocalPoint = inversePointList[adjGlobalPoint];
+          
+          if (adjLocalPoint < localPoint) {
+          levels[localPoint] = std::max(levels[localPoint], levels[adjLocalPoint] + 1);
+        }
+       }
+      }
+
+      nLevels = std::max(nLevels, levels[localPoint] + 1);
+    }  
+
+    levelOffsets.resize(nLevels + 1);
+    for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++)  ++levelOffsets[levels[iPoint] + 1];
+
+    for (auto iLevel = 2ul; iLevel <= nLevels; ++iLevel) {
+      levelOffsets[iLevel] += levelOffsets[iLevel - 1];
+    }
+
+    for(auto elem = levelOffsets.begin(); elem != (levelOffsets.end() - 1); elem++) maxLevelWidth = std::max(*(elem+1) - *elem, maxLevelWidth);
+    
+    Reorder(pointList, inversePointList, levelOffsets);
+  }
+};
+

From 6f51f456a2e6904c3cad0d9bc7e8ff1d11a9978b Mon Sep 17 00:00:00 2001
From: Areen Raj <areenraj@gmail.com>
Date: Wed, 25 Jun 2025 19:16:28 +0530
Subject: [PATCH 03/15] Major commit: graph partitioning algorithms, level
 scheduling method, GPU Preconditioner framework

---
 Common/include/CConfig.hpp                    |   7 +
 Common/include/geometry/CGeometry.hpp         |   6 +
 Common/include/geometry/CPhysicalGeometry.hpp |  11 ++
 .../linear_algebra/CMatrixVectorProduct.hpp   |  11 +-
 .../linear_algebra/CPreconditioner.hpp        |  12 +-
 Common/include/linear_algebra/CSysMatrix.hpp  |   5 +-
 Common/include/linear_algebra/GPUComms.cuh    |  46 ++++-
 Common/include/option_structure.hpp           |  10 +
 Common/src/CConfig.cpp                        |   3 +
 Common/src/geometry/CPhysicalGeometry.cpp     |  20 ++
 Common/src/linear_algebra/CSysMatrix.cpp      |   8 +-
 Common/src/linear_algebra/CSysMatrixGPU.cu    | 173 +++++++++---------
 Common/src/linear_algebra/CSysVectorGPU.cu    |   2 +-
 13 files changed, 210 insertions(+), 104 deletions(-)

diff --git a/Common/include/CConfig.hpp b/Common/include/CConfig.hpp
index 1d98b92534e7..721627ebdf20 100644
--- a/Common/include/CConfig.hpp
+++ b/Common/include/CConfig.hpp
@@ -521,6 +521,7 @@ class CConfig {
   Kind_Gradient_Method_Recon,      /*!< \brief Numerical method for computation of spatial gradients used for upwind reconstruction. */
   Kind_Deform_Linear_Solver,             /*!< Numerical method to deform the grid */
   Kind_Deform_Linear_Solver_Prec,        /*!< \brief Preconditioner of the linear solver. */
+  Kind_Graph_Part_Algo,    /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
   Kind_Linear_Solver,                    /*!< \brief Numerical solver for the implicit scheme. */
   Kind_Linear_Solver_Prec,               /*!< \brief Preconditioner of the linear solver. */
   Kind_DiscAdj_Linear_Solver,            /*!< \brief Linear solver for the discrete adjoint system. */
@@ -4136,6 +4137,12 @@ class CConfig {
    */
   bool GetLeastSquaresRequired(void) const { return LeastSquaresRequired; }
 
+    /*!
+   * \brief Get the type of algorithm used for partitioning the matrix graph.
+   * \return Algorithm that divides the matrix into partitions that are executed parallely.
+   */
+  unsigned short GetKind_Graph_Part_Algo(void) const { return Kind_Graph_Part_Algo; }
+
   /*!
    * \brief Get the kind of solver for the implicit solver.
    * \return Numerical solver for implicit formulation (solving the linear system).
diff --git a/Common/include/geometry/CGeometry.hpp b/Common/include/geometry/CGeometry.hpp
index a345f72c046f..f89ac7b910a3 100644
--- a/Common/include/geometry/CGeometry.hpp
+++ b/Common/include/geometry/CGeometry.hpp
@@ -260,6 +260,12 @@ class CGeometry {
   unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points
                                                on all prior ranks in the linear partitioning. */
 
+  unsigned long nPartition; /*!< \brief Number of divisions of the matrix graph during execution of parallel
+                               partitioning algorithms. */
+  unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */
+  vector<unsigned long>
+      partitionOffsets; /*!< \brief Vector array containing the indices at which different parallel partitions begin. */
+
   /*--- Data structures for point-to-point MPI communications. ---*/
 
   int maxCountPerPoint{0}; /*!< \brief Maximum number of pieces of data sent per vertex in point-to-point comms. */
diff --git a/Common/include/geometry/CPhysicalGeometry.hpp b/Common/include/geometry/CPhysicalGeometry.hpp
index 1148471f1c3e..70fe1f7d4c4c 100644
--- a/Common/include/geometry/CPhysicalGeometry.hpp
+++ b/Common/include/geometry/CPhysicalGeometry.hpp
@@ -148,6 +148,17 @@ class CPhysicalGeometry final : public CGeometry {
    */
   void DistributeColoring(const CConfig* config, CGeometry* geometry);
 
+  /*!
+   * \brief Divide the graph produced by the matrix into parallel partitions.
+   * \param[in] config - Definition of the particular problem.
+   * \param[in] pointList - Ordered list of points in the mesh.
+   * \param[in] numPartitions - Returns the number of parallel partitions created by the algorithm.
+   * \param[in] indexOffsets - Vector array that represents the starting index of each partition in the reordered point
+   * list.
+   */
+  template <class ScalarType>
+  void PartitionGraph(const CConfig* config, vector<ScalarType>& pointList);
+
   /*!
    * \brief Distribute the grid points, including ghost points, across all ranks based on a ParMETIS coloring.
    * \param[in] config - Definition of the particular problem.
diff --git a/Common/include/linear_algebra/CMatrixVectorProduct.hpp b/Common/include/linear_algebra/CMatrixVectorProduct.hpp
index ca92673db7f1..d2a7b79f2727 100644
--- a/Common/include/linear_algebra/CMatrixVectorProduct.hpp
+++ b/Common/include/linear_algebra/CMatrixVectorProduct.hpp
@@ -101,14 +101,17 @@ class CSysMatrixVectorProduct final : public CMatrixVectorProduct<ScalarType> {
    * \param[out] v - CSysVector that is the result of the product
    */
   inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
-#ifdef HAVE_CUDA
     if (config->GetCUDA()) {
+#ifdef HAVE_CUDA
       matrix.GPUMatrixVectorProduct(u, v, geometry, config);
+#else
+      SU2_MPI::Error(
+          "\nError in launching Matrix-Vector Product Function\nENABLE_CUDA is set to YES\nPlease compile with CUDA "
+          "options enabled in Meson to access GPU Functions",
+          CURRENT_FUNCTION);
+#endif
     } else {
       matrix.MatrixVectorProduct(u, v, geometry, config);
     }
-#else
-    matrix.MatrixVectorProduct(u, v, geometry, config);
-#endif
   }
 };
diff --git a/Common/include/linear_algebra/CPreconditioner.hpp b/Common/include/linear_algebra/CPreconditioner.hpp
index b2399a1384f1..7f5e601e323a 100644
--- a/Common/include/linear_algebra/CPreconditioner.hpp
+++ b/Common/include/linear_algebra/CPreconditioner.hpp
@@ -205,17 +205,15 @@ class CLU_SGSPreconditioner final : public CPreconditioner<ScalarType> {
    * \param[out] v - CSysVector that is the result of the preconditioning.
    */
   inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
-    #ifdef HAVE_CUDA
-    if(config->GetCUDA()) 
-    {
+#ifdef HAVE_CUDA
+    if (config->GetCUDA()) {
       sparse_matrix.GPUComputeLU_SGSPreconditioner(u, v, geometry, config);
-    }
-    else {
+    } else {
       sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
     }
-    #else
+#else
     sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
-    #endif
+#endif
   }
 };
 
diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
index 64dec51ebfa0..1fe9be7b9120 100644
--- a/Common/include/linear_algebra/CSysMatrix.hpp
+++ b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -149,6 +149,7 @@ class CSysMatrix {
   const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
   const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
   const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
+  unsigned long* d_partition_offsets;
 
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
@@ -865,8 +866,8 @@ class CSysMatrix {
    * \param[in] vec - CSysVector to be multiplied by the preconditioner.
    * \param[out] prod - Result of the product A*vec.
    */
-  void GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
-    const CConfig* config) const;
+  void GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
+                                      CGeometry* geometry, const CConfig* config) const;
 
   /*!
    * \brief Build the Jacobi preconditioner.
diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
index dba0a280e420..c4cf61cd9c35 100644
--- a/Common/include/linear_algebra/GPUComms.cuh
+++ b/Common/include/linear_algebra/GPUComms.cuh
@@ -28,17 +28,51 @@
 #include<cuda_runtime.h>
 #include"iostream"
 
-namespace KernelParameters{
+namespace kernelParameters{
 
   /*Returns the rounded up value of the decimal quotient to the next integer (in all cases)*/
-  inline constexpr int rounded_up_division(const int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }   
+  inline constexpr int rounded_up_division(const int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }
 
   /*Returns the rounded down value of the decimal quotient to the previous integer (in all cases)*/
-  inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }   
+  inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
+
+  const unsigned int MVP_BLOCK_SIZE = 1024;
+  const unsigned int MVP_WARP_SIZE = 32;
+
+};
+
+struct matrixParameters{
+
+  public:
+    unsigned long totalRows;
+    unsigned long blockRowSize;
+    unsigned long blockColSize;
+    unsigned long nPartition; 
+    unsigned long blockSize;
+
+    matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions)
+    {
+      totalRows = nPointDomain;
+      blockRowSize = nEqn;
+      blockColSize = nVar;
+      nPartition = nPartitions;
+      blockSize = nVar * nEqn;
+    }
+};
+struct precondParameters{
+
+  public:
+    dim3 gaussElimBlockDim;
+    dim3 gaussElimGridDim;
+
+    precondParameters(matrixParameters matrixParam)
+    {
+      unsigned int geBlockx = matrixParam.blockSize;
+      gaussElimBlockDim = {geBlockx, 1, 1};
+      gaussElimGridDim = {1,1,1};
+    }
+};
 
-  const int MVP_BLOCK_SIZE = 1024;
-  const int MVP_WARP_SIZE = 32;
-}
 /*!
   * \brief assert style function that reads return codes after intercepting CUDA API calls.
   *        It returns the result code and its location if the call is unsuccessful.
diff --git a/Common/include/option_structure.hpp b/Common/include/option_structure.hpp
index 223a53da742b..ad0dbdc24090 100644
--- a/Common/include/option_structure.hpp
+++ b/Common/include/option_structure.hpp
@@ -2345,6 +2345,16 @@ static const MapType<std::string, ENUM_FFD_BLENDING> Blending_Map = {
   MakePair("BEZIER", BEZIER)
 };
 
+/*!
+ * \brief Types of graph partitioning algorithms for parallel computing
+ */
+enum ENUM_GRAPH_PART_ALGORITHM {
+  LEVEL_SCHEDULING,   /*!< \brief  Partitions the graphs according to level-set algorithm. */
+};
+static const MapType<std::string, ENUM_GRAPH_PART_ALGORITHM> Graph_Part_Map = {
+  MakePair("LEVEL_SCHEDULING", LEVEL_SCHEDULING)
+};
+
 /*!
  * \brief Types of solvers for solving linear systems
  */
diff --git a/Common/src/CConfig.cpp b/Common/src/CConfig.cpp
index 9222ed7f8928..0ddef7dfd249 100644
--- a/Common/src/CConfig.cpp
+++ b/Common/src/CConfig.cpp
@@ -1849,6 +1849,9 @@ void CConfig::SetConfig_Options() {
   /*!\par CONFIG_CATEGORY: Linear solver definition \ingroup Config*/
   /*--- Options related to the linear solvers ---*/
 
+  /*!\brief GRAPH_PARTIONING
+   *  \n DESCRIPTION: Algorithm for partioning the matrix graph to facilitate parallel execution of inear algebra subroutines\n OPTIONS: see \link Graph_Part_Map \endlink \n DEFAULT: LEVEL_SCHEDULING \ingroup Config*/
+  addEnumOption("GRAPH_PART_ALGORITHM", Kind_Graph_Part_Algo, Graph_Part_Map, LEVEL_SCHEDULING);
   /*!\brief LINEAR_SOLVER
    *  \n DESCRIPTION: Linear solver for the implicit, mesh deformation, or discrete adjoint systems \n OPTIONS: see \link Linear_Solver_Map \endlink \n DEFAULT: FGMRES \ingroup Config*/
   addEnumOption("LINEAR_SOLVER", Kind_Linear_Solver, Linear_Solver_Map, FGMRES);
diff --git a/Common/src/geometry/CPhysicalGeometry.cpp b/Common/src/geometry/CPhysicalGeometry.cpp
index c97d2453541e..ff3f2dbfaf01 100644
--- a/Common/src/geometry/CPhysicalGeometry.cpp
+++ b/Common/src/geometry/CPhysicalGeometry.cpp
@@ -26,6 +26,7 @@
  */
 
 #include "../../include/geometry/CPhysicalGeometry.hpp"
+#include "../../include/linear_algebra/CGraphPartitioning.hpp"
 #include "../../include/adt/CADTPointsOnlyClass.hpp"
 #include "../../include/toolboxes/printing_toolbox.hpp"
 #include "../../include/toolboxes/CLinearPartitioner.hpp"
@@ -49,6 +50,8 @@
 #include "../../include/geometry/primal_grid/CPyramid.hpp"
 #include "../../include/geometry/primal_grid/CPrism.hpp"
 #include "../../include/geometry/primal_grid/CVertexMPI.hpp"
+#include "boost/integer_fwd.hpp"
+#include "cgnslib.h"
 
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -699,6 +702,21 @@ void CPhysicalGeometry::DistributeColoring(const CConfig* config, CGeometry* geo
   delete[] nPoint_Flag;
 }
 
+template <class ScalarType>
+void CPhysicalGeometry::PartitionGraph(const CConfig* config, vector<ScalarType>& pointList) {
+  unsigned short KindAlgorithm = config->GetKind_Graph_Part_Algo();
+  partitionOffsets.reserve(nPointDomain);
+
+  switch (KindAlgorithm) {
+    case LEVEL_SCHEDULING:
+      auto levelSchedule = CLevelScheduling<ScalarType>(nPointDomain, nodes);
+      levelSchedule.Partition(pointList, partitionOffsets);
+      nPartition = levelSchedule.nLevels;
+      maxPartitionSize = levelSchedule.maxLevelWidth;
+      break;
+  }
+}
+
 void CPhysicalGeometry::DistributeVolumeConnectivity(const CConfig* config, CGeometry* geometry,
                                                      unsigned short Elem_Type) {
   unsigned short NODES_PER_ELEMENT = 0;
@@ -4542,6 +4560,8 @@ void CPhysicalGeometry::SetRCM_Ordering(CConfig* config) {
     if (!status) SU2_MPI::Error("RCM ordering failed", CURRENT_FUNCTION);
   }
 
+  if (config->GetCUDA()) PartitionGraph(config, Result);
+
   /*--- Add the MPI points ---*/
   for (auto iPoint = nPointDomain; iPoint < nPoint; iPoint++) {
     Result.push_back(iPoint);
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
index 86dee081da83..5e2fd56a33e2 100644
--- a/Common/src/linear_algebra/CSysMatrix.cpp
+++ b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -71,6 +71,7 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
   GPUMemoryAllocation::gpu_free(d_matrix);
   GPUMemoryAllocation::gpu_free(d_row_ptr);
   GPUMemoryAllocation::gpu_free(d_col_ind);
+  GPUMemoryAllocation::gpu_free(d_partition_offsets);
 
 #ifdef USE_MKL
   mkl_jit_destroy(MatrixMatrixProductJitter);
@@ -150,11 +151,16 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
     ptr = GPUMemoryAllocation::gpu_alloc_cpy<const unsigned long>(src_ptr, num * sizeof(const unsigned long));
   };
 
+  auto GPUVectorAllocAndCopy = [](unsigned long*& ptr, vector<unsigned long>& src_ptr, unsigned long num) {
+    ptr = GPUMemoryAllocation::gpu_alloc_cpy<unsigned long>(&src_ptr[0], num * sizeof(unsigned long));
+  };
+
   GPUAllocAndInit(d_matrix, nnz * nVar * nEqn);
   GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
   GPUAllocAndCopy(d_col_ind, col_ind, nnz);
   GPUAllocAndCopy(d_dia_ptr, dia_ptr, nPointDomain);
-  
+  GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nPartition);
+
   if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();
 
   if (type == ConnectivityType::FiniteVolume) {
diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu
index edc1e1b5b561..640b77d6f71a 100644
--- a/Common/src/linear_algebra/CSysMatrixGPU.cu
+++ b/Common/src/linear_algebra/CSysMatrixGPU.cu
@@ -2,7 +2,7 @@
  * \file CSysMatrixGPU.cu
  * \brief Implementations of Kernels and Functions for Matrix Operations on the GPU
  * \author A. Raj
- * \version 8.1.0 "Harrier"
+ * \version 8.2.0 "Harrier"
  *
  * SU2 Project Website: https://su2code.github.io
  *
@@ -26,105 +26,113 @@
  */
 
 #include "../../include/linear_algebra/CSysMatrix.hpp"
+#include "../../include/geometry/CGeometry.hpp"
 #include "../../include/linear_algebra/GPUComms.cuh"
 
+using namespace kernelParameters;
+
 
 template<typename matrixType, typename vectorType>
-__global__ void GaussEliminationKernel(matrixType* matrix, vectorType* prod, const unsigned long* d_dia_ptr, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn, unsigned long size)
+__global__ void GaussEliminationKernel(matrixType* matrix, vectorType* prod, const unsigned long* d_dia_ptr, matrixParameters matrixParam)
 {
-   #define A(I, J) matrixCopy[localRowNo * size + (I)*nVar + (J)]
-   #define validRow row<nPointDomain
-
-   int row = (blockDim.x*blockIdx.x + threadIdx.x)/size;
-   int threadNo = threadIdx.x%size;
+   #define A(I, J) matrixCopy[(I) * matrixParam.blockColSize + (J)]
 
-   /*Each block caters to blockDim.x/(nVar * nEqn) rows. This variable tells us which row out 
-      of the blockDim.x/(nVar * nEqn) rows  in the block does the individual thread belong to*/
-   int localRowNo = threadIdx.x/size;           
-  
-   int x = threadNo/nEqn;                                         //  A single block is in use so no remainder step necessary like in MVP
-   int y = threadNo%nVar;
+   int x = threadIdx.x/matrixParam.blockRowSize;                                         //  A single block is in use so no remainder step necessary like in MVP
+   int y = threadIdx.x % matrixParam.blockColSize;
 
    matrixType pivot, weight = 0.0;
 
   extern __shared__ matrixType matrixCopy[];
 
-   if(validRow) A(x, y) = matrix[d_dia_ptr[row] * size + x * nVar + y];
+   A(x, y) = matrix[x * matrixParam.blockColSize + y];
 
    __syncthreads();
 
-   for(int currentRow=0; currentRow<nEqn; currentRow++)
+   for(int currentRow=0; currentRow< matrixParam.blockRowSize; currentRow++)
    {
-      pivot = A(currentRow, currentRow);             
+      pivot = A(currentRow, currentRow);
 
       __syncthreads();
 
-      if(validRow)
-      {
-         A(currentRow, y) /= pivot;                              
-         if(y==0) prod[row * nVar + currentRow] /= pivot;
-      }
+      A(currentRow, y) /= pivot;
+      if(y==0) prod[currentRow] /= pivot;
 
       __syncthreads();
-      
-      weight = A(x, currentRow);                            
 
-      if(x!=currentRow && validRow) 
+      weight = A(x, currentRow);
+
+      if(x!=currentRow)
       {
-         A(x, y) -= weight * A(currentRow, y);       
-         if(y==0) prod[row * nVar + x] -= weight * prod[row * nVar + currentRow];
+         A(x, y) -= weight * A(currentRow, y);
+         if(y==0) prod[x] -= weight * prod[currentRow];
       }
 
    __syncthreads();
    }
+
+   #undef A
+   #undef validRow
 }
 
 template<typename matrixType, typename vectorType>
-__global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn)
+__global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* vec, vectorType* prod, unsigned long* d_partition_offsets, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, matrixParameters matrixParam, precondParameters precondParam)
 {
-   int row = (blockIdx.x * blockDim.x + threadIdx.x)/KernelParameters::MVP_WARP_SIZE;
-   int threadNo = threadIdx.x%KernelParameters::MVP_WARP_SIZE;
-   int activeThreads = nEqn * (KernelParameters::MVP_WARP_SIZE/nEqn);
+   for(auto iPartition = 1ul; iPartition < matrixParam.nPartition - 1; iPartition++)
+   {  
+      #define matrixIndex(row, threadNo) d_row_ptr[row] * matrixParam.blockSize + threadNo
+      #define vectorIndex(row, elemNo) row * matrixParam.blockColSize + elemNo
 
-   int blockRow = (threadNo/nEqn)%nVar;
+      int row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE + d_partition_offsets[iPartition-1];
+      bool rowInLevel = (row < d_partition_offsets[iPartition]);
    
-   vectorType res = 0.0;
+      int threadNo = threadIdx.x % MVP_WARP_SIZE;
+      int activeThreads = matrixParam.blockColSize * (MVP_WARP_SIZE/matrixParam.blockRowSize);
 
-   if(row<nPointDomain && threadNo<activeThreads)
-   {
-      for(int index = d_row_ptr[row] * nVar * nEqn + threadNo; index < d_dia_ptr[row] * nVar * nEqn; index+=activeThreads)
+      int blockRow = (threadNo/matrixParam.blockRowSize) % matrixParam.blockColSize;
+
+      extern __shared__ vectorType lowProd[];
+
+      vectorType res = 0.0;
+
+      if(rowInLevel && threadNo<activeThreads)
       {
-         int blockCol = index%nVar;
-         int blockNo = index/(nVar * nEqn);
-         res -= matrix[index] * prod[(d_col_ind[blockNo])*nVar + blockCol];    // Compute -L.x*
+         for(int index = matrixIndex(row, threadNo); index < d_dia_ptr[row] * matrixParam.blockSize; index+=activeThreads)
+         {
+            int blockCol = index % matrixParam.blockColSize;
+            int blockNo = index/(matrixParam.blockSize);
+            res += matrix[index] * prod[(d_col_ind[blockNo]) * matrixParam.blockColSize + blockCol];    // Compute L.x*
+         }
       }
-   }
 
-   __syncthreads();   // Syncthreads outside the conditional statements to prevent excecptions
+      if(rowInLevel && threadNo<activeThreads) atomicAdd(&lowProd[vectorIndex(row - d_partition_offsets[iPartition], threadNo)], res);    // Compute y = b - L.x* (subtracts the previously obtained lower product)
 
-   if(row<nPointDomain && threadNo<nVar) prod[row * nVar + threadNo]  = vec[row * nVar + threadNo];   // First condition just selects the first nVar threads of the warp - this is to exclude multiple writes
-   
-   __syncthreads(); 
+      __syncthreads();
+
+      if(rowInLevel && threadNo<matrixParam.blockColSize) prod[vectorIndex(row, threadNo)] = vec[vectorIndex(row, threadNo)] - lowProd[vectorIndex(row - d_partition_offsets[iPartition], threadNo)];   // First condition just selects the first nVar threads of the warp - this is to exclude multiple writes
+      
+      __syncthreads();
 
-   if(row<nPointDomain && threadNo<activeThreads) atomicAdd(&prod[row * nVar + blockRow], res);    // Compute y = b - L.x* (subtracts the previously obtained lower product)
+      if(rowInLevel && threadNo==0) GaussEliminationKernel<<<precondParam.gaussElimGridDim, precondParam.gaussElimBlockDim, matrixParam.blockSize * sizeof(matrixType)>>>(&matrix[d_dia_ptr[row]], &prod[vectorIndex(row, 0)], d_dia_ptr, matrixParam);
+      
+      __syncthreads();
+      
+      #undef matrixIndex
+      #undef vectorIndex
+   } 
 }
 
+
 template<typename matrixType, typename vectorType>
 __global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn)
 {
-   int row = (blockIdx.x * blockDim.x + threadIdx.x)/KernelParameters::MVP_WARP_SIZE;
-   int threadNo = threadIdx.x%KernelParameters::MVP_WARP_SIZE;
-   int activeThreads = nEqn * (KernelParameters::MVP_WARP_SIZE/nEqn);
+   int row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE;
+   int threadNo = threadIdx.x%MVP_WARP_SIZE;
+   int activeThreads = nEqn * (MVP_WARP_SIZE/nEqn);
 
    int blockRow = (threadNo/nEqn)%nVar;
 
    vectorType res = 0.0;
 
-   if(row==nPointDomain-3) 
-   {
-      int hi = 1;
-   }
-
    if(row<nPointDomain && threadNo<activeThreads)
    {
       // Diagonal Product
@@ -134,7 +142,7 @@ __global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* p
          int blockNo = index/(nVar * nEqn);
          res += matrix[index] * prod[(d_col_ind[blockNo]) * nVar + blockCol];    // Compute D.x*
       }
-      
+
       // Upper Product along with Simultaneous Subtraction
       for(int index = (d_dia_ptr[row] + 1) * nVar * nEqn + threadNo; index < d_row_ptr[row+1] * nVar * nEqn; index+=activeThreads)
       {
@@ -154,31 +162,37 @@ __global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* p
 }
 
 template<typename matrixType, typename vectorType>
-__global__ void MatrixVectorProductKernel(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn)
+__global__ void MatrixVectorProductKernel(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, matrixParameters matrixParam)
 {
-   int row = (blockIdx.x * blockDim.x + threadIdx.x)/KernelParameters::MVP_WARP_SIZE;
-   int threadNo = threadIdx.x%KernelParameters::MVP_WARP_SIZE;
-   int activeThreads = nEqn * (KernelParameters::MVP_WARP_SIZE/nEqn);
+   #define matrixIndex(row, threadNo) d_row_ptr[row] * matrixParam.blockSize + threadNo
+   #define vectorIndex(row, elemNo) row * matrixParam.blockColSize + elemNo
 
-   int blockRow = (threadNo/nEqn)%nVar;      // The remainder step is necessary as the blockRow sequnce should reset when we move to the next block
+   int row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE;
+   int threadNo = threadIdx.x % MVP_WARP_SIZE;
+   int activeThreads = matrixParam.blockRowSize * (MVP_WARP_SIZE/matrixParam.blockRowSize);
 
-   if(row<nPointDomain && threadNo<activeThreads) prod[row * nVar + blockRow] = 0.0;
+   int blockRow = (threadNo/matrixParam.blockRowSize) % matrixParam.blockColSize;      // The remainder step is necessary as the blockRow sequnce should reset when we move to the next block
+
+   if(row<matrixParam.totalRows && threadNo<activeThreads) prod[vectorIndex(row, blockRow)] = 0.0;
 
    __syncthreads();
 
-   if(row<nPointDomain && threadNo<activeThreads)
+   if(row<matrixParam.totalRows && threadNo<activeThreads)
    {
       vectorType res = 0.0;
 
-      for(int index = d_row_ptr[row] * nVar * nEqn + threadNo; index < d_row_ptr[row+1] * nVar * nEqn; index+=activeThreads)
+      for(int index = matrixIndex(row, threadNo); index < matrixIndex(row+1, threadNo); index+=activeThreads)
       {
-         int blockCol = index%nVar;
-         int blockNo = index/(nVar * nEqn);
-         res += matrix[index] * vec[(d_col_ind[blockNo]) * nVar + blockCol];
+         int blockCol = index % matrixParam.blockColSize;
+         int blockNo = index/(matrixParam.blockSize);
+         res += matrix[index] * vec[(d_col_ind[blockNo]) * matrixParam.blockColSize + blockCol];
       }
 
-      atomicAdd(&prod[row * nVar + blockRow], res);
+      atomicAdd(&prod[vectorIndex(row, blockRow)], res);
    }
+
+   #undef matrixIndex
+   #undef vectorIndex
 }
 
 template<class ScalarType>
@@ -199,11 +213,13 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
    vec.HtDTransfer();
    prod.GPUSetVal(0.0);
 
-  dim3 blockDim(KernelParameters::MVP_BLOCK_SIZE,1,1);
-  int gridx = KernelParameters::rounded_up_division(KernelParameters::MVP_BLOCK_SIZE, nPointDomain * KernelParameters::MVP_WARP_SIZE);
+   matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition);
+
+  dim3 blockDim(MVP_BLOCK_SIZE,1,1);
+  int gridx = rounded_up_division(MVP_BLOCK_SIZE, matrixParam.totalRows * MVP_WARP_SIZE);
   dim3 gridDim(gridx, 1, 1);
 
-  MatrixVectorProductKernel<<<gridDim, blockDim>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, nPointDomain, nVar, nEqn);
+  MatrixVectorProductKernel<<<gridDim, blockDim>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
   gpuErrChk( cudaPeekAtLastError() );
 
   prod.DtHTransfer();
@@ -221,26 +237,17 @@ void CSysMatrix<ScalarType>::GPUComputeLU_SGSPreconditioner(const CSysVector<Sca
       vec.HtDTransfer();
       prod.HtDTransfer();
 
-      unsigned long size = nVar * nEqn;
+      matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition);
+      precondParameters precondParam(matrixParam);
 
-      dim3 blockDim(KernelParameters::MVP_BLOCK_SIZE,1,1);
-      int gridx = KernelParameters::rounded_up_division(KernelParameters::MVP_BLOCK_SIZE, nPointDomain * KernelParameters::MVP_WARP_SIZE);
+      dim3 blockDim(MVP_BLOCK_SIZE,1,1);
+      int gridx = rounded_up_division(MVP_BLOCK_SIZE, geometry->maxPartitionSize * MVP_WARP_SIZE);
       dim3 gridDim(gridx, 1, 1);
 
-      int geBlockx = size * (KernelParameters::MVP_BLOCK_SIZE/size);
-      dim3 gaussElimBlockDim(geBlockx, 1, 1);
-      int geGridx = KernelParameters::rounded_up_division(geBlockx, size * nPointDomain);
-      dim3 gaussElimGridDim(geGridx,1,1);
-
-      FirstSymmetricIterationKernel<<<gridDim, blockDim>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, d_dia_ptr, nPointDomain, nVar, nEqn);
-      gpuErrChk( cudaPeekAtLastError() );
-     GaussEliminationKernel<<<gaussElimGridDim, gaussElimBlockDim, sizeof(ScalarType) * geBlockx>>>(d_matrix, d_prod, d_dia_ptr, nPointDomain, nVar, nEqn, size);
+      FirstSymmetricIterationKernel<<<gridDim, blockDim, geometry->maxPartitionSize * MVP_WARP_SIZE * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam, precondParam);
       gpuErrChk( cudaPeekAtLastError() );
-     SecondSymmetricIterationKernel<<<gridDim, blockDim>>>(d_matrix, d_prod, d_row_ptr, d_col_ind, d_dia_ptr, nPointDomain, nVar, nEqn);
+      SecondSymmetricIterationKernel<<<gridDim, blockDim>>>(d_matrix, d_prod, d_row_ptr, d_col_ind, d_dia_ptr, nPointDomain, nVar, nEqn);
       gpuErrChk( cudaPeekAtLastError() );
-      GaussEliminationKernel<<<gaussElimGridDim, gaussElimBlockDim, sizeof(ScalarType) * geBlockx>>>(d_matrix, d_prod, d_dia_ptr, nPointDomain, nVar, nEqn, size);
-      gpuErrChk( cudaPeekAtLastError() );
-
       prod.DtHTransfer();
    }
 
diff --git a/Common/src/linear_algebra/CSysVectorGPU.cu b/Common/src/linear_algebra/CSysVectorGPU.cu
index 84945b3f1c06..c07f84c3544f 100644
--- a/Common/src/linear_algebra/CSysVectorGPU.cu
+++ b/Common/src/linear_algebra/CSysVectorGPU.cu
@@ -2,7 +2,7 @@
  * \file CSysVectorGPU.cu
  * \brief Implementations of Kernels and Functions for Vector Operations on the GPU
  * \author A. Raj
- * \version 8.1.0 "Harrier"
+ * \version 8.2.0 "Harrier"
  *
  * SU2 Project Website: https://su2code.github.io
  *

From 2495edd376ff286e623e6322f73ac6847f6df002 Mon Sep 17 00:00:00 2001
From: Areen Raj <areenraj@gmail.com>
Date: Wed, 25 Jun 2025 19:37:00 +0530
Subject: [PATCH 04/15] resolving some more conflicts

---
 Common/include/linear_algebra/CSysMatrix.hpp |  3 ---
 Common/include/linear_algebra/GPUComms.cuh   |  4 ++--
 Common/src/linear_algebra/CSysMatrix.cpp     | 21 ++++++++------------
 3 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
index 6afbfc47d1b8..b39231d59838 100644
--- a/Common/include/linear_algebra/CSysMatrix.hpp
+++ b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -148,13 +148,10 @@ class CSysMatrix {
   ScalarType* d_matrix;           /*!< \brief Device Pointer to store the matrix values on the GPU. */
   const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
   const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
-<<<<<<< HEAD
   const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
   unsigned long* d_partition_offsets;
-=======
   bool useCuda;                   /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
                                      Mainly used to conditionally free GPU memory in the class destructor. */
->>>>>>> upstream/develop
 
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
index 808772e52a8e..4042934db241 100644
--- a/Common/include/linear_algebra/GPUComms.cuh
+++ b/Common/include/linear_algebra/GPUComms.cuh
@@ -36,8 +36,8 @@ namespace kernelParameters{
   /*Returns the rounded down value of the decimal quotient to the previous integer (in all cases)*/
   inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
 
-  const unsigned int MVP_BLOCK_SIZE = 1024;
-  const unsigned int MVP_WARP_SIZE = 32;
+  static constexpr int MVP_BLOCK_SIZE = 1024;
+  static constexpr int MVP_WARP_SIZE = 32;  
 
 };
 
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
index 2cc1ace58248..8bbb4efabc91 100644
--- a/Common/src/linear_algebra/CSysMatrix.cpp
+++ b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -159,26 +159,21 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
       ptr = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(num * sizeof(ScalarType));
     };
 
-<<<<<<< HEAD
-  auto GPUVectorAllocAndCopy = [](unsigned long*& ptr, vector<unsigned long>& src_ptr, unsigned long num) {
-    ptr = GPUMemoryAllocation::gpu_alloc_cpy<unsigned long>(&src_ptr[0], num * sizeof(unsigned long));
-  };
-
-  GPUAllocAndInit(d_matrix, nnz * nVar * nEqn);
-  GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
-  GPUAllocAndCopy(d_col_ind, col_ind, nnz);
-  GPUAllocAndCopy(d_dia_ptr, dia_ptr, nPointDomain);
-  GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nPartition);
-=======
-    auto GPUAllocAndCopy = [](const unsigned long*& ptr, const unsigned long*& src_ptr, unsigned long num) {
+      auto GPUAllocAndCopy = [](const unsigned long*& ptr, const unsigned long*& src_ptr, unsigned long num) {
       ptr = GPUMemoryAllocation::gpu_alloc_cpy<const unsigned long>(src_ptr, num * sizeof(const unsigned long));
     };
 
+    auto GPUVectorAllocAndCopy = [](unsigned long*& ptr, vector<unsigned long>& src_ptr, unsigned long num) {
+      ptr = GPUMemoryAllocation::gpu_alloc_cpy<unsigned long>(&src_ptr[0], num * sizeof(unsigned long));
+    };
+
     GPUAllocAndInit(d_matrix, nnz * nVar * nEqn);
     GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
     GPUAllocAndCopy(d_col_ind, col_ind, nnz);
+    GPUAllocAndCopy(d_dia_ptr, dia_ptr, nPointDomain);
+    GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nPartition);
+
   }
->>>>>>> upstream/develop
 
   if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();
 

From f68358e4545d43def9e1f9ebb4338cb0665b1854 Mon Sep 17 00:00:00 2001
From: Areen Raj <areenraj@gmail.com>
Date: Wed, 25 Jun 2025 19:41:46 +0530
Subject: [PATCH 05/15] cleaning up

---
 Common/include/linear_algebra/CSysMatrix.hpp | 3 ---
 Common/src/geometry/CPhysicalGeometry.cpp    | 2 --
 2 files changed, 5 deletions(-)

diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
index b39231d59838..43cd3d8bb239 100644
--- a/Common/include/linear_algebra/CSysMatrix.hpp
+++ b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -863,8 +863,6 @@ class CSysMatrix {
                               const CConfig* config) const;
 
   /*!
-<<<<<<< HEAD
-=======
    * \brief Performs first step of the LU_SGS Preconditioner building
    * \param[in] vec - CSysVector to be multiplied by the sparse matrix A.
    * \param[in] geometry - Geometrical definition of the problem.
@@ -890,7 +888,6 @@ class CSysMatrix {
   void GPUGaussElimination(ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
 
   /*!
->>>>>>> upstream/develop
    * \brief Multiply CSysVector by the preconditioner all of which are stored on the device
    * \param[in] vec - CSysVector to be multiplied by the preconditioner.
    * \param[out] prod - Result of the product A*vec.
diff --git a/Common/src/geometry/CPhysicalGeometry.cpp b/Common/src/geometry/CPhysicalGeometry.cpp
index ff3f2dbfaf01..5ce18cc87f90 100644
--- a/Common/src/geometry/CPhysicalGeometry.cpp
+++ b/Common/src/geometry/CPhysicalGeometry.cpp
@@ -50,8 +50,6 @@
 #include "../../include/geometry/primal_grid/CPyramid.hpp"
 #include "../../include/geometry/primal_grid/CPrism.hpp"
 #include "../../include/geometry/primal_grid/CVertexMPI.hpp"
-#include "boost/integer_fwd.hpp"
-#include "cgnslib.h"
 
 #include <sys/types.h>
 #include <sys/stat.h>

From bf561b61bd93e9e0ef61c2e5de8ee46cc717e296 Mon Sep 17 00:00:00 2001
From: Areen Raj <areenraj@gmail.com>
Date: Wed, 25 Jun 2025 19:43:59 +0530
Subject: [PATCH 06/15] apologies for repeated commits, just cleaning

---
 Common/src/linear_algebra/CSysMatrix.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
index 8bbb4efabc91..909b2eb25787 100644
--- a/Common/src/linear_algebra/CSysMatrix.cpp
+++ b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -68,18 +68,12 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
   MemoryAllocation::aligned_free(matrix);
   MemoryAllocation::aligned_free(invM);
 
-<<<<<<< HEAD
-  GPUMemoryAllocation::gpu_free(d_matrix);
-  GPUMemoryAllocation::gpu_free(d_row_ptr);
-  GPUMemoryAllocation::gpu_free(d_col_ind);
-  GPUMemoryAllocation::gpu_free(d_partition_offsets);
-=======
   if (useCuda) {
     GPUMemoryAllocation::gpu_free(d_matrix);
     GPUMemoryAllocation::gpu_free(d_row_ptr);
     GPUMemoryAllocation::gpu_free(d_col_ind);
+    GPUMemoryAllocation::gpu_free(d_partition_offsets);
   }
->>>>>>> upstream/develop
 
 #ifdef USE_MKL
   mkl_jit_destroy(MatrixMatrixProductJitter);

From 9121e25da7930971584bdeeca845be79412f1beb Mon Sep 17 00:00:00 2001
From: Areen Raj <areenraj@gmail.com>
Date: Mon, 30 Jun 2025 00:20:19 +0530
Subject: [PATCH 07/15] coalesced memory access for MVP, shared memory addition
 and lamda functions for indexing

---
 Common/include/linear_algebra/GPUComms.cuh |  33 ++-
 Common/src/linear_algebra/CSysMatrixGPU.cu | 244 ++++++++++++++-------
 2 files changed, 175 insertions(+), 102 deletions(-)

diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
index 4042934db241..ccd44622b365 100644
--- a/Common/include/linear_algebra/GPUComms.cuh
+++ b/Common/include/linear_algebra/GPUComms.cuh
@@ -36,8 +36,8 @@ namespace kernelParameters{
   /*Returns the rounded down value of the decimal quotient to the previous integer (in all cases)*/
   inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
 
-  static constexpr int MVP_BLOCK_SIZE = 1024;
-  static constexpr int MVP_WARP_SIZE = 32;  
+  static constexpr short MVP_BLOCK_SIZE = 1024;
+  static constexpr short MVP_WARP_SIZE = 32;  
 
 };
 
@@ -45,32 +45,31 @@ struct matrixParameters{
 
   public:
     unsigned long totalRows;
-    unsigned long blockRowSize;
-    unsigned long blockColSize;
+    unsigned short blockRowSize;
+    unsigned short blockColSize;
     unsigned long nPartition; 
-    unsigned long blockSize;
+    unsigned short blockSize;
+    unsigned short rowsPerBlock;
+    unsigned short activeThreads;
 
-    matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions)
-    {
+    matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions){
       totalRows = nPointDomain;
       blockRowSize = nEqn;
       blockColSize = nVar;
       nPartition = nPartitions;
       blockSize = nVar * nEqn;
+      rowsPerBlock = kernelParameters::rounded_up_division(kernelParameters::MVP_WARP_SIZE, kernelParameters::MVP_BLOCK_SIZE);
+      activeThreads = nVar * (kernelParameters::MVP_WARP_SIZE/nVar);
     }
-};
-struct precondParameters{
 
-  public:
-    dim3 gaussElimBlockDim;
-    dim3 gaussElimGridDim;
+    __device__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){
+      return (row<totalRows && threadNo<threadLimit);
+    }
 
-    precondParameters(matrixParameters matrixParam)
-    {
-      unsigned int geBlockx = matrixParam.blockSize;
-      gaussElimBlockDim = {geBlockx, 1, 1};
-      gaussElimGridDim = {1,1,1};
+    __device__ bool validParallelAccess(bool rowInPartition, unsigned short threadNo, unsigned short threadLimit){
+      return (rowInPartition && threadNo<activeThreads);
     }
+
 };
 
 /*!
diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu
index 640b77d6f71a..d8dd2448db28 100644
--- a/Common/src/linear_algebra/CSysMatrixGPU.cu
+++ b/Common/src/linear_algebra/CSysMatrixGPU.cu
@@ -33,22 +33,18 @@ using namespace kernelParameters;
 
 
 template<typename matrixType, typename vectorType>
-__global__ void GaussEliminationKernel(matrixType* matrix, vectorType* prod, const unsigned long* d_dia_ptr, matrixParameters matrixParam)
+__device__ void GaussEliminationKernel(matrixType* matrixCopy, vectorType* prod, unsigned int threadNo, matrixParameters matrixParam)
 {
    #define A(I, J) matrixCopy[(I) * matrixParam.blockColSize + (J)]
 
-   int x = threadIdx.x/matrixParam.blockRowSize;                                         //  A single block is in use so no remainder step necessary like in MVP
-   int y = threadIdx.x % matrixParam.blockColSize;
+   int x = threadNo/matrixParam.blockRowSize;                  
+   int y = threadNo % matrixParam.blockColSize;
 
    matrixType pivot, weight = 0.0;
-
-  extern __shared__ matrixType matrixCopy[];
-
-   A(x, y) = matrix[x * matrixParam.blockColSize + y];
-
+   
    __syncthreads();
 
-   for(int currentRow=0; currentRow< matrixParam.blockRowSize; currentRow++)
+   for(int currentRow=0; currentRow < matrixParam.blockRowSize; currentRow++)
    {
       pivot = A(currentRow, currentRow);
 
@@ -57,8 +53,6 @@ __global__ void GaussEliminationKernel(matrixType* matrix, vectorType* prod, con
       A(currentRow, y) /= pivot;
       if(y==0) prod[currentRow] /= pivot;
 
-      __syncthreads();
-
       weight = A(x, currentRow);
 
       if(x!=currentRow)
@@ -71,128 +65,209 @@ __global__ void GaussEliminationKernel(matrixType* matrix, vectorType* prod, con
    }
 
    #undef A
-   #undef validRow
 }
 
 template<typename matrixType, typename vectorType>
-__global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* vec, vectorType* prod, unsigned long* d_partition_offsets, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, matrixParameters matrixParam, precondParameters precondParam)
+__global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_partition_offsets, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, matrixParameters matrixParam)
 {
-   for(auto iPartition = 1ul; iPartition < matrixParam.nPartition - 1; iPartition++)
+
+   auto matrixIndex = [=](unsigned long row, unsigned short threadNo){
+      return (d_row_ptr[row] * matrixParam.blockSize + threadNo);
+   };
+
+   auto matrixDiagonalIndex = [=](unsigned long row, unsigned short threadNo){
+      return (d_row_ptr[row] * matrixParam.blockSize + threadNo);
+   };
+
+   auto vectorIndex = [=](unsigned long row, unsigned short elemNo){
+      return (row * matrixParam.blockRowSize + elemNo);
+   };
+
+   unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE; 
+   unsigned short localRow = row % matrixParam.rowsPerBlock;
+   unsigned short threadNo = threadIdx.x % MVP_WARP_SIZE;
+
+   unsigned short blockCol = threadNo % matrixParam.blockColSize;
+
+   extern __shared__ vectorType tempBuffer[];
+
+   for(auto iPartition = 1ul; iPartition < matrixParam.nPartition; iPartition++)
    {  
-      #define matrixIndex(row, threadNo) d_row_ptr[row] * matrixParam.blockSize + threadNo
-      #define vectorIndex(row, elemNo) row * matrixParam.blockColSize + elemNo
 
-      int row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE + d_partition_offsets[iPartition-1];
-      bool rowInLevel = (row < d_partition_offsets[iPartition]);
-   
-      int threadNo = threadIdx.x % MVP_WARP_SIZE;
-      int activeThreads = matrixParam.blockColSize * (MVP_WARP_SIZE/matrixParam.blockRowSize);
+      /*We move to the set of rows present in the current partition.*/
+      row += d_partition_offsets[iPartition-1]; 
+      
+      bool rowInPartition = (row < d_partition_offsets[iPartition]);
 
-      int blockRow = (threadNo/matrixParam.blockRowSize) % matrixParam.blockColSize;
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) {
 
-      extern __shared__ vectorType lowProd[];
+         prod[vectorIndex(row, threadNo)] = 0.0;
+         tempBuffer[vectorIndex(localRow, threadNo)] = 0.0;
+      }
+      
+      __syncthreads();
 
-      vectorType res = 0.0;
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.activeThreads)) {
 
-      if(rowInLevel && threadNo<activeThreads)
-      {
-         for(int index = matrixIndex(row, threadNo); index < d_dia_ptr[row] * matrixParam.blockSize; index+=activeThreads)
+         for(unsigned long index = matrixIndex(row, threadNo); index < matrixDiagonalIndex(row, 0); index+=matrixParam.activeThreads)
          {
-            int blockCol = index % matrixParam.blockColSize;
-            int blockNo = index/(matrixParam.blockSize);
-            res += matrix[index] * prod[(d_col_ind[blockNo]) * matrixParam.blockColSize + blockCol];    // Compute L.x*
+
+            unsigned short blockNo = index/matrixParam.blockSize;
+            unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;    
+
+            atomicAdd(&tempBuffer[vectorIndex(localRow, blockRow)], matrix[index] * vec[(d_col_ind[blockNo]) * matrixParam.blockColSize + blockCol]);
          }
       }
 
-      if(rowInLevel && threadNo<activeThreads) atomicAdd(&lowProd[vectorIndex(row - d_partition_offsets[iPartition], threadNo)], res);    // Compute y = b - L.x* (subtracts the previously obtained lower product)
-
       __syncthreads();
 
-      if(rowInLevel && threadNo<matrixParam.blockColSize) prod[vectorIndex(row, threadNo)] = vec[vectorIndex(row, threadNo)] - lowProd[vectorIndex(row - d_partition_offsets[iPartition], threadNo)];   // First condition just selects the first nVar threads of the warp - this is to exclude multiple writes
-      
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) {
+
+         prod[vectorIndex(row, threadNo)] = vec[vectorIndex(row, threadNo)] - tempBuffer[vectorIndex(localRow, threadNo)];
+      } 
+     
       __syncthreads();
 
-      if(rowInLevel && threadNo==0) GaussEliminationKernel<<<precondParam.gaussElimGridDim, precondParam.gaussElimBlockDim, matrixParam.blockSize * sizeof(matrixType)>>>(&matrix[d_dia_ptr[row]], &prod[vectorIndex(row, 0)], d_dia_ptr, matrixParam);
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)) {
+
+         tempBuffer[vectorIndex(localRow, threadNo)] = matrix[matrixDiagonalIndex(row, threadNo)];
+         GaussEliminationKernel(&tempBuffer[vectorIndex(localRow, threadNo)], &prod[vectorIndex(row, threadNo/matrixParam.blockColSize)], threadNo, matrixParam);
+      } 
       
       __syncthreads();
-      
-      #undef matrixIndex
-      #undef vectorIndex
+
    } 
 }
 
 
 template<typename matrixType, typename vectorType>
-__global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn)
+__global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* prod, const unsigned long* d_partition_offsets, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, matrixParameters matrixParam)
 {
-   int row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE;
-   int threadNo = threadIdx.x%MVP_WARP_SIZE;
-   int activeThreads = nEqn * (MVP_WARP_SIZE/nEqn);
 
-   int blockRow = (threadNo/nEqn)%nVar;
+   auto matrixIndex = [=](unsigned long row, unsigned short threadNo){
+      return (d_row_ptr[row] * matrixParam.blockSize + threadNo);
+   };
 
-   vectorType res = 0.0;
+   auto matrixDiagonalIndex = [=](unsigned long row, unsigned short threadNo){
+      return (d_dia_ptr[row] * matrixParam.blockSize + threadNo);
+   };
 
-   if(row<nPointDomain && threadNo<activeThreads)
-   {
-      // Diagonal Product
-      for(int index = d_dia_ptr[row] * nVar * nEqn + threadNo; index < (d_dia_ptr[row] + 1) * nVar * nEqn; index+=activeThreads)
-      {
-         int blockCol = index%nEqn;
-         int blockNo = index/(nVar * nEqn);
-         res += matrix[index] * prod[(d_col_ind[blockNo]) * nVar + blockCol];    // Compute D.x*
+   auto vectorIndex = [=](unsigned long row, unsigned short elemNo){
+      return (row * matrixParam.blockRowSize + elemNo);
+   };
+
+   unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE; 
+   unsigned short localRow = row % matrixParam.rowsPerBlock;
+   unsigned short threadNo = threadIdx.x % MVP_WARP_SIZE;
+
+   unsigned short blockCol = threadNo % matrixParam.blockColSize;
+
+   extern __shared__ vectorType tempBuffer[];
+
+   for(auto iPartition = 1ul; iPartition < matrixParam.nPartition; iPartition++)
+   {  
+
+      /*We move to the set of rows present in the current partition.*/
+      row += d_partition_offsets[iPartition-1]; 
+      
+      bool rowInPartition = (row < d_partition_offsets[iPartition]);
+
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) {
+
+         tempBuffer[vectorIndex(localRow, threadNo)] = 0.0;
       }
+      
+      __syncthreads();
 
-      // Upper Product along with Simultaneous Subtraction
-      for(int index = (d_dia_ptr[row] + 1) * nVar * nEqn + threadNo; index < d_row_ptr[row+1] * nVar * nEqn; index+=activeThreads)
-      {
-         int blockCol = index%nVar;
-         int blockNo = index/(nVar * nEqn);
-         res -= matrix[index] * prod[(d_col_ind[blockNo])*nVar + blockCol];    // Compute U.x_(n+1)
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.activeThreads)) {
+
+         for(unsigned long index = matrixDiagonalIndex(row, threadNo); index < matrixDiagonalIndex(row, 0) + matrixParam.blockSize; index+=matrixParam.activeThreads)
+         {
+
+            unsigned short blockNo = index/matrixParam.blockSize;
+            unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;    
+
+            atomicAdd(&tempBuffer[vectorIndex(localRow, blockRow)], matrix[index] * prod[(d_col_ind[blockNo]) * matrixParam.blockColSize + blockCol]);
+         }
       }
-   }
 
-   __syncthreads();  // Syncthreads outside conditional statements to prevent exceptions
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.activeThreads)) {
 
-   if(row<nPointDomain && threadNo<nVar) prod[row * nVar + threadNo] = 0.0;    // First condition just selects the first nVar threads of the warp - this is to exclude multiple writes
+         for(unsigned long index = matrixDiagonalIndex(row, threadNo); index < matrixIndex(row+1, 0); index+=matrixParam.activeThreads)
+         {
 
-   __syncthreads();
+            unsigned short blockNo = index/matrixParam.blockSize;
+            unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;    
 
-   if(row<nPointDomain && threadNo<activeThreads) atomicAdd(&prod[row * nVar + blockRow], res);   // Compute y = D.x*-U.x_(n+1)
+            atomicAdd(&tempBuffer[vectorIndex(localRow, blockRow)], -(matrix[index] * prod[(d_col_ind[blockNo]) * matrixParam.blockColSize + blockCol]));
+         }
+      }
+
+      __syncthreads();
+
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) {
+
+         prod[vectorIndex(row, threadNo)] = tempBuffer[vectorIndex(localRow, threadNo)];
+      } 
+     
+      __syncthreads();
+
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)) {
+
+         tempBuffer[vectorIndex(localRow, threadNo)] = matrix[(d_dia_ptr[row] * matrixParam.blockSize) + threadNo];
+         GaussEliminationKernel(&tempBuffer[vectorIndex(localRow, threadNo)], &prod[vectorIndex(row, threadNo/matrixParam.blockColSize)], threadNo, matrixParam);
+      } 
+      
+      __syncthreads();
+
+   } 
 }
 
 template<typename matrixType, typename vectorType>
 __global__ void MatrixVectorProductKernel(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, matrixParameters matrixParam)
 {
-   #define matrixIndex(row, threadNo) d_row_ptr[row] * matrixParam.blockSize + threadNo
-   #define vectorIndex(row, elemNo) row * matrixParam.blockColSize + elemNo
+   auto matrixIndex = [=](unsigned long row, unsigned short threadNo) {
+      return (d_row_ptr[row] * matrixParam.blockSize + threadNo);
+   };
 
-   int row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE;
-   int threadNo = threadIdx.x % MVP_WARP_SIZE;
-   int activeThreads = matrixParam.blockRowSize * (MVP_WARP_SIZE/matrixParam.blockRowSize);
+   auto vectorIndex = [=](unsigned long row, unsigned short elemNo) {
+      return (row * matrixParam.blockRowSize + elemNo);
+   };
 
-   int blockRow = (threadNo/matrixParam.blockRowSize) % matrixParam.blockColSize;      // The remainder step is necessary as the blockRow sequnce should reset when we move to the next block
+   unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE;
+   unsigned short threadNo = threadIdx.x % MVP_WARP_SIZE;
+   unsigned short localRow = row % matrixParam.rowsPerBlock;
 
-   if(row<matrixParam.totalRows && threadNo<activeThreads) prod[vectorIndex(row, blockRow)] = 0.0;
+   unsigned short blockCol = threadNo % matrixParam.blockColSize;
+
+   extern __shared__ vectorType res[];
+
+   if(matrixParam.validAccess(row, threadNo, matrixParam.blockRowSize)) 
+   {
+      prod[vectorIndex(row, threadNo)] = 0.0;
+      res[vectorIndex(localRow, threadNo)] = 0.0;
+   }
 
    __syncthreads();
 
-   if(row<matrixParam.totalRows && threadNo<activeThreads)
+   if(matrixParam.validAccess(row, threadNo, matrixParam.activeThreads))
    {
-      vectorType res = 0.0;
 
-      for(int index = matrixIndex(row, threadNo); index < matrixIndex(row+1, threadNo); index+=activeThreads)
+      for(unsigned long index = matrixIndex(row, threadNo); index < matrixIndex(row+1, 0); index+=matrixParam.activeThreads)
       {
-         int blockCol = index % matrixParam.blockColSize;
-         int blockNo = index/(matrixParam.blockSize);
-         res += matrix[index] * vec[(d_col_ind[blockNo]) * matrixParam.blockColSize + blockCol];
+         /*Calculating the position of the new element the thread has shifted to.*/
+         unsigned short blockNo = index/matrixParam.blockSize;
+         unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;    
+
+         atomicAdd(&res[vectorIndex(localRow, blockRow)], matrix[index] * vec[(d_col_ind[blockNo]) * matrixParam.blockColSize + blockCol]);
       }
 
-      atomicAdd(&prod[vectorIndex(row, blockRow)], res);
    }
 
-   #undef matrixIndex
-   #undef vectorIndex
+   __syncthreads();
+
+   if(matrixParam.validAccess(row, threadNo, matrixParam.blockRowSize)) prod[vectorIndex(row, threadNo)] = res[vectorIndex(localRow, threadNo)];
+
 }
 
 template<class ScalarType>
@@ -216,7 +291,7 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
    matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition);
 
   dim3 blockDim(MVP_BLOCK_SIZE,1,1);
-  int gridx = rounded_up_division(MVP_BLOCK_SIZE, matrixParam.totalRows * MVP_WARP_SIZE);
+  unsigned int gridx = rounded_up_division(MVP_BLOCK_SIZE, matrixParam.totalRows * MVP_WARP_SIZE);
   dim3 gridDim(gridx, 1, 1);
 
   MatrixVectorProductKernel<<<gridDim, blockDim>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
@@ -238,15 +313,14 @@ void CSysMatrix<ScalarType>::GPUComputeLU_SGSPreconditioner(const CSysVector<Sca
       prod.HtDTransfer();
 
       matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition);
-      precondParameters precondParam(matrixParam);
 
-      dim3 blockDim(MVP_BLOCK_SIZE,1,1);
-      int gridx = rounded_up_division(MVP_BLOCK_SIZE, geometry->maxPartitionSize * MVP_WARP_SIZE);
+      dim3 blockDim(matrixParam.rowsPerBlock * MVP_WARP_SIZE,1,1);
+      unsigned int gridx = rounded_up_division(matrixParam.rowsPerBlock, geometry->maxPartitionSize);
       dim3 gridDim(gridx, 1, 1);
 
-      FirstSymmetricIterationKernel<<<gridDim, blockDim, geometry->maxPartitionSize * MVP_WARP_SIZE * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam, precondParam);
+      FirstSymmetricIterationKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
       gpuErrChk( cudaPeekAtLastError() );
-      SecondSymmetricIterationKernel<<<gridDim, blockDim>>>(d_matrix, d_prod, d_row_ptr, d_col_ind, d_dia_ptr, nPointDomain, nVar, nEqn);
+      SecondSymmetricIterationKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
       gpuErrChk( cudaPeekAtLastError() );
       prod.DtHTransfer();
    }

From 991e29f0b61f5551351617a5ba1d6e9b499cbed3 Mon Sep 17 00:00:00 2001
From: Areen Raj <areenraj@gmail.com>
Date: Tue, 1 Jul 2025 16:49:59 +0530
Subject: [PATCH 08/15] bug fixes

---
 Common/src/linear_algebra/CSysMatrixGPU.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu
index d8dd2448db28..45e38eb6bd7a 100644
--- a/Common/src/linear_algebra/CSysMatrixGPU.cu
+++ b/Common/src/linear_algebra/CSysMatrixGPU.cu
@@ -76,7 +76,7 @@ __global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* ve
    };
 
    auto matrixDiagonalIndex = [=](unsigned long row, unsigned short threadNo){
-      return (d_row_ptr[row] * matrixParam.blockSize + threadNo);
+      return (d_dia_ptr[row] * matrixParam.blockSize + threadNo);
    };
 
    auto vectorIndex = [=](unsigned long row, unsigned short elemNo){

From 9bfeff9252e04bbf86920d8d59fcb13676b7a12d Mon Sep 17 00:00:00 2001
From: Areen Raj <areenraj@gmail.com>
Date: Thu, 3 Jul 2025 22:43:31 +0530
Subject: [PATCH 09/15] Merge remote-tracking branch 'upstream/master'

---
 .../linear_algebra/CGraphPartitioning.hpp     |   2 +-
 Common/include/linear_algebra/GPUComms.cuh    |   8 +-
 Common/src/linear_algebra/CSysMatrixGPU.cu    | 131 ++++++++++--------
 3 files changed, 77 insertions(+), 64 deletions(-)

diff --git a/Common/include/linear_algebra/CGraphPartitioning.hpp b/Common/include/linear_algebra/CGraphPartitioning.hpp
index 80d5e30e05c3..71837e22b07e 100644
--- a/Common/include/linear_algebra/CGraphPartitioning.hpp
+++ b/Common/include/linear_algebra/CGraphPartitioning.hpp
@@ -82,7 +82,7 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
 
  CLevelScheduling() = delete;   // Removing default constructor
 
- void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType>& levelOffsets)
+ void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType> levelOffsets)
  {
     for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
       const auto globalPoint = pointList[localPoint];
diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
index ccd44622b365..dbe90baf07c9 100644
--- a/Common/include/linear_algebra/GPUComms.cuh
+++ b/Common/include/linear_algebra/GPUComms.cuh
@@ -36,7 +36,7 @@ namespace kernelParameters{
   /*Returns the rounded down value of the decimal quotient to the previous integer (in all cases)*/
   inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
 
-  static constexpr short MVP_BLOCK_SIZE = 1024;
+  static constexpr short MVP_BLOCK_SIZE = 256;
   static constexpr short MVP_WARP_SIZE = 32;  
 
 };
@@ -62,12 +62,16 @@ struct matrixParameters{
       activeThreads = nVar * (kernelParameters::MVP_WARP_SIZE/nVar);
     }
 
+    __device__ unsigned short shrdMemIndex(unsigned short localRow, unsigned short threadNo){
+      return (localRow * blockSize + threadNo);
+    }
+
     __device__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){
       return (row<totalRows && threadNo<threadLimit);
     }
 
     __device__ bool validParallelAccess(bool rowInPartition, unsigned short threadNo, unsigned short threadLimit){
-      return (rowInPartition && threadNo<activeThreads);
+      return (rowInPartition && threadNo<threadLimit);
     }
 
 };
diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu
index 45e38eb6bd7a..770d628ff1be 100644
--- a/Common/src/linear_algebra/CSysMatrixGPU.cu
+++ b/Common/src/linear_algebra/CSysMatrixGPU.cu
@@ -33,34 +33,44 @@ using namespace kernelParameters;
 
 
 template<typename matrixType, typename vectorType>
-__device__ void GaussEliminationKernel(matrixType* matrixCopy, vectorType* prod, unsigned int threadNo, matrixParameters matrixParam)
+__device__ void GaussEliminationKernel(matrixType* matrixCopy, vectorType* prod, unsigned long row, unsigned int threadNo, bool rowInPartition, matrixParameters matrixParam)
 {
-   #define A(I, J) matrixCopy[(I) * matrixParam.blockColSize + (J)]
+   auto matrixIndex = [=](unsigned short x, unsigned short y){
+      return (x * matrixParam.blockColSize + y);
+   };
 
-   int x = threadNo/matrixParam.blockRowSize;                  
-   int y = threadNo % matrixParam.blockColSize;
+   auto vectorIndex = [=](unsigned short row, unsigned short elemNo){
+      return (row * matrixParam.blockRowSize + elemNo);
+   };
+
+   unsigned short x = threadNo/matrixParam.blockRowSize;                  
+   unsigned short y = threadNo % matrixParam.blockColSize;
 
    matrixType pivot, weight = 0.0;
    
    __syncthreads();
 
-   for(int currentRow=0; currentRow < matrixParam.blockRowSize; currentRow++)
+   for(unsigned short currentRow=0; currentRow < matrixParam.blockRowSize; currentRow++)
    {
-      pivot = A(currentRow, currentRow);
-
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockColSize)){
+         pivot = matrixCopy[matrixIndex(currentRow, currentRow)];
+      }
       __syncthreads();
 
-      A(currentRow, y) /= pivot;
-      if(y==0) prod[currentRow] /= pivot;
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockColSize)){
+         
+         matrixCopy[matrixIndex(currentRow, y)] /= pivot;
+         if(y==0) prod[vectorIndex(row, currentRow)] /= pivot;
 
-      weight = A(x, currentRow);
+         weight = matrixCopy[matrixIndex(x, currentRow)];
 
-      if(x!=currentRow)
-      {
-         A(x, y) -= weight * A(currentRow, y);
-         if(y==0) prod[x] -= weight * prod[currentRow];
+         if(x!=currentRow)
+         {
+            matrixCopy[matrixIndex(x, y)] -= weight * matrixCopy[matrixIndex(currentRow, y)];
+            if(y==0) prod[vectorIndex(row, x)] -= weight * prod[vectorIndex(row, x)];
+         }
       }
-
+      
    __syncthreads();
    }
 
@@ -71,69 +81,66 @@ template<typename matrixType, typename vectorType>
 __global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_partition_offsets, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, matrixParameters matrixParam)
 {
 
-   auto matrixIndex = [=](unsigned long row, unsigned short threadNo){
-      return (d_row_ptr[row] * matrixParam.blockSize + threadNo);
-   };
-
-   auto matrixDiagonalIndex = [=](unsigned long row, unsigned short threadNo){
-      return (d_dia_ptr[row] * matrixParam.blockSize + threadNo);
+   auto matrixIndex = [=](unsigned long row, unsigned short threadNo, const unsigned long* ptr_array){
+      return (ptr_array[row] * matrixParam.blockSize + threadNo);
    };
 
    auto vectorIndex = [=](unsigned long row, unsigned short elemNo){
       return (row * matrixParam.blockRowSize + elemNo);
    };
 
-   unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE; 
-   unsigned short localRow = row % matrixParam.rowsPerBlock;
+   auto vectorMultIndex = [=](unsigned short blockNo, unsigned short blockCol){
+      return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
+   };
+
+   unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE; 
+   unsigned short localRow = origRow % matrixParam.rowsPerBlock;
    unsigned short threadNo = threadIdx.x % MVP_WARP_SIZE;
 
    unsigned short blockCol = threadNo % matrixParam.blockColSize;
 
-   extern __shared__ vectorType tempBuffer[];
+   extern __shared__ matrixType tempBuffer[];
 
-   for(auto iPartition = 1ul; iPartition < matrixParam.nPartition; iPartition++)
+   for(auto iPartition = 1ul; iPartition < matrixParam.nPartition + 1; iPartition++)
    {  
 
       /*We move to the set of rows present in the current partition.*/
-      row += d_partition_offsets[iPartition-1]; 
-      
+      unsigned long row = origRow + d_partition_offsets[iPartition-1];
       bool rowInPartition = (row < d_partition_offsets[iPartition]);
 
       if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) {
 
          prod[vectorIndex(row, threadNo)] = 0.0;
-         tempBuffer[vectorIndex(localRow, threadNo)] = 0.0;
+         tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = 0.0;
       }
       
       __syncthreads();
 
       if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.activeThreads)) {
 
-         for(unsigned long index = matrixIndex(row, threadNo); index < matrixDiagonalIndex(row, 0); index+=matrixParam.activeThreads)
+         for(unsigned long index = matrixIndex(row, threadNo, d_row_ptr); index < matrixIndex(row, 0, d_dia_ptr); index+=matrixParam.activeThreads)
          {
-
+            
             unsigned short blockNo = index/matrixParam.blockSize;
             unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;    
 
-            atomicAdd(&tempBuffer[vectorIndex(localRow, blockRow)], matrix[index] * vec[(d_col_ind[blockNo]) * matrixParam.blockColSize + blockCol]);
+            atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], matrix[index] * vec[vectorMultIndex(blockNo, blockCol)]);
          }
       }
 
       __syncthreads();
 
-      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) {
-
-         prod[vectorIndex(row, threadNo)] = vec[vectorIndex(row, threadNo)] - tempBuffer[vectorIndex(localRow, threadNo)];
-      } 
-     
-      __syncthreads();
-
       if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)) {
 
-         tempBuffer[vectorIndex(localRow, threadNo)] = matrix[matrixDiagonalIndex(row, threadNo)];
-         GaussEliminationKernel(&tempBuffer[vectorIndex(localRow, threadNo)], &prod[vectorIndex(row, threadNo/matrixParam.blockColSize)], threadNo, matrixParam);
+         /*Write the result to the product vector.*/
+         if(threadNo < matrixParam.blockRowSize) prod[vectorIndex(row, threadNo)] = vec[vectorIndex(row, threadNo)] - tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)];
+         
+         /*Reinitialize the memory to hold the matrix diagonal elements now.*/
+         tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = matrix[matrixIndex(row, threadNo, d_dia_ptr)];
       } 
       
+      GaussEliminationKernel(&tempBuffer[matrixParam.shrdMemIndex(localRow, 0)], prod, row, threadNo, rowInPartition, matrixParam);
+
       __syncthreads();
 
    } 
@@ -144,18 +151,18 @@ template<typename matrixType, typename vectorType>
 __global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* prod, const unsigned long* d_partition_offsets, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, matrixParameters matrixParam)
 {
 
-   auto matrixIndex = [=](unsigned long row, unsigned short threadNo){
-      return (d_row_ptr[row] * matrixParam.blockSize + threadNo);
-   };
-
-   auto matrixDiagonalIndex = [=](unsigned long row, unsigned short threadNo){
-      return (d_dia_ptr[row] * matrixParam.blockSize + threadNo);
+   auto matrixIndex = [=](unsigned long row, unsigned short threadNo, const unsigned long* ptr_array){
+      return (ptr_array[row] * matrixParam.blockSize + threadNo);
    };
 
    auto vectorIndex = [=](unsigned long row, unsigned short elemNo){
       return (row * matrixParam.blockRowSize + elemNo);
    };
 
+   auto vectorMultIndex = [=](unsigned short blockNo, unsigned short blockCol){
+      return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
+   };
+
    unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE; 
    unsigned short localRow = row % matrixParam.rowsPerBlock;
    unsigned short threadNo = threadIdx.x % MVP_WARP_SIZE;
@@ -164,58 +171,60 @@ __global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* p
 
    extern __shared__ vectorType tempBuffer[];
 
-   for(auto iPartition = 1ul; iPartition < matrixParam.nPartition; iPartition++)
+   for(auto iPartition = 1ul; iPartition < matrixParam.nPartition + 1; iPartition++)
    {  
 
       /*We move to the set of rows present in the current partition.*/
+      row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE;
       row += d_partition_offsets[iPartition-1]; 
       
       bool rowInPartition = (row < d_partition_offsets[iPartition]);
 
       if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) {
 
-         tempBuffer[vectorIndex(localRow, threadNo)] = 0.0;
+         tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = 0.0;
       }
       
       __syncthreads();
 
       if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.activeThreads)) {
 
-         for(unsigned long index = matrixDiagonalIndex(row, threadNo); index < matrixDiagonalIndex(row, 0) + matrixParam.blockSize; index+=matrixParam.activeThreads)
+         for(unsigned long index = matrixIndex(row, threadNo, d_dia_ptr); index < matrixIndex(row, matrixParam.blockSize, d_dia_ptr); index+=matrixParam.activeThreads)
          {
 
             unsigned short blockNo = index/matrixParam.blockSize;
             unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;    
 
-            atomicAdd(&tempBuffer[vectorIndex(localRow, blockRow)], matrix[index] * prod[(d_col_ind[blockNo]) * matrixParam.blockColSize + blockCol]);
+            atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], matrix[index] * prod[vectorMultIndex(blockNo, blockCol)]);
          }
-      }
 
-      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.activeThreads)) {
-
-         for(unsigned long index = matrixDiagonalIndex(row, threadNo); index < matrixIndex(row+1, 0); index+=matrixParam.activeThreads)
+         for(unsigned long index = matrixIndex(row, threadNo, d_dia_ptr); index < matrixIndex(row+1, 0, d_row_ptr); index+=matrixParam.activeThreads)
          {
 
             unsigned short blockNo = index/matrixParam.blockSize;
             unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;    
 
-            atomicAdd(&tempBuffer[vectorIndex(localRow, blockRow)], -(matrix[index] * prod[(d_col_ind[blockNo]) * matrixParam.blockColSize + blockCol]));
+            atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], -(matrix[index] * prod[vectorMultIndex(blockNo, blockCol)]));
          }
       }
 
       __syncthreads();
 
-      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) {
-
-         prod[vectorIndex(row, threadNo)] = tempBuffer[vectorIndex(localRow, threadNo)];
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)) {
+         
+         /*Write the result to the product vector.*/
+         if(threadNo < matrixParam.blockRowSize) prod[vectorIndex(row, threadNo)] = tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)];
+         
+         /*Reinitialize the memory to hold the matrix diagonal elements now.*/
+         tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = matrix[matrixIndex(row, threadNo, d_dia_ptr)];
       } 
      
       __syncthreads();
 
       if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)) {
 
-         tempBuffer[vectorIndex(localRow, threadNo)] = matrix[(d_dia_ptr[row] * matrixParam.blockSize) + threadNo];
-         GaussEliminationKernel(&tempBuffer[vectorIndex(localRow, threadNo)], &prod[vectorIndex(row, threadNo/matrixParam.blockColSize)], threadNo, matrixParam);
+         tempBuffer[localRow * matrixParam.blockSize + threadNo] = matrix[(d_dia_ptr[row] * matrixParam.blockSize) + threadNo];
+         GaussEliminationKernel(&tempBuffer[localRow * matrixParam.blockSize], prod, row, threadNo, rowInPartition, matrixParam);
       } 
       
       __syncthreads();
@@ -294,7 +303,7 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
   unsigned int gridx = rounded_up_division(MVP_BLOCK_SIZE, matrixParam.totalRows * MVP_WARP_SIZE);
   dim3 gridDim(gridx, 1, 1);
 
-  MatrixVectorProductKernel<<<gridDim, blockDim>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
+  MatrixVectorProductKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockRowSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
   gpuErrChk( cudaPeekAtLastError() );
 
   prod.DtHTransfer();

From 0372099a1ce064c54b118faabbebd0ecf42b1200 Mon Sep 17 00:00:00 2001
From: Areen Raj <areenraj@gmail.com>
Date: Tue, 15 Jul 2025 21:23:55 +0530
Subject: [PATCH 10/15] Working GPU LU_SGS Preconditioner Port

---
 Common/include/geometry/CGeometry.hpp         |   5 +-
 Common/include/geometry/CPhysicalGeometry.hpp |   3 -
 .../linear_algebra/CGraphPartitioning.hpp     | 119 +++++++++----
 Common/include/linear_algebra/CSysMatrix.hpp  |   4 +-
 Common/include/linear_algebra/GPUComms.cuh    |  52 ++++--
 Common/src/geometry/CPhysicalGeometry.cpp     |   3 +-
 Common/src/linear_algebra/CSysMatrix.cpp      |   5 +-
 Common/src/linear_algebra/CSysMatrixGPU.cu    | 163 +++++++++---------
 8 files changed, 217 insertions(+), 137 deletions(-)

diff --git a/Common/include/geometry/CGeometry.hpp b/Common/include/geometry/CGeometry.hpp
index f89ac7b910a3..100e9beeb3f5 100644
--- a/Common/include/geometry/CGeometry.hpp
+++ b/Common/include/geometry/CGeometry.hpp
@@ -260,11 +260,12 @@ class CGeometry {
   unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points
                                                on all prior ranks in the linear partitioning. */
 
-  unsigned long nPartition; /*!< \brief Number of divisions of the matrix graph during execution of parallel
-                               partitioning algorithms. */
+  unsigned long nPartition;       /*!< \brief Number of divisions of the matrix graph during execution of parallel
+                                     partitioning algorithms. */
   unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */
   vector<unsigned long>
       partitionOffsets; /*!< \brief Vector array containing the indices at which different parallel partitions begin. */
+  vector<unsigned long> chainPtr; /*!< \brief Vector array containing distribution of levels into chains. */
 
   /*--- Data structures for point-to-point MPI communications. ---*/
 
diff --git a/Common/include/geometry/CPhysicalGeometry.hpp b/Common/include/geometry/CPhysicalGeometry.hpp
index 70fe1f7d4c4c..5ac21523ee25 100644
--- a/Common/include/geometry/CPhysicalGeometry.hpp
+++ b/Common/include/geometry/CPhysicalGeometry.hpp
@@ -152,9 +152,6 @@ class CPhysicalGeometry final : public CGeometry {
    * \brief Divide the graph produced by the matrix into parallel partitions.
    * \param[in] config - Definition of the particular problem.
    * \param[in] pointList - Ordered list of points in the mesh.
-   * \param[in] numPartitions - Returns the number of parallel partitions created by the algorithm.
-   * \param[in] indexOffsets - Vector array that represents the starting index of each partition in the reordered point
-   * list.
    */
   template <class ScalarType>
   void PartitionGraph(const CConfig* config, vector<ScalarType>& pointList);
diff --git a/Common/include/linear_algebra/CGraphPartitioning.hpp b/Common/include/linear_algebra/CGraphPartitioning.hpp
index 71837e22b07e..209745301241 100644
--- a/Common/include/linear_algebra/CGraphPartitioning.hpp
+++ b/Common/include/linear_algebra/CGraphPartitioning.hpp
@@ -1,6 +1,6 @@
 /*!
  * \file CGraphPartitioning.hpp
- * \brief Headers for the classes realted to the algorithms that are used 
+ * \brief Headers for the classes realted to the algorithms that are used
                 to divide the matrix acyclic graph into parallel partitions.
  * \author A. Raj
  * \version 8.2.0 "Harrier"
@@ -26,6 +26,8 @@
  * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
  */
 
+#pragma once
+
 #include "../CConfig.hpp"
 #include "../geometry/CGeometry.hpp"
 #include "../geometry/dual_grid/CPoint.hpp"
@@ -35,65 +37,107 @@
  * \brief Abstract base class for defining graph partitioning algorithms
  * \author A. Raj
  *
- * In order to use certain parallel algorithms in the solution process - 
- * whether with linear solvers or preconditioners - we require the matrix 
- * to be partitioned into certain parallel divisions. These maybe in the form 
- * of levels, blocks, colors and so on. Since a number of different algorithms 
- * can be used to split the graph, we've introduced a base class containing the 
- * "Partition" member function from which child classes of the specific 
- * algorithm can be derived. Currently, we are only using direct declarations 
+ * In order to use certain parallel algorithms in the solution process -
+ * whether with linear solvers or preconditioners - we require the matrix
+ * to be partitioned into certain parallel divisions. These maybe in the form
+ * of levels, blocks, colors and so on. Since a number of different algorithms
+ * can be used to split the graph, we've introduced a base class containing the
+ * "Partition" member function from which child classes of the specific
+ * algorithm can be derived. Currently, we are only using direct declarations
  * of the derived classes in the code. However, this method was chosen as it
- * allows us to pass different child class algorithms to a single implementation 
+ * allows us to pass different child class algorithms to a single implementation
  * of the function that requires it - similar to the CMatrixVectorProduct class.
  */
 
 template <class ScalarType>
 
 class CGraphPartitioning {
-
  public:
   virtual ~CGraphPartitioning() = 0;
-  virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets) = 0;  
+  virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets,
+                         vector<ScalarType>& chainPtr) = 0;
 };
 template <class ScalarType>
-CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}  
+CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}
 
 template <class ScalarType>
 
-class CLevelScheduling final : public CGraphPartitioning<ScalarType> { 
-
+class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
  private:
   ScalarType nPointDomain;
   CPoint* nodes;
-  
+
  public:
   ScalarType nLevels;
   ScalarType maxLevelWidth;
   vector<ScalarType> levels;
 
- /*!
+  /*!
    * \brief constructor of the class
    * \param[in] nPointDomain_ref - number of points associated with the problem
-   * \param[in] nodes - represents the relationships between the points
+   * \param[in] nodes_ref - represents the relationships between the points
    */
- inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref) 
-                                                        : nPointDomain(nPointDomain_ref), nodes(nodes_ref) 
-                                                        { nLevels = 0ul; maxLevelWidth = 0ul; }
+  inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref)
+      : nPointDomain(nPointDomain_ref), nodes(nodes_ref) {
+    nLevels = 0ul;
+    maxLevelWidth = 0ul;
+  }
+
+  CLevelScheduling() = delete;  // Removing default constructor
 
- CLevelScheduling() = delete;   // Removing default constructor
+  /*!
+   * \brief Divides the levels into groups of chains depending on the preset GPU block and warp size.
+   * \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level.
+   * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
+   * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per block.
+   */
+  void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, int rowsPerBlock) {
+    ScalarType levelWidth = 0;
+    unsigned short chainLength = chainPtr.capacity();
 
- void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType> levelOffsets)
- {
+    /*This is not a magic number. We are simply initializing
+    the point array with its first element that is always zero.*/
+    chainPtr.push_back(0);
+
+    for (ScalarType iLevel = 0ul; iLevel < nLevels; iLevel++) {
+      levelWidth = levelOffsets[iLevel + 1] - levelOffsets[iLevel];
+      maxLevelWidth = std::max(levelWidth, maxLevelWidth);
+
+      if (levelWidth > rowsPerBlock) {
+        if (chainPtr.back() != iLevel) {
+          chainPtr.push_back(iLevel);
+        }
+
+        chainPtr.push_back(iLevel + 1);
+      }
+    }
+
+    chainPtr.push_back(nLevels);
+  }
+
+  /*!
+   * \brief Reorders the points according to the levels
+   * \param[in] pointList - Ordered array that contains the list of all mesh points.
+   * \param[in] inversePointList - Array utilized to access the index of each point in pointList.
+   * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
+   */
+  void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType> levelOffsets) {
     for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
       const auto globalPoint = pointList[localPoint];
       inversePointList[levelOffsets[levels[localPoint]]++] = globalPoint;
     }
-      
+
     pointList = std::move(inversePointList);
   }
 
- void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets) override
-  {
+  /*!
+   * \brief Reorders the points according to the levels
+   * \param[in] pointList - Ordered array that contains the list  of all mesh points.
+   * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
+   * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
+   */
+  void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets,
+                 vector<ScalarType>& chainPtr) override {
     vector<ScalarType> inversePointList;
     inversePointList.reserve(nPointDomain);
     levels.reserve(nPointDomain);
@@ -111,29 +155,34 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
 
       for (auto adjPoints = 0u; adjPoints < nodes->GetnPoint(globalPoint); adjPoints++) {
         const auto adjGlobalPoint = nodes->GetPoint(globalPoint, adjPoints);
-        
+
         if (adjGlobalPoint < nPointDomain) {
           const auto adjLocalPoint = inversePointList[adjGlobalPoint];
-          
+
           if (adjLocalPoint < localPoint) {
-          levels[localPoint] = std::max(levels[localPoint], levels[adjLocalPoint] + 1);
+            levels[localPoint] = std::max(levels[localPoint], levels[adjLocalPoint] + 1);
+          }
         }
-       }
       }
 
       nLevels = std::max(nLevels, levels[localPoint] + 1);
-    }  
+    }
 
     levelOffsets.resize(nLevels + 1);
-    for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++)  ++levelOffsets[levels[iPoint] + 1];
+    for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++) {
+      ++levelOffsets[levels[iPoint] + 1];
+    }
 
     for (auto iLevel = 2ul; iLevel <= nLevels; ++iLevel) {
       levelOffsets[iLevel] += levelOffsets[iLevel - 1];
     }
 
-    for(auto elem = levelOffsets.begin(); elem != (levelOffsets.end() - 1); elem++) maxLevelWidth = std::max(*(elem+1) - *elem, maxLevelWidth);
-    
     Reorder(pointList, inversePointList, levelOffsets);
+
+#ifdef HAVE_CUDA
+    CalculateChain(levelOffsets, chainPtr, 20);
+#elif
+    chainPtr = NULL;
+#endif
   }
 };
-
diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
index 43cd3d8bb239..7ea3e881d055 100644
--- a/Common/include/linear_algebra/CSysMatrix.hpp
+++ b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -150,8 +150,8 @@ class CSysMatrix {
   const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
   const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
   unsigned long* d_partition_offsets;
-  bool useCuda;                   /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
-                                     Mainly used to conditionally free GPU memory in the class destructor. */
+  bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
+                   Mainly used to conditionally free GPU memory in the class destructor. */
 
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
index dbe90baf07c9..ccff27b79c07 100644
--- a/Common/include/linear_algebra/GPUComms.cuh
+++ b/Common/include/linear_algebra/GPUComms.cuh
@@ -28,48 +28,72 @@
 #include<cuda_runtime.h>
 #include<iostream>
 
+/*!< \brief Namespace that contains variables and helper functions that are
+    utilized to launch CUDA Kernels. */
 namespace kernelParameters{
 
-  /*Returns the rounded up value of the decimal quotient to the next integer (in all cases)*/
+
+  /*!
+   * \brief Returns the rounded up value of the decimal quotient to the next integer (in all cases).
+   */
   inline constexpr int rounded_up_division(const int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }
 
-  /*Returns the rounded down value of the decimal quotient to the previous integer (in all cases)*/
+  /*!
+   * \brief Returns the rounded down value of the decimal quotient to the previous integer (in all cases).
+   */
   inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
 
-  static constexpr short MVP_BLOCK_SIZE = 256;
-  static constexpr short MVP_WARP_SIZE = 32;  
+  static constexpr short BLOCK_SIZE = 640;
+  static constexpr short WARP_SIZE = 32;
+  static constexpr short ROWS_PER_BLOCK = rounded_up_division(WARP_SIZE, BLOCK_SIZE);
+
 
 };
 
+/*!< \brief Structure containing information related to the Jacobian Matrix
+    which is utilized by any launched Kernel. */
 struct matrixParameters{
 
   public:
-    unsigned long totalRows;
-    unsigned short blockRowSize;
-    unsigned short blockColSize;
-    unsigned long nPartition; 
-    unsigned short blockSize;
-    unsigned short rowsPerBlock;
-    unsigned short activeThreads;
+    unsigned long totalRows;        /*!< \brief Contains the total number of rows of the Jacbian Matrix. */
+    unsigned short blockRowSize;    /*!< \brief Contains the row dimensions of the blocks of the Jacobian Matrix. */
+    unsigned short blockColSize;    /*!< \brief Contains the column dimensions of the blocks of the Jacobian Matrix. */
+    unsigned int nChainStart;       /*!< \brief Starting partition of the current chain. */
+    unsigned int nChainEnd;         /*!< \brief Ending partition of the current chain. */
+    unsigned short blockSize;       /*!< \brief Contains the total number of elements in each block of the Jacbian Matrix. */
+    unsigned short activeThreads;   /*!< \brief Cotains the number of active threads per iteration during MVP - depending on the
+                                        dimensions of the Jacbian Matrix. */
 
     matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions){
       totalRows = nPointDomain;
       blockRowSize = nEqn;
       blockColSize = nVar;
-      nPartition = nPartitions;
+      nChainStart = 0;
+      nChainEnd = 0;
       blockSize = nVar * nEqn;
-      rowsPerBlock = kernelParameters::rounded_up_division(kernelParameters::MVP_WARP_SIZE, kernelParameters::MVP_BLOCK_SIZE);
-      activeThreads = nVar * (kernelParameters::MVP_WARP_SIZE/nVar);
+      activeThreads = nVar * (kernelParameters::WARP_SIZE/nVar);
     }
 
+    /*!
+    * \brief Returns the memory index in the shared memory array used by the Symmetric Iteration Kernels.
+    */
     __device__ unsigned short shrdMemIndex(unsigned short localRow, unsigned short threadNo){
       return (localRow * blockSize + threadNo);
     }
 
+    /*!
+    * \brief Returns a boolean value to check whether the row is under the total number of rows and if the
+    *        thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
+    */
     __device__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){
       return (row<totalRows && threadNo<threadLimit);
     }
 
+    /*!
+    * \brief Returns a boolean value to check whether the row is part of the parallel partition being executed and if the
+    *        thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
+    * \param[in] rowInPartition - Represents a boolean that indicates the presence/absence of the row in the partition.
+    */
     __device__ bool validParallelAccess(bool rowInPartition, unsigned short threadNo, unsigned short threadLimit){
       return (rowInPartition && threadNo<threadLimit);
     }
diff --git a/Common/src/geometry/CPhysicalGeometry.cpp b/Common/src/geometry/CPhysicalGeometry.cpp
index 5ce18cc87f90..32b9852818bf 100644
--- a/Common/src/geometry/CPhysicalGeometry.cpp
+++ b/Common/src/geometry/CPhysicalGeometry.cpp
@@ -708,7 +708,7 @@ void CPhysicalGeometry::PartitionGraph(const CConfig* config, vector<ScalarType>
   switch (KindAlgorithm) {
     case LEVEL_SCHEDULING:
       auto levelSchedule = CLevelScheduling<ScalarType>(nPointDomain, nodes);
-      levelSchedule.Partition(pointList, partitionOffsets);
+      levelSchedule.Partition(pointList, partitionOffsets, chainPtr);
       nPartition = levelSchedule.nLevels;
       maxPartitionSize = levelSchedule.maxLevelWidth;
       break;
@@ -4558,6 +4558,7 @@ void CPhysicalGeometry::SetRCM_Ordering(CConfig* config) {
     if (!status) SU2_MPI::Error("RCM ordering failed", CURRENT_FUNCTION);
   }
 
+  /*Partition graph into parallel constituents for GPU Operations*/
   if (config->GetCUDA()) PartitionGraph(config, Result);
 
   /*--- Add the MPI points ---*/
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
index 909b2eb25787..8764bd11bba9 100644
--- a/Common/src/linear_algebra/CSysMatrix.cpp
+++ b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -153,7 +153,7 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
       ptr = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(num * sizeof(ScalarType));
     };
 
-      auto GPUAllocAndCopy = [](const unsigned long*& ptr, const unsigned long*& src_ptr, unsigned long num) {
+    auto GPUAllocAndCopy = [](const unsigned long*& ptr, const unsigned long*& src_ptr, unsigned long num) {
       ptr = GPUMemoryAllocation::gpu_alloc_cpy<const unsigned long>(src_ptr, num * sizeof(const unsigned long));
     };
 
@@ -165,8 +165,7 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
     GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
     GPUAllocAndCopy(d_col_ind, col_ind, nnz);
     GPUAllocAndCopy(d_dia_ptr, dia_ptr, nPointDomain);
-    GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nPartition);
-
+    GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nPartition + 1);
   }
 
   if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();
diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu
index 770d628ff1be..8d8dae3e59c1 100644
--- a/Common/src/linear_algebra/CSysMatrixGPU.cu
+++ b/Common/src/linear_algebra/CSysMatrixGPU.cu
@@ -33,7 +33,7 @@ using namespace kernelParameters;
 
 
 template<typename matrixType, typename vectorType>
-__device__ void GaussEliminationKernel(matrixType* matrixCopy, vectorType* prod, unsigned long row, unsigned int threadNo, bool rowInPartition, matrixParameters matrixParam)
+__device__ void DeviceGaussElimination(matrixType* matrixCopy, vectorType* prod, unsigned long row, unsigned int threadNo, bool rowInPartition, matrixParameters matrixParam)
 {
    auto matrixIndex = [=](unsigned short x, unsigned short y){
       return (x * matrixParam.blockColSize + y);
@@ -43,43 +43,46 @@ __device__ void GaussEliminationKernel(matrixType* matrixCopy, vectorType* prod,
       return (row * matrixParam.blockRowSize + elemNo);
    };
 
-   unsigned short x = threadNo/matrixParam.blockRowSize;                  
+   unsigned short x = threadNo/matrixParam.blockRowSize;
    unsigned short y = threadNo % matrixParam.blockColSize;
 
    matrixType pivot, weight = 0.0;
-   
+
    __syncthreads();
 
    for(unsigned short currentRow=0; currentRow < matrixParam.blockRowSize; currentRow++)
    {
-      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockColSize)){
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)){
          pivot = matrixCopy[matrixIndex(currentRow, currentRow)];
       }
       __syncthreads();
 
-      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockColSize)){
-         
-         matrixCopy[matrixIndex(currentRow, y)] /= pivot;
-         if(y==0) prod[vectorIndex(row, currentRow)] /= pivot;
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)){
+
+         if(x==currentRow)
+         {
+            matrixCopy[matrixIndex(currentRow, y)] /= pivot;
+            if(y==0) prod[vectorIndex(row, currentRow)] /= pivot;
+         }
 
          weight = matrixCopy[matrixIndex(x, currentRow)];
 
          if(x!=currentRow)
          {
             matrixCopy[matrixIndex(x, y)] -= weight * matrixCopy[matrixIndex(currentRow, y)];
-            if(y==0) prod[vectorIndex(row, x)] -= weight * prod[vectorIndex(row, x)];
+            if(y==0) prod[vectorIndex(row, x)] -= weight * prod[vectorIndex(row, currentRow)];
          }
       }
-      
+
    __syncthreads();
    }
 
-   #undef A
 }
 
 template<typename matrixType, typename vectorType>
 __global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_partition_offsets, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, matrixParameters matrixParam)
 {
+   /*--- First part of the symmetric iteration: (D+L).x* = b ---*/
 
    auto matrixIndex = [=](unsigned long row, unsigned short threadNo, const unsigned long* ptr_array){
       return (ptr_array[row] * matrixParam.blockSize + threadNo);
@@ -93,38 +96,38 @@ __global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* ve
       return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
    };
 
-   unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE; 
-   unsigned short localRow = origRow % matrixParam.rowsPerBlock;
-   unsigned short threadNo = threadIdx.x % MVP_WARP_SIZE;
+   unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/WARP_SIZE;
+   unsigned short localRow = origRow % ROWS_PER_BLOCK;
+   unsigned short threadNo = threadIdx.x % WARP_SIZE;
 
    unsigned short blockCol = threadNo % matrixParam.blockColSize;
 
    extern __shared__ matrixType tempBuffer[];
 
-   for(auto iPartition = 1ul; iPartition < matrixParam.nPartition + 1; iPartition++)
-   {  
+   /* Forward Substitution Algorithm with Gaussian Elimination. */
+   for(auto iPartition = matrixParam.nChainStart; iPartition < matrixParam.nChainEnd; iPartition++)
+   {
 
-      /*We move to the set of rows present in the current partition.*/
-      unsigned long row = origRow + d_partition_offsets[iPartition-1];
-      bool rowInPartition = (row < d_partition_offsets[iPartition]);
+      /* We move to the set of rows present in the current partition. */
+      unsigned long row = origRow + d_partition_offsets[iPartition];
+      bool rowInPartition = (row < d_partition_offsets[iPartition + 1]);
 
       if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) {
 
-         prod[vectorIndex(row, threadNo)] = 0.0;
          tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = 0.0;
       }
-      
+
       __syncthreads();
 
       if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.activeThreads)) {
 
          for(unsigned long index = matrixIndex(row, threadNo, d_row_ptr); index < matrixIndex(row, 0, d_dia_ptr); index+=matrixParam.activeThreads)
          {
-            
+
             unsigned short blockNo = index/matrixParam.blockSize;
-            unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;    
+            unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;
 
-            atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], matrix[index] * vec[vectorMultIndex(blockNo, blockCol)]);
+            atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], matrix[index] * prod[vectorMultIndex(blockNo, blockCol)]);  // Compute L.x*
          }
       }
 
@@ -132,18 +135,17 @@ __global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* ve
 
       if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)) {
 
-         /*Write the result to the product vector.*/
-         if(threadNo < matrixParam.blockRowSize) prod[vectorIndex(row, threadNo)] = vec[vectorIndex(row, threadNo)] - tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)];
-         
-         /*Reinitialize the memory to hold the matrix diagonal elements now.*/
+         if(threadNo < matrixParam.blockRowSize) prod[vectorIndex(row, threadNo)] = vec[vectorIndex(row, threadNo)] - tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)];  // Compute y = b - L.x*
+
+         /* Reinitialize the shared memory to hold the matrix diagonal elements now. */
          tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = matrix[matrixIndex(row, threadNo, d_dia_ptr)];
-      } 
-      
-      GaussEliminationKernel(&tempBuffer[matrixParam.shrdMemIndex(localRow, 0)], prod, row, threadNo, rowInPartition, matrixParam);
+      }
+
+      DeviceGaussElimination(&tempBuffer[matrixParam.shrdMemIndex(localRow, 0)], prod, row, threadNo, rowInPartition, matrixParam); // Solve D.x* = y
 
       __syncthreads();
 
-   } 
+   }
 }
 
 
@@ -151,6 +153,8 @@ template<typename matrixType, typename vectorType>
 __global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* prod, const unsigned long* d_partition_offsets, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, matrixParameters matrixParam)
 {
 
+   /*--- Second part of the symmetric iteration: (D+U).x_(1) = D.x* ---*/
+
    auto matrixIndex = [=](unsigned long row, unsigned short threadNo, const unsigned long* ptr_array){
       return (ptr_array[row] * matrixParam.blockSize + threadNo);
    };
@@ -163,28 +167,27 @@ __global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* p
       return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
    };
 
-   unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE; 
-   unsigned short localRow = row % matrixParam.rowsPerBlock;
-   unsigned short threadNo = threadIdx.x % MVP_WARP_SIZE;
+   unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/WARP_SIZE;
+   unsigned short localRow = origRow % ROWS_PER_BLOCK;
+   unsigned short threadNo = threadIdx.x % WARP_SIZE;
 
    unsigned short blockCol = threadNo % matrixParam.blockColSize;
 
-   extern __shared__ vectorType tempBuffer[];
+   extern __shared__ matrixType tempBuffer[];
 
-   for(auto iPartition = 1ul; iPartition < matrixParam.nPartition + 1; iPartition++)
-   {  
+   /* Backward Substitution Algorithm with Gaussian Elimination. */
+   for(auto iPartition = matrixParam.nChainStart; iPartition > matrixParam.nChainEnd; iPartition--)
+   {
 
-      /*We move to the set of rows present in the current partition.*/
-      row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE;
-      row += d_partition_offsets[iPartition-1]; 
-      
+      /* We move to the set of rows present in the current partition. */
+      unsigned long row = origRow + d_partition_offsets[iPartition - 1];
       bool rowInPartition = (row < d_partition_offsets[iPartition]);
 
       if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) {
 
          tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = 0.0;
       }
-      
+
       __syncthreads();
 
       if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.activeThreads)) {
@@ -193,43 +196,35 @@ __global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* p
          {
 
             unsigned short blockNo = index/matrixParam.blockSize;
-            unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;    
+            unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;
 
-            atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], matrix[index] * prod[vectorMultIndex(blockNo, blockCol)]);
+            atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], matrix[index] * prod[vectorMultIndex(blockNo, blockCol)]);  // Compute D.x*
          }
 
-         for(unsigned long index = matrixIndex(row, threadNo, d_dia_ptr); index < matrixIndex(row+1, 0, d_row_ptr); index+=matrixParam.activeThreads)
+         for(unsigned long index = matrixIndex(row, threadNo + matrixParam.blockSize, d_dia_ptr); index < matrixIndex(row + 1, 0, d_row_ptr); index+=matrixParam.activeThreads)
          {
 
             unsigned short blockNo = index/matrixParam.blockSize;
-            unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;    
+            unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;
 
-            atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], -(matrix[index] * prod[vectorMultIndex(blockNo, blockCol)]));
+            atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], -(matrix[index] * prod[vectorMultIndex(blockNo, blockCol)]));  // Compute y = D.x*-U.x_(n+1)
          }
       }
 
       __syncthreads();
 
       if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)) {
-         
-         /*Write the result to the product vector.*/
+
          if(threadNo < matrixParam.blockRowSize) prod[vectorIndex(row, threadNo)] = tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)];
-         
-         /*Reinitialize the memory to hold the matrix diagonal elements now.*/
+
+         /* Reinitialize the memory to hold the matrix diagonal elements now. */
          tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = matrix[matrixIndex(row, threadNo, d_dia_ptr)];
-      } 
-     
-      __syncthreads();
+      }
 
-      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)) {
+      DeviceGaussElimination(&tempBuffer[matrixParam.shrdMemIndex(localRow, 0)], prod, row, threadNo, rowInPartition, matrixParam); // Solve D.x* = y
 
-         tempBuffer[localRow * matrixParam.blockSize + threadNo] = matrix[(d_dia_ptr[row] * matrixParam.blockSize) + threadNo];
-         GaussEliminationKernel(&tempBuffer[localRow * matrixParam.blockSize], prod, row, threadNo, rowInPartition, matrixParam);
-      } 
-      
       __syncthreads();
-
-   } 
+   }
 }
 
 template<typename matrixType, typename vectorType>
@@ -243,15 +238,15 @@ __global__ void MatrixVectorProductKernel(matrixType* matrix, vectorType* vec, v
       return (row * matrixParam.blockRowSize + elemNo);
    };
 
-   unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE;
-   unsigned short threadNo = threadIdx.x % MVP_WARP_SIZE;
-   unsigned short localRow = row % matrixParam.rowsPerBlock;
+   unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/WARP_SIZE;
+   unsigned short threadNo = threadIdx.x % WARP_SIZE;
+   unsigned short localRow = row % ROWS_PER_BLOCK;
 
    unsigned short blockCol = threadNo % matrixParam.blockColSize;
 
    extern __shared__ vectorType res[];
 
-   if(matrixParam.validAccess(row, threadNo, matrixParam.blockRowSize)) 
+   if(matrixParam.validAccess(row, threadNo, matrixParam.blockRowSize))
    {
       prod[vectorIndex(row, threadNo)] = 0.0;
       res[vectorIndex(localRow, threadNo)] = 0.0;
@@ -266,7 +261,7 @@ __global__ void MatrixVectorProductKernel(matrixType* matrix, vectorType* vec, v
       {
          /*Calculating the position of the new element the thread has shifted to.*/
          unsigned short blockNo = index/matrixParam.blockSize;
-         unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;    
+         unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;
 
          atomicAdd(&res[vectorIndex(localRow, blockRow)], matrix[index] * vec[(d_col_ind[blockNo]) * matrixParam.blockColSize + blockCol]);
       }
@@ -299,11 +294,11 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
 
    matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition);
 
-  dim3 blockDim(MVP_BLOCK_SIZE,1,1);
-  unsigned int gridx = rounded_up_division(MVP_BLOCK_SIZE, matrixParam.totalRows * MVP_WARP_SIZE);
+  dim3 blockDim(BLOCK_SIZE,1,1);
+  unsigned int gridx = rounded_up_division(BLOCK_SIZE, matrixParam.totalRows * WARP_SIZE);
   dim3 gridDim(gridx, 1, 1);
 
-  MatrixVectorProductKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockRowSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
+  MatrixVectorProductKernel<<<gridDim, blockDim, ROWS_PER_BLOCK * matrixParam.blockRowSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
   gpuErrChk( cudaPeekAtLastError() );
 
   prod.DtHTransfer();
@@ -323,15 +318,29 @@ void CSysMatrix<ScalarType>::GPUComputeLU_SGSPreconditioner(const CSysVector<Sca
 
       matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition);
 
-      dim3 blockDim(matrixParam.rowsPerBlock * MVP_WARP_SIZE,1,1);
-      unsigned int gridx = rounded_up_division(matrixParam.rowsPerBlock, geometry->maxPartitionSize);
+      dim3 blockDim(ROWS_PER_BLOCK * WARP_SIZE,1,1);
+      unsigned int gridx = rounded_up_division(ROWS_PER_BLOCK, geometry->maxPartitionSize);
       dim3 gridDim(gridx, 1, 1);
 
-      FirstSymmetricIterationKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
-      gpuErrChk( cudaPeekAtLastError() );
-      SecondSymmetricIterationKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
-      gpuErrChk( cudaPeekAtLastError() );
+      for(auto elem = geometry->chainPtr.begin(); elem != geometry->chainPtr.end() - 1; elem++)
+      {
+         matrixParam.nChainStart = *(elem);
+         matrixParam.nChainEnd = *(elem + 1);
+
+         FirstSymmetricIterationKernel<<<gridDim, blockDim, ROWS_PER_BLOCK * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
+         gpuErrChk( cudaPeekAtLastError() );
+      }
+
+      for(auto elem = geometry->chainPtr.rbegin(); elem != geometry->chainPtr.rend() - 1; elem++)
+      {
+         matrixParam.nChainStart = *(elem);
+         matrixParam.nChainEnd = *(elem + 1);
+
+         SecondSymmetricIterationKernel<<<gridDim, blockDim, ROWS_PER_BLOCK * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
+         gpuErrChk( cudaPeekAtLastError() );
+      }
+
       prod.DtHTransfer();
    }
 
-template class CSysMatrix<su2mixedfloat>; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.
+template class CSysMatrix<su2mixedfloat>; // This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.

From 52b90b6787bdc7ab01a6f04b651562a27400f6b0 Mon Sep 17 00:00:00 2001
From: Areen Raj <areenraj@gmail.com>
Date: Thu, 17 Jul 2025 19:26:28 +0530
Subject: [PATCH 11/15] Fixed the issue with the visibility of the rowsPerBlock
 variable. Also edited the nvcc flags to match machine-specific architecture

---
 Common/include/CConfig.hpp                    | 14 ++++++
 .../linear_algebra/CGraphPartitioning.hpp     | 21 +++-----
 Common/include/linear_algebra/GPUComms.cuh    | 49 ++++++++-----------
 Common/include/option_structure.hpp           | 20 ++++++++
 Common/src/CConfig.cpp                        |  5 +-
 Common/src/geometry/CPhysicalGeometry.cpp     |  2 +-
 Common/src/linear_algebra/CSysMatrixGPU.cu    | 48 ++++++++++--------
 meson.build                                   |  2 +-
 8 files changed, 95 insertions(+), 66 deletions(-)

diff --git a/Common/include/CConfig.hpp b/Common/include/CConfig.hpp
index 721627ebdf20..b0194b4d59aa 100644
--- a/Common/include/CConfig.hpp
+++ b/Common/include/CConfig.hpp
@@ -632,6 +632,8 @@ class CConfig {
   unsigned long Linear_Solver_Restart_Frequency; /*!< \brief Restart frequency of the linear solver for the implicit formulation. */
   unsigned long Linear_Solver_Prec_Threads;      /*!< \brief Number of threads per rank for ILU and LU_SGS preconditioners. */
   unsigned short Linear_Solver_ILU_n;            /*!< \brief ILU fill=in level. */
+  unsigned short Cuda_Block_Size;                /*!< \brief  User-specified value for the X-Axis dimension of thread blocks
+                                                              that are deployed by the CUDA Kernels. */
   su2double SemiSpan;                   /*!< \brief Wing Semi span. */
   su2double Roe_Kappa;                  /*!< \brief Relaxation of the Roe scheme. */
   su2double Relaxation_Factor_Adjoint;  /*!< \brief Relaxation coefficient for variable updates of adjoint solvers. */
@@ -4204,6 +4206,18 @@ class CConfig {
    */
   su2double GetLinear_Solver_Smoother_Relaxation(void) const { return Linear_Solver_Smoother_Relaxation; }
 
+  /*!
+   * \brief Get thread block dimensions (X-axis) being used by the CUDA Kernels.
+   * \return Thread block dimensions (X-axis) being used by the CUDA Kernels.
+   */
+  unsigned short GetCuda_Block_Size(void) const { return Cuda_Block_Size; }
+
+  /*!
+   * \brief Get the number of matrix rows assigned per CUDA Block.
+   * \return The number of matrix rows assigned per CUDA Block.
+   */
+  unsigned short GetRows_Per_Cuda_Block(void) const { return cudaKernelParameters::rounded_up_division(cudaKernelParameters::CUDA_WARP_SIZE, Cuda_Block_Size); }
+
   /*!
    * \brief Get the relaxation factor for solution updates of adjoint solvers.
    */
diff --git a/Common/include/linear_algebra/CGraphPartitioning.hpp b/Common/include/linear_algebra/CGraphPartitioning.hpp
index 209745301241..50486d545836 100644
--- a/Common/include/linear_algebra/CGraphPartitioning.hpp
+++ b/Common/include/linear_algebra/CGraphPartitioning.hpp
@@ -28,13 +28,11 @@
 
 #pragma once
 
-#include "../CConfig.hpp"
 #include "../geometry/CGeometry.hpp"
-#include "../geometry/dual_grid/CPoint.hpp"
 
 /*!
  * \class CGraphPartitioning
- * \brief Abstract base class for defining graph partitioning algorithms
+ * \brief Abstract base class for defining graph partitioning algorithms.
  * \author A. Raj
  *
  * In order to use certain parallel algorithms in the solution process -
@@ -55,7 +53,7 @@ class CGraphPartitioning {
  public:
   virtual ~CGraphPartitioning() = 0;
   virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets,
-                         vector<ScalarType>& chainPtr) = 0;
+                         vector<ScalarType>& chainPtr, unsigned short chainLimit) = 0;
 };
 template <class ScalarType>
 CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}
@@ -89,9 +87,9 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
    * \brief Divides the levels into groups of chains depending on the preset GPU block and warp size.
    * \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level.
    * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
-   * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per block.
+   * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
    */
-  void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, int rowsPerBlock) {
+  void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, unsigned short rowsPerBlock) {
     ScalarType levelWidth = 0;
     unsigned short chainLength = chainPtr.capacity();
 
@@ -135,9 +133,10 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
    * \param[in] pointList - Ordered array that contains the list  of all mesh points.
    * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
    * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
+   * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
    */
-  void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets,
-                 vector<ScalarType>& chainPtr) override {
+  void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr,
+                 unsigned short rowsPerBlock) override {
     vector<ScalarType> inversePointList;
     inversePointList.reserve(nPointDomain);
     levels.reserve(nPointDomain);
@@ -179,10 +178,6 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
 
     Reorder(pointList, inversePointList, levelOffsets);
 
-#ifdef HAVE_CUDA
-    CalculateChain(levelOffsets, chainPtr, 20);
-#elif
-    chainPtr = NULL;
-#endif
+    CalculateChain(levelOffsets, chainPtr, rowsPerBlock);
   }
 };
diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
index ccff27b79c07..7e1109a4ee81 100644
--- a/Common/include/linear_algebra/GPUComms.cuh
+++ b/Common/include/linear_algebra/GPUComms.cuh
@@ -25,33 +25,21 @@
 * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
 */
 
+#pragma once
+
 #include<cuda_runtime.h>
 #include<iostream>
+#include "../option_structure.hpp"
 
-/*!< \brief Namespace that contains variables and helper functions that are
-    utilized to launch CUDA Kernels. */
-namespace kernelParameters{
-
-
-  /*!
-   * \brief Returns the rounded up value of the decimal quotient to the next integer (in all cases).
-   */
-  inline constexpr int rounded_up_division(const int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }
-
-  /*!
-   * \brief Returns the rounded down value of the decimal quotient to the previous integer (in all cases).
-   */
-  inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
-
-  static constexpr short BLOCK_SIZE = 640;
-  static constexpr short WARP_SIZE = 32;
-  static constexpr short ROWS_PER_BLOCK = rounded_up_division(WARP_SIZE, BLOCK_SIZE);
-
-
-};
-
-/*!< \brief Structure containing information related to the Jacobian Matrix
-    which is utilized by any launched Kernel. */
+/*!
+ * \struct matrixParameters
+ * \brief Structure containing information related to the Jacobian Matrix which is utilized by any launched Kernel.
+ *
+ *  This implementation alleviates the need to pass an excessive number of arguments
+ *  to a Kernel and, instead, packages it into a single structure. While this leads
+ *  to data duplication for a short period of time, this is a much cleaner and resuable approach.
+ * \author A. Raj
+ */
 struct matrixParameters{
 
   public:
@@ -63,21 +51,24 @@ struct matrixParameters{
     unsigned short blockSize;       /*!< \brief Contains the total number of elements in each block of the Jacbian Matrix. */
     unsigned short activeThreads;   /*!< \brief Cotains the number of active threads per iteration during MVP - depending on the
                                         dimensions of the Jacbian Matrix. */
+    unsigned short rowsPerBlock;     /*!< \brief Number of rows being processed by each thread block. This is equal to the number
+                                        of warps present in the block as each row gets assigned a warp. */
 
-    matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions){
+    matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions, unsigned short rowsPrBlck){
       totalRows = nPointDomain;
       blockRowSize = nEqn;
       blockColSize = nVar;
       nChainStart = 0;
       nChainEnd = 0;
       blockSize = nVar * nEqn;
-      activeThreads = nVar * (kernelParameters::WARP_SIZE/nVar);
+      activeThreads = nVar * (cudaKernelParameters::CUDA_WARP_SIZE/nVar);
+      rowsPerBlock = rowsPrBlck;
     }
 
     /*!
     * \brief Returns the memory index in the shared memory array used by the Symmetric Iteration Kernels.
     */
-    __device__ unsigned short shrdMemIndex(unsigned short localRow, unsigned short threadNo){
+    __device__ __forceinline__ unsigned short shrdMemIndex(unsigned short localRow, unsigned short threadNo){
       return (localRow * blockSize + threadNo);
     }
 
@@ -85,7 +76,7 @@ struct matrixParameters{
     * \brief Returns a boolean value to check whether the row is under the total number of rows and if the
     *        thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
     */
-    __device__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){
+    __device__ __forceinline__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){
       return (row<totalRows && threadNo<threadLimit);
     }
 
@@ -94,7 +85,7 @@ struct matrixParameters{
     *        thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
     * \param[in] rowInPartition - Represents a boolean that indicates the presence/absence of the row in the partition.
     */
-    __device__ bool validParallelAccess(bool rowInPartition, unsigned short threadNo, unsigned short threadLimit){
+    __device__ __forceinline__ bool validParallelAccess(bool rowInPartition, unsigned short threadNo, unsigned short threadLimit){
       return (rowInPartition && threadNo<threadLimit);
     }
 
diff --git a/Common/include/option_structure.hpp b/Common/include/option_structure.hpp
index ad0dbdc24090..be70124b1205 100644
--- a/Common/include/option_structure.hpp
+++ b/Common/include/option_structure.hpp
@@ -71,6 +71,26 @@ enum class SU2_COMPONENT {
   SU2_SOL  /*!< \brief Running the SU2_SOL software. */
 };
 
+/*!
+ * \namespace cudaKernelParameters
+ * \brief Namespace that contains variables and helper functions that are utilized to calculate CUDA Kernel parameters.
+ * \author A. Raj
+ */
+namespace cudaKernelParameters{
+
+  /*!
+   * \brief Returns the rounded up value of the decimal quotient to the next integer (in all cases).
+   */
+  inline unsigned int rounded_up_division(int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }
+
+  /*!
+   * \brief Returns the rounded down value of the decimal quotient to the previous integer (in all cases).
+   */
+  inline unsigned int rounded_down_division(int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
+
+  static constexpr short CUDA_WARP_SIZE = 32;  /*!< \brief Outlines the numbers of threads per warp for a CUDA GPU. */
+};
+
 const unsigned int EXIT_DIVERGENCE = 2;   /*!< \brief Exit code (divergence). */
 
 const unsigned int MAX_PARAMETERS = 10;       /*!< \brief Maximum number of parameters for a design variable definition. */
diff --git a/Common/src/CConfig.cpp b/Common/src/CConfig.cpp
index 0ddef7dfd249..d5e926850bf7 100644
--- a/Common/src/CConfig.cpp
+++ b/Common/src/CConfig.cpp
@@ -1850,7 +1850,7 @@ void CConfig::SetConfig_Options() {
   /*--- Options related to the linear solvers ---*/
 
   /*!\brief GRAPH_PARTIONING
-   *  \n DESCRIPTION: Algorithm for partioning the matrix graph to facilitate parallel execution of inear algebra subroutines\n OPTIONS: see \link Graph_Part_Map \endlink \n DEFAULT: LEVEL_SCHEDULING \ingroup Config*/
+   *  \n DESCRIPTION: Algorithm for partioning the matrix graph to facilitate parallel execution of linear algebra subroutines\n OPTIONS: see \link Graph_Part_Map \endlink \n DEFAULT: LEVEL_SCHEDULING \ingroup Config*/
   addEnumOption("GRAPH_PART_ALGORITHM", Kind_Graph_Part_Algo, Graph_Part_Map, LEVEL_SCHEDULING);
   /*!\brief LINEAR_SOLVER
    *  \n DESCRIPTION: Linear solver for the implicit, mesh deformation, or discrete adjoint systems \n OPTIONS: see \link Linear_Solver_Map \endlink \n DEFAULT: FGMRES \ingroup Config*/
@@ -1890,7 +1890,8 @@ void CConfig::SetConfig_Options() {
   addEnumOption("DISCADJ_LIN_SOLVER", Kind_DiscAdj_Linear_Solver, Linear_Solver_Map, FGMRES);
   /* DESCRIPTION: Preconditioner for the discrete adjoint Krylov linear solvers */
   addEnumOption("DISCADJ_LIN_PREC", Kind_DiscAdj_Linear_Prec, Linear_Solver_Prec_Map, ILU);
-  /* DESCRIPTION: Linear solver for the discete adjoint systems */
+  /* DESCRIPTION: Thread block size for CUDA GPUs */
+  addUnsignedShortOption("CUDA_BLOCK_SIZE", Cuda_Block_Size, 1024);
 
   /*!\par CONFIG_CATEGORY: Convergence\ingroup Config*/
   /*--- Options related to convergence ---*/
diff --git a/Common/src/geometry/CPhysicalGeometry.cpp b/Common/src/geometry/CPhysicalGeometry.cpp
index 32b9852818bf..bc6cd5258270 100644
--- a/Common/src/geometry/CPhysicalGeometry.cpp
+++ b/Common/src/geometry/CPhysicalGeometry.cpp
@@ -708,7 +708,7 @@ void CPhysicalGeometry::PartitionGraph(const CConfig* config, vector<ScalarType>
   switch (KindAlgorithm) {
     case LEVEL_SCHEDULING:
       auto levelSchedule = CLevelScheduling<ScalarType>(nPointDomain, nodes);
-      levelSchedule.Partition(pointList, partitionOffsets, chainPtr);
+      levelSchedule.Partition(pointList, partitionOffsets, chainPtr, config->GetRows_Per_Cuda_Block());
       nPartition = levelSchedule.nLevels;
       maxPartitionSize = levelSchedule.maxLevelWidth;
       break;
diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu
index 8d8dae3e59c1..d306d6e3d31f 100644
--- a/Common/src/linear_algebra/CSysMatrixGPU.cu
+++ b/Common/src/linear_algebra/CSysMatrixGPU.cu
@@ -1,6 +1,15 @@
 /*!
  * \file CSysMatrixGPU.cu
  * \brief Implementations of Kernels and Functions for Matrix Operations on the GPU
+ *
+ * The kernel implementations will feature a lot of if-statements.
+ * The reason for such heavy usage of conditionals is to do a check
+ * whether the memory locations being accessed by the threads are
+ * within bounds or not. Usually the entire kernel is "wrapped" in
+ * a single conditional for these checks. But, in our case, it is
+ * necessary for us to use intermittent synchronization barriers like
+ * __syncthreads() which will lead to thread divergence issues if used
+ * inside a conditional.
  * \author A. Raj
  * \version 8.2.0 "Harrier"
  *
@@ -29,8 +38,7 @@
 #include "../../include/geometry/CGeometry.hpp"
 #include "../../include/linear_algebra/GPUComms.cuh"
 
-using namespace kernelParameters;
-
+using namespace cudaKernelParameters;
 
 template<typename matrixType, typename vectorType>
 __device__ void DeviceGaussElimination(matrixType* matrixCopy, vectorType* prod, unsigned long row, unsigned int threadNo, bool rowInPartition, matrixParameters matrixParam)
@@ -96,9 +104,9 @@ __global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* ve
       return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
    };
 
-   unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/WARP_SIZE;
-   unsigned short localRow = origRow % ROWS_PER_BLOCK;
-   unsigned short threadNo = threadIdx.x % WARP_SIZE;
+   unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/CUDA_WARP_SIZE;
+   unsigned short localRow = origRow % matrixParam.rowsPerBlock;
+   unsigned short threadNo = threadIdx.x % CUDA_WARP_SIZE;
 
    unsigned short blockCol = threadNo % matrixParam.blockColSize;
 
@@ -167,9 +175,9 @@ __global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* p
       return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
    };
 
-   unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/WARP_SIZE;
-   unsigned short localRow = origRow % ROWS_PER_BLOCK;
-   unsigned short threadNo = threadIdx.x % WARP_SIZE;
+   unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/CUDA_WARP_SIZE;
+   unsigned short localRow = origRow % matrixParam.rowsPerBlock;
+   unsigned short threadNo = threadIdx.x % CUDA_WARP_SIZE;
 
    unsigned short blockCol = threadNo % matrixParam.blockColSize;
 
@@ -238,9 +246,9 @@ __global__ void MatrixVectorProductKernel(matrixType* matrix, vectorType* vec, v
       return (row * matrixParam.blockRowSize + elemNo);
    };
 
-   unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/WARP_SIZE;
-   unsigned short threadNo = threadIdx.x % WARP_SIZE;
-   unsigned short localRow = row % ROWS_PER_BLOCK;
+   unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/CUDA_WARP_SIZE;
+   unsigned short threadNo = threadIdx.x % CUDA_WARP_SIZE;
+   unsigned short localRow = row % matrixParam.rowsPerBlock;
 
    unsigned short blockCol = threadNo % matrixParam.blockColSize;
 
@@ -292,13 +300,13 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
    vec.HtDTransfer();
    prod.GPUSetVal(0.0);
 
-   matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition);
+   matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition, config->GetRows_Per_Cuda_Block());
 
-  dim3 blockDim(BLOCK_SIZE,1,1);
-  unsigned int gridx = rounded_up_division(BLOCK_SIZE, matrixParam.totalRows * WARP_SIZE);
+  dim3 blockDim(config->GetCuda_Block_Size(),1,1);
+  unsigned int gridx = rounded_up_division(config->GetCuda_Block_Size(), matrixParam.totalRows * CUDA_WARP_SIZE);
   dim3 gridDim(gridx, 1, 1);
 
-  MatrixVectorProductKernel<<<gridDim, blockDim, ROWS_PER_BLOCK * matrixParam.blockRowSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
+  MatrixVectorProductKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockRowSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
   gpuErrChk( cudaPeekAtLastError() );
 
   prod.DtHTransfer();
@@ -316,10 +324,10 @@ void CSysMatrix<ScalarType>::GPUComputeLU_SGSPreconditioner(const CSysVector<Sca
       vec.HtDTransfer();
       prod.HtDTransfer();
 
-      matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition);
+      matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition, config->GetRows_Per_Cuda_Block());
 
-      dim3 blockDim(ROWS_PER_BLOCK * WARP_SIZE,1,1);
-      unsigned int gridx = rounded_up_division(ROWS_PER_BLOCK, geometry->maxPartitionSize);
+      dim3 blockDim(matrixParam.rowsPerBlock * CUDA_WARP_SIZE,1,1);
+      unsigned int gridx = rounded_up_division(matrixParam.rowsPerBlock, geometry->maxPartitionSize);
       dim3 gridDim(gridx, 1, 1);
 
       for(auto elem = geometry->chainPtr.begin(); elem != geometry->chainPtr.end() - 1; elem++)
@@ -327,7 +335,7 @@ void CSysMatrix<ScalarType>::GPUComputeLU_SGSPreconditioner(const CSysVector<Sca
          matrixParam.nChainStart = *(elem);
          matrixParam.nChainEnd = *(elem + 1);
 
-         FirstSymmetricIterationKernel<<<gridDim, blockDim, ROWS_PER_BLOCK * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
+         FirstSymmetricIterationKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
          gpuErrChk( cudaPeekAtLastError() );
       }
 
@@ -336,7 +344,7 @@ void CSysMatrix<ScalarType>::GPUComputeLU_SGSPreconditioner(const CSysVector<Sca
          matrixParam.nChainStart = *(elem);
          matrixParam.nChainEnd = *(elem + 1);
 
-         SecondSymmetricIterationKernel<<<gridDim, blockDim, ROWS_PER_BLOCK * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
+         SecondSymmetricIterationKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
          gpuErrChk( cudaPeekAtLastError() );
       }
 
diff --git a/meson.build b/meson.build
index 443cb1f4e543..c368dbf50b50 100644
--- a/meson.build
+++ b/meson.build
@@ -18,7 +18,7 @@ python = pymod.find_installation()
 
 if get_option('enable-cuda')
   add_languages('cuda')
-  add_global_arguments('-arch=sm_86', language : 'cuda')
+  add_global_arguments('-arch=native', language : 'cuda')
 endif
 
 su2_cpp_args = []

From c4dbe5ce6aad451cbf621d9a817fe2b453c84780 Mon Sep 17 00:00:00 2001
From: Areen Raj <areenraj@gmail.com>
Date: Thu, 17 Jul 2025 19:49:02 +0530
Subject: [PATCH 12/15] Fixing warnings

---
 Common/include/linear_algebra/CSysMatrix.hpp | 14 --------------
 Common/include/option_structure.hpp          |  2 +-
 2 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
index df2e8267c3fb..7ea3e881d055 100644
--- a/Common/include/linear_algebra/CSysMatrix.hpp
+++ b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -148,15 +148,10 @@ class CSysMatrix {
   ScalarType* d_matrix;           /*!< \brief Device Pointer to store the matrix values on the GPU. */
   const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
   const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
-<<<<<<< HEAD
-  bool useCuda;                   /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
-                                     Mainly used to conditionally free GPU memory in the class destructor. */
-=======
   const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
   unsigned long* d_partition_offsets;
   bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
                    Mainly used to conditionally free GPU memory in the class destructor. */
->>>>>>> precond_port
 
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
@@ -868,11 +863,6 @@ class CSysMatrix {
                               const CConfig* config) const;
 
   /*!
-<<<<<<< HEAD
-<<<<<<< HEAD
-=======
-=======
->>>>>>> precond_port
    * \brief Performs first step of the LU_SGS Preconditioner building
    * \param[in] vec - CSysVector to be multiplied by the sparse matrix A.
    * \param[in] geometry - Geometrical definition of the problem.
@@ -898,10 +888,6 @@ class CSysMatrix {
   void GPUGaussElimination(ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
 
   /*!
-<<<<<<< HEAD
->>>>>>> upstream/develop
-=======
->>>>>>> precond_port
    * \brief Multiply CSysVector by the preconditioner all of which are stored on the device
    * \param[in] vec - CSysVector to be multiplied by the preconditioner.
    * \param[out] prod - Result of the product A*vec.
diff --git a/Common/include/option_structure.hpp b/Common/include/option_structure.hpp
index be70124b1205..bdb836b9c654 100644
--- a/Common/include/option_structure.hpp
+++ b/Common/include/option_structure.hpp
@@ -89,7 +89,7 @@ namespace cudaKernelParameters{
   inline unsigned int rounded_down_division(int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
 
   static constexpr short CUDA_WARP_SIZE = 32;  /*!< \brief Outlines the numbers of threads per warp for a CUDA GPU. */
-};
+}
 
 const unsigned int EXIT_DIVERGENCE = 2;   /*!< \brief Exit code (divergence). */
 

From 1be1e2fec5bae2ec7fb5420a5c0791f11e492272 Mon Sep 17 00:00:00 2001
From: Areen Raj <areenraj@gmail.com>
Date: Thu, 17 Jul 2025 19:56:16 +0530
Subject: [PATCH 13/15] Syncing repo to develop

---
 externals/meson | 2 +-
 externals/ninja | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/externals/meson b/externals/meson
index 5a82ea050173..41c650a040d5 160000
--- a/externals/meson
+++ b/externals/meson
@@ -1 +1 @@
-Subproject commit 5a82ea0501736a666ca9cc003ea0774f8219fd65
+Subproject commit 41c650a040d50e0912d268af7a903a9ce1456dfa
diff --git a/externals/ninja b/externals/ninja
index b4d51f6ed5be..52649de2c56b 160000
--- a/externals/ninja
+++ b/externals/ninja
@@ -1 +1 @@
-Subproject commit b4d51f6ed5bed09dd2b70324df0d9cb4ecad2638
+Subproject commit 52649de2c56b63f42bc59513d51286531c595b44

From 7472bd1bd8aa5430ce72fca8dbb46ed6ff712b2d Mon Sep 17 00:00:00 2001
From: Areen Raj <areenraj@gmail.com>
Date: Thu, 17 Jul 2025 21:11:31 +0530
Subject: [PATCH 14/15] updating submodule versions

---
 externals/meson | 2 +-
 externals/ninja | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/externals/meson b/externals/meson
index 41c650a040d5..5a82ea050173 160000
--- a/externals/meson
+++ b/externals/meson
@@ -1 +1 @@
-Subproject commit 41c650a040d50e0912d268af7a903a9ce1456dfa
+Subproject commit 5a82ea0501736a666ca9cc003ea0774f8219fd65
diff --git a/externals/ninja b/externals/ninja
index 52649de2c56b..b4d51f6ed5be 160000
--- a/externals/ninja
+++ b/externals/ninja
@@ -1 +1 @@
-Subproject commit 52649de2c56b63f42bc59513d51286531c595b44
+Subproject commit b4d51f6ed5bed09dd2b70324df0d9cb4ecad2638

From 352e148f85da7e7529f20e7f24591a6af94fc294 Mon Sep 17 00:00:00 2001
From: Areen Raj <areenraj@gmail.com>
Date: Thu, 17 Jul 2025 21:23:44 +0530
Subject: [PATCH 15/15] Fixing some more warnings

---
 Common/include/linear_algebra/CGraphPartitioning.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Common/include/linear_algebra/CGraphPartitioning.hpp b/Common/include/linear_algebra/CGraphPartitioning.hpp
index 50486d545836..44b7d74928c9 100644
--- a/Common/include/linear_algebra/CGraphPartitioning.hpp
+++ b/Common/include/linear_algebra/CGraphPartitioning.hpp
@@ -91,7 +91,6 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
    */
   void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, unsigned short rowsPerBlock) {
     ScalarType levelWidth = 0;
-    unsigned short chainLength = chainPtr.capacity();
 
     /*This is not a magic number. We are simply initializing
     the point array with its first element that is always zero.*/