diff --git a/Common/include/CConfig.hpp b/Common/include/CConfig.hpp
index cb5a607c9df0..eb96b5a973a7 100644
--- a/Common/include/CConfig.hpp
+++ b/Common/include/CConfig.hpp
@@ -521,6 +521,7 @@ class CConfig {
   Kind_Gradient_Method_Recon,      /*!< \brief Numerical method for computation of spatial gradients used for upwind reconstruction. */
   Kind_Deform_Linear_Solver,             /*!< Numerical method to deform the grid */
   Kind_Deform_Linear_Solver_Prec,        /*!< \brief Preconditioner of the linear solver. */
+  Kind_Graph_Part_Algo,    /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
   Kind_Linear_Solver,                    /*!< \brief Numerical solver for the implicit scheme. */
   Kind_Linear_Solver_Prec,               /*!< \brief Preconditioner of the linear solver. */
   Kind_DiscAdj_Linear_Solver,            /*!< \brief Linear solver for the discrete adjoint system. */
@@ -634,6 +635,8 @@ class CConfig {
   unsigned long Linear_Solver_Restart_Frequency; /*!< \brief Restart frequency of the linear solver for the implicit formulation. */
   unsigned long Linear_Solver_Prec_Threads;      /*!< \brief Number of threads per rank for ILU and LU_SGS preconditioners. */
   unsigned short Linear_Solver_ILU_n;            /*!< \brief ILU fill=in level. */
+  unsigned short Cuda_Block_Size;                /*!< \brief  User-specified value for the X-Axis dimension of thread blocks
+                                                              that are deployed by the CUDA Kernels. */
   su2double SemiSpan;                   /*!< \brief Wing Semi span. */
   su2double Roe_Kappa;                  /*!< \brief Relaxation of the Roe scheme. */
   su2double Relaxation_Factor_Adjoint;  /*!< \brief Relaxation coefficient for variable updates of adjoint solvers. */
@@ -4160,6 +4163,12 @@ class CConfig {
    */
   bool GetLeastSquaresRequired(void) const { return LeastSquaresRequired; }
 
+    /*!
+   * \brief Get the type of algorithm used for partitioning the matrix graph.
+   * \return Algorithm that divides the matrix into partitions that are executed parallely.
+   */
+  unsigned short GetKind_Graph_Part_Algo(void) const { return Kind_Graph_Part_Algo; }
+
   /*!
    * \brief Get the kind of solver for the implicit solver.
    * \return Numerical solver for implicit formulation (solving the linear system).
@@ -4221,6 +4230,18 @@ class CConfig {
    */
   su2double GetLinear_Solver_Smoother_Relaxation(void) const { return Linear_Solver_Smoother_Relaxation; }
 
+  /*!
+   * \brief Get thread block dimensions (X-axis) being used by the CUDA Kernels.
+   * \return Thread block dimensions (X-axis) being used by the CUDA Kernels.
+   */
+  unsigned short GetCuda_Block_Size(void) const { return Cuda_Block_Size; }
+
+  /*!
+   * \brief Get the number of matrix rows assigned per CUDA Block.
+   * \return The number of matrix rows assigned per CUDA Block.
+   */
+  unsigned short GetRows_Per_Cuda_Block(void) const { return cudaKernelParameters::rounded_up_division(cudaKernelParameters::CUDA_WARP_SIZE, Cuda_Block_Size); }
+
   /*!
    * \brief Get the relaxation factor for solution updates of adjoint solvers.
    */
diff --git a/Common/include/geometry/CGeometry.hpp b/Common/include/geometry/CGeometry.hpp
index a345f72c046f..100e9beeb3f5 100644
--- a/Common/include/geometry/CGeometry.hpp
+++ b/Common/include/geometry/CGeometry.hpp
@@ -260,6 +260,13 @@ class CGeometry {
   unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points
                                                on all prior ranks in the linear partitioning. */
 
+  unsigned long nPartition;       /*!< \brief Number of divisions of the matrix graph during execution of parallel
+                                     partitioning algorithms. */
+  unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */
+  vector<unsigned long>
+      partitionOffsets; /*!< \brief Vector array containing the indices at which different parallel partitions begin. */
+  vector<unsigned long> chainPtr; /*!< \brief Vector array containing distribution of levels into chains. */
+
   /*--- Data structures for point-to-point MPI communications. ---*/
 
   int maxCountPerPoint{0}; /*!< \brief Maximum number of pieces of data sent per vertex in point-to-point comms. */
diff --git a/Common/include/geometry/CPhysicalGeometry.hpp b/Common/include/geometry/CPhysicalGeometry.hpp
index 1148471f1c3e..5ac21523ee25 100644
--- a/Common/include/geometry/CPhysicalGeometry.hpp
+++ b/Common/include/geometry/CPhysicalGeometry.hpp
@@ -148,6 +148,14 @@ class CPhysicalGeometry final : public CGeometry {
    */
   void DistributeColoring(const CConfig* config, CGeometry* geometry);
 
+  /*!
+   * \brief Divide the graph produced by the matrix into parallel partitions.
+   * \param[in] config - Definition of the particular problem.
+   * \param[in] pointList - Ordered list of points in the mesh.
+   */
+  template <class ScalarType>
+  void PartitionGraph(const CConfig* config, vector<ScalarType>& pointList);
+
   /*!
    * \brief Distribute the grid points, including ghost points, across all ranks based on a ParMETIS coloring.
    * \param[in] config - Definition of the particular problem.
diff --git a/Common/include/linear_algebra/CGraphPartitioning.hpp b/Common/include/linear_algebra/CGraphPartitioning.hpp
new file mode 100644
index 000000000000..44b7d74928c9
--- /dev/null
+++ b/Common/include/linear_algebra/CGraphPartitioning.hpp
@@ -0,0 +1,182 @@
+/*!
+ * \file CGraphPartitioning.hpp
+ * \brief Headers for the classes realted to the algorithms that are used
+                to divide the matrix acyclic graph into parallel partitions.
+ * \author A. Raj
+ * \version 8.2.0 "Harrier"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2025, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../geometry/CGeometry.hpp"
+
+/*!
+ * \class CGraphPartitioning
+ * \brief Abstract base class for defining graph partitioning algorithms.
+ * \author A. Raj
+ *
+ * In order to use certain parallel algorithms in the solution process -
+ * whether with linear solvers or preconditioners - we require the matrix
+ * to be partitioned into certain parallel divisions. These maybe in the form
+ * of levels, blocks, colors and so on. Since a number of different algorithms
+ * can be used to split the graph, we've introduced a base class containing the
+ * "Partition" member function from which child classes of the specific
+ * algorithm can be derived. Currently, we are only using direct declarations
+ * of the derived classes in the code. However, this method was chosen as it
+ * allows us to pass different child class algorithms to a single implementation
+ * of the function that requires it - similar to the CMatrixVectorProduct class.
+ */
+
+template <class ScalarType>
+
+class CGraphPartitioning {
+ public:
+  virtual ~CGraphPartitioning() = 0;
+  virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets,
+                         vector<ScalarType>& chainPtr, unsigned short chainLimit) = 0;
+};
+template <class ScalarType>
+CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}
+
+template <class ScalarType>
+
+class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
+ private:
+  ScalarType nPointDomain;
+  CPoint* nodes;
+
+ public:
+  ScalarType nLevels;
+  ScalarType maxLevelWidth;
+  vector<ScalarType> levels;
+
+  /*!
+   * \brief constructor of the class
+   * \param[in] nPointDomain_ref - number of points associated with the problem
+   * \param[in] nodes_ref - represents the relationships between the points
+   */
+  inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref)
+      : nPointDomain(nPointDomain_ref), nodes(nodes_ref) {
+    nLevels = 0ul;
+    maxLevelWidth = 0ul;
+  }
+
+  CLevelScheduling() = delete;  // Removing default constructor
+
+  /*!
+   * \brief Divides the levels into groups of chains depending on the preset GPU block and warp size.
+   * \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level.
+   * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
+   * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
+   */
+  void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, unsigned short rowsPerBlock) {
+    ScalarType levelWidth = 0;
+
+    /*This is not a magic number. We are simply initializing
+    the point array with its first element that is always zero.*/
+    chainPtr.push_back(0);
+
+    for (ScalarType iLevel = 0ul; iLevel < nLevels; iLevel++) {
+      levelWidth = levelOffsets[iLevel + 1] - levelOffsets[iLevel];
+      maxLevelWidth = std::max(levelWidth, maxLevelWidth);
+
+      if (levelWidth > rowsPerBlock) {
+        if (chainPtr.back() != iLevel) {
+          chainPtr.push_back(iLevel);
+        }
+
+        chainPtr.push_back(iLevel + 1);
+      }
+    }
+
+    chainPtr.push_back(nLevels);
+  }
+
+  /*!
+   * \brief Reorders the points according to the levels
+   * \param[in] pointList - Ordered array that contains the list of all mesh points.
+   * \param[in] inversePointList - Array utilized to access the index of each point in pointList.
+   * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
+   */
+  void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType> levelOffsets) {
+    for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
+      const auto globalPoint = pointList[localPoint];
+      inversePointList[levelOffsets[levels[localPoint]]++] = globalPoint;
+    }
+
+    pointList = std::move(inversePointList);
+  }
+
+  /*!
+   * \brief Reorders the points according to the levels
+   * \param[in] pointList - Ordered array that contains the list  of all mesh points.
+   * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
+   * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
+   * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
+   */
+  void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr,
+                 unsigned short rowsPerBlock) override {
+    vector<ScalarType> inversePointList;
+    inversePointList.reserve(nPointDomain);
+    levels.reserve(nPointDomain);
+
+    for (auto point = 0ul; point < nPointDomain; point++) {
+      inversePointList[pointList[point]] = point;
+      levels[point] = 0;
+    }
+
+    //  Local Point - Ordering of the points post the RCM ordering
+    //  Global Point - Original order of the points before the RCM ordering
+
+    for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
+      const auto globalPoint = pointList[localPoint];
+
+      for (auto adjPoints = 0u; adjPoints < nodes->GetnPoint(globalPoint); adjPoints++) {
+        const auto adjGlobalPoint = nodes->GetPoint(globalPoint, adjPoints);
+
+        if (adjGlobalPoint < nPointDomain) {
+          const auto adjLocalPoint = inversePointList[adjGlobalPoint];
+
+          if (adjLocalPoint < localPoint) {
+            levels[localPoint] = std::max(levels[localPoint], levels[adjLocalPoint] + 1);
+          }
+        }
+      }
+
+      nLevels = std::max(nLevels, levels[localPoint] + 1);
+    }
+
+    levelOffsets.resize(nLevels + 1);
+    for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++) {
+      ++levelOffsets[levels[iPoint] + 1];
+    }
+
+    for (auto iLevel = 2ul; iLevel <= nLevels; ++iLevel) {
+      levelOffsets[iLevel] += levelOffsets[iLevel - 1];
+    }
+
+    Reorder(pointList, inversePointList, levelOffsets);
+
+    CalculateChain(levelOffsets, chainPtr, rowsPerBlock);
+  }
+};
diff --git a/Common/include/linear_algebra/CPreconditioner.hpp b/Common/include/linear_algebra/CPreconditioner.hpp
index 9310225179cc..7f5e601e323a 100644
--- a/Common/include/linear_algebra/CPreconditioner.hpp
+++ b/Common/include/linear_algebra/CPreconditioner.hpp
@@ -205,7 +205,15 @@ class CLU_SGSPreconditioner final : public CPreconditioner<ScalarType> {
    * \param[out] v - CSysVector that is the result of the preconditioning.
    */
   inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
+#ifdef HAVE_CUDA
+    if (config->GetCUDA()) {
+      sparse_matrix.GPUComputeLU_SGSPreconditioner(u, v, geometry, config);
+    } else {
+      sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
+    }
+#else
     sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
+#endif
   }
 };
 
diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
index d3212e7a9f5f..284afeca0411 100644
--- a/Common/include/linear_algebra/CSysMatrix.hpp
+++ b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -148,8 +148,10 @@ class CSysMatrix {
   ScalarType* d_matrix;           /*!< \brief Device Pointer to store the matrix values on the GPU. */
   const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
   const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
-  bool useCuda;                   /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
-                                     Mainly used to conditionally free GPU memory in the class destructor. */
+  const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
+  unsigned long* d_partition_offsets;
+  bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
+                   Mainly used to conditionally free GPU memory in the class destructor. */
 
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
@@ -904,8 +906,8 @@ class CSysMatrix {
    * \param[in] vec - CSysVector to be multiplied by the preconditioner.
    * \param[out] prod - Result of the product A*vec.
    */
-  void GPUComputeLU_SGSPreconditioner(ScalarType& vec, ScalarType& prod, CGeometry* geometry,
-                                      const CConfig* config) const;
+  void GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
+                                      CGeometry* geometry, const CConfig* config) const;
 
   /*!
    * \brief Build the Jacobi preconditioner.
diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
index 05f40ee4d9b0..7e1109a4ee81 100644
--- a/Common/include/linear_algebra/GPUComms.cuh
+++ b/Common/include/linear_algebra/GPUComms.cuh
@@ -25,16 +25,72 @@
 * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
 */
 
+#pragma once
+
 #include<cuda_runtime.h>
 #include<iostream>
+#include "../option_structure.hpp"
+
+/*!
+ * \struct matrixParameters
+ * \brief Structure containing information related to the Jacobian Matrix which is utilized by any launched Kernel.
+ *
+ *  This implementation alleviates the need to pass an excessive number of arguments
+ *  to a Kernel and, instead, packages it into a single structure. While this leads
+ *  to data duplication for a short period of time, this is a much cleaner and resuable approach.
+ * \author A. Raj
+ */
+struct matrixParameters{
 
-namespace KernelParameters{
+  public:
+    unsigned long totalRows;        /*!< \brief Contains the total number of rows of the Jacbian Matrix. */
+    unsigned short blockRowSize;    /*!< \brief Contains the row dimensions of the blocks of the Jacobian Matrix. */
+    unsigned short blockColSize;    /*!< \brief Contains the column dimensions of the blocks of the Jacobian Matrix. */
+    unsigned int nChainStart;       /*!< \brief Starting partition of the current chain. */
+    unsigned int nChainEnd;         /*!< \brief Ending partition of the current chain. */
+    unsigned short blockSize;       /*!< \brief Contains the total number of elements in each block of the Jacbian Matrix. */
+    unsigned short activeThreads;   /*!< \brief Cotains the number of active threads per iteration during MVP - depending on the
+                                        dimensions of the Jacbian Matrix. */
+    unsigned short rowsPerBlock;     /*!< \brief Number of rows being processed by each thread block. This is equal to the number
+                                        of warps present in the block as each row gets assigned a warp. */
 
-inline constexpr int round_up_division(const int multiple, int x) { return ((x + multiple - 1) / multiple); }
+    matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions, unsigned short rowsPrBlck){
+      totalRows = nPointDomain;
+      blockRowSize = nEqn;
+      blockColSize = nVar;
+      nChainStart = 0;
+      nChainEnd = 0;
+      blockSize = nVar * nEqn;
+      activeThreads = nVar * (cudaKernelParameters::CUDA_WARP_SIZE/nVar);
+      rowsPerBlock = rowsPrBlck;
+    }
+
+    /*!
+    * \brief Returns the memory index in the shared memory array used by the Symmetric Iteration Kernels.
+    */
+    __device__ __forceinline__ unsigned short shrdMemIndex(unsigned short localRow, unsigned short threadNo){
+      return (localRow * blockSize + threadNo);
+    }
+
+    /*!
+    * \brief Returns a boolean value to check whether the row is under the total number of rows and if the
+    *        thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
+    */
+    __device__ __forceinline__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){
+      return (row<totalRows && threadNo<threadLimit);
+    }
+
+    /*!
+    * \brief Returns a boolean value to check whether the row is part of the parallel partition being executed and if the
+    *        thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
+    * \param[in] rowInPartition - Represents a boolean that indicates the presence/absence of the row in the partition.
+    */
+    __device__ __forceinline__ bool validParallelAccess(bool rowInPartition, unsigned short threadNo, unsigned short threadLimit){
+      return (rowInPartition && threadNo<threadLimit);
+    }
+
+};
 
-static constexpr int MVP_BLOCK_SIZE = 1024;
-static constexpr int MVP_WARP_SIZE = 32;
-}
 /*!
 * \brief assert style function that reads return codes after intercepting CUDA API calls.
 *        It returns the result code and its location if the call is unsuccessful.
diff --git a/Common/include/option_structure.hpp b/Common/include/option_structure.hpp
index 6822128e6950..7b40d27b214f 100644
--- a/Common/include/option_structure.hpp
+++ b/Common/include/option_structure.hpp
@@ -71,6 +71,26 @@ enum class SU2_COMPONENT {
   SU2_SOL  /*!< \brief Running the SU2_SOL software. */
 };
 
+/*!
+ * \namespace cudaKernelParameters
+ * \brief Namespace that contains variables and helper functions that are utilized to calculate CUDA Kernel parameters.
+ * \author A. Raj
+ */
+namespace cudaKernelParameters{
+
+  /*!
+   * \brief Returns the rounded up value of the decimal quotient to the next integer (in all cases).
+   */
+  inline unsigned int rounded_up_division(int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }
+
+  /*!
+   * \brief Returns the rounded down value of the decimal quotient to the previous integer (in all cases).
+   */
+  inline unsigned int rounded_down_division(int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
+
+  static constexpr short CUDA_WARP_SIZE = 32;  /*!< \brief Outlines the numbers of threads per warp for a CUDA GPU. */
+}
+
 const unsigned int EXIT_DIVERGENCE = 2;   /*!< \brief Exit code (divergence). */
 
 const unsigned int MAX_PARAMETERS = 10;       /*!< \brief Maximum number of parameters for a design variable definition. */
@@ -2345,6 +2365,16 @@ static const MapType<std::string, ENUM_FFD_BLENDING> Blending_Map = {
   MakePair("BEZIER", BEZIER)
 };
 
+/*!
+ * \brief Types of graph partitioning algorithms for parallel computing
+ */
+enum ENUM_GRAPH_PART_ALGORITHM {
+  LEVEL_SCHEDULING,   /*!< \brief  Partitions the graphs according to level-set algorithm. */
+};
+static const MapType<std::string, ENUM_GRAPH_PART_ALGORITHM> Graph_Part_Map = {
+  MakePair("LEVEL_SCHEDULING", LEVEL_SCHEDULING)
+};
+
 /*!
  * \brief Types of solvers for solving linear systems
  */
diff --git a/Common/src/CConfig.cpp b/Common/src/CConfig.cpp
index af17bd4e6f5c..4418a8917291 100644
--- a/Common/src/CConfig.cpp
+++ b/Common/src/CConfig.cpp
@@ -1860,6 +1860,9 @@ void CConfig::SetConfig_Options() {
   /*!\par CONFIG_CATEGORY: Linear solver definition \ingroup Config*/
   /*--- Options related to the linear solvers ---*/
 
+  /*!\brief GRAPH_PARTIONING
+   *  \n DESCRIPTION: Algorithm for partioning the matrix graph to facilitate parallel execution of linear algebra subroutines\n OPTIONS: see \link Graph_Part_Map \endlink \n DEFAULT: LEVEL_SCHEDULING \ingroup Config*/
+  addEnumOption("GRAPH_PART_ALGORITHM", Kind_Graph_Part_Algo, Graph_Part_Map, LEVEL_SCHEDULING);
   /*!\brief LINEAR_SOLVER
    *  \n DESCRIPTION: Linear solver for the implicit, mesh deformation, or discrete adjoint systems \n OPTIONS: see \link Linear_Solver_Map \endlink \n DEFAULT: FGMRES \ingroup Config*/
   addEnumOption("LINEAR_SOLVER", Kind_Linear_Solver, Linear_Solver_Map, FGMRES);
@@ -1898,7 +1901,8 @@ void CConfig::SetConfig_Options() {
   addEnumOption("DISCADJ_LIN_SOLVER", Kind_DiscAdj_Linear_Solver, Linear_Solver_Map, FGMRES);
   /* DESCRIPTION: Preconditioner for the discrete adjoint Krylov linear solvers */
   addEnumOption("DISCADJ_LIN_PREC", Kind_DiscAdj_Linear_Prec, Linear_Solver_Prec_Map, ILU);
-  /* DESCRIPTION: Linear solver for the discete adjoint systems */
+  /* DESCRIPTION: Thread block size for CUDA GPUs */
+  addUnsignedShortOption("CUDA_BLOCK_SIZE", Cuda_Block_Size, 1024);
 
   /*!\par CONFIG_CATEGORY: Convergence\ingroup Config*/
   /*--- Options related to convergence ---*/
diff --git a/Common/src/geometry/CPhysicalGeometry.cpp b/Common/src/geometry/CPhysicalGeometry.cpp
index c97d2453541e..bc6cd5258270 100644
--- a/Common/src/geometry/CPhysicalGeometry.cpp
+++ b/Common/src/geometry/CPhysicalGeometry.cpp
@@ -26,6 +26,7 @@
  */
 
 #include "../../include/geometry/CPhysicalGeometry.hpp"
+#include "../../include/linear_algebra/CGraphPartitioning.hpp"
 #include "../../include/adt/CADTPointsOnlyClass.hpp"
 #include "../../include/toolboxes/printing_toolbox.hpp"
 #include "../../include/toolboxes/CLinearPartitioner.hpp"
@@ -699,6 +700,21 @@ void CPhysicalGeometry::DistributeColoring(const CConfig* config, CGeometry* geo
   delete[] nPoint_Flag;
 }
 
+template <class ScalarType>
+void CPhysicalGeometry::PartitionGraph(const CConfig* config, vector<ScalarType>& pointList) {
+  unsigned short KindAlgorithm = config->GetKind_Graph_Part_Algo();
+  partitionOffsets.reserve(nPointDomain);
+
+  switch (KindAlgorithm) {
+    case LEVEL_SCHEDULING:
+      auto levelSchedule = CLevelScheduling<ScalarType>(nPointDomain, nodes);
+      levelSchedule.Partition(pointList, partitionOffsets, chainPtr, config->GetRows_Per_Cuda_Block());
+      nPartition = levelSchedule.nLevels;
+      maxPartitionSize = levelSchedule.maxLevelWidth;
+      break;
+  }
+}
+
 void CPhysicalGeometry::DistributeVolumeConnectivity(const CConfig* config, CGeometry* geometry,
                                                      unsigned short Elem_Type) {
   unsigned short NODES_PER_ELEMENT = 0;
@@ -4542,6 +4558,9 @@ void CPhysicalGeometry::SetRCM_Ordering(CConfig* config) {
     if (!status) SU2_MPI::Error("RCM ordering failed", CURRENT_FUNCTION);
   }
 
+  /*Partition graph into parallel constituents for GPU Operations*/
+  if (config->GetCUDA()) PartitionGraph(config, Result);
+
   /*--- Add the MPI points ---*/
   for (auto iPoint = nPointDomain; iPoint < nPoint; iPoint++) {
     Result.push_back(iPoint);
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
index c0a6aa81ee04..8764bd11bba9 100644
--- a/Common/src/linear_algebra/CSysMatrix.cpp
+++ b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -72,6 +72,7 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
     GPUMemoryAllocation::gpu_free(d_matrix);
     GPUMemoryAllocation::gpu_free(d_row_ptr);
     GPUMemoryAllocation::gpu_free(d_col_ind);
+    GPUMemoryAllocation::gpu_free(d_partition_offsets);
   }
 
 #ifdef USE_MKL
@@ -156,9 +157,15 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
       ptr = GPUMemoryAllocation::gpu_alloc_cpy<const unsigned long>(src_ptr, num * sizeof(const unsigned long));
     };
 
+    auto GPUVectorAllocAndCopy = [](unsigned long*& ptr, vector<unsigned long>& src_ptr, unsigned long num) {
+      ptr = GPUMemoryAllocation::gpu_alloc_cpy<unsigned long>(&src_ptr[0], num * sizeof(unsigned long));
+    };
+
     GPUAllocAndInit(d_matrix, nnz * nVar * nEqn);
     GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
     GPUAllocAndCopy(d_col_ind, col_ind, nnz);
+    GPUAllocAndCopy(d_dia_ptr, dia_ptr, nPointDomain);
+    GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nPartition + 1);
   }
 
   if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();
diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu
index d63430c96f96..d306d6e3d31f 100644
--- a/Common/src/linear_algebra/CSysMatrixGPU.cu
+++ b/Common/src/linear_algebra/CSysMatrixGPU.cu
@@ -1,6 +1,15 @@
 /*!
  * \file CSysMatrixGPU.cu
  * \brief Implementations of Kernels and Functions for Matrix Operations on the GPU
+ *
+ * The kernel implementations will feature a lot of if-statements.
+ * The reason for such heavy usage of conditionals is to do a check
+ * whether the memory locations being accessed by the threads are
+ * within bounds or not. Usually the entire kernel is "wrapped" in
+ * a single conditional for these checks. But, in our case, it is
+ * necessary for us to use intermittent synchronization barriers like
+ * __syncthreads() which will lead to thread divergence issues if used
+ * inside a conditional.
  * \author A. Raj
  * \version 8.2.0 "Harrier"
  *
@@ -26,34 +35,251 @@
  */
 
 #include "../../include/linear_algebra/CSysMatrix.hpp"
+#include "../../include/geometry/CGeometry.hpp"
 #include "../../include/linear_algebra/GPUComms.cuh"
 
+using namespace cudaKernelParameters;
+
+template<typename matrixType, typename vectorType>
+__device__ void DeviceGaussElimination(matrixType* matrixCopy, vectorType* prod, unsigned long row, unsigned int threadNo, bool rowInPartition, matrixParameters matrixParam)
+{
+   auto matrixIndex = [=](unsigned short x, unsigned short y){
+      return (x * matrixParam.blockColSize + y);
+   };
+
+   auto vectorIndex = [=](unsigned short row, unsigned short elemNo){
+      return (row * matrixParam.blockRowSize + elemNo);
+   };
+
+   unsigned short x = threadNo/matrixParam.blockRowSize;
+   unsigned short y = threadNo % matrixParam.blockColSize;
+
+   matrixType pivot, weight = 0.0;
+
+   __syncthreads();
+
+   for(unsigned short currentRow=0; currentRow < matrixParam.blockRowSize; currentRow++)
+   {
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)){
+         pivot = matrixCopy[matrixIndex(currentRow, currentRow)];
+      }
+      __syncthreads();
+
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)){
+
+         if(x==currentRow)
+         {
+            matrixCopy[matrixIndex(currentRow, y)] /= pivot;
+            if(y==0) prod[vectorIndex(row, currentRow)] /= pivot;
+         }
+
+         weight = matrixCopy[matrixIndex(x, currentRow)];
+
+         if(x!=currentRow)
+         {
+            matrixCopy[matrixIndex(x, y)] -= weight * matrixCopy[matrixIndex(currentRow, y)];
+            if(y==0) prod[vectorIndex(row, x)] -= weight * prod[vectorIndex(row, currentRow)];
+         }
+      }
+
+   __syncthreads();
+   }
+
+}
+
+template<typename matrixType, typename vectorType>
+__global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_partition_offsets, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, matrixParameters matrixParam)
+{
+   /*--- First part of the symmetric iteration: (D+L).x* = b ---*/
+
+   auto matrixIndex = [=](unsigned long row, unsigned short threadNo, const unsigned long* ptr_array){
+      return (ptr_array[row] * matrixParam.blockSize + threadNo);
+   };
+
+   auto vectorIndex = [=](unsigned long row, unsigned short elemNo){
+      return (row * matrixParam.blockRowSize + elemNo);
+   };
+
+   auto vectorMultIndex = [=](unsigned short blockNo, unsigned short blockCol){
+      return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
+   };
+
+   unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/CUDA_WARP_SIZE;
+   unsigned short localRow = origRow % matrixParam.rowsPerBlock;
+   unsigned short threadNo = threadIdx.x % CUDA_WARP_SIZE;
+
+   unsigned short blockCol = threadNo % matrixParam.blockColSize;
+
+   extern __shared__ matrixType tempBuffer[];
+
+   /* Forward Substitution Algorithm with Gaussian Elimination. */
+   for(auto iPartition = matrixParam.nChainStart; iPartition < matrixParam.nChainEnd; iPartition++)
+   {
+
+      /* We move to the set of rows present in the current partition. */
+      unsigned long row = origRow + d_partition_offsets[iPartition];
+      bool rowInPartition = (row < d_partition_offsets[iPartition + 1]);
+
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) {
+
+         tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = 0.0;
+      }
+
+      __syncthreads();
+
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.activeThreads)) {
+
+         for(unsigned long index = matrixIndex(row, threadNo, d_row_ptr); index < matrixIndex(row, 0, d_dia_ptr); index+=matrixParam.activeThreads)
+         {
+
+            unsigned short blockNo = index/matrixParam.blockSize;
+            unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;
+
+            atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], matrix[index] * prod[vectorMultIndex(blockNo, blockCol)]);  // Compute L.x*
+         }
+      }
+
+      __syncthreads();
+
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)) {
+
+         if(threadNo < matrixParam.blockRowSize) prod[vectorIndex(row, threadNo)] = vec[vectorIndex(row, threadNo)] - tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)];  // Compute y = b - L.x*
+
+         /* Reinitialize the shared memory to hold the matrix diagonal elements now. */
+         tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = matrix[matrixIndex(row, threadNo, d_dia_ptr)];
+      }
+
+      DeviceGaussElimination(&tempBuffer[matrixParam.shrdMemIndex(localRow, 0)], prod, row, threadNo, rowInPartition, matrixParam); // Solve D.x* = y
+
+      __syncthreads();
+
+   }
+}
+
+
+template<typename matrixType, typename vectorType>
+__global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* prod, const unsigned long* d_partition_offsets, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, matrixParameters matrixParam)
+{
+
+   /*--- Second part of the symmetric iteration: (D+U).x_(1) = D.x* ---*/
+
+   auto matrixIndex = [=](unsigned long row, unsigned short threadNo, const unsigned long* ptr_array){
+      return (ptr_array[row] * matrixParam.blockSize + threadNo);
+   };
+
+   auto vectorIndex = [=](unsigned long row, unsigned short elemNo){
+      return (row * matrixParam.blockRowSize + elemNo);
+   };
+
+   auto vectorMultIndex = [=](unsigned short blockNo, unsigned short blockCol){
+      return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
+   };
+
+   unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/CUDA_WARP_SIZE;
+   unsigned short localRow = origRow % matrixParam.rowsPerBlock;
+   unsigned short threadNo = threadIdx.x % CUDA_WARP_SIZE;
+
+   unsigned short blockCol = threadNo % matrixParam.blockColSize;
+
+   extern __shared__ matrixType tempBuffer[];
+
+   /* Backward Substitution Algorithm with Gaussian Elimination. */
+   for(auto iPartition = matrixParam.nChainStart; iPartition > matrixParam.nChainEnd; iPartition--)
+   {
+
+      /* We move to the set of rows present in the current partition. */
+      unsigned long row = origRow + d_partition_offsets[iPartition - 1];
+      bool rowInPartition = (row < d_partition_offsets[iPartition]);
+
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) {
+
+         tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = 0.0;
+      }
+
+      __syncthreads();
+
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.activeThreads)) {
+
+         for(unsigned long index = matrixIndex(row, threadNo, d_dia_ptr); index < matrixIndex(row, matrixParam.blockSize, d_dia_ptr); index+=matrixParam.activeThreads)
+         {
+
+            unsigned short blockNo = index/matrixParam.blockSize;
+            unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;
+
+            atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], matrix[index] * prod[vectorMultIndex(blockNo, blockCol)]);  // Compute D.x*
+         }
+
+         for(unsigned long index = matrixIndex(row, threadNo + matrixParam.blockSize, d_dia_ptr); index < matrixIndex(row + 1, 0, d_row_ptr); index+=matrixParam.activeThreads)
+         {
+
+            unsigned short blockNo = index/matrixParam.blockSize;
+            unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;
+
+            atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], -(matrix[index] * prod[vectorMultIndex(blockNo, blockCol)]));  // Compute y = D.x*-U.x_(n+1)
+         }
+      }
+
+      __syncthreads();
+
+      if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)) {
+
+         if(threadNo < matrixParam.blockRowSize) prod[vectorIndex(row, threadNo)] = tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)];
+
+         /* Reinitialize the memory to hold the matrix diagonal elements now. */
+         tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = matrix[matrixIndex(row, threadNo, d_dia_ptr)];
+      }
+
+      DeviceGaussElimination(&tempBuffer[matrixParam.shrdMemIndex(localRow, 0)], prod, row, threadNo, rowInPartition, matrixParam); // Solve D.x* = y
+
+      __syncthreads();
+   }
+}
+
 template<typename matrixType, typename vectorType>
-__global__ void GPUMatrixVectorProductAdd(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn)
+__global__ void MatrixVectorProductKernel(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, matrixParameters matrixParam)
 {
-   int row = (blockIdx.x * blockDim.x + threadIdx.x)/32;
-   int threadNo = threadIdx.x%32;
-   int activeThreads = nVar * (32/nVar);
+   auto matrixIndex = [=](unsigned long row, unsigned short threadNo) {
+      return (d_row_ptr[row] * matrixParam.blockSize + threadNo);
+   };
+
+   auto vectorIndex = [=](unsigned long row, unsigned short elemNo) {
+      return (row * matrixParam.blockRowSize + elemNo);
+   };
+
+   unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/CUDA_WARP_SIZE;
+   unsigned short threadNo = threadIdx.x % CUDA_WARP_SIZE;
+   unsigned short localRow = row % matrixParam.rowsPerBlock;
 
-   int blockRow = (threadNo/nVar)%nVar;
+   unsigned short blockCol = threadNo % matrixParam.blockColSize;
 
-   if(row<nPointDomain && threadNo<activeThreads) prod[row * nVar + blockRow] = 0.0;
+   extern __shared__ vectorType res[];
+
+   if(matrixParam.validAccess(row, threadNo, matrixParam.blockRowSize))
+   {
+      prod[vectorIndex(row, threadNo)] = 0.0;
+      res[vectorIndex(localRow, threadNo)] = 0.0;
+   }
 
    __syncthreads();
 
-   if(row<nPointDomain && threadNo<activeThreads)
+   if(matrixParam.validAccess(row, threadNo, matrixParam.activeThreads))
    {
-      vectorType res = 0.0;
 
-      for(int index = d_row_ptr[row] * nVar * nEqn + threadNo; index < d_row_ptr[row+1] * nVar * nEqn; index+=activeThreads)
+      for(unsigned long index = matrixIndex(row, threadNo); index < matrixIndex(row+1, 0); index+=matrixParam.activeThreads)
       {
-         int blockCol = index%nEqn;
-         int blockNo = index/(nVar * nEqn);
-         res += matrix[index] * vec[(d_col_ind[blockNo])*nVar + blockCol];
+         /*Calculating the position of the new element the thread has shifted to.*/
+         unsigned short blockNo = index/matrixParam.blockSize;
+         unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;
+
+         atomicAdd(&res[vectorIndex(localRow, blockRow)], matrix[index] * vec[(d_col_ind[blockNo]) * matrixParam.blockColSize + blockCol]);
       }
 
-      atomicAdd(&prod[row * nVar + blockRow], res);
    }
+
+   __syncthreads();
+
+   if(matrixParam.validAccess(row, threadNo, matrixParam.blockRowSize)) prod[vectorIndex(row, threadNo)] = res[vectorIndex(localRow, threadNo)];
+
 }
 
 template<class ScalarType>
@@ -74,15 +300,55 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
    vec.HtDTransfer();
    prod.GPUSetVal(0.0);
 
-  dim3 blockDim(KernelParameters::MVP_BLOCK_SIZE,1,1);
-  int gridx = KernelParameters::round_up_division(KernelParameters::MVP_WARP_SIZE, nPointDomain);
+   matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition, config->GetRows_Per_Cuda_Block());
+
+  dim3 blockDim(config->GetCuda_Block_Size(),1,1);
+  unsigned int gridx = rounded_up_division(config->GetCuda_Block_Size(), matrixParam.totalRows * CUDA_WARP_SIZE);
   dim3 gridDim(gridx, 1, 1);
 
-  GPUMatrixVectorProductAdd<<<gridDim, blockDim>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, nPointDomain, nVar, nEqn);
+  MatrixVectorProductKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockRowSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
   gpuErrChk( cudaPeekAtLastError() );
 
   prod.DtHTransfer();
 
 }
 
-template class CSysMatrix<su2mixedfloat>; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.
+template<class ScalarType>
+void CSysMatrix<ScalarType>::GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
+   CGeometry* geometry, const CConfig* config) const
+   {
+      ScalarType* d_vec = vec.GetDevicePointer();
+      ScalarType* d_prod = prod.GetDevicePointer();
+
+      HtDTransfer();
+      vec.HtDTransfer();
+      prod.HtDTransfer();
+
+      matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition, config->GetRows_Per_Cuda_Block());
+
+      dim3 blockDim(matrixParam.rowsPerBlock * CUDA_WARP_SIZE,1,1);
+      unsigned int gridx = rounded_up_division(matrixParam.rowsPerBlock, geometry->maxPartitionSize);
+      dim3 gridDim(gridx, 1, 1);
+
+      for(auto elem = geometry->chainPtr.begin(); elem != geometry->chainPtr.end() - 1; elem++)
+      {
+         matrixParam.nChainStart = *(elem);
+         matrixParam.nChainEnd = *(elem + 1);
+
+         FirstSymmetricIterationKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
+         gpuErrChk( cudaPeekAtLastError() );
+      }
+
+      for(auto elem = geometry->chainPtr.rbegin(); elem != geometry->chainPtr.rend() - 1; elem++)
+      {
+         matrixParam.nChainStart = *(elem);
+         matrixParam.nChainEnd = *(elem + 1);
+
+         SecondSymmetricIterationKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
+         gpuErrChk( cudaPeekAtLastError() );
+      }
+
+      prod.DtHTransfer();
+   }
+
+template class CSysMatrix<su2mixedfloat>; // This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.
diff --git a/meson.build b/meson.build
index e550d57893b7..66c90c64a4f8 100644
--- a/meson.build
+++ b/meson.build
@@ -19,7 +19,7 @@ python = pymod.find_installation()
 
 if get_option('enable-cuda')
   add_languages('cuda')
-  add_global_arguments('-arch=sm_86', language : 'cuda')
+  add_global_arguments('-arch=native', language : 'cuda')
 endif
 
 su2_cpp_args = []