diff --git a/CMakeLists.txt b/CMakeLists.txt
index e1729bd2..709e6dd0 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ else()
 endif()
 
 # Check required CMake version
-set(REQUIRED_CMAKE_VERSION "3.18.0")
+set(REQUIRED_CMAKE_VERSION "3.24.0")
 if(POLYSOLVE_TOPLEVEL_PROJECT)
     cmake_minimum_required(VERSION ${REQUIRED_CMAKE_VERSION})
     SET(CMAKE_POLICY_VERSION_MINIMUM ${REQUIRED_CMAKE_VERSION})
@@ -75,6 +75,7 @@ endif()
 # Polysolve options
 option(POLYSOLVE_WITH_SANITIZERS    "Enable sanitizers in compilation targets"          OFF)
 option(POLYSOLVE_WITH_UNICODE       "Use Unicode in logging messages"                    ON)
+option(POLYSOLVE_WITH_CUDA          "Enable cuda support"                               OFF)
 
 # Polysolve options for enabling/disabling optional libraries
 option(POLYSOLVE_WITH_ACCELERATE    "Enable Apple Accelerate" ${POLYSOLVE_ON_APPLE_SILICON})
@@ -148,20 +149,79 @@ if(POLYSOLVE_WITH_MKL)
     set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDLL" CACHE STRING "Select the MSVC runtime library")
 endif()
 
+# Add empty library target. Setup later.
+add_library(polysolve)
+add_library(polysolve::polysolve ALIAS polysolve)
+add_library(polysolve_linear)
+add_library(polysolve::linear ALIAS polysolve_linear)
+
+################################################################################
+# CUDA
+################################################################################
+
+if (POLYSOLVE_WITH_CUDA)
+  # If CMAKE_CUDA_ARCHITECTURES was not specified, set it to native.
+  if(DEFINED CMAKE_CUDA_ARCHITECTURES)
+    message(STATUS "CMAKE_CUDA_ARCHITECTURES was specified, skipping auto-detection")
+  else()
+    message(STATUS "CMAKE_CUDA_ARCHITECTURES was not specified, set it to native")
+    set(CMAKE_CUDA_ARCHITECTURES "native")
+  endif()
+  message(STATUS "Targeting CUDA_ARCHITECTURES \"${CMAKE_CUDA_ARCHITECTURES}\"")
+
+  # Enable CUDA support
+  enable_language(CUDA)
+  set(CMAKE_CUDA_STANDARD 17)
+  set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+  set_target_properties(polysolve PROPERTIES
+      CUDA_SEPARABLE_COMPILATION ON
+      CUDA_RESOLVE_DEVICE_SYMBOLS ON
+  )
+  set_target_properties(polysolve_linear PROPERTIES
+      CUDA_SEPARABLE_COMPILATION ON
+      CUDA_RESOLVE_DEVICE_SYMBOLS ON
+  )
+
+  # Enable __device__ lambda
+  target_compile_options(polysolve PRIVATE
+      $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>
+  )
+  target_compile_options(polysolve_linear PRIVATE
+      $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>
+  )
+  # Relax device constexpr usage
+  target_compile_options(polysolve PRIVATE
+      $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+  )
+  target_compile_options(polysolve_linear PRIVATE
+      $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+  )
+
+  if(APPLE)
+      # We need to add the path to the driver (libcuda.dylib) as an rpath,
+      # so that the static cuda runtime can find it at runtime.
+      set_property(TARGET polysolve
+                  PROPERTY
+                  BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+  endif()
+endif()
+
 ################################################################################
 # PolySolve Library
 ################################################################################
 
-# Add an empty library and fill in the list of sources in `src/CMakeLists.txt`.
-add_library(polysolve_linear)
-add_library(polysolve::linear ALIAS polysolve_linear)
 add_subdirectory(src/polysolve)
 add_subdirectory(src/polysolve/linear)
 
-add_library(polysolve)
-add_library(polysolve::polysolve ALIAS polysolve)
 add_subdirectory(src/polysolve/nonlinear)
 
+# Ensure that changes in the CUDA linear library
+# trigger relinking of the aggregate polysolve
+# static library (and its CUDA device-link step).
+set_property(TARGET polysolve APPEND PROPERTY LINK_DEPENDS
+    "$<TARGET_FILE:polysolve_linear>")
+
 target_link_libraries(polysolve_linear PUBLIC polysolve_coverage_config)
 target_link_libraries(polysolve PUBLIC polysolve_coverage_config)
 
@@ -180,6 +240,11 @@ if(POLYSOLVE_LARGE_INDEX)
     target_compile_definitions(polysolve_linear PUBLIC POLYSOLVE_LARGE_INDEX)
 endif()
 
+if(POLYSOLVE_WITH_CUDA)
+  target_compile_definitions(polysolve PUBLIC POLYSOLVE_WITH_CUDA)
+  target_compile_definitions(polysolve_linear PUBLIC POLYSOLVE_WITH_CUDA)
+endif()
+
 target_compile_definitions(polysolve_linear PRIVATE POLYSOLVE_LINEAR_SPEC="${PROJECT_SOURCE_DIR}/linear-solver-spec.json")
 target_compile_definitions(polysolve PRIVATE POLYSOLVE_NON_LINEAR_SPEC="${PROJECT_SOURCE_DIR}/nonlinear-solver-spec.json")
 target_compile_definitions(polysolve_linear PUBLIC POLYSOLVE_JSON_SPEC_DIR="${PROJECT_SOURCE_DIR}")
@@ -189,6 +254,13 @@ target_compile_definitions(polysolve_linear PUBLIC POLYSOLVE_JSON_SPEC_DIR="${PR
 # Dependencies
 ################################################################################
 
+# Cuda runtime and cuSparse
+if (POLYSOLVE_WITH_CUDA)
+    find_package(CUDAToolkit REQUIRED)
+    target_link_libraries(polysolve PUBLIC CUDA::cudart CUDA::cusparse)
+    target_link_libraries(polysolve_linear PUBLIC CUDA::cudart CUDA::cusparse)
+endif()
+
 # ------
 # Linear
 # ------
@@ -229,8 +301,8 @@ if(POLYSOLVE_WITH_HYPRE)
     include(hypre)
     target_link_libraries(polysolve_linear PUBLIC HYPRE::HYPRE)
     target_compile_definitions(polysolve_linear PUBLIC POLYSOLVE_WITH_HYPRE)
-    if(HYPRE_WITH_MPI)
-        target_compile_definitions(polysolve_linear PUBLIC HYPRE_WITH_MPI)
+    if(HYPRE_ENABLE_MPI)
+        target_compile_definitions(polysolve_linear PUBLIC HYPRE_ENABLE_MPI)
     endif()
 endif()
 
@@ -304,6 +376,10 @@ endif()
 
 # cuSolver solvers
 if(POLYSOLVE_WITH_CUSOLVER)
+    if (NOT POLYSOLVE_WITH_CUDA)
+      message(FATAL_ERROR "Expect POLYSOLVE_WITH_CUDA set to ON when CUSOLVER is enabled")
+    endif()
+
     include(cusolverdn)
     if(TARGET CUDA::cusolver)
         target_link_libraries(polysolve_linear PUBLIC CUDA::cusolver)
@@ -331,6 +407,21 @@ if(POLYSOLVE_WITH_UNICODE)
     endif()
 endif()
 
+# CCCL
+if(POLYSOLVE_WITH_CUDA)
+  include(cccl)
+  target_link_libraries(polysolve_linear PUBLIC CCCL::CCCL)
+  target_link_libraries(polysolve PUBLIC CCCL::CCCL)
+endif()
+
+# KaMinPar
+# KaMinPar requires onetbb
+if(POLYSOLVE_WITH_CUDA)
+  include(kaminpar)
+  target_link_libraries(polysolve_linear PRIVATE KaMinPar::KaMinPar)
+endif()
+
+
 # ---------
 # Nonlinear
 # ---------
diff --git a/cmake/polysolve/polysolve_warnings.cmake b/cmake/polysolve/polysolve_warnings.cmake
index 20b16a61..cbad7bfa 100644
--- a/cmake/polysolve/polysolve_warnings.cmake
+++ b/cmake/polysolve/polysolve_warnings.cmake
@@ -10,7 +10,6 @@ endif()
 set(POLYSOLVE_WARNING_FLAGS
 	-Wall
 	-Wextra
-	-pedantic
 
 	# -Wconversion
 	#-Wunsafe-loop-optimizations # broken with C++11 loops
diff --git a/cmake/recipes/amgcl.cmake b/cmake/recipes/amgcl.cmake
index 300f7479..53849d5f 100644
--- a/cmake/recipes/amgcl.cmake
+++ b/cmake/recipes/amgcl.cmake
@@ -48,6 +48,18 @@ function(amgcl_import_target)
     )
 
     target_link_libraries(amgcl INTERFACE Boost::boost)
+
+    # AMGCL pollutes all downstream target with their aggressive diagostic flags,
+    # which causes cuda code to vomit warnings every line cause nvcc generates gcc specific intrinsics.
+    # Use hacky regex replacement to purge -Wpedantic, -Wall, and -Wextra.
+    get_target_property(AMGCL_OPTIONS amgcl INTERFACE_COMPILE_OPTIONS)
+    if(AMGCL_OPTIONS)
+        string(REGEX REPLACE ";?-Wpedantic" "" AMGCL_OPTIONS_CLEAN "${AMGCL_OPTIONS}")
+        string(REGEX REPLACE ";?-Wextra" "" AMGCL_OPTIONS_CLEAN "${AMGCL_OPTIONS_CLEAN}")
+        string(REGEX REPLACE ";?-Wall" "" AMGCL_OPTIONS_CLEAN "${AMGCL_OPTIONS_CLEAN}")
+        set_target_properties(amgcl PROPERTIES INTERFACE_COMPILE_OPTIONS "${AMGCL_OPTIONS_CLEAN}")
+    endif()
+
 endfunction()
 
 amgcl_import_target()
diff --git a/cmake/recipes/cccl.cmake b/cmake/recipes/cccl.cmake
new file mode 100644
index 00000000..d335dacb
--- /dev/null
+++ b/cmake/recipes/cccl.cmake
@@ -0,0 +1,15 @@
+# cccl (https://github.com/NVIDIA/cccl)
+# License: Apache
+
+if(TARGET CCCL::CCCL)
+    return()
+endif()
+
+message(STATUS "Third-party: creating target 'CCCL::CCCL'")
+
+include(CPM)
+CPMAddPackage(
+    NAME CCCL
+    GITHUB_REPOSITORY NVIDIA/cccl
+    VERSION 3.3.0
+)
diff --git a/cmake/recipes/cusolverdn.cmake b/cmake/recipes/cusolverdn.cmake
index 43657ac0..a2215fb8 100644
--- a/cmake/recipes/cusolverdn.cmake
+++ b/cmake/recipes/cusolverdn.cmake
@@ -6,55 +6,11 @@ endif()
 
 message(STATUS "Third-party: creating targets 'CUDA::cusolver'")
 
-include(CheckLanguage)
-check_language(CUDA)
-if(CMAKE_CUDA_COMPILER)
-    enable_language(CUDA)
-
-    # We do not have a build recipe for this, so find it as a system installed library.
-    find_package(CUDAToolkit)
-    if(CUDAToolkit_FOUND)
-        set(CUDA_SEPARABLE_COMPILATION ON)
-        set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -use_fast_math --expt-relaxed-constexpr  -gencode arch=compute_86,code=sm_86")
-
-        target_compile_options(polysolve PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:
-
-        --generate-line-info
-        --use_fast_math
-        --relocatable-device-code=true
-
-        --ptxas-options=-v
-        --maxrregcount=7
-        --compiler-options
-        -fPIC # https://stackoverflow.com/questions/5311515/gcc-fpic-option
-
-        >
-
-        )
-
-        target_link_libraries(polysolve PUBLIC CUDA::toolkit)
-        set_target_properties(polysolve PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-        # Nvidia RTX8000 -> compute_75
-        # Nvidia V100 -> compute_70
-        # Nvidia 1080/1080Ti -> compute_61
-        # Nvidia 3080Ti -> compute_86
-        if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-            set(CMAKE_CUDA_ARCHITECTURES 70 75 86)
-        endif()
-        set_target_properties(polysolve PROPERTIES CUDA_ARCHITECTURES "70;75;86")
-
-        if(APPLE)
-            # We need to add the path to the driver (libcuda.dylib) as an rpath,
-            # so that the static cuda runtime can find it at runtime.
-            set_property(TARGET polysolve
-                        PROPERTY
-                        BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
-        endif()
-
-    else()
-        message(WARNING "cuSOLVER not found, solver will not be available.")
-    endif()
+# Use cuSolver bundled with cuda-toolkit.
+find_package(CUDAToolkit)
+if (CUDAToolkit_FIND)
+    target_link_libraries(polysolve PUBLIC CUDA::toolkit)
+    target_link_libraries(polysolve_linear PUBLIC CUDA::toolkit)
 else()
-    message(WARNING "CUDA not found, cuSOLVER will not be available.")
-endif()
\ No newline at end of file
+    message(WARNING "cuSOLVER not found, solver will not be available.")
+endif()
diff --git a/cmake/recipes/hypre.cmake b/cmake/recipes/hypre.cmake
index 7cfd0c2b..e3b19a6a 100644
--- a/cmake/recipes/hypre.cmake
+++ b/cmake/recipes/hypre.cmake
@@ -6,18 +6,28 @@ endif()
 
 message(STATUS "Third-party: creating target 'HYPRE::HYPRE'")
 
-set(HYPRE_WITH_MPI    OFF CACHE INTERNAL "" FORCE)
-set(HYPRE_PRINT_ERRORS  ON CACHE INTERNAL "" FORCE)
-set(HYPRE_BIGINT        ON CACHE INTERNAL "" FORCE)
-set(HYPRE_USING_FEI    OFF CACHE INTERNAL "" FORCE)
-set(HYPRE_USING_OPENMP OFF CACHE INTERNAL "" FORCE)
-set(HYPRE_SHARED       OFF CACHE INTERNAL "" FORCE)
+set(HYPRE_ENABLE_MPI           OFF CACHE INTERNAL "" FORCE)
+set(HYPRE_ENABLE_PRINT_ERRORS  ON  CACHE INTERNAL "" FORCE)
+set(HYPRE_ENABLE_BIGINT        ON  CACHE INTERNAL "" FORCE)
+set(HYPRE_ENABLE_FEI           OFF CACHE INTERNAL "" FORCE)
+set(HYPRE_ENABLE_OPENMP        OFF CACHE INTERNAL "" FORCE)
+set(HYPRE_SHARED               OFF CACHE INTERNAL "" FORCE)
+
+# HYPRE unconditionally defines an "uninstall" target, which conflicts with other buggy libraries
+# as modern cmake requires unique target name. This is a hacky workaround until upstream is fixed.
+macro(add_custom_target _target_name)
+  if("${_target_name}" STREQUAL "uninstall" AND TARGET uninstall)
+    # skip: HYPRE's uninstall target conflicts with an existing one
+  else()
+    _add_custom_target(${_target_name} ${ARGN})
+  endif()
+endmacro()
 
 include(CPM)
 CPMAddPackage(
     NAME hypre
     GITHUB_REPOSITORY hypre-space/hypre
-    GIT_TAG v2.28.0
+    GIT_TAG v3.1.0
     SOURCE_SUBDIR src
 )
 file(REMOVE "${hypre_SOURCE_DIR}/src/utilities/version")
diff --git a/cmake/recipes/kaminpar.cmake b/cmake/recipes/kaminpar.cmake
new file mode 100644
index 00000000..124398e3
--- /dev/null
+++ b/cmake/recipes/kaminpar.cmake
@@ -0,0 +1,29 @@
+# KaMinPar (https://github.com/KaHIP/KaMinPar)
+# License: MIT
+
+if(TARGET KaMinPar::KaMinPar)
+    return()
+endif()
+
+message(STATUS "Third-party: creating target 'KaMinPar::KaMinPar'")
+
+include(onetbb)
+
+include(CPM)
+CPMAddPackage(
+    NAME KaMinPar
+    VERSION 3.7.3
+    GITHUB_REPOSITORY KaHIP/KaMinPar
+    GIT_TAG v3.7.3
+    OPTIONS
+        "KAMINPAR_BUILD_APPS OFF"
+        "KAMINPAR_BUILD_IO OFF"
+        "KAMINPAR_BUILD_WITH_SPARSEHASH OFF"
+        "KAMINPAR_BUILD_WITH_DEBUG_SYMBOLS OFF"
+        "KAMINPAR_64BIT_WEIGHTS ON"
+        "KAMINPAR_ENABLE_TIMERS OFF"
+        "INSTALL_KAMINPAR OFF"
+        # Below hack force kaminpar to use our vendored tbb
+        "KAMINPAR_DOWNLOAD_TBB ON"
+        "TBB_FOUND TRUE"
+)
diff --git a/linear-solver-spec.json b/linear-solver-spec.json
index 7f181bb6..ab585aea 100644
--- a/linear-solver-spec.json
+++ b/linear-solver-spec.json
@@ -15,7 +15,8 @@
             "Eigen::MINRES",
             "Pardiso",
             "Hypre",
-            "AMGCL"
+            "AMGCL",
+            "CUDA_PCG"
         ],
         "doc": "Settings for the linear solver."
     },
@@ -47,7 +48,8 @@
             "Eigen::ConjugateGradient",
             "Eigen::BiCGSTAB",
             "Eigen::GMRES",
-            "Eigen::MINRES"
+            "Eigen::MINRES",
+            "CUDA_PCG"
         ]
     },
     {
@@ -449,5 +451,60 @@
         "default": 0,
         "type": "float",
         "doc": "Aggregation epsilon strong."
+    },
+    {
+        "pointer": "/CUDA_PCG",
+        "default": null,
+        "type": "object",
+        "optional": [
+            "block_dim",
+            "max_iter",
+            "relative_tolerance",
+            "absolute_tolerance",
+            "lazy_partitioning",
+            "use_preconditioned_residual_norm"
+        ],
+        "doc": "Settings for the CUDA PCG solver."
+    },
+    {
+        "pointer": "/CUDA_PCG/block_dim",
+        "default": 1,
+        "type": "int",
+        "options": [
+            1,
+            2,
+            3
+        ],
+        "doc": "Block size of the BSR matrix PCG sovler internally use. Choose 3 for 3d problem, 2 for 2d, etc."
+    },
+    {
+        "pointer": "/CUDA_PCG/max_iter",
+        "default": 10000,
+        "type": "int",
+        "doc": "Maximum number of iterations."
+    },
+    {
+        "pointer": "/CUDA_PCG/relative_tolerance",
+        "default": 0.0,
+        "type": "float",
+        "doc": "Relative convergence tolerance."
+    },
+    {
+        "pointer": "/CUDA_PCG/absolute_tolerance",
+        "default": 1e-8,
+        "type": "float",
+        "doc": "Absolute convergence tolerance."
+    },
+    {
+        "pointer": "/CUDA_PCG/lazy_partitioning",
+        "default": true,
+        "type": "bool",
+        "doc": "If true, reuse the first graph partition."
+    },
+    {
+        "pointer": "/CUDA_PCG/use_preconditioned_residual_norm",
+        "default": false,
+        "type": "bool",
+        "doc": "Use preconditioned residual norm for termination check."
     }
-]
\ No newline at end of file
+]
diff --git a/src/polysolve/linear/CMakeLists.txt b/src/polysolve/linear/CMakeLists.txt
index a742a357..1eb58850 100644
--- a/src/polysolve/linear/CMakeLists.txt
+++ b/src/polysolve/linear/CMakeLists.txt
@@ -17,5 +17,25 @@ set(SOURCES
     SaddlePointSolver.hpp
 )
 
+if(POLYSOLVE_WITH_CUDA)
+    list(APPEND SOURCES
+        PCGSolver.cu
+        PCGSolver.hpp
+        mas_utils/BSRAdjacency.cpp
+        mas_utils/BSRAdjacency.hpp
+        mas_utils/BSRMatrix.cu
+        mas_utils/BSRMatrix.hpp
+        mas_utils/CuSparseWrapper.hpp
+        mas_utils/CudaUtils.cuh
+        mas_utils/InnerProduct.cu
+        mas_utils/InnerProduct.hpp
+        mas_utils/MASPreconditioner.cu
+        mas_utils/MASPreconditioner.hpp
+        mas_utils/GraphPartition.cpp
+        mas_utils/GraphPartition.hpp
+    )
+
+endif()
+
 source_group(TREE "${CMAKE_CURRENT_SOURCE_DIR}" PREFIX "Source Files" FILES ${SOURCES})
 target_sources(polysolve_linear PRIVATE ${SOURCES})
diff --git a/src/polysolve/linear/CuSolverDN.cu b/src/polysolve/linear/CuSolverDN.cu
index 29cf7570..e0d01d15 100644
--- a/src/polysolve/linear/CuSolverDN.cu
+++ b/src/polysolve/linear/CuSolverDN.cu
@@ -206,7 +206,7 @@ namespace polysolve::linear
 
 } // namespace polysolve::linear
 
-template polysolve::CuSolverDN<double>::CuSolverDN();
-template polysolve::CuSolverDN<float>::CuSolverDN();
+template polysolve::linear::CuSolverDN<double>::CuSolverDN();
+template polysolve::linear::CuSolverDN<float>::CuSolverDN();
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/polysolve/linear/HypreSolver.cpp b/src/polysolve/linear/HypreSolver.cpp
index 804eca1f..0901b880 100644
--- a/src/polysolve/linear/HypreSolver.cpp
+++ b/src/polysolve/linear/HypreSolver.cpp
@@ -15,7 +15,7 @@ namespace polysolve::linear
     HypreSolver::HypreSolver()
     {
         precond_num_ = 0;
-#ifdef HYPRE_WITH_MPI
+#ifdef HYPRE_ENABLE_MPI
         int done_already;
 
         MPI_Initialized(&done_already);
@@ -32,6 +32,10 @@ namespace polysolve::linear
             MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
         }
 #endif
+        if (!HYPRE_Initialized())
+        {
+            HYPRE_Initialize();
+        }
     }
 
     // Set solver parameters
@@ -91,7 +95,7 @@ namespace polysolve::linear
         has_matrix_ = true;
         const HYPRE_Int rows = Ain.rows();
         const HYPRE_Int cols = Ain.cols();
-#ifdef HYPRE_WITH_MPI
+#ifdef HYPRE_ENABLE_MPI
         HYPRE_IJMatrixCreate(MPI_COMM_WORLD, 0, rows - 1, 0, cols - 1, &A);
 #else
         HYPRE_IJMatrixCreate(0, 0, rows - 1, 0, cols - 1, &A);
@@ -285,7 +289,7 @@ namespace polysolve::linear
 
         /* Create solver */
         HYPRE_Solver solver, precond;
-#ifdef HYPRE_WITH_MPI
+#ifdef HYPRE_ENABLE_MPI
         HYPRE_ParCSRPCGCreate(MPI_COMM_WORLD, &solver);
 #else
         HYPRE_ParCSRPCGCreate(0, &solver);
diff --git a/src/polysolve/linear/PCGSolver.cu b/src/polysolve/linear/PCGSolver.cu
new file mode 100644
index 00000000..de45c89e
--- /dev/null
+++ b/src/polysolve/linear/PCGSolver.cu
@@ -0,0 +1,649 @@
+#include "PCGSolver.hpp"
+
+// #ifndef SPDLOG_ACTIVE_LEVEL
+// #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE
+// #endif
+
+#include <Eigen/Core>
+
+#include <cub/cub.cuh>
+
+#include <cuda/algorithm>
+#include <cuda/devices>
+#include <cuda/stream>
+#include <cuda/memory_pool>
+#include <cuda/std/cmath>
+#include <cuda/std/span>
+#include <cuda/std/optional>
+
+#include <spdlog/spdlog.h>
+
+#include <chrono>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <polysolve/linear/mas_utils/BSRAdjacency.hpp>
+#include <polysolve/linear/mas_utils/BSRMatrix.hpp>
+#include <polysolve/linear/mas_utils/CuSparseWrapper.hpp>
+#include <polysolve/linear/mas_utils/CudaUtils.cuh>
+#include <polysolve/linear/mas_utils/InnerProduct.hpp>
+#include <polysolve/linear/mas_utils/MASPreconditioner.hpp>
+#include <polysolve/linear/mas_utils/GraphPartition.hpp>
+
+namespace polysolve::linear
+{
+
+    using namespace mas;
+
+    namespace
+    {
+        using clock = std::chrono::steady_clock;
+
+        double elapsed_seconds(const std::chrono::time_point<clock> &begin)
+        {
+            return std::chrono::duration<double>(clock::now() - begin).count();
+        }
+
+        /// Device scalar devision num / denom.
+        /// Defaults to zero when denom is small because we only check termination condition every 10
+        /// PCG iterations and beta should be zero if we already arrive at solution.
+        void scalar_division(
+            ctd::span<const double> num,
+            ctd::span<const double> denom,
+            ctd::span<double> out,
+            CudaRuntime rt)
+        {
+            auto op = [num, denom, out] __device__(int) {
+                out[0] = (ctd::abs(denom[0]) < 1e-20) ? 0.0 : (num[0] / denom[0]);
+            };
+            cub::DeviceFor::Bulk(1, op, rt.stream.get());
+        }
+
+        /// Compute alpha * x + beta * y.
+        /// @params h_alpha Host alpha.
+        /// @params d_alpha Device alpha. nullptr implies 1.0.
+        /// @params h_beta Host beta.
+        /// @params d_beta Device beta. nullptr implies 1.0.
+        /// @params x Device vector x.
+        /// @params y Device vector y and the output.
+        /// @params rt Cuda runtime.
+        void axpby(
+            double h_alpha,
+            const double *d_alpha,
+            double h_beta,
+            const double *d_beta,
+            ctd::span<const double> x,
+            ctd::span<double> y,
+            CudaRuntime rt)
+        {
+            auto op = [h_alpha, d_alpha, h_beta, d_beta, x, y] __device__(int idx) {
+                double alpha = h_alpha * ((d_alpha == nullptr) ? 1.0 : *d_alpha);
+                double beta = h_beta * ((d_beta == nullptr) ? 1.0 : *d_beta);
+                y[idx] = alpha * x[idx] + beta * y[idx];
+            };
+            cub::DeviceFor::Bulk(x.size(), op, rt.stream.get());
+        }
+
+        /// @brief Build CSR row ptr style parition offset.
+        /// Suppose all nodes are sorted by partition id, partition offset layout is:
+        /// | partition 0 start | parition 1 start | ... |
+        std::vector<int> build_part_offsets(ctd::span<const int> part_id, int part_num)
+        {
+            std::vector<int> offsets(part_num + 1, 0);
+            for (int old_id = 0; old_id < part_id.size(); ++old_id)
+            {
+                offsets[part_id[old_id] + 1] += 1;
+            }
+            for (int part = 0; part < part_num; ++part)
+            {
+                offsets[part + 1] += offsets[part];
+            }
+            return offsets;
+        }
+
+        /// @brief Build permutation based such that all nodes are sorted by parition.
+        std::vector<int> build_permutation(
+            const std::vector<int> &part_id,
+            const std::vector<int> &offsets)
+        {
+            std::vector<int> next = offsets;
+            std::vector<int> permutation(part_id.size(), 0);
+            for (int old_id = 0; old_id < part_id.size(); ++old_id)
+            {
+                int part = part_id[old_id];
+                permutation[old_id] = next[part];
+                next[part] += 1;
+            }
+            return permutation;
+        }
+
+        void permute_vector(
+            ctd::span<const double> input,
+            ctd::span<double> output,
+            ctd::span<const int> block_map,
+            int block_dim,
+            CudaRuntime rt)
+        {
+            auto op = [input, output, block_map, block_dim] __device__(int idx) {
+                int block_id = idx / block_dim;
+                int comp = idx % block_dim;
+                int mapped = block_map[block_id];
+                output[idx] = input[mapped * block_dim + comp];
+            };
+            cub::DeviceFor::Bulk(output.size(), op, rt.stream.get());
+        }
+
+    } // namespace
+
+    class CudaPCG::CudaPCGImpl
+    {
+    public:
+        int block_dim_ = 3; ///< BSR block dim.
+        int max_iter_ = 1e5;
+        int true_residual_period_ = 4;
+        double abs_tol_ = 1e-20;
+        double rel_tol_ = 1e-6;
+        bool lazy_partitioning_ = false;
+        bool use_preconditioned_residual_norm_ = true;
+
+        int dim_ = 0;          ///< Input matrix A dim.
+        int permuted_dim_ = 0; ///< Dim with block padding.
+        int iterations_ = 0;
+        double residual_norm_ = 0.0;
+        CudaPCGStatus status_ = CudaPCGStatus::Running;
+
+        // == IMPORTANT ==
+        // You must declare cu::stream and memory_pool before any device storage!
+        // Else stream and mem pool will be destroyed before cuda::buffer, causing segfault.
+        ctd::optional<cu::device_ref> default_device_;
+        ctd::optional<cu::stream> default_stream_;
+        ctd::optional<cu::device_memory_pool> default_mem_pool_;
+        CuSparseHandle cusparse_handle_;
+
+        BSRMatrix A_;
+        MASPreconditioner mas_precond_;
+        std::vector<int> permutation_;     ///< Permutation from graph partition.
+        std::vector<int> inv_permutation_; ///< Permutation from graph partition.
+        std::vector<int> part_offsets_;    ///< Partition offsets for sorted blocks.
+        CuSparseBSR sparse_A_;             ///< CuSparse handle of A.
+        Buf<int> d_permutation_;           ///< Permutation from graph partition.
+        Buf<int> d_inv_permutation_;       ///< Permutation from graph partition.
+        Buf<char> spmv_workspace_;         ///< CuSparse temp.
+
+        // PCG variables.
+
+        Buf<double> x_;
+        Buf<double> b_;
+        Buf<double> r_;
+        Buf<double> p_;
+        Buf<double> z_;
+        Buf<double> Ap_;
+        Buf<char> reduction_storage_;
+        Buf<double> scalar_rz_;
+        Buf<double> scalar_pAp_;
+        Buf<double> scalar_alpha_;
+        Buf<double> scalar_beta_;
+        Buf<double> scalar_rz_old_;
+        Buf<double> scalar_rr_;
+
+    public:
+        CudaPCGImpl()
+        {
+            if (cu::devices.size() == 0)
+            {
+                throw std::runtime_error("No Nvidia GPU!!");
+            }
+
+            default_device_.emplace(cu::devices[0]);
+            default_stream_.emplace(*default_device_);
+            default_mem_pool_.emplace(*default_device_);
+        }
+
+        void set_parameters(const json &params)
+        {
+            if (params.contains("block_dim"))
+                block_dim_ = params["block_dim"];
+            if (params.contains("max_iter"))
+                max_iter_ = params["max_iter"];
+            if (params.contains("relative_tolerance"))
+                rel_tol_ = params["relative_tolerance"];
+            if (params.contains("absolute_tolerance"))
+                abs_tol_ = params["absolute_tolerance"];
+            if (params.contains("lazy_partitioning"))
+                lazy_partitioning_ = params["lazy_partitioning"];
+            if (params.contains("use_preconditioned_residual_norm"))
+                use_preconditioned_residual_norm_ = params["use_preconditioned_residual_norm"];
+        }
+
+        void get_info(json &params) const
+        {
+            params["solver_iter"] = iterations_;
+            params["solver_error"] = residual_norm_;
+            params["solver_status"] = pcg_status_to_string(status_);
+        }
+
+        void analyze_pattern(const StiffnessMatrix &, const int) {}
+
+        void build_partition_and_perm(BSRAdjacency &adj)
+        {
+            int part_num = 0;
+            std::vector<int> part_id;
+            graph_partition(
+                adj.row_ptr,
+                adj.cols,
+                adj.weights,
+                32,
+                part_num,
+                part_id);
+
+            part_offsets_ = build_part_offsets(part_id, part_num);
+            permutation_ = build_permutation(part_id, part_offsets_);
+            inv_permutation_.resize(permutation_.size());
+            for (int old_id = 0; old_id < permutation_.size(); ++old_id)
+            {
+                inv_permutation_[permutation_[old_id]] = old_id;
+            }
+        }
+
+        // Allocate cusparse workspace buffer.
+        void setup_cusparse(CudaRuntime rt)
+        {
+            sparse_A_ = CuSparseBSR(A_.view());
+
+            double alpha = 1.0;
+            double beta = 0.0;
+            CuSparseConstVec x_desc(*x_);
+            CuSparseVec y_desc(*Ap_);
+            size_t workspace_size = 0;
+
+            cusparseSetStream(cusparse_handle_.raw, rt.stream.get());
+            cusparseSpMV_bufferSize(
+                cusparse_handle_.raw,
+                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                &alpha,
+                sparse_A_.raw,
+                x_desc.raw,
+                &beta,
+                y_desc.raw,
+                CUDA_R_64F,
+                CUSPARSE_SPMV_ALG_DEFAULT,
+                &workspace_size);
+
+            spmv_workspace_ = cu::make_buffer<char>(
+                rt.stream,
+                rt.mr,
+                workspace_size == 0 ? 1 : workspace_size,
+                cu::no_init);
+        }
+
+        void spmv(ctd::span<const double> x, ctd::span<double> y, CudaRuntime rt)
+        {
+            double alpha = 1.0;
+            double beta = 0.0;
+            CuSparseConstVec x_desc(x);
+            CuSparseVec y_desc(y);
+
+            cusparseSetStream(cusparse_handle_.raw, rt.stream.get());
+            cusparseSpMV(
+                cusparse_handle_.raw,
+                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                &alpha,
+                sparse_A_.raw,
+                x_desc.raw,
+                &beta,
+                y_desc.raw,
+                CUDA_R_64F,
+                CUSPARSE_SPMV_ALG_DEFAULT,
+                spmv_workspace_->data());
+        }
+
+        void factorize(const StiffnessMatrix &A)
+        {
+            CudaRuntime rt{*default_stream_, default_mem_pool_->as_ref()};
+            auto total_begin = clock::now();
+            int block_n = div_round_up(A.rows(), block_dim_);
+
+            // We do:
+            // 1. K-way graph parition.
+            // 2. Build permutation based on partition.
+            // 3. Initialize MAS preconditioner.
+            // 4. Allocates buffer for PCG loop.
+
+            bool rebuild_partition =
+                !lazy_partitioning_ || permutation_.size() != block_n || part_offsets_.empty();
+            if (rebuild_partition)
+            {
+                // Build adjacency for graph partition.
+                auto phase_begin = clock::now();
+                BSRAdjacency adj{A, block_dim_};
+                SPDLOG_TRACE("CUDA_PCG setup: topology_adj {:.6f}s", elapsed_seconds(phase_begin));
+
+                // Sort nodes based on parition.
+                phase_begin = clock::now();
+                build_partition_and_perm(adj);
+                SPDLOG_TRACE("CUDA_PCG setup: graph_partition {:.6f}s", elapsed_seconds(phase_begin));
+            }
+            else
+            {
+                SPDLOG_TRACE("CUDA_PCG setup: reuse_partition");
+            }
+
+            // Build new sorted BSR matrix for MAS initialization.
+            auto phase_begin = clock::now();
+            A_ = BSRMatrix{
+                A,
+                block_dim_,
+                ctd::span<const int>(permutation_.data(), permutation_.size()),
+                rt};
+            SPDLOG_TRACE("CUDA_PCG setup: permuted_bsr {:.6f}s", elapsed_seconds(phase_begin));
+
+            BSRView view = A_.view();
+            dim_ = A.rows();
+            permuted_dim_ = view.block_dim * view.dim;
+
+            // Initialize MAS.
+            phase_begin = clock::now();
+            mas_precond_.factorize(
+                A_,
+                ctd::span<const int>(part_offsets_.data(), part_offsets_.size()),
+                rt);
+            rt.stream.sync();
+            SPDLOG_TRACE("CUDA_PCG setup: mas_factorize {:.6f}s", elapsed_seconds(phase_begin));
+
+            // Copy permutation to device.
+            phase_begin = clock::now();
+            d_permutation_ = cu::make_buffer<int>(rt.stream, rt.mr, permutation_.size(), cu::no_init);
+            d_inv_permutation_ =
+                cu::make_buffer<int>(rt.stream, rt.mr, inv_permutation_.size(), cu::no_init);
+            cu::copy_bytes(rt.stream, permutation_, *d_permutation_);
+            cu::copy_bytes(rt.stream, inv_permutation_, *d_inv_permutation_);
+
+            // Allocates buffers for PCG loop.
+            x_ = cu::make_buffer<double>(rt.stream, rt.mr, permuted_dim_, cu::no_init);
+            b_ = cu::make_buffer<double>(rt.stream, rt.mr, permuted_dim_, cu::no_init);
+            r_ = cu::make_buffer<double>(rt.stream, rt.mr, permuted_dim_, cu::no_init);
+            p_ = cu::make_buffer<double>(rt.stream, rt.mr, permuted_dim_, cu::no_init);
+            z_ = cu::make_buffer<double>(rt.stream, rt.mr, permuted_dim_, cu::no_init);
+            Ap_ = cu::make_buffer<double>(rt.stream, rt.mr, permuted_dim_, cu::no_init);
+
+            scalar_rz_ = cu::make_buffer<double>(rt.stream, rt.mr, 1, cu::no_init);
+            scalar_pAp_ = cu::make_buffer<double>(rt.stream, rt.mr, 1, cu::no_init);
+            scalar_alpha_ = cu::make_buffer<double>(rt.stream, rt.mr, 1, cu::no_init);
+            scalar_beta_ = cu::make_buffer<double>(rt.stream, rt.mr, 1, cu::no_init);
+            scalar_rz_old_ = cu::make_buffer<double>(rt.stream, rt.mr, 1, cu::no_init);
+            scalar_rr_ = cu::make_buffer<double>(rt.stream, rt.mr, 1, cu::no_init);
+            rt.stream.sync();
+            SPDLOG_TRACE("CUDA_PCG setup: device_buffers {:.6f}s", elapsed_seconds(phase_begin));
+
+            // Allocates buffers for CuSparse.
+            phase_begin = clock::now();
+            setup_cusparse(rt);
+            rt.stream.sync();
+            SPDLOG_TRACE("CUDA_PCG setup: cusparse {:.6f}s", elapsed_seconds(phase_begin));
+            SPDLOG_TRACE("CUDA_PCG setup: total {:.6f}s", elapsed_seconds(total_begin));
+        }
+
+        void solve(const Eigen::Ref<const Eigen::VectorXd> b, Eigen::Ref<Eigen::VectorXd> x)
+        {
+            status_ = CudaPCGStatus::Running;
+
+            if (b.size() != x.size() || !check_buffer_size(b.size()))
+            {
+                throw std::runtime_error("[CudaPCG] Size mismatch. Did you forget to call factorize?");
+            }
+
+            CudaRuntime rt{*default_stream_, default_mem_pool_->as_ref()};
+
+            cu::copy_bytes(
+                rt.stream,
+                ctd::span<const double>(b.data(), dim_),
+                ctd::span<double>(r_->data(), dim_));
+            cu::fill_bytes(
+                rt.stream, ctd::span<double>(r_->data() + dim_, permuted_dim_ - dim_), 0);
+            permute_vector(*r_, *b_, *d_inv_permutation_, A_.view().block_dim, rt);
+
+            // The solver sometimes fails to converge if we use input x as initial value.
+            // Maybe the caller does not initialize x properly?
+            // Set initial x to zero to work around this issue for now.
+            cu::fill_bytes(rt.stream, *x_, 0);
+
+            pcg_solve(rt);
+
+            permute_vector(*x_, *r_, *d_permutation_, A_.view().block_dim, rt);
+
+            cu::copy_bytes(
+                rt.stream,
+                ctd::span<const double>(r_->data(), dim_),
+                ctd::span<double>(x.data(), dim_));
+            rt.stream.sync();
+        }
+
+        bool check_buffer_size(int n) const
+        {
+            if (n <= 0 || mas_precond_.empty() || !x_ || !b_ || !r_ || !p_ || !z_ || !Ap_
+                || !scalar_rz_ || !scalar_pAp_ || !scalar_alpha_ || !scalar_beta_
+                || !scalar_rz_old_ || !scalar_rr_)
+            {
+                return false;
+            }
+
+            BSRView view = A_.view();
+            int block_n = (n + view.block_dim - 1) / view.block_dim;
+            int block_size = view.block_dim * view.block_dim;
+
+            if (n != dim_ || block_n != view.dim)
+            {
+                return false;
+            }
+
+            if (view.rows.size() != (view.dim + 1)
+                || view.cols.size() != view.non_zeros
+                || view.vals.size() != block_size * view.non_zeros)
+            {
+                return false;
+            }
+
+            if (!d_permutation_ || !d_inv_permutation_ || sparse_A_.raw == nullptr || !spmv_workspace_
+                || x_->size() != permuted_dim_
+                || b_->size() != permuted_dim_
+                || r_->size() != permuted_dim_
+                || p_->size() != permuted_dim_
+                || z_->size() != permuted_dim_
+                || Ap_->size() != permuted_dim_)
+            {
+                return false;
+            }
+
+            if (scalar_rz_->size() < 1
+                || scalar_pAp_->size() < 1
+                || scalar_alpha_->size() < 1
+                || scalar_beta_->size() < 1
+                || scalar_rz_old_->size() < 1
+                || scalar_rr_->size() < 1)
+            {
+                return false;
+            }
+
+            return true;
+        }
+
+        /// https://www.cs.cmu.edu/~quake-papers/painless-conjugate-gradient.pdf
+        void pcg_solve(CudaRuntime rt)
+        {
+            // Compute initial residual r = b-Ax.
+            spmv(*x_, *r_, rt);
+            axpby(1.0, nullptr, -1.0, nullptr, *b_, *r_, rt);
+
+            // Compute z = M^-1 r.
+            mas_precond_.apply(*r_, *z_, rt);
+            // Initial search direction p = z;
+            cu::copy_bytes(rt.stream, *z_, *p_);
+
+            // Compute rz = r^T M^-1 r.
+            inner_product(*r_, *z_, *scalar_rz_, rt);
+            const double rz0 = device2host(scalar_rz_->data(), rt);
+            if (ctd::isnan(rz0) || !ctd::isfinite(rz0))
+            {
+                throw std::runtime_error("[CudaPCG] Invalid initial residual.");
+            }
+
+            // Compute rr = r^T r.
+            double rr0 = 0.0;
+            if (!use_preconditioned_residual_norm_)
+            {
+                inner_product(*r_, *r_, *scalar_rr_, rt);
+                rr0 = device2host(scalar_rr_->data(), rt);
+            }
+
+            // debug logging
+            auto iter_window_begin = clock::now();
+            int iter_window_start = 1;
+
+            for (int k = 1; k <= max_iter_; ++k)
+            {
+                // Compute Ap = A p.
+                spmv(*p_, *Ap_, rt);
+                // Compute pAp = p^T * A * p.
+                inner_product(*p_, *Ap_, *scalar_pAp_, rt);
+                // Compute alpha = (r M^-1 r) / (p^T A p).
+                scalar_division(*scalar_rz_, *scalar_pAp_, *scalar_alpha_, rt);
+                // Compute x = x + alpha A p.
+                axpby(1.0, scalar_alpha_->data(), 1.0, nullptr, *p_, *x_, rt);
+
+                // Compute residual b-Ax directly.
+                if (k % true_residual_period_ == 0)
+                {
+                    spmv(*x_, *r_, rt);
+                    axpby(1.0, nullptr, -1.0, nullptr, *b_, *r_, rt);
+                }
+                // Compute residual update using r' = r - alpha A p.
+                // This saves one spmv but accumulates floating point error overtime.
+                else
+                {
+                    axpby(-1.0, scalar_alpha_->data(), 1.0, nullptr, *Ap_, *r_, rt);
+                }
+
+                // Compute z = M^-1 r.
+                mas_precond_.apply(*r_, *z_, rt);
+
+                // Compute rz = r M^-1 r.
+                cu::copy_bytes(rt.stream, *scalar_rz_, *scalar_rz_old_);
+                inner_product(*r_, *z_, *scalar_rz_, rt);
+
+                iterations_ = k;
+                bool converged = false;
+
+                // Check convergence every 10 iterations.
+                if (k % 10 == 0)
+                {
+                    if (use_preconditioned_residual_norm_)
+                    {
+                        double rz_new = device2host(scalar_rz_->data(), rt);
+                        residual_norm_ = ctd::sqrt(rz_new);
+                        if (rz_new <= rel_tol_ * rel_tol_ * rz0 || rz_new <= abs_tol_ * abs_tol_)
+                        {
+                            status_ = (rz_new <= abs_tol_ * abs_tol_)
+                                          ? CudaPCGStatus::ReachAbsoluteTolerance
+                                          : CudaPCGStatus::ReachRelativeTolerance;
+                            converged = true;
+                        }
+                    }
+                    else
+                    {
+                        inner_product(*r_, *r_, *scalar_rr_, rt);
+                        double rr = device2host(scalar_rr_->data(), rt);
+                        residual_norm_ = ctd::sqrt(rr);
+                        if (rr <= rel_tol_ * rel_tol_ * rr0 || rr <= abs_tol_ * abs_tol_)
+                        {
+                            status_ = (rr <= abs_tol_ * abs_tol_)
+                                          ? CudaPCGStatus::ReachAbsoluteTolerance
+                                          : CudaPCGStatus::ReachRelativeTolerance;
+                            converged = true;
+                        }
+                    }
+                }
+
+                // debug logging
+                if (k % 100 == 0)
+                {
+                    rt.stream.sync();
+                    SPDLOG_TRACE(
+                        "CUDA_PCG iter {}-{}: {:.6f}s residual {:.6e}",
+                        iter_window_start,
+                        k,
+                        elapsed_seconds(iter_window_begin),
+                        residual_norm_);
+                    iter_window_begin = clock::now();
+                    iter_window_start = k + 1;
+                }
+
+                if (converged)
+                {
+                    break;
+                }
+
+                // Compute beta = rz / rz_old.
+                scalar_division(*scalar_rz_, *scalar_rz_old_, *scalar_beta_, rt);
+                // Compute direction update p' = M^-1 r + beta p.
+                axpby(1.0, nullptr, 1.0, scalar_beta_->data(), *z_, *p_, rt);
+            }
+
+            if (iterations_ == max_iter_)
+            {
+                status_ = CudaPCGStatus::ReachMaxIterations;
+            }
+
+            SPDLOG_TRACE(
+                "CUDA_PCG iter {}-{}: {:.6f}s residual {:.6e}",
+                iter_window_start,
+                iterations_,
+                elapsed_seconds(iter_window_begin),
+                residual_norm_);
+        }
+    };
+
+    CudaPCG::CudaPCG()
+        : impl_(std::make_unique<CudaPCGImpl>())
+    {
+    }
+
+    CudaPCG::~CudaPCG() = default;
+
+    void CudaPCG::set_parameters(const json &params)
+    {
+        const std::string solver_name = name();
+        if (!params.contains(solver_name))
+        {
+            return;
+        }
+
+        impl_->set_parameters(params[solver_name]);
+    }
+
+    void CudaPCG::get_info(json &params) const
+    {
+        impl_->get_info(params);
+    }
+
+    void CudaPCG::analyze_pattern(const StiffnessMatrix &A, const int precond_num)
+    {
+        impl_->analyze_pattern(A, precond_num);
+    }
+
+    void CudaPCG::factorize(const StiffnessMatrix &A)
+    {
+        impl_->factorize(A);
+    }
+
+    void CudaPCG::solve(const Ref<const VectorXd> b, Ref<VectorXd> x)
+    {
+        impl_->solve(b, x);
+    }
+
+    std::string CudaPCG::name() const
+    {
+        return "CUDA_PCG";
+    }
+
+} // namespace polysolve::linear
diff --git a/src/polysolve/linear/PCGSolver.hpp b/src/polysolve/linear/PCGSolver.hpp
new file mode 100644
index 00000000..556b696b
--- /dev/null
+++ b/src/polysolve/linear/PCGSolver.hpp
@@ -0,0 +1,74 @@
+#pragma once
+
+#include "Solver.hpp"
+
+#include <memory>
+#include <string>
+
+namespace polysolve::linear
+{
+    enum class CudaPCGStatus
+    {
+        Running,
+        ReachRelativeTolerance,
+        ReachAbsoluteTolerance,
+        ReachMaxIterations,
+    };
+
+    inline std::string pcg_status_to_string(CudaPCGStatus stat)
+    {
+        switch (stat)
+        {
+        case CudaPCGStatus::Running:
+            return "Running";
+        case CudaPCGStatus::ReachRelativeTolerance:
+            return "Reach relative tolerance";
+        case CudaPCGStatus::ReachAbsoluteTolerance:
+            return "Reach absolute tolerance";
+        case CudaPCGStatus::ReachMaxIterations:
+            return "Reach max iterations";
+        default:
+            return "Unknown";
+        }
+    }
+
+    class CudaPCG : public Solver
+    {
+
+    public:
+        CudaPCG();
+        ~CudaPCG() override;
+        POLYSOLVE_DELETE_MOVE_COPY(CudaPCG)
+
+    public:
+        //////////////////////
+        // Public interface //
+        //////////////////////
+
+        // Set solver parameters
+        void set_parameters(const json &params) override;
+
+        // Retrieve information
+        void get_info(json &params) const override;
+
+        // Analyze sparsity pattern. This is a no-op
+        void analyze_pattern(const StiffnessMatrix &A, const int precond_num) override;
+
+        // Factorize system matrix.
+        void factorize(const StiffnessMatrix &A) override;
+
+        // Solve the linear system Ax = b
+        void solve(const Ref<const VectorXd> b, Ref<VectorXd> x) override;
+
+        // Set solver tolerance
+        // void set_tolerance(const double tol) override;
+
+        // Name of the solver type (for debugging purposes)
+        std::string name() const override;
+
+    private:
+        class CudaPCGImpl;
+        std::unique_ptr<CudaPCGImpl> impl_;
+    };
+
+} // namespace polysolve::linear
diff --git a/src/polysolve/linear/Solver.cpp b/src/polysolve/linear/Solver.cpp
index e7c0c259..ad7b1d2b 100644
--- a/src/polysolve/linear/Solver.cpp
+++ b/src/polysolve/linear/Solver.cpp
@@ -63,6 +63,10 @@ namespace polysolve::linear {
 #ifdef POLYSOLVE_WITH_CUSOLVER
 #include "CuSolverDN.cuh"
 #endif
+#ifdef POLYSOLVE_WITH_CUDA
+#include "PCGSolver.hpp"
+#endif
+
 #include <unsupported/Eigen/IterativeSolvers>
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -399,6 +403,12 @@ namespace polysolve::linear
         {
             return std::make_unique<CuSolverDN<float>>();
 #endif
+#ifdef POLYSOLVE_WITH_CUDA
+        }
+        else if (solver == "CUDA_PCG")
+        {
+            return std::make_unique<CudaPCG>();
+#endif
 #ifdef POLYSOLVE_WITH_HYPRE
         }
         else if (solver == "Hypre")
@@ -531,6 +541,9 @@ namespace polysolve::linear
             "cuSolverDN",
             "cuSolverDN_float",
 #endif
+#ifdef POLYSOLVE_WITH_CUDA
+            "CUDA_PCG",
+#endif
 #ifdef POLYSOLVE_WITH_HYPRE
             "Hypre",
 #endif
diff --git a/src/polysolve/linear/mas_utils/BSRAdjacency.cpp b/src/polysolve/linear/mas_utils/BSRAdjacency.cpp
new file mode 100644
index 00000000..4c77ffdb
--- /dev/null
+++ b/src/polysolve/linear/mas_utils/BSRAdjacency.cpp
@@ -0,0 +1,192 @@
+#include <polysolve/linear/mas_utils/BSRAdjacency.hpp>
+
+#include <cuda/std/span>
+#include <Eigen/SparseCore>
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <stdexcept>
+#include <unordered_map>
+#include <vector>
+
+namespace polysolve::linear::mas
+{
+    namespace
+    {
+        std::vector<int64_t> quantize_weight(cuda::std::span<const double> weight)
+        {
+            if (weight.empty())
+            {
+                return {};
+            }
+
+            auto [min_it, max_it] = std::minmax_element(weight.begin(), weight.end());
+            double min_w = *min_it;
+            double max_w = *max_it;
+            double w_range = max_w - min_w;
+            if (w_range <= 0.0)
+            {
+                return std::vector<int64_t>(weight.size(), 1);
+            }
+
+            constexpr int MAX_QUANT = 1000000;
+            std::vector<int64_t> quant(weight.size());
+            for (int i = 0; i < static_cast<int>(weight.size()); ++i)
+            {
+                double scaled = (weight[i] - min_w) / w_range * MAX_QUANT;
+                quant[i] = static_cast<int64_t>(std::clamp(static_cast<int>(scaled), 1, MAX_QUANT));
+            }
+            return quant;
+        }
+
+        /// Average upper and lower part of topology connection weight.
+        void avg_sym_weight(cuda::std::span<const int> row_ptr,
+                            cuda::std::span<const int> cols,
+                            cuda::std::span<double> weights)
+        {
+            struct Sum
+            {
+                double val = 0.0;
+                int count = 0;
+            };
+
+            std::unordered_map<uint64_t, Sum> pair_weight;
+
+            auto ij_to_key = [](int i, int j) -> uint64_t {
+                int u = std::min(i, j);
+                int v = std::max(i, j);
+                return (static_cast<uint64_t>(u) << 32) | static_cast<uint32_t>(v);
+            };
+
+            for (int i = 0; i < static_cast<int>(row_ptr.size()) - 1; ++i)
+            {
+                for (int n = row_ptr[i]; n < row_ptr[i + 1]; ++n)
+                {
+                    int j = cols[n];
+                    Sum &stat = pair_weight[ij_to_key(i, j)];
+                    stat.val += weights[n];
+                    stat.count += 1;
+                }
+            }
+
+            for (int i = 0; i < static_cast<int>(row_ptr.size()) - 1; ++i)
+            {
+                for (int n = row_ptr[i]; n < row_ptr[i + 1]; ++n)
+                {
+                    int j = cols[n];
+                    auto it = pair_weight.find(ij_to_key(i, j));
+                    if (it == pair_weight.end() || it->second.count == 0)
+                    {
+                        continue;
+                    }
+                    weights[n] = it->second.val / it->second.count;
+                }
+            }
+        }
+
+        template <int BLOCK_DIM>
+        void build_host_adjacency(const StiffnessMatrix &A,
+                                  int dim,
+                                  std::vector<int> &row_ptr,
+                                  std::vector<int> &cols,
+                                  std::vector<int64_t> &weights)
+        {
+            using Iter = Eigen::SparseMatrix<double, Eigen::RowMajor>::InnerIterator;
+
+            Eigen::SparseMatrix<double, Eigen::RowMajor> Arow = A;
+            Arow.makeCompressed();
+
+            row_ptr.assign(dim + 1, 0);
+            cols.clear();
+            weights.clear();
+
+            std::vector<double> norms;
+
+            // One block row at a time.
+            for (int bi = 0; bi < dim; ++bi)
+            {
+                // Accumulate block Frobenius norm squared per block col.
+                std::unordered_map<int, double> col2norm2;
+
+                for (int li = 0; li < BLOCK_DIM; ++li)
+                {
+                    int scalar_row = bi * BLOCK_DIM + li;
+                    if (scalar_row >= Arow.rows())
+                    {
+                        break;
+                    }
+
+                    for (Iter it(Arow, scalar_row); it; ++it)
+                    {
+                        int bj = it.col() / BLOCK_DIM;
+                        if (bj == bi)
+                        {
+                            continue; // self-excluding
+                        }
+
+                        double v = it.value();
+                        col2norm2[bj] += v * v;
+                    }
+                }
+
+                std::vector<int> row_cols;
+                row_cols.reserve(col2norm2.size());
+                for (auto &kv : col2norm2)
+                {
+                    row_cols.push_back(kv.first);
+                }
+                std::sort(row_cols.begin(), row_cols.end());
+
+                row_ptr[bi + 1] = static_cast<int>(row_cols.size());
+                for (int bj : row_cols)
+                {
+                    cols.push_back(bj);
+                    norms.push_back(std::sqrt(col2norm2[bj]));
+                }
+            }
+
+            for (int i = 0; i < dim; ++i)
+            {
+                row_ptr[i + 1] += row_ptr[i];
+            }
+
+            // Stabilize weights for the graph partitioner: symmetrize, then quantize.
+            avg_sym_weight(row_ptr, cols, norms);
+            weights = quantize_weight(norms);
+        }
+    } // namespace
+
+    BSRAdjacency::BSRAdjacency(const StiffnessMatrix &A, int block_dim)
+    {
+        if (A.cols() != A.rows() || A.cols() == 0 || A.rows() == 0 || A.nonZeros() == 0)
+        {
+            throw std::runtime_error("[CudaPcg] Factorization failed due to invalid A");
+        }
+        if (A.cols() > std::numeric_limits<int>::max() || A.rows() > std::numeric_limits<int>::max())
+        {
+            throw std::runtime_error("[CudaPcg] A is too large. Row/Col number exceeding int32 max.");
+        }
+        if (block_dim < 1 || block_dim > 3)
+        {
+            throw std::runtime_error("[CudaPcg] MAS only supports block size 1, 2, or 3.");
+        }
+
+        int dim = (A.rows() + block_dim - 1) / block_dim;
+        if (block_dim == 1)
+        {
+            build_host_adjacency<1>(A, dim, row_ptr, cols, weights);
+        }
+        else if (block_dim == 2)
+        {
+            build_host_adjacency<2>(A, dim, row_ptr, cols, weights);
+        }
+        else
+        {
+            build_host_adjacency<3>(A, dim, row_ptr, cols, weights);
+        }
+    }
+
+} // namespace polysolve::linear::mas
diff --git a/src/polysolve/linear/mas_utils/BSRAdjacency.hpp b/src/polysolve/linear/mas_utils/BSRAdjacency.hpp
new file mode 100644
index 00000000..0a89eadb
--- /dev/null
+++ b/src/polysolve/linear/mas_utils/BSRAdjacency.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <polysolve/Types.hpp>
+
+#include <cstdint>
+#include <vector>
+
+namespace polysolve::linear::mas
+{
+    /// Host-only CSR adjacency for graph partitioning.
+    /// Self-excluding, with symmetrized + quantized positive integer weights.
+    class BSRAdjacency
+    {
+    public:
+        BSRAdjacency() = default;
+        BSRAdjacency(const StiffnessMatrix &A, int block_dim);
+
+        std::vector<int> row_ptr;
+        std::vector<int> cols;
+        std::vector<int64_t> weights;
+    };
+
+} // namespace polysolve::linear::mas
diff --git a/src/polysolve/linear/mas_utils/BSRMatrix.cu b/src/polysolve/linear/mas_utils/BSRMatrix.cu
new file mode 100644
index 00000000..01e95ef7
--- /dev/null
+++ b/src/polysolve/linear/mas_utils/BSRMatrix.cu
@@ -0,0 +1,445 @@
+#include <polysolve/linear/mas_utils/BSRMatrix.hpp>
+
+#include <cub/cub.cuh>
+
+#include <cuda/algorithm>
+#include <cuda/buffer>
+#include <cuda/std/span>
+
+#include <Eigen/SparseCore>
+
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <stdexcept>
+#include <type_traits>
+
+namespace polysolve::linear::mas
+{
+    namespace
+    {
+        // Key bit layout:
+        // | block row idx (32) | block col idx (32) |
+
+        __device__ uint64_t pack_key(int block_row, int block_col)
+        {
+            return (static_cast<uint64_t>(static_cast<uint32_t>(block_row)) << 32) | static_cast<uint32_t>(block_col);
+        }
+
+        __device__ int key_to_row(uint64_t key)
+        {
+            return static_cast<int>(static_cast<uint32_t>(key >> 32));
+        }
+
+        __device__ int key_to_col(uint64_t key)
+        {
+            return static_cast<int>(static_cast<uint32_t>(key));
+        }
+
+        // Payload bit layout:
+        // | is padding (1) | block local offset (31) | nnz index (32) |
+
+        /// @brief Pack payload for radix sort.
+        /// @param is_padding If true, this is the diagonal padding of trailing block.
+        /// @param nnz_index Index of input CSC vals array.
+        /// @param local_offset Block local offset. Assumes Row major block.
+        __device__ uint64_t pack_payload(bool is_padding, int nnz_index, int local_offset)
+        {
+            uint64_t payload = 0;
+            payload |= static_cast<uint32_t>(nnz_index);
+            payload |= (static_cast<uint64_t>(static_cast<uint32_t>(local_offset)) << 32);
+            if (is_padding)
+            {
+                payload |= (uint64_t{1} << 63);
+            }
+            return payload;
+        }
+
+        __device__ bool payload_to_is_padding(uint64_t payload)
+        {
+            return (payload >> 63) != 0;
+        }
+
+        __device__ int payload_to_local_offset(uint64_t payload)
+        {
+            return static_cast<int>(static_cast<uint32_t>(payload >> 32) & 0x7FFFFFFF);
+        }
+
+        __device__ int payload_to_nnz_index(uint64_t payload)
+        {
+            return static_cast<int>(static_cast<uint32_t>(payload));
+        }
+
+        __global__ void build_col_of_nnz(int col_num,
+                                         ctd::span<const int> col_ptr,
+                                         ctd::span<int> col_of_nnz)
+        {
+            int col = blockDim.x * blockIdx.x + threadIdx.x;
+            if (col >= col_num)
+            {
+                return;
+            }
+
+            int begin = col_ptr[col];
+            int end = col_ptr[col + 1];
+            for (int k = begin; k < end; ++k)
+            {
+                col_of_nnz[k] = col;
+            }
+        }
+
+        __global__ void build_keys_payloads(int nnz,
+                                            int padded,
+                                            int block_dim,
+                                            int dim_blocks,
+                                            const int *perm, // nullable
+                                            ctd::span<const int> col_of_nnz,
+                                            ctd::span<const int> row_idx,
+                                            ctd::span<uint64_t> keys,
+                                            ctd::span<uint64_t> payloads)
+        {
+            int tid = blockDim.x * blockIdx.x + threadIdx.x;
+            int nnz_total = nnz + padded;
+            if (tid >= nnz_total)
+            {
+                return;
+            }
+
+            // Real node from input A. compute permuted block ij then pack.
+            if (tid < nnz)
+            {
+                int old_row = row_idx[tid];
+                int old_col = col_of_nnz[tid];
+
+                int br_old = old_row / block_dim;
+                int bc_old = old_col / block_dim;
+                int br = (perm == nullptr) ? br_old : perm[br_old];
+                int bc = (perm == nullptr) ? bc_old : perm[bc_old];
+
+                int offset = (old_row - br_old * block_dim) * block_dim + (old_col - bc_old * block_dim);
+                keys[tid] = pack_key(br, bc);
+                payloads[tid] = pack_payload(false, tid, offset);
+            }
+            // Padding for trailing block. Compute permuted block ij for tail block then pack.
+            else
+            {
+                int t = tid - nnz;
+                int tail_old = dim_blocks - 1;
+                int tail_new = (perm == nullptr) ? tail_old : perm[tail_old];
+                int local = (block_dim - padded) + t;
+                int offset = local * block_dim + local;
+
+                keys[tid] = pack_key(tail_new, tail_new);
+                payloads[tid] = pack_payload(true, 0, offset);
+            }
+        }
+
+        __global__ void extract_block_rows(ctd::span<const uint64_t> unique_keys,
+                                           ctd::span<int> block_rows)
+        {
+            int tid = blockDim.x * blockIdx.x + threadIdx.x;
+            if (tid >= unique_keys.size())
+            {
+                return;
+            }
+
+            block_rows[tid] = key_to_row(unique_keys[tid]);
+        }
+
+        __global__ void fill_bsr_cols_vals(int block_dim,
+                                           ctd::span<const uint64_t> unique_keys,
+                                           ctd::span<const int> payload_offsets,
+                                           ctd::span<const uint64_t> payloads,
+                                           ctd::span<const double> csc_vals,
+                                           ctd::span<int> bsr_cols,
+                                           ctd::span<double> bsr_vals)
+        {
+            int bid = blockDim.x * blockIdx.x + threadIdx.x;
+            if (bid >= unique_keys.size())
+            {
+                return;
+            }
+
+            bsr_cols[bid] = key_to_col(unique_keys[bid]);
+
+            int begin = payload_offsets[bid];
+            int end = payload_offsets[bid + 1];
+            int block_size = block_dim * block_dim;
+            int base = bid * block_size;
+            for (int i = begin; i < end; ++i)
+            {
+                uint64_t payload = payloads[i];
+                int offset = payload_to_local_offset(payload);
+                // We pad diagonal entry to 1.0 for trailing blocks.
+                double val = payload_to_is_padding(payload) ? 1.0 : csc_vals[payload_to_nnz_index(payload)];
+                bsr_vals[base + offset] = val;
+            }
+        }
+
+        int build_device_bsr_from_csc(ctd::span<const int> d_col_ptr,
+                                      ctd::span<const int> d_row_idx,
+                                      ctd::span<const double> d_vals,
+                                      int rows_num,
+                                      int cols_num,
+                                      int block_dim,
+                                      const int *d_perm,
+                                      Buf<int> &out_rows,
+                                      Buf<int> &out_cols,
+                                      Buf<double> &out_vals,
+                                      CudaRuntime rt)
+        {
+            int nnz = d_row_idx.size();
+            int dim_blocks = div_round_up(rows_num, block_dim);
+            int padded = block_dim * dim_blocks - rows_num;
+            int nnz_total = nnz + padded;
+
+            // ---------------------------------------------------------------------------
+            // Map each nnz index to input col.
+            // ---------------------------------------------------------------------------
+
+            auto col_of_nnz = cu::make_buffer<int>(rt.stream, rt.mr, nnz, cu::no_init);
+            build_col_of_nnz<<<div_round_up(cols_num, 128), 128, 0, rt.stream.get()>>>(
+                cols_num, d_col_ptr, col_of_nnz);
+
+            // ---------------------------------------------------------------------------
+            // Convert each non-zero to [key, payload] pair.
+            // ---------------------------------------------------------------------------
+
+            auto keys_in = cu::make_buffer<uint64_t>(rt.stream, rt.mr, nnz_total, cu::no_init);
+            auto payloads_in = cu::make_buffer<uint64_t>(rt.stream, rt.mr, nnz_total, cu::no_init);
+            build_keys_payloads<<<div_round_up(nnz_total, 128), 128, 0, rt.stream.get()>>>(
+                nnz,
+                padded,
+                block_dim,
+                dim_blocks,
+                d_perm,
+                col_of_nnz,
+                d_row_idx,
+                keys_in,
+                payloads_in);
+
+            // ---------------------------------------------------------------------------
+            // Radix sort by key. Result should be in row major order.
+            // ---------------------------------------------------------------------------
+
+            auto keys_alt = cu::make_buffer<uint64_t>(rt.stream, rt.mr, nnz_total, cu::no_init);
+            auto payloads_alt = cu::make_buffer<uint64_t>(rt.stream, rt.mr, nnz_total, cu::no_init);
+            cub::DoubleBuffer<uint64_t> d_keys(keys_in.data(), keys_alt.data());
+            cub::DoubleBuffer<uint64_t> d_payloads(payloads_in.data(), payloads_alt.data());
+            Buf<char> cub_tmp;
+            auto make_cub_tmp = [&cub_tmp, rt](size_t required_size) {
+                if (!cub_tmp || cub_tmp->size() < required_size)
+                {
+                    cub_tmp = cu::make_buffer<char>(rt.stream, rt.mr, required_size, cu::no_init);
+                }
+                return cub_tmp->data();
+            };
+
+            size_t sort_tmp_size = 0;
+            cub::DeviceRadixSort::SortPairs(nullptr,
+                                            sort_tmp_size,
+                                            d_keys,
+                                            d_payloads,
+                                            nnz_total,
+                                            0,
+                                            64,
+                                            rt.stream.get());
+            cub::DeviceRadixSort::SortPairs(make_cub_tmp(sort_tmp_size),
+                                            sort_tmp_size,
+                                            d_keys,
+                                            d_payloads,
+                                            nnz_total,
+                                            0,
+                                            64,
+                                            rt.stream.get());
+
+            // ---------------------------------------------------------------------------
+            // Run-length encode.
+            // This step computes non-zero block num + payload count for each block.
+            // ---------------------------------------------------------------------------
+
+            auto unique_keys = cu::make_buffer<uint64_t>(rt.stream, rt.mr, nnz_total, cu::no_init);
+            auto counts = cu::make_buffer<int>(rt.stream, rt.mr, nnz_total, cu::no_init);
+            auto num_runs = cu::make_buffer<int>(rt.stream, rt.mr, 1, cu::no_init);
+
+            size_t rle_tmp_size = 0;
+            cub::DeviceRunLengthEncode::Encode(nullptr,
+                                               rle_tmp_size,
+                                               d_keys.Current(),
+                                               unique_keys.data(),
+                                               counts.data(),
+                                               num_runs.data(),
+                                               nnz_total,
+                                               rt.stream.get());
+            cub::DeviceRunLengthEncode::Encode(make_cub_tmp(rle_tmp_size),
+                                               rle_tmp_size,
+                                               d_keys.Current(),
+                                               unique_keys.data(),
+                                               counts.data(),
+                                               num_runs.data(),
+                                               nnz_total,
+                                               rt.stream.get());
+
+            // ---------------------------------------------------------------------------
+            // Histogram + ExclusiveSum
+            // This step count non-zero blocks per rows to compute row_ptr.
+            // ---------------------------------------------------------------------------
+
+            int nnz_blocks = device2host(num_runs.data(), rt);
+            auto block_rows = cu::make_buffer<int>(rt.stream, rt.mr, nnz_blocks, cu::no_init);
+            // Extract row index from packed key.
+            extract_block_rows<<<div_round_up(nnz_blocks, 128), 128, 0, rt.stream.get()>>>(
+                ctd::span<const uint64_t>(unique_keys.data(), nnz_blocks),
+                block_rows);
+            // Histogram count nnz block per row.
+            auto hist = cu::make_buffer<int>(rt.stream, rt.mr, dim_blocks + 1, 0);
+            size_t hist_tmp_size = 0;
+            cub::DeviceHistogram::HistogramEven(nullptr,
+                                                hist_tmp_size,
+                                                block_rows.data(),
+                                                hist.data(),
+                                                dim_blocks + 1,
+                                                0,
+                                                dim_blocks,
+                                                nnz_blocks,
+                                                rt.stream.get());
+            cub::DeviceHistogram::HistogramEven(make_cub_tmp(hist_tmp_size),
+                                                hist_tmp_size,
+                                                block_rows.data(),
+                                                hist.data(),
+                                                dim_blocks + 1,
+                                                0,
+                                                dim_blocks,
+                                                nnz_blocks,
+                                                rt.stream.get());
+            // Exclusive scan compute CSR row ptr.
+            out_rows = cu::make_buffer<int>(rt.stream, rt.mr, dim_blocks + 1, cu::no_init);
+            size_t scan_tmp_size = 0;
+            cub::DeviceScan::ExclusiveSum(nullptr,
+                                          scan_tmp_size,
+                                          hist.data(),
+                                          out_rows->data(),
+                                          dim_blocks + 1,
+                                          rt.stream.get());
+            cub::DeviceScan::ExclusiveSum(make_cub_tmp(scan_tmp_size),
+                                          scan_tmp_size,
+                                          hist.data(),
+                                          out_rows->data(),
+                                          dim_blocks + 1,
+                                          rt.stream.get());
+
+            // ---------------------------------------------------------------------------
+            // Fill cols and vals non-zero blocks.
+            // ---------------------------------------------------------------------------
+
+            // Compute Payload offsets for each block.
+            auto payload_offsets = cu::make_buffer<int>(rt.stream, rt.mr, nnz_blocks + 1, cu::no_init);
+            cudaMemsetAsync(counts.data() + nnz_blocks, 0, sizeof(int), rt.stream.get());
+            size_t off_tmp_size = 0;
+            cub::DeviceScan::ExclusiveSum(nullptr,
+                                          off_tmp_size,
+                                          counts.data(),
+                                          payload_offsets.data(),
+                                          nnz_blocks + 1,
+                                          rt.stream.get());
+            cub::DeviceScan::ExclusiveSum(make_cub_tmp(off_tmp_size),
+                                          off_tmp_size,
+                                          counts.data(),
+                                          payload_offsets.data(),
+                                          nnz_blocks + 1,
+                                          rt.stream.get());
+
+            // Fill cols and vals.
+            out_cols = cu::make_buffer<int>(rt.stream, rt.mr, nnz_blocks, cu::no_init);
+            out_vals = cu::make_buffer<double>(rt.stream, rt.mr, nnz_blocks * block_dim * block_dim, 0.0);
+            fill_bsr_cols_vals<<<div_round_up(nnz_blocks, 128), 128, 0, rt.stream.get()>>>(
+                block_dim,
+                ctd::span<const uint64_t>(unique_keys.data(), nnz_blocks),
+                payload_offsets,
+                ctd::span<const uint64_t>(d_payloads.Current(), nnz_total),
+                d_vals,
+                *out_cols,
+                *out_vals);
+
+            return nnz_blocks;
+        }
+    } // namespace
+
+    BSRMatrix::BSRMatrix(const StiffnessMatrix &A,
+                         int block_dim,
+                         ctd::span<const int> permutation,
+                         CudaRuntime rt)
+    {
+        static_assert(std::is_same_v<StiffnessMatrix::StorageIndex, int>, "MAS only support int32 index type.");
+        if (A.nonZeros() > std::numeric_limits<int>::max())
+        {
+            throw std::runtime_error("[CudaPcg] A is too large. Non-zero number exceeding int32 max.");
+        }
+
+        // if A is not compressed. Copy and compress.
+        const StiffnessMatrix *A_csc = &A;
+        StiffnessMatrix compressed_A;
+        if (!A.isCompressed())
+        {
+            compressed_A = A;
+            compressed_A.makeCompressed();
+            A_csc = &compressed_A;
+        }
+
+        block_dim_ = block_dim;
+        dim_ = div_round_up(A_csc->rows(), block_dim_);
+
+        int rows_num = A_csc->rows();
+        int cols_num = A_csc->cols();
+        int nnz = A_csc->nonZeros();
+
+        // Upload input CSC to device.
+        auto d_col_ptr = cu::make_buffer<int>(rt.stream, rt.mr, cols_num + 1, cu::no_init);
+        auto d_row_idx = cu::make_buffer<int>(rt.stream, rt.mr, nnz, cu::no_init);
+        auto d_vals = cu::make_buffer<double>(rt.stream, rt.mr, nnz, cu::no_init);
+        cudaMemcpyAsync(
+            d_col_ptr.data(),
+            A_csc->outerIndexPtr(),
+            static_cast<size_t>(cols_num + 1) * sizeof(int),
+            cudaMemcpyHostToDevice,
+            rt.stream.get());
+        cudaMemcpyAsync(
+            d_row_idx.data(),
+            A_csc->innerIndexPtr(),
+            static_cast<size_t>(nnz) * sizeof(int),
+            cudaMemcpyHostToDevice,
+            rt.stream.get());
+        cudaMemcpyAsync(
+            d_vals.data(),
+            A_csc->valuePtr(),
+            static_cast<size_t>(nnz) * sizeof(double),
+            cudaMemcpyHostToDevice,
+            rt.stream.get());
+
+        // Upload permutation if non empty.
+        const int *d_perm_ptr = nullptr;
+        Buf<int> d_perm;
+        if (!permutation.empty())
+        {
+            d_perm = cu::make_buffer<int>(rt.stream, rt.mr, permutation.size(), cu::no_init);
+            cu::copy_bytes(rt.stream, permutation, *d_perm);
+            d_perm_ptr = d_perm->data();
+        }
+
+        non_zeros_ = build_device_bsr_from_csc(
+            d_col_ptr,
+            d_row_idx,
+            d_vals,
+            rows_num,
+            cols_num,
+            block_dim_,
+            d_perm_ptr,
+            rows_,
+            cols_,
+            vals_,
+            rt);
+
+        rt.stream.sync();
+    }
+
+} // namespace polysolve::linear::mas
diff --git a/src/polysolve/linear/mas_utils/BSRMatrix.hpp b/src/polysolve/linear/mas_utils/BSRMatrix.hpp
new file mode 100644
index 00000000..ecf68f2a
--- /dev/null
+++ b/src/polysolve/linear/mas_utils/BSRMatrix.hpp
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <polysolve/linear/mas_utils/CudaUtils.cuh>
+#include <polysolve/Types.hpp>
+
+#include <cuda/std/span>
+
+namespace polysolve::linear::mas
+{
+    struct BSRView
+    {
+        int dim;
+        int block_dim;
+        int non_zeros;
+        ctd::span<const int> rows;
+        ctd::span<const int> cols;
+        ctd::span<const double> vals;
+    };
+
+    /// @brief Block CSR matrix.
+    class BSRMatrix
+    {
+    public:
+        BSRMatrix() = default;
+
+        /// @brief Build from host CSR matrix. Empty permutation implies identity.
+        /// Sync stream before return.
+        BSRMatrix(const StiffnessMatrix &A,
+                  int block_dim,
+                  ctd::span<const int> permutation,
+                  CudaRuntime rt);
+
+        BSRView view() const
+        {
+            return BSRView{dim_, block_dim_, non_zeros_, *rows_, *cols_, *vals_};
+        }
+
+    private:
+        int dim_ = 0;
+        int block_dim_ = 0;
+        int non_zeros_ = 0;
+        Buf<int> rows_;
+        Buf<int> cols_;
+        Buf<double> vals_;
+    };
+
+} // namespace polysolve::linear::mas
diff --git a/src/polysolve/linear/mas_utils/CuSparseWrapper.hpp b/src/polysolve/linear/mas_utils/CuSparseWrapper.hpp
new file mode 100644
index 00000000..20c8065a
--- /dev/null
+++ b/src/polysolve/linear/mas_utils/CuSparseWrapper.hpp
@@ -0,0 +1,182 @@
+#pragma once
+
+#include <polysolve/linear/mas_utils/BSRMatrix.hpp>
+
+#include <cusparse.h>
+
+// --------------------------------------------------------------
+// RAII wrapper for varous cuSparse handle
+// --------------------------------------------------------------
+
+namespace polysolve::linear::mas
+{
+    class CuSparseHandle
+    {
+    public:
+        cusparseHandle_t raw = nullptr;
+
+        CuSparseHandle()
+        {
+            cusparseCreate(&raw);
+        }
+
+        ~CuSparseHandle()
+        {
+            if (raw != nullptr)
+            {
+                cusparseDestroy(raw);
+            }
+        }
+
+        CuSparseHandle(const CuSparseHandle &) = delete;
+        CuSparseHandle(CuSparseHandle &&other) { swap(other); }
+        CuSparseHandle &operator=(const CuSparseHandle &) = delete;
+        CuSparseHandle &operator=(CuSparseHandle &&other)
+        {
+            swap(other);
+            return *this;
+        }
+
+    private:
+        void swap(CuSparseHandle &other)
+        {
+            std::swap(raw, other.raw);
+        }
+    };
+
+    class CuSparseConstVec
+    {
+    public:
+        cusparseConstDnVecDescr_t raw = nullptr;
+
+        CuSparseConstVec(ctd::span<const double> vec)
+        {
+            cusparseCreateConstDnVec(&raw, vec.size(), vec.data(), CUDA_R_64F);
+        }
+
+        ~CuSparseConstVec()
+        {
+            if (raw != nullptr)
+            {
+                cusparseDestroyDnVec(raw);
+            }
+        }
+
+        CuSparseConstVec(const CuSparseConstVec &) = delete;
+        CuSparseConstVec(CuSparseConstVec &&other) { swap(other); }
+        CuSparseConstVec &operator=(const CuSparseConstVec &) = delete;
+        CuSparseConstVec &operator=(CuSparseConstVec &&other)
+        {
+            swap(other);
+            return *this;
+        }
+
+    private:
+        void swap(CuSparseConstVec &other)
+        {
+            std::swap(raw, other.raw);
+        }
+    };
+
+    /// @brief Minimal RAII wrapper for CuSparse dense vector.
+    class CuSparseVec
+    {
+    public:
+        cusparseDnVecDescr_t raw = nullptr;
+
+        CuSparseVec(ctd::span<double> vec)
+        {
+            cusparseCreateDnVec(&raw, vec.size(), vec.data(), CUDA_R_64F);
+        }
+
+        ~CuSparseVec()
+        {
+            if (raw != nullptr)
+            {
+                cusparseDestroyDnVec(raw);
+            }
+        }
+
+        CuSparseVec(const CuSparseVec &) = delete;
+        CuSparseVec(CuSparseVec &&other) { swap(other); }
+        CuSparseVec &operator=(const CuSparseVec &) = delete;
+        CuSparseVec &operator=(CuSparseVec &&other)
+        {
+            swap(other);
+            return *this;
+        }
+
+    private:
+        void swap(CuSparseVec &other)
+        {
+            std::swap(raw, other.raw);
+        }
+    };
+
+    class CuSparseBSR
+    {
+    public:
+        cusparseConstSpMatDescr_t raw = nullptr;
+
+        CuSparseBSR() = default;
+
+        CuSparseBSR(BSRView mat)
+        {
+            if (mat.block_dim == 1)
+            {
+                cusparseCreateConstCsr(&raw,
+                                       mat.dim,
+                                       mat.dim,
+                                       mat.non_zeros,
+                                       mat.rows.data(),
+                                       mat.cols.data(),
+                                       mat.vals.data(),
+                                       CUSPARSE_INDEX_32I,
+                                       CUSPARSE_INDEX_32I,
+                                       CUSPARSE_INDEX_BASE_ZERO,
+                                       CUDA_R_64F);
+            }
+            else
+            {
+                cusparseCreateConstBsr(&raw,
+                                       mat.dim,
+                                       mat.dim,
+                                       mat.non_zeros,
+                                       mat.block_dim,
+                                       mat.block_dim,
+                                       mat.rows.data(),
+                                       mat.cols.data(),
+                                       mat.vals.data(),
+                                       CUSPARSE_INDEX_32I,
+                                       CUSPARSE_INDEX_32I,
+                                       CUSPARSE_INDEX_BASE_ZERO,
+                                       CUDA_R_64F,
+                                       CUSPARSE_ORDER_ROW);
+            }
+        }
+
+        ~CuSparseBSR()
+        {
+            if (raw != nullptr)
+            {
+                cusparseDestroySpMat(raw);
+            }
+        }
+
+        CuSparseBSR(const CuSparseBSR &) = delete;
+        CuSparseBSR(CuSparseBSR &&other) { swap(other); }
+        CuSparseBSR &operator=(const CuSparseBSR &) = delete;
+        CuSparseBSR &operator=(CuSparseBSR &&other)
+        {
+            swap(other);
+            return *this;
+        }
+
+    private:
+        void swap(CuSparseBSR &other)
+        {
+            std::swap(raw, other.raw);
+        }
+    };
+
+} // namespace polysolve::linear::mas
diff --git a/src/polysolve/linear/mas_utils/CudaUtils.cuh b/src/polysolve/linear/mas_utils/CudaUtils.cuh
new file mode 100644
index 00000000..ddd72ba7
--- /dev/null
+++ b/src/polysolve/linear/mas_utils/CudaUtils.cuh
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <cuda/stream>
+#include <cuda/memory_pool>
+#include <cuda/buffer>
+#include <cuda/std/optional>
+
+#define __both__ __host__ __device__
+
+// Convenient namespace alias for libcudacxx.
+namespace cu = ::cuda;
+namespace ctd = ::cuda::std;
+
+namespace polysolve::linear::mas
+{
+    /// @brief ceil(num/denom)
+    constexpr int div_round_up(int num, int denom)
+    {
+        return (num + denom - 1) / denom;
+    }
+
+    struct CudaRuntime
+    {
+        cu::stream_ref stream;
+        cu::device_memory_pool_ref mr;
+    };
+
+    /// @brief Transfer device scalar src to host.
+    template <typename T>
+    T device2host(const T *src, CudaRuntime rt)
+    {
+        T result;
+        cudaMemcpyAsync(&result, src, sizeof(T), cudaMemcpyDeviceToHost,
+                        rt.stream.get());
+        rt.stream.sync();
+        return result;
+    }
+
+    /// @brief Transfer host scalar val to device dst. Does not sync stream.
+    template <typename T>
+    void host2device(T *dst, T val, CudaRuntime rt)
+    {
+        cudaMemcpyAsync(dst, &val, sizeof(T), cudaMemcpyHostToDevice,
+                        rt.stream.get());
+    }
+
+    /// @brief Nullable device buffer.
+    /// It's very annoying device_buffer does not have default ctor for empty buffer.
+    template <typename T>
+    using Buf = ctd::optional<cu::device_buffer<T>>;
+
+} // namespace polysolve::linear::mas
diff --git a/src/polysolve/linear/mas_utils/GraphPartition.cpp b/src/polysolve/linear/mas_utils/GraphPartition.cpp
new file mode 100644
index 00000000..2219883d
--- /dev/null
+++ b/src/polysolve/linear/mas_utils/GraphPartition.cpp
@@ -0,0 +1,92 @@
+#include <polysolve/linear/mas_utils/GraphPartition.hpp>
+
+#include <polysolve/linear/mas_utils/CudaUtils.cuh>
+
+#include <cassert>
+#include <cuda/std/span>
+#include <cstdint>
+#include <thread>
+#include <vector>
+#include <iostream>
+
+#include <ckaminpar.h>
+
+namespace polysolve::linear::mas
+{
+    void graph_partition(ctd::span<int> row_ptr,
+                         ctd::span<int> cols,
+                         ctd::span<int64_t> weights,
+                         int max_part_size,
+                         int &part_num,
+                         std::vector<int> &part_id)
+    {
+        int node_num = row_ptr.size() - 1;
+        assert(node_num >= 0);
+        assert(max_part_size > 0);
+        assert(weights.empty() || weights.size() == cols.size());
+
+        part_id.resize(node_num, 0);
+        if (node_num <= max_part_size)
+        {
+            part_num = 1;
+            return;
+        }
+
+        static_assert(sizeof(int) == sizeof(kaminpar_node_id_t), "int and KaMinPar node IDs differ in size");
+        static_assert(sizeof(int) == sizeof(kaminpar_edge_id_t), "int and KaMinPar edge IDs differ in size");
+        static_assert(sizeof(int64_t) == sizeof(kaminpar_edge_weight_t),
+                      "int64 and KaMinPar edge weights differ in size (enable KAMINPAR_64BIT_WEIGHTS?)");
+
+        auto *xadj = reinterpret_cast<kaminpar_edge_id_t *>(row_ptr.data());
+        auto *adjncy = reinterpret_cast<kaminpar_node_id_t *>(cols.data());
+        auto *adjwgt = weights.empty()
+                           ? nullptr
+                           : reinterpret_cast<kaminpar_edge_weight_t *>(weights.data());
+
+        kaminpar_context_t *ctx = kaminpar_create_context_by_preset_name("default");
+        assert(ctx);
+
+        int thread_num = (std::thread::hardware_concurrency() != 0)
+                             ? std::thread::hardware_concurrency()
+                             : 1;
+        kaminpar_t *partitioner = kaminpar_create(thread_num, ctx);
+        assert(partitioner);
+
+        kaminpar_set_output_level(partitioner, KAMINPAR_OUTPUT_LEVEL_QUIET);
+        kaminpar_borrow_and_mutate_graph(
+            partitioner,
+            node_num,
+            xadj,
+            adjncy,
+            nullptr,
+            adjwgt);
+
+        // Impl Eq.7 of arXiv:2411.06224
+
+        // Forcing tight K for k-way algorithm leads to low quality parition.
+        // It seems like max_part_size - 2 is a good balance.
+        part_num = div_round_up(node_num, max_part_size - 2);
+        assert(part_num >= 1);
+        std::vector<kaminpar_block_weight_t> max_block_weights(
+            part_num,
+            static_cast<kaminpar_block_weight_t>(max_part_size));
+        std::vector<kaminpar_block_id_t> partition(node_num);
+
+        // K way partition with absolution max block weight.
+        // Since our node weight is uniform, this is equivalent to limiting parition size.
+        kaminpar_compute_partition_with_max_block_weights(
+            partitioner,
+            part_num,
+            max_block_weights.data(),
+            partition.data());
+
+        kaminpar_context_free(ctx);
+        kaminpar_free(partitioner);
+
+        for (int i = 0; i < node_num; ++i)
+        {
+            part_id[i] = static_cast<int>(partition[i]);
+        }
+    }
+
+} // namespace polysolve::linear::mas
diff --git a/src/polysolve/linear/mas_utils/GraphPartition.hpp b/src/polysolve/linear/mas_utils/GraphPartition.hpp
new file mode 100644
index 00000000..3bd45dfd
--- /dev/null
+++ b/src/polysolve/linear/mas_utils/GraphPartition.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <cuda/std/span>
+#include <cstdint>
+#include <vector>
+
+#include <polysolve/linear/mas_utils/CudaUtils.cuh>
+
+namespace polysolve::linear::mas
+{
+    /// @brief K-way graph partition.
+    /// @param row_ptr[in] CSR graph topology. Will be modified!
+    /// @param cols[in] CSR graph topology. Will be modified!
+    /// @param weights[in] CSR graph edge weights. Empty span implies unweighted graph. Will be modified!
+    /// @param max_part_size[in] Maximum partition size.
+    /// @param part_num[out] Total partition num.
+    /// @param part_id[out] Partition id of each graph node.
+    void graph_partition(ctd::span<int> row_ptr,
+                         ctd::span<int> cols,
+                         ctd::span<int64_t> weights,
+                         int max_part_size,
+                         int &part_num,
+                         std::vector<int> &part_id);
+} // namespace polysolve::linear::mas
diff --git a/src/polysolve/linear/mas_utils/InnerProduct.cu b/src/polysolve/linear/mas_utils/InnerProduct.cu
new file mode 100644
index 00000000..cc93e23b
--- /dev/null
+++ b/src/polysolve/linear/mas_utils/InnerProduct.cu
@@ -0,0 +1,46 @@
+#include <polysolve/linear/mas_utils/InnerProduct.hpp>
+
+#include <cuda/buffer>
+#include <cuda/std/span>
+#include <cuda/atomic>
+#include <cuda/algorithm>
+#include <polysolve/linear/mas_utils/CudaUtils.cuh>
+#include <cub/cub.cuh>
+#include <cassert>
+
+namespace polysolve::linear::mas
+{
+
+    namespace
+    {
+        __global__ void inner_product_kernel(
+            ctd::span<const double> a, ctd::span<const double> b, double *out)
+        {
+            int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+            double c = (tid < a.size()) ? a[tid] * b[tid] : 0.0;
+
+            using BlockReduce = cub::BlockReduce<double, 128>;
+            __shared__ typename BlockReduce::TempStorage tmp;
+
+            double reduced = BlockReduce(tmp).Sum(c);
+
+            if (threadIdx.x == 0)
+            {
+                cu::atomic_ref<double> a_out{*out};
+                a_out.fetch_add(reduced, ctd::memory_order_relaxed);
+            }
+        }
+    } // namespace
+
+    void inner_product(ctd::span<const double> a, ctd::span<const double> b,
+                       ctd::span<double> out, CudaRuntime rt)
+    {
+        assert(a.size() == b.size());
+        assert(out.size() == 1);
+
+        cu::fill_bytes(rt.stream, out, 0);
+        int grid_num = div_round_up(a.size(), 128);
+        inner_product_kernel<<<grid_num, 128, 0, rt.stream.get()>>>(a, b, out.data());
+    }
+} // namespace polysolve::linear::mas
diff --git a/src/polysolve/linear/mas_utils/InnerProduct.hpp b/src/polysolve/linear/mas_utils/InnerProduct.hpp
new file mode 100644
index 00000000..6fb584be
--- /dev/null
+++ b/src/polysolve/linear/mas_utils/InnerProduct.hpp
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <cuda/buffer>
+#include <cuda/std/span>
+#include <polysolve/linear/mas_utils/CudaUtils.cuh>
+
+namespace polysolve::linear::mas
+{
+    /// @brief Compute inner product a dot b. Does not sync implicitly.
+    void inner_product(ctd::span<const double> a, ctd::span<const double> b,
+                       ctd::span<double> out, CudaRuntime rt);
+} // namespace polysolve::linear::mas
diff --git a/src/polysolve/linear/mas_utils/MASPreconditioner.cu b/src/polysolve/linear/mas_utils/MASPreconditioner.cu
new file mode 100644
index 00000000..a23435dc
--- /dev/null
+++ b/src/polysolve/linear/mas_utils/MASPreconditioner.cu
@@ -0,0 +1,1117 @@
+#include <polysolve/linear/mas_utils/MASPreconditioner.hpp>
+
+// #ifndef SPDLOG_ACTIVE_LEVEL
+// #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE
+// #endif
+
+#include <cub/cub.cuh>
+#include <cuda/algorithm>
+#include <cuda/std/array>
+#include <cuda/std/bit>
+#include <cuda/std/utility>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <spdlog/spdlog.h>
+#include <stdexcept>
+#include <cassert>
+
+namespace polysolve::linear::mas
+{
+    namespace
+    {
+        using clock = std::chrono::steady_clock;
+
+        double elapsed_seconds(const std::chrono::time_point<clock> &begin)
+        {
+            return std::chrono::duration<double>(clock::now() - begin).count();
+        }
+
+        // This is not tunable.
+        constexpr int BANK_SIZE = 32;
+
+        /// @brief Convenience wrapper for CoarseMatrices.
+        struct CoarseMatricesRef
+        {
+            int mat_dim;
+            int mat_storage_size;
+            ctd::array<ctd::span<double>, MAS_LEVEL_COUNT> matrix_per_level;
+
+            CoarseMatricesRef(CoarseMatrices &mats)
+            {
+                mat_dim = mats.mat_dim;
+                mat_storage_size = mats.mat_storage_size;
+
+                int scalar_offset = 0;
+                for (int i = 0; i < MAS_LEVEL_COUNT; ++i)
+                {
+                    int level_size = mats.matrix_counts[i] * mats.mat_storage_size;
+                    matrix_per_level[i] =
+                        ctd::span<double>(mats.data->data() + scalar_offset, level_size);
+                    scalar_offset += level_size;
+                }
+            }
+        };
+
+        __global__ void build_padded_maps(ctd::span<const int> part_offsets,
+                                          ctd::span<const int> bsr_rows,
+                                          ctd::span<int> real_to_padded,
+                                          ctd::span<int> padded_to_real,
+                                          ctd::span<int> real_num_per_row)
+        {
+            int padded_id = blockDim.x * blockIdx.x + threadIdx.x;
+            if (padded_id >= real_num_per_row.size())
+            {
+                return;
+            }
+
+            int part = padded_id / BANK_SIZE;
+            int local = padded_id % BANK_SIZE;
+            int part_begin = part_offsets[part];
+            int part_end = part_offsets[part + 1];
+            int part_size = part_end - part_begin;
+
+            if (local < part_size)
+            {
+                int real_id = part_begin + local;
+                real_to_padded[real_id] = padded_id;
+                padded_to_real[padded_id] = real_id;
+                real_num_per_row[padded_id] = bsr_rows[real_id + 1] - bsr_rows[real_id];
+            }
+            else
+            {
+                padded_to_real[padded_id] = -1;
+                real_num_per_row[padded_id] = 0;
+            }
+        }
+
+        __global__ void fill_padded_cols(ctd::span<const int> bsr_rows,
+                                         ctd::span<const int> bsr_cols,
+                                         ctd::span<const int> real_to_padded,
+                                         ctd::span<const int> padded_rows,
+                                         ctd::span<int> padded_cols)
+        {
+            int real_id = blockDim.x * blockIdx.x + threadIdx.x;
+            int node_num = bsr_rows.size() - 1;
+            if (real_id >= node_num)
+            {
+                return;
+            }
+
+            int padded_id = real_to_padded[real_id];
+            int dst = padded_rows[padded_id];
+            for (int n = bsr_rows[real_id]; n < bsr_rows[real_id + 1]; ++n)
+            {
+                padded_cols[dst] = real_to_padded[bsr_cols[n]];
+                ++dst;
+            }
+        }
+
+        /// @brief Get coarse space CCO id.
+        /// @param map Coarse space map.
+        /// @param vid Vertex id.
+        /// @param level Coarse space level. Start at lv 0.
+        /// @return CCO id.
+        __both__ int get_coarse_space_id(ctd::span<const int> map, int vid, int level)
+        {
+            return map[vid * MAS_MAX_COARSE_LEVEL + level];
+        }
+
+        /// @brief Build local CCO mapping from input space -> coarse space lv 0 and collapse topology.
+        /// @param row_ptr CSR graph topology (may include self).
+        /// @param cols CSR graph topology (may include self).
+        /// @param padded_to_real Padded id -> real id. -1 denotes padding.
+        /// @param cco_num_per_bank CCO number per bank.
+        /// @param local_cco_ids Bank local CCO id at coarse space lv 0.
+        __global__ void build_local_cco_lv0(ctd::span<const int> row_ptr,
+                                            ctd::span<int> cols,
+                                            ctd::span<const int> padded_to_real,
+                                            ctd::span<int> cco_num_per_bank,
+                                            ctd::span<int> local_cco_ids)
+        {
+            // Bank local neighbor masks.
+            __shared__ uint32_t neighbors[128];
+
+            int btid = threadIdx.x;                   // block local thread id
+            int tid = blockDim.x * blockIdx.x + btid; // global thread id
+            int wid = threadIdx.x / BANK_SIZE;        // block local warp id
+            int lid = threadIdx.x % BANK_SIZE;        // lane id
+            int bid = tid / BANK_SIZE;                // bank id
+
+            if (lid == 0 && bid < cco_num_per_bank.size())
+            {
+                cco_num_per_bank[bid] = 0;
+            }
+
+            int node_num = row_ptr.size() - 1;
+            if (tid >= node_num)
+            {
+                return;
+            }
+            // Skip virtual padding.
+            if (padded_to_real[tid] == -1)
+            {
+                local_cco_ids[tid] = -1;
+                return;
+            }
+
+            // Build neighbor mask (including self) and collapse topology.
+            int row_begin = row_ptr[tid];
+            int row_end = row_ptr[tid + 1];
+            uint32_t neighbor = (1u << lid);
+            int out_of_bank_neighbor_count = 0;
+            for (int n = row_begin; n < row_end; ++n)
+            {
+                int neighbor_vid = cols[n];
+                int neighbor_bid = neighbor_vid / BANK_SIZE;
+                if (bid == neighbor_bid)
+                {
+                    neighbor |= (1u << (neighbor_vid % BANK_SIZE));
+                }
+                else
+                {
+                    // Compact topology to remove in-bank neighbors so we can skip
+                    // additional filtering when building future coarse spaces.
+                    cols[row_begin + out_of_bank_neighbor_count] = neighbor_vid;
+                    ++out_of_bank_neighbor_count;
+                }
+            }
+            if (row_begin + out_of_bank_neighbor_count < row_end)
+            {
+                // -1 denotes neighbor list ending.
+                cols[row_begin + out_of_bank_neighbor_count] = -1;
+            }
+            neighbors[btid] = neighbor;
+            __syncwarp();
+
+            // Build connectivity mask (including self).
+            uint32_t connection = neighbor;
+            uint32_t visited = (1u << lid);
+            uint32_t to_visit = connection ^ visited;
+            while (to_visit)
+            {
+                int visiting = ctd::countr_zero(to_visit);
+                connection |= neighbors[visiting + wid * BANK_SIZE];
+                visited |= (1u << visiting);
+                to_visit = connection ^ visited;
+            }
+
+            // Find bank (warp) local connected component (CCO).
+            uint32_t cco_lead_lid = ctd::countr_zero(connection);
+            bool is_lead = (cco_lead_lid == lid);
+            uint32_t lead_lanes = __ballot_sync(0xFFFFFFFFu, is_lead);
+            uint32_t before_cco_lead_mask = static_cast<uint32_t>((1ull << cco_lead_lid) - 1ull);
+
+            // Write cco info back.
+            local_cco_ids[tid] = ctd::popcount(lead_lanes & before_cco_lead_mask);
+            if (lid == 0)
+            {
+                cco_num_per_bank[bid] = ctd::popcount(lead_lanes);
+            }
+        }
+
+        /// @brief Build bank neighbor mask and collapse topology.
+        /// @param level Target coarse space level.
+        /// @param coarse_space_map Coarse space map.
+        /// @param row_ptr CSR graph topology (may include self).
+        /// @param cols CSR graph topology (may include self).
+        /// @param padded_to_real Padded id -> real id. -1 denotes padding.
+        /// @param neighbors Bank local neighbor mask.
+        __global__ void build_neighbor_masks_lvx(int level,
+                                                 ctd::span<const int> coarse_space_map,
+                                                 ctd::span<const int> row_ptr,
+                                                 ctd::span<int> cols,
+                                                 ctd::span<const int> padded_to_real,
+                                                 ctd::span<uint32_t> neighbors)
+        {
+            int tid = blockDim.x * blockIdx.x + threadIdx.x; // global thread id
+            int node_num = row_ptr.size() - 1;
+            if (tid >= node_num)
+            {
+                return;
+            }
+            // Skip virtual padding.
+            if (padded_to_real[tid] == -1)
+            {
+                return;
+            }
+
+            // Build neighbor mask (including self) and collapse topology.
+            int row_begin = row_ptr[tid];
+            int row_end = row_ptr[tid + 1];
+            int real_id = padded_to_real[tid];
+            int cco_id = get_coarse_space_id(coarse_space_map, real_id, level - 1);
+            uint32_t neighbor = (1u << (cco_id % BANK_SIZE));
+            int out_of_bank_neighbor_count = 0;
+            for (int n = row_begin; n < row_end; ++n)
+            {
+                int neighbor_vid = cols[n];
+                if (neighbor_vid == -1)
+                {
+                    break;
+                }
+
+                int neighbor_real_id = padded_to_real[neighbor_vid];
+                int neighbor_cco_id =
+                    get_coarse_space_id(coarse_space_map, neighbor_real_id, level - 1);
+                if (cco_id / BANK_SIZE == neighbor_cco_id / BANK_SIZE)
+                {
+                    neighbor |= (1u << (neighbor_cco_id % BANK_SIZE));
+                }
+                else
+                {
+                    cols[row_begin + out_of_bank_neighbor_count] = neighbor_vid;
+                    ++out_of_bank_neighbor_count;
+                }
+            }
+            if (row_begin + out_of_bank_neighbor_count < row_end)
+            {
+                cols[row_begin + out_of_bank_neighbor_count] = -1;
+            }
+
+            atomicOr(neighbors.data() + cco_id, neighbor);
+        }
+
+        /// @brief Build local CCO mapping from coarse space level x-1 -> coarse space level x.
+        /// @param cco_num Total CCO number at coarse space lv x-1.
+        /// @param cco_num_per_bank Independent CCO number per bank at coarse space lv x.
+        /// @param local_cco_ids Bank local CCO id at coarse space lv x.
+        __global__ void build_local_cco_lvx(int cco_num,
+                                            ctd::span<const uint32_t> neighbors,
+                                            ctd::span<int> cco_num_per_bank,
+                                            ctd::span<int> local_cco_ids)
+        {
+            int tid = blockDim.x * blockIdx.x + threadIdx.x; // global thread id
+            int bid = tid / BANK_SIZE;                       // bank id
+            int lid = tid % BANK_SIZE;                       // lane id
+            if (tid >= cco_num)
+            {
+                return;
+            }
+
+            // Build connectivity mask (including self).
+            uint32_t connection = neighbors[tid];
+            uint32_t visited = (1u << lid);
+            uint32_t to_visit = connection ^ visited;
+            while (to_visit)
+            {
+                int visiting = ctd::countr_zero(to_visit);
+                connection |= neighbors[visiting + bid * BANK_SIZE];
+                visited |= (1u << visiting);
+                to_visit = connection ^ visited;
+            }
+
+            // Find bank (warp) local connected component (CCO).
+            uint32_t cco_lead_lid = ctd::countr_zero(connection);
+            bool is_lead = (cco_lead_lid == lid);
+            uint32_t lead_lanes = __ballot_sync(0xFFFFFFFFu, is_lead);
+            uint32_t before_cco_lead_mask = static_cast<uint32_t>((1ull << cco_lead_lid) - 1ull);
+
+            // Write cco info back.
+            local_cco_ids[tid] = ctd::popcount(lead_lanes & before_cco_lead_mask);
+            if (lid == 0)
+            {
+                cco_num_per_bank[bid] = ctd::popcount(lead_lanes);
+            }
+        }
+
+        /// @brief Compute global CCO id and write it to coarse space map.
+        ///
+        /// Compute global CCO id offset from prefix sum, then write result to coarse_space_map.
+        __global__ void aggregate_coarse_space_map_lv0(ctd::span<const int> local_cco_ids,
+                                                       ctd::span<const int> cco_num_per_bank,
+                                                       ctd::span<const int> cco_num_per_bank_summed,
+                                                       ctd::span<const int> real_to_padded,
+                                                       ctd::span<int> coarse_space_map)
+        {
+            int real_id = blockDim.x * blockIdx.x + threadIdx.x;
+            if (real_id >= real_to_padded.size())
+            {
+                return;
+            }
+
+            int padded_id = real_to_padded[real_id];
+            int bid = padded_id / BANK_SIZE;
+            int cco_id =
+                local_cco_ids[padded_id] + cco_num_per_bank_summed[bid] - cco_num_per_bank[bid];
+            coarse_space_map[real_id * MAS_MAX_COARSE_LEVEL] = cco_id;
+        }
+
+        /// @brief Compute global CCO id and write it to coarse space map.
+        ///
+        /// Compute global CCO id offset from prefix sum, then write result to coarse_space_map.
+        __global__ void aggregate_coarse_space_map_lvx(int level,
+                                                       ctd::span<const int> local_cco_ids,
+                                                       ctd::span<const int> cco_num_per_bank,
+                                                       ctd::span<const int> cco_num_per_bank_summed,
+                                                       ctd::span<int> coarse_space_map)
+        {
+            int real_id = blockDim.x * blockIdx.x + threadIdx.x;
+            int node_num = coarse_space_map.size() / MAS_MAX_COARSE_LEVEL;
+            if (real_id >= node_num)
+            {
+                return;
+            }
+
+            int prev_cco_id = get_coarse_space_id(coarse_space_map, real_id, level - 1);
+            int bid = prev_cco_id / BANK_SIZE;
+            int cco_id =
+                local_cco_ids[prev_cco_id] + cco_num_per_bank_summed[bid] - cco_num_per_bank[bid];
+            coarse_space_map[real_id * MAS_MAX_COARSE_LEVEL + level] = cco_id;
+        }
+
+        /// @brief Build coarse space map.
+        ///
+        /// Coarse space map real node id layout:
+        /// | node0_lv0 node0_lv1 ... node0_lv_max | node1_lv0 ... node1_lv_max | ... |
+        ///
+        /// @param row_ptr CSR graph topology (may include self).
+        /// @param cols CSR graph topology (may include self).
+        /// @param real_to_padded Real id -> padded id.
+        /// @param padded_to_real Padded id -> real id. -1 denotes virtual padding.
+        /// @rt Cuda runtime config.
+        /// @warning Will modify cols as side effect!
+        CoarseSpace build_coarse_space(ctd::span<const int> row_ptr,
+                                       ctd::span<int> cols,
+                                       ctd::span<const int> real_to_padded,
+                                       ctd::span<const int> padded_to_real,
+                                       CudaRuntime rt)
+        {
+            ctd::array<int, MAS_MAX_COARSE_LEVEL> cco_nums{};
+
+            int padded_node_num = row_ptr.size() - 1;
+            int real_node_num = real_to_padded.size();
+            int max_bank_per_level = div_round_up(padded_node_num, BANK_SIZE);
+            // Bank local CCO id.
+            auto local_cco_ids =
+                cu::make_buffer<int>(rt.stream, rt.mr, padded_node_num, cu::no_init);
+            auto cco_num_per_bank =
+                cu::make_buffer<int>(rt.stream, rt.mr, max_bank_per_level, cu::no_init);
+            // Inclusive prefix sum of cco_num_per_bank.
+            auto cco_num_per_bank_summed =
+                cu::make_buffer<int>(rt.stream, rt.mr, max_bank_per_level, cu::no_init);
+            auto coarse_space_map =
+                cu::make_buffer<int>(rt.stream, rt.mr, real_node_num * MAS_MAX_COARSE_LEVEL, cu::no_init);
+
+            // Build coarse map level 0.
+            int grid_num = div_round_up(padded_node_num, 128);
+            build_local_cco_lv0<<<grid_num, 128, 0, rt.stream.get()>>>(
+                row_ptr, cols, padded_to_real, cco_num_per_bank, local_cco_ids);
+
+            // Prefix sum to compute global CCO id offset.
+            size_t cub_tmp_size = 0;
+            cub::DeviceScan::InclusiveSum(nullptr,
+                                          cub_tmp_size,
+                                          cco_num_per_bank.data(),
+                                          cco_num_per_bank_summed.data(),
+                                          max_bank_per_level,
+                                          rt.stream.get());
+            auto cub_tmp =
+                cu::make_buffer<char>(rt.stream, rt.mr, cub_tmp_size, cu::no_init);
+            cub::DeviceScan::InclusiveSum(cub_tmp.data(),
+                                          cub_tmp_size,
+                                          cco_num_per_bank.data(),
+                                          cco_num_per_bank_summed.data(),
+                                          max_bank_per_level,
+                                          rt.stream.get());
+
+            aggregate_coarse_space_map_lv0<<<div_round_up(real_node_num, 128), 128, 0, rt.stream.get()>>>(
+                local_cco_ids, cco_num_per_bank, cco_num_per_bank_summed, real_to_padded, coarse_space_map);
+
+            int cco_num = device2host(cco_num_per_bank_summed.data() + max_bank_per_level - 1, rt);
+            cco_nums[0] = cco_num;
+            int level_num = 1;
+
+            // Build coarse map level 1 ... MAS_MAX_COARSE_LEVEL-1 recursively.
+            // We do:
+            //  1. Build bank local neighbbor bit mask.
+            //  2. Build bank local connectivity mask and find local CCO partition.
+            //  3. Compute global CCO id and write to coarse space map.
+            auto neighbors = cu::make_buffer<uint32_t>(rt.stream, rt.mr, padded_node_num, cu::no_init);
+            for (int lv = 1; lv < MAS_MAX_COARSE_LEVEL; ++lv)
+            {
+                cudaMemsetAsync(neighbors.data(), 0, cco_num * sizeof(uint32_t), rt.stream.get());
+
+                grid_num = div_round_up(padded_node_num, 128);
+                build_neighbor_masks_lvx<<<grid_num, 128, 0, rt.stream.get()>>>(
+                    lv,
+                    coarse_space_map,
+                    row_ptr,
+                    cols,
+                    padded_to_real,
+                    ctd::span<uint32_t>(neighbors.data(), cco_num));
+
+                int bank_num = div_round_up(cco_num, BANK_SIZE);
+                grid_num = div_round_up(cco_num, 128);
+                build_local_cco_lvx<<<grid_num, 128, 0, rt.stream.get()>>>(
+                    cco_num,
+                    ctd::span<const uint32_t>(neighbors.data(), cco_num),
+                    ctd::span<int>(cco_num_per_bank.data(), bank_num),
+                    ctd::span<int>(local_cco_ids.data(), cco_num));
+
+                cub::DeviceScan::InclusiveSum(cub_tmp.data(),
+                                              cub_tmp_size,
+                                              cco_num_per_bank.data(),
+                                              cco_num_per_bank_summed.data(),
+                                              bank_num,
+                                              rt.stream.get());
+                aggregate_coarse_space_map_lvx<<<div_round_up(real_node_num, 128), 128, 0, rt.stream.get()>>>(
+                    lv,
+                    ctd::span<const int>(local_cco_ids.data(), cco_num),
+                    ctd::span<const int>(cco_num_per_bank.data(), bank_num),
+                    ctd::span<const int>(cco_num_per_bank_summed.data(), bank_num),
+                    coarse_space_map);
+
+                // No more CCO to merge.
+                int next_cco_num = device2host(cco_num_per_bank_summed.data() + bank_num - 1, rt);
+                if (next_cco_num == cco_num)
+                {
+                    break;
+                }
+
+                cco_num = next_cco_num;
+                cco_nums[lv] = cco_num;
+                level_num = lv + 1;
+            }
+
+            CoarseSpace cs;
+            cs.cco_nums = cco_nums;
+            cs.level_num = level_num;
+            cs.map = std::move(coarse_space_map);
+            return cs;
+        }
+
+        /// @brief Compute offset of row major upper triangular dim x dim matrix.
+        __both__ int index_upper_mat(int dim, int i, int j)
+        {
+            if (i > j)
+            {
+                ctd::swap(i, j);
+            }
+            return i * dim - (i * (i + 1) / 2) + j;
+        }
+
+        /// @brief Build coarse space matrices by accumulating padded space hessian.
+        __global__ void fill_coarse_matrices(BSRView mat_in,
+                                             CoarseMatricesRef mat_out,
+                                             ctd::span<const int> coarse_space_map,
+                                             ctd::span<const int> real_to_padded,
+                                             int coarse_level_num)
+        {
+            int tid = blockDim.x * blockIdx.x + threadIdx.x; // global thread id
+            if (tid >= mat_in.dim)
+            {
+                return;
+            }
+
+            int padded_i = real_to_padded[tid];
+            int block_size = mat_in.block_dim * mat_in.block_dim;
+            for (int nz = mat_in.rows[tid]; nz < mat_in.rows[tid + 1]; ++nz)
+            {
+                int j = mat_in.cols[nz];
+                // Assume input mat is symmetric (Which is required by CG anyway). Skip lower.
+                if (tid > j)
+                {
+                    continue;
+                }
+
+                int padded_j = real_to_padded[j];
+                int block_offset = nz * block_size;
+                // Padded space matrices.
+                if (padded_i / BANK_SIZE == padded_j / BANK_SIZE)
+                {
+                    int mat_id = padded_i / BANK_SIZE;
+                    int scalar_i_root = (padded_i % BANK_SIZE) * mat_in.block_dim;
+                    int scalar_j_root = (padded_j % BANK_SIZE) * mat_in.block_dim;
+                    double *mat = mat_out.matrix_per_level[0].data() + mat_id * mat_out.mat_storage_size;
+                    for (int bi = 0; bi < mat_in.block_dim; ++bi)
+                    {
+                        for (int bj = 0; bj < mat_in.block_dim; ++bj)
+                        {
+                            int row = scalar_i_root + bi;
+                            int col = scalar_j_root + bj;
+                            // Write upper part of coarse matrix only.
+                            if (row > col)
+                            {
+                                continue;
+                            }
+                            // For diagonal block (tid == j), write upper part only.
+                            if (tid == j && bi > bj)
+                            {
+                                continue;
+                            }
+
+                            double val = mat_in.vals[block_offset + bi * mat_in.block_dim + bj];
+                            atomicAdd(mat + index_upper_mat(mat_out.mat_dim, row, col), val);
+                        }
+                    }
+                }
+                // Coarse space matrices.
+                for (int lv = 0; lv < coarse_level_num; ++lv)
+                {
+                    int cco_i = coarse_space_map[tid * MAS_MAX_COARSE_LEVEL + lv];
+                    int cco_j = coarse_space_map[j * MAS_MAX_COARSE_LEVEL + lv];
+                    if (cco_i / BANK_SIZE != cco_j / BANK_SIZE)
+                    {
+                        continue;
+                    }
+
+                    int mat_id = cco_i / BANK_SIZE;
+                    int scalar_i_root = (cco_i % BANK_SIZE) * mat_in.block_dim;
+                    int scalar_j_root = (cco_j % BANK_SIZE) * mat_in.block_dim;
+                    double *mat = mat_out.matrix_per_level[lv + 1].data() + mat_id * mat_out.mat_storage_size;
+                    for (int bi = 0; bi < mat_in.block_dim; ++bi)
+                    {
+                        for (int bj = 0; bj < mat_in.block_dim; ++bj)
+                        {
+                            int row = scalar_i_root + bi;
+                            int col = scalar_j_root + bj;
+                            // Write upper part of coarse matrix only.
+                            if (row > col)
+                            {
+                                continue;
+                            }
+                            // For diagonal block (tid == j), write upper part only.
+                            if (tid == j && bi > bj)
+                            {
+                                continue;
+                            }
+
+                            double val = mat_in.vals[block_offset + bi * mat_in.block_dim + bj];
+                            // When two different fine nodes collapse to the same coarse node,
+                            // the coarse diagonal block must receive both A_ij and A_ji = A_ij^T.
+                            if (tid != j && cco_i == cco_j)
+                            {
+                                val += mat_in.vals[block_offset + bj * mat_in.block_dim + bi];
+                            }
+                            atomicAdd(mat + index_upper_mat(mat_out.mat_dim, row, col), val);
+                        }
+                    }
+                }
+            }
+        }
+
+        __global__ void gather_multi_level_r(
+            ctd::span<const double> r,
+            ctd::span<double> multi_level_r,
+            ctd::span<const int> real_to_padded,
+            ctd::span<const int> coarse_space_map,
+            ctd::array<int, MAS_LEVEL_COUNT> level_offsets,
+            int block_dim,
+            int coarse_level_num)
+        {
+            int real_id = blockDim.x * blockIdx.x + threadIdx.x;
+            if (real_id >= real_to_padded.size())
+            {
+                return;
+            }
+
+            int padded_id = real_to_padded[real_id];
+            int src_root = real_id * block_dim;
+            int fine_root = padded_id * block_dim;
+            // Gather padded space.
+            for (int i = 0; i < block_dim; ++i)
+            {
+                multi_level_r[fine_root + i] = r[src_root + i];
+            }
+            // Gather coarse space.
+            for (int lv = 0; lv < coarse_level_num; ++lv)
+            {
+                int cco_id = coarse_space_map[real_id * MAS_MAX_COARSE_LEVEL + lv];
+                int dst_root = (level_offsets[lv + 1] + cco_id) * block_dim;
+                for (int i = 0; i < block_dim; ++i)
+                {
+                    atomicAdd(multi_level_r.data() + dst_root + i, r[src_root + i]);
+                }
+            }
+        }
+
+        /// @brief Matrix vector multiplication. A is packed upper row major matrix.
+        ///
+        /// This is the main bottleneck of MAS preconditioner. But since we are vram bandwidth
+        /// bound, there's nothing I could do. The compute / memory ratio is just too low.
+        template <int N, int BLOCK>
+        __global__ void symv_upper_packed(const double *A_upper,
+                                          const double *x,
+                                          double *y)
+        {
+            int mat_id = blockIdx.x;
+            int row = threadIdx.x;
+            constexpr int L = N * (N + 1) / 2;
+            const double *Amat = A_upper + mat_id * L;
+
+            // All threads in block load A and x into shared mem.
+
+            __shared__ double sx[N];
+            __shared__ double sA[L];
+
+            for (int i = threadIdx.x; i < N; i += BLOCK)
+            {
+                sx[i] = x[mat_id * N + i];
+            }
+            for (int k = threadIdx.x; k < L; k += BLOCK)
+            {
+                sA[k] = Amat[k];
+            }
+
+            __syncthreads();
+
+            // Each thread do one row * x.
+
+            if (row >= N)
+            {
+                return;
+            }
+
+            double sum = 0.0;
+            for (int col = 0; col < N; ++col)
+            {
+                sum += sA[index_upper_mat(N, row, col)] * sx[col];
+            }
+
+            y[mat_id * N + row] = sum;
+        }
+
+        void apply_inverse(const CoarseMatrices &mats,
+                           ctd::span<const double> x,
+                           ctd::span<double> y,
+                           int block_dim,
+                           CudaRuntime rt)
+        {
+            int mat_num = mats.total_matrix_num;
+            if (mat_num == 0)
+            {
+                return;
+            }
+
+            if (block_dim == 1)
+            {
+                symv_upper_packed<32, 32><<<mat_num, 32, 0, rt.stream.get()>>>(
+                    mats.data->data(),
+                    x.data(),
+                    y.data());
+                return;
+            }
+            if (block_dim == 2)
+            {
+                symv_upper_packed<64, 64><<<mat_num, 64, 0, rt.stream.get()>>>(
+                    mats.data->data(),
+                    x.data(),
+                    y.data());
+                return;
+            }
+            if (block_dim == 3)
+            {
+                symv_upper_packed<96, 96><<<mat_num, 96, 0, rt.stream.get()>>>(
+                    mats.data->data(),
+                    x.data(),
+                    y.data());
+                return;
+            }
+
+            throw std::runtime_error("[CudaPCG] MAS only supports block size 1, 2, or 3.");
+        }
+
+        __global__ void gather_multi_level_z(
+            ctd::span<const double> multi_level_z,
+            ctd::span<double> z,
+            ctd::span<const int> real_to_padded,
+            ctd::span<const int> coarse_space_map,
+            ctd::array<int, MAS_LEVEL_COUNT> level_offsets,
+            int block_dim,
+            int coarse_level_num)
+        {
+            int real_id = blockDim.x * blockIdx.x + threadIdx.x;
+            if (real_id >= real_to_padded.size())
+            {
+                return;
+            }
+
+            // Padded space.
+            int padded_id = real_to_padded[real_id];
+            int dst_root = real_id * block_dim;
+            for (int i = 0; i < block_dim; ++i)
+            {
+                z[dst_root + i] = multi_level_z[padded_id * block_dim + i];
+            }
+            // Coarse space.
+            for (int lv = 0; lv < coarse_level_num; ++lv)
+            {
+                int cco_id = coarse_space_map[real_id * MAS_MAX_COARSE_LEVEL + lv];
+                int src_root = (level_offsets[lv + 1] + cco_id) * block_dim;
+                for (int i = 0; i < block_dim; ++i)
+                {
+                    z[dst_root + i] += multi_level_z[src_root + i];
+                }
+            }
+        }
+
+        __global__ void pad_zero_diagonal(double *mats, int mat_num, int mat_dim, int mat_storage_size)
+        {
+            int tid = blockDim.x * blockIdx.x + threadIdx.x;
+            int total_diag = mat_num * mat_dim;
+            if (tid >= total_diag)
+            {
+                return;
+            }
+
+            int mat_id = tid / mat_dim;
+            int row = tid % mat_dim;
+            double *mat = mats + mat_id * mat_storage_size;
+            double *diag = mat + index_upper_mat(mat_dim, row, row);
+            if (*diag == 0.0)
+            {
+                *diag = 1.0;
+            }
+        }
+
+        /// @brief Invert a packed symmetric matrix in-place using symmetric Gauss-Jordan sweeps.
+        template <int N>
+        __global__ void batched_invert_upper(double *d_matrices,
+                                             bool *success)
+        {
+            int mat_idx = blockIdx.x;
+            constexpr int STORAGE = N * (N + 1) / 2;
+            double *d_A = d_matrices + mat_idx * STORAGE;
+
+            __shared__ double s_A[STORAGE];
+            __shared__ double s_col[N];
+            __shared__ double s_pivot;
+            int tx = threadIdx.x;
+
+            for (int i = tx; i < STORAGE; i += N)
+            {
+                s_A[i] = d_A[i];
+            }
+            __syncthreads();
+
+            for (int pivot = 0; pivot < N; ++pivot)
+            {
+                if (tx == 0)
+                {
+                    s_pivot = s_A[index_upper_mat(N, pivot, pivot)];
+                    if (!ctd::isfinite(s_pivot) || s_pivot == 0.0)
+                    {
+                        *success = false;
+                    }
+                }
+                __syncthreads();
+
+                if (tx < N)
+                {
+                    if (tx == pivot)
+                    {
+                        s_col[tx] = 0.0;
+                    }
+                    else
+                    {
+                        int row = ctd::min(tx, pivot);
+                        int col = ctd::max(tx, pivot);
+                        s_col[tx] = s_A[index_upper_mat(N, row, col)];
+                    }
+                }
+                __syncthreads();
+
+                if (tx < N && tx != pivot)
+                {
+                    double a_ik = s_col[tx];
+                    for (int col = tx; col < N; ++col)
+                    {
+                        if (col == pivot)
+                        {
+                            continue;
+                        }
+
+                        double updated =
+                            s_A[index_upper_mat(N, tx, col)] - a_ik * s_col[col] / s_pivot;
+                        if (!ctd::isfinite(updated))
+                        {
+                            *success = false;
+                        }
+                        s_A[index_upper_mat(N, tx, col)] = updated;
+                    }
+                }
+                __syncthreads();
+
+                if (tx == pivot)
+                {
+                    double updated = -1.0 / s_pivot;
+                    if (!ctd::isfinite(updated))
+                    {
+                        *success = false;
+                    }
+                    else
+                    {
+                        s_A[index_upper_mat(N, pivot, pivot)] = updated;
+                    }
+                }
+                else if (tx < N)
+                {
+                    double updated = s_col[tx] / s_pivot;
+                    if (!ctd::isfinite(updated))
+                    {
+                        *success = false;
+                    }
+                    else
+                    {
+                        int row = ctd::min(tx, pivot);
+                        int col = ctd::max(tx, pivot);
+                        s_A[index_upper_mat(N, row, col)] = updated;
+                    }
+                }
+                __syncthreads();
+            }
+
+            for (int i = tx; i < STORAGE; i += N)
+            {
+                double output = -s_A[i];
+                if (!ctd::isfinite(output))
+                {
+                    *success = false;
+                }
+                d_A[i] = output;
+            }
+        }
+
+        void invert_packed_matrices(double *mats, int mat_num, int block_dim, CudaRuntime rt)
+        {
+            if (mat_num == 0)
+            {
+                return;
+            }
+
+            auto success = cu::make_buffer<bool>(rt.stream, rt.mr, 1, true);
+
+            if (block_dim == 1)
+            {
+                batched_invert_upper<32><<<mat_num, 32, 0, rt.stream.get()>>>(
+                    mats, success.data());
+            }
+            else if (block_dim == 2)
+            {
+                batched_invert_upper<64><<<mat_num, 64, 0, rt.stream.get()>>>(
+                    mats, success.data());
+            }
+            else if (block_dim == 3)
+            {
+                batched_invert_upper<96><<<mat_num, 96, 0, rt.stream.get()>>>(
+                    mats, success.data());
+            }
+            else
+            {
+                assert(false);
+            }
+
+            bool host_success = device2host(success.data(), rt);
+            if (!host_success)
+            {
+                throw std::runtime_error("[CudaPCG] MAS packed inverse failed.");
+            }
+        }
+
+        /// @brief Build symmetric upper triangular coarse space matrices.
+        CoarseMatrices build_sym_coarse_matrices(const CoarseSpace &cs,
+                                                 BSRView mat,
+                                                 ctd::span<const int> real_to_padded,
+                                                 int padded_node_num,
+                                                 CudaRuntime rt)
+        {
+            CoarseMatrices out;
+            out.mat_dim = BANK_SIZE * mat.block_dim;
+            out.mat_storage_size = out.mat_dim * (out.mat_dim + 1) / 2;
+            out.total_matrix_num = 0;
+            out.matrix_offsets[0] = 0;
+            out.matrix_counts[0] = padded_node_num / BANK_SIZE;
+            out.total_matrix_num += out.matrix_counts[0];
+            for (int i = 0; i < cs.level_num; ++i)
+            {
+                out.matrix_offsets[i + 1] = out.total_matrix_num;
+                out.matrix_counts[i + 1] = div_round_up(cs.cco_nums[i], BANK_SIZE);
+                out.total_matrix_num += out.matrix_counts[i + 1];
+            }
+
+            out.data = cu::make_buffer<double>(
+                rt.stream,
+                rt.mr,
+                out.total_matrix_num * out.mat_storage_size,
+                0.0);
+            CoarseMatricesRef view{out};
+
+            // Gather.
+            int grid_num = div_round_up(mat.dim, 128);
+            fill_coarse_matrices<<<grid_num, 128, 0, rt.stream.get()>>>(
+                mat, view, *cs.map, real_to_padded, cs.level_num);
+
+            // Pad diagonal.
+            int total_diag = out.total_matrix_num * out.mat_dim;
+            pad_zero_diagonal<<<div_round_up(total_diag, 128), 128, 0, rt.stream.get()>>>(
+                out.data->data(), out.total_matrix_num, out.mat_dim, out.mat_storage_size);
+
+            // Compute inverse.
+            invert_packed_matrices(out.data->data(), out.total_matrix_num, mat.block_dim, rt);
+            return out;
+        }
+    } // namespace
+
+    void MASPreconditioner::factorize(const BSRMatrix &A,
+                                      ctd::span<const int> part_offsets,
+                                      CudaRuntime rt)
+    {
+        assert(part_offsets.size() >= 2);
+
+        BSRView view = A.view();
+        assert(view.block_dim >= 1 && view.block_dim <= 3);
+        assert(part_offsets.back() == view.dim);
+        for (int i = 0; i + 1 < static_cast<int>(part_offsets.size()); ++i)
+        {
+            assert(part_offsets[i + 1] - part_offsets[i] <= BANK_SIZE);
+        }
+
+        initialized_ = false;
+        block_dim_ = view.block_dim;
+        vector_size_ = view.dim * view.block_dim;
+
+        auto total_begin = clock::now();
+        auto phase_begin = clock::now();
+
+        // Build padded topology.
+        int node_num = view.dim;
+        int part_num = part_offsets.size() - 1;
+        int padded_node_num = part_num * BANK_SIZE;
+
+        auto d_part_offsets = cu::make_buffer<int>(rt.stream, rt.mr, part_offsets.size(), cu::no_init);
+        cu::copy_bytes(rt.stream, part_offsets, d_part_offsets);
+
+        padded_topology_.node_num = node_num;
+        padded_topology_.padded_node_num = padded_node_num;
+        padded_topology_.real_to_padded = cu::make_buffer<int>(rt.stream, rt.mr, node_num, -1);
+        padded_topology_.padded_to_real = cu::make_buffer<int>(rt.stream, rt.mr, padded_node_num, -1);
+        padded_topology_.rows = cu::make_buffer<int>(rt.stream, rt.mr, padded_node_num + 1, cu::no_init);
+        padded_topology_.cols = cu::make_buffer<int>(rt.stream, rt.mr, view.non_zeros, cu::no_init);
+
+        auto read_num_per_row = cu::make_buffer<int>(rt.stream, rt.mr, padded_node_num, 0);
+        build_padded_maps<<<div_round_up(padded_node_num, 128), 128, 0, rt.stream.get()>>>(
+            d_part_offsets,
+            view.rows,
+            *(padded_topology_.real_to_padded),
+            *(padded_topology_.padded_to_real),
+            read_num_per_row);
+
+        // Build padded space row_ptr.
+        size_t cub_tmp_size = 0;
+        cub::DeviceScan::ExclusiveSum(nullptr,
+                                      cub_tmp_size,
+                                      read_num_per_row.data(),
+                                      padded_topology_.rows->data(),
+                                      padded_node_num,
+                                      rt.stream.get());
+        auto cub_tmp = cu::make_buffer<char>(rt.stream, rt.mr, cub_tmp_size, cu::no_init);
+        cub::DeviceScan::ExclusiveSum(cub_tmp.data(),
+                                      cub_tmp_size,
+                                      read_num_per_row.data(),
+                                      padded_topology_.rows->data(),
+                                      padded_node_num,
+                                      rt.stream.get());
+        host2device(padded_topology_.rows->data() + padded_node_num, view.non_zeros, rt);
+
+        fill_padded_cols<<<div_round_up(node_num, 128), 128, 0, rt.stream.get()>>>(
+            view.rows,
+            view.cols,
+            *(padded_topology_.real_to_padded),
+            *(padded_topology_.rows),
+            *(padded_topology_.cols));
+
+        rt.stream.sync();
+        SPDLOG_TRACE("CUDA_PCG MAS: build_padded_topology_device {:.6f}s", elapsed_seconds(phase_begin));
+
+        // Build coarse space hierarchy.
+        phase_begin = clock::now();
+        coarse_space_ = build_coarse_space(
+            *(padded_topology_.rows),
+            *(padded_topology_.cols),
+            *(padded_topology_.real_to_padded),
+            *(padded_topology_.padded_to_real),
+            rt);
+        rt.stream.sync();
+        SPDLOG_TRACE("CUDA_PCG MAS: build_coarse_space {:.6f}s", elapsed_seconds(phase_begin));
+
+        // Build coarse space matrices by 1. gather coarse space hessian from fine nodes 2. invert
+        phase_begin = clock::now();
+        coarse_matrices_ = build_sym_coarse_matrices(
+            coarse_space_,
+            view,
+            *(padded_topology_.real_to_padded),
+            padded_topology_.padded_node_num,
+            rt);
+        rt.stream.sync();
+        SPDLOG_TRACE("CUDA_PCG MAS: build_coarse_matrices {:.6f}s", elapsed_seconds(phase_begin));
+
+        // Allocate memory for coarse space residual (r) and preconditioned residual (z).
+        phase_begin = clock::now();
+        coarse_vectors_.level_offsets[0] = 0;
+        coarse_vectors_.level_sizes[0] = padded_topology_.padded_node_num;
+        coarse_vectors_.total_level_nodes = padded_topology_.padded_node_num;
+        for (int i = 0; i < coarse_space_.level_num; ++i)
+        {
+            coarse_vectors_.level_offsets[i + 1] = coarse_vectors_.total_level_nodes;
+            coarse_vectors_.level_sizes[i + 1] = coarse_matrices_.matrix_counts[i + 1] * BANK_SIZE;
+            coarse_vectors_.total_level_nodes += coarse_vectors_.level_sizes[i + 1];
+        }
+        int total_level_scalars = coarse_vectors_.total_level_nodes * block_dim_;
+        coarse_vectors_.multi_level_r =
+            cu::make_buffer<double>(rt.stream, rt.mr, total_level_scalars, 0.0);
+        coarse_vectors_.multi_level_z =
+            cu::make_buffer<double>(rt.stream, rt.mr, total_level_scalars, 0.0);
+        rt.stream.sync();
+        SPDLOG_TRACE("CUDA_PCG MAS: allocate_coarse_vectors {:.6f}s", elapsed_seconds(phase_begin));
+        SPDLOG_TRACE("CUDA_PCG MAS: factorize_total {:.6f}s", elapsed_seconds(total_begin));
+
+        initialized_ = true;
+    }
+
+    void MASPreconditioner::apply(
+        ctd::span<const double> r,
+        ctd::span<double> z,
+        CudaRuntime rt)
+    {
+        if (!initialized_)
+        {
+            throw std::runtime_error("[CudaPCG] MASPreconditioner is not initialized.");
+        }
+        if (r.size() != z.size() || r.size() != vector_size_)
+        {
+            throw std::runtime_error("[CudaPCG] Invalid vector size for MAS preconditioner.");
+        }
+
+        cu::fill_bytes(rt.stream, *(coarse_vectors_.multi_level_r), 0);
+        cu::fill_bytes(rt.stream, *(coarse_vectors_.multi_level_z), 0);
+
+        int grid_num = div_round_up(padded_topology_.node_num, 128);
+        gather_multi_level_r<<<grid_num, 128, 0, rt.stream.get()>>>(
+            r,
+            *(coarse_vectors_.multi_level_r),
+            *(padded_topology_.real_to_padded),
+            *(coarse_space_.map),
+            coarse_vectors_.level_offsets,
+            block_dim_,
+            coarse_space_.level_num);
+
+        apply_inverse(
+            coarse_matrices_,
+            *(coarse_vectors_.multi_level_r),
+            *(coarse_vectors_.multi_level_z),
+            block_dim_,
+            rt);
+
+        gather_multi_level_z<<<grid_num, 128, 0, rt.stream.get()>>>(
+            *(coarse_vectors_.multi_level_z),
+            z,
+            *(padded_topology_.real_to_padded),
+            *(coarse_space_.map),
+            coarse_vectors_.level_offsets,
+            block_dim_,
+            coarse_space_.level_num);
+    }
+
+} // namespace polysolve::linear::mas
diff --git a/src/polysolve/linear/mas_utils/MASPreconditioner.hpp b/src/polysolve/linear/mas_utils/MASPreconditioner.hpp
new file mode 100644
index 00000000..cde2b77b
--- /dev/null
+++ b/src/polysolve/linear/mas_utils/MASPreconditioner.hpp
@@ -0,0 +1,96 @@
+#pragma once
+
+#include <polysolve/linear/mas_utils/BSRMatrix.hpp>
+#include <polysolve/linear/mas_utils/CudaUtils.cuh>
+
+#include <cuda/std/array>
+
+namespace polysolve::linear::mas
+{
+    // Our bank size is 32, so 4 coarse levels should cover 1M x 1M BSR matrix.
+    constexpr int MAS_MAX_COARSE_LEVEL = 4;
+    constexpr int MAS_LEVEL_COUNT = MAS_MAX_COARSE_LEVEL + 1;
+
+    // Topology enhanced MAS has 2 levels of mapping:
+    // Real -> Padded: virtual nodes are inserted so each partition resides in one BANK.
+    //                 see fig.6 in arXiv:2411.06224.
+    // Padded -> Coarse Space: map fine nodes to coarse space CCO (Connected components).
+
+    /// @brief Real <-> Padded space mapping.
+    struct PaddedTopology
+    {
+        Buf<int> real_to_padded; ///< Real id to padded id.
+        Buf<int> padded_to_real; ///< Padded id to real id. -1 = virtual padding.
+        Buf<int> rows;           ///< CSR row ptr. Virtual nodes are not included.
+        Buf<int> cols;           ///< CSR cols. Virtual nodes are not included.
+        int node_num = 0;        ///< Real node num.
+        int padded_node_num = 0; ///< Padded space node num.
+    };
+
+    struct CoarseSpace
+    {
+        /// Coarse space map (real id -> coarse space id) layout:
+        /// | node0_lv0 node0_lv1 ... node0_lv_max | node1_lv0 ... node1_lv_max | ... |
+        Buf<int> map;
+        /// CCO numbers per coarse space level.
+        ctd::array<int, MAS_MAX_COARSE_LEVEL> cco_nums{};
+        /// Total coarse space levels. Sometimes we terminates early cause there are no CCO to merge.
+        int level_num = 0;
+    };
+
+    struct CoarseMatrices
+    {
+        /// Packed upper triangular row major coarse space matrices (inverse hessian).
+        /// | lv0_mat0 lv0_mat1 ... | lv1_mat0 ... | ... |
+        Buf<double> data;
+        /// Per coarse space level offset and counts.
+        ctd::array<int, MAS_LEVEL_COUNT> matrix_offsets{};
+        ctd::array<int, MAS_LEVEL_COUNT> matrix_counts{};
+        /// We support block dim 1,2,3. So coarse space mat dim should be 32,64,96.
+        int mat_dim = 0;
+        int mat_storage_size = 0;
+        int total_matrix_num = 0;
+    };
+
+    struct CoarseVectors
+    {
+        /// | lv0 r | lv 1 r | ... |
+        Buf<double> multi_level_r;
+        /// | lv0 z | lv 1 z | ... |
+        Buf<double> multi_level_z;
+        /// Per coarse space level offset and counts.
+        ctd::array<int, MAS_LEVEL_COUNT> level_offsets{};
+        ctd::array<int, MAS_LEVEL_COUNT> level_sizes{};
+        int total_level_nodes = 0;
+    };
+
+    class MASPreconditioner
+    {
+    public:
+        bool empty() const
+        {
+            return !initialized_;
+        }
+
+        /// @brief Initialize MAS preconditioner. Build coarse space, compute inverse, allocate buffer...
+        /// @param A Input matrix sorted by graph partition.
+        /// @param part_offsets Graph paritition offset. CSR row ptr style.
+        /// @param rt Cuda runtime.
+        void factorize(const BSRMatrix &A, ctd::span<const int> part_offsets, CudaRuntime rt);
+
+        /// @brief Apply MAS preconditioner.
+        /// @param r Residual.
+        /// @param z Preconditioned residual.
+        /// @param rt Cuda runtime.
+        void apply(ctd::span<const double> r, ctd::span<double> z, CudaRuntime rt);
+
+    private:
+        bool initialized_ = false;
+        int vector_size_ = 0;
+        int block_dim_ = 0;
+        PaddedTopology padded_topology_;
+        CoarseSpace coarse_space_;
+        CoarseMatrices coarse_matrices_;
+        CoarseVectors coarse_vectors_;
+    };
+} // namespace polysolve::linear::mas
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 8f7b56ca..415294e1 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -30,6 +30,10 @@ if(POLYSOLVE_WITH_AMGCL)
     target_compile_definitions(unit_tests PRIVATE -DPOLYSOLVE_WITH_AMGCL)
 endif()
 
+if (POLYFEM_WITH_CUDA)
+    target_compile_definitions(unit_tests PUBLIC POLYSOLVE_WITH_CUDA)
+endif()
+
 include(polyfem-data)
 target_link_libraries(unit_tests PRIVATE polyfem::data)
 
diff --git a/tests/test_linear_solver.cpp b/tests/test_linear_solver.cpp
index 0e157dea..37d3f186 100644
--- a/tests/test_linear_solver.cpp
+++ b/tests/test_linear_solver.cpp
@@ -127,6 +127,13 @@ TEST_CASE("all", "[solver]")
         json params;
         params[s]["tolerance"] = 1e-10;
         solver->set_parameters(params);
+        if (s == "CUDA_PCG")
+        {
+            params[s]["relative_tolerance"] = 0.0;
+            params[s]["absolute_tolerance"] = 1e-8;
+            params[s]["use_preconditioned_residual_norm"] = false;
+            solver->set_parameters(params);
+        }
         Eigen::VectorXd b(A.rows());
         b.setRandom();
         Eigen::VectorXd x(b.size());
@@ -194,6 +201,41 @@ TEST_CASE("eigen_params", "[solver]")
     }
 }
 
+#ifdef POLYSOLVE_WITH_CUDA
+// Test block dim 1, 2, 3
+TEST_CASE("cuda_pcg_block_dims", "[solver]")
+{
+    const std::string path = POLYFEM_DATA_DIR;
+    Eigen::SparseMatrix<double> A;
+    const bool ok = loadMarket(A, path + "/A_2.mat");
+    REQUIRE(ok);
+
+    for (int block_dim : {1, 2, 3})
+    {
+        auto solver = Solver::create("CUDA_PCG", "");
+        json params;
+        params["CUDA_PCG"]["block_dim"] = block_dim;
+        params["CUDA_PCG"]["relative_tolerance"] = 0.0;
+        params["CUDA_PCG"]["absolute_tolerance"] = 1e-8;
+        params["CUDA_PCG"]["use_preconditioned_residual_norm"] = false;
+        solver->set_parameters(params);
+
+        Eigen::VectorXd b(A.rows());
+        b.setRandom();
+        Eigen::VectorXd x(b.size());
+        x.setZero();
+
+        solver->analyze_pattern(A, A.rows());
+        solver->factorize(A);
+        solver->solve(b, x);
+
+        const double err = (A * x - b).norm();
+        INFO("block_dim: " + std::to_string(block_dim));
+        REQUIRE(err < 1e-8);
+    }
+}
+#endif
+
 TEST_CASE("pre_factor", "[solver]")
 {
     const std::string path = POLYFEM_DATA_DIR;