diff --git a/CMakeLists.txt b/CMakeLists.txt index e1729bd2..709e6dd0 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ else() endif() # Check required CMake version -set(REQUIRED_CMAKE_VERSION "3.18.0") +set(REQUIRED_CMAKE_VERSION "3.24.0") if(POLYSOLVE_TOPLEVEL_PROJECT) cmake_minimum_required(VERSION ${REQUIRED_CMAKE_VERSION}) SET(CMAKE_POLICY_VERSION_MINIMUM ${REQUIRED_CMAKE_VERSION}) @@ -75,6 +75,7 @@ endif() # Polysolve options option(POLYSOLVE_WITH_SANITIZERS "Enable sanitizers in compilation targets" OFF) option(POLYSOLVE_WITH_UNICODE "Use Unicode in logging messages" ON) +option(POLYSOLVE_WITH_CUDA "Enable cuda support" OFF) # Polysolve options for enabling/disabling optional libraries option(POLYSOLVE_WITH_ACCELERATE "Enable Apple Accelerate" ${POLYSOLVE_ON_APPLE_SILICON}) @@ -148,20 +149,79 @@ if(POLYSOLVE_WITH_MKL) set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDLL" CACHE STRING "Select the MSVC runtime library") endif() +# Add empty library target. Setup later. +add_library(polysolve) +add_library(polysolve::polysolve ALIAS polysolve) +add_library(polysolve_linear) +add_library(polysolve::linear ALIAS polysolve_linear) + +################################################################################ +# CUDA +################################################################################ + +if (POLYSOLVE_WITH_CUDA) + # If CMAKE_CUDA_ARCHITECTURES was not specified, set it to native. + if(DEFINED CMAKE_CUDA_ARCHITECTURES) + message(STATUS "CMAKE_CUDA_ARCHITECTURES was specified, skipping auto-detection") + else() + message(STATUS "CMAKE_CUDA_ARCHITECTURES was not specified, set it to native") + set(CMAKE_CUDA_ARCHITECTURES "native") + endif() + message(STATUS "Targeting CUDA_ARCHITECTURES \"${CMAKE_CUDA_ARCHITECTURES}\"") + + # Enable CUDA support + enable_language(CUDA) + set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) + + set_target_properties(polysolve PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON + ) + set_target_properties(polysolve_linear PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON + ) + + # Enable __device__ lambda + target_compile_options(polysolve PRIVATE + $<$:--extended-lambda> + ) + target_compile_options(polysolve_linear PRIVATE + $<$:--extended-lambda> + ) + # Relax device constexpr usage + target_compile_options(polysolve PRIVATE + $<$:--expt-relaxed-constexpr> + ) + target_compile_options(polysolve_linear PRIVATE + $<$:--expt-relaxed-constexpr> + ) + + if(APPLE) + # We need to add the path to the driver (libcuda.dylib) as an rpath, + # so that the static cuda runtime can find it at runtime. + set_property(TARGET polysolve + PROPERTY + BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) + endif() +endif() + ################################################################################ # PolySolve Library ################################################################################ -# Add an empty library and fill in the list of sources in `src/CMakeLists.txt`. -add_library(polysolve_linear) -add_library(polysolve::linear ALIAS polysolve_linear) add_subdirectory(src/polysolve) add_subdirectory(src/polysolve/linear) -add_library(polysolve) -add_library(polysolve::polysolve ALIAS polysolve) add_subdirectory(src/polysolve/nonlinear) +# Ensure that changes in the CUDA linear library +# trigger relinking of the aggregate polysolve +# static library (and its CUDA device-link step). +set_property(TARGET polysolve APPEND PROPERTY LINK_DEPENDS + "$") + target_link_libraries(polysolve_linear PUBLIC polysolve_coverage_config) target_link_libraries(polysolve PUBLIC polysolve_coverage_config) @@ -180,6 +240,11 @@ if(POLYSOLVE_LARGE_INDEX) target_compile_definitions(polysolve_linear PUBLIC POLYSOLVE_LARGE_INDEX) endif() +if(POLYSOLVE_WITH_CUDA) + target_compile_definitions(polysolve PUBLIC POLYSOLVE_WITH_CUDA) + target_compile_definitions(polysolve_linear PUBLIC POLYSOLVE_WITH_CUDA) +endif() + target_compile_definitions(polysolve_linear PRIVATE POLYSOLVE_LINEAR_SPEC="${PROJECT_SOURCE_DIR}/linear-solver-spec.json") target_compile_definitions(polysolve PRIVATE POLYSOLVE_NON_LINEAR_SPEC="${PROJECT_SOURCE_DIR}/nonlinear-solver-spec.json") target_compile_definitions(polysolve_linear PUBLIC POLYSOLVE_JSON_SPEC_DIR="${PROJECT_SOURCE_DIR}") @@ -189,6 +254,13 @@ target_compile_definitions(polysolve_linear PUBLIC POLYSOLVE_JSON_SPEC_DIR="${PR # Dependencies ################################################################################ +# Cuda runtime and cuSparse +if (POLYSOLVE_WITH_CUDA) + find_package(CUDAToolkit REQUIRED) + target_link_libraries(polysolve PUBLIC CUDA::cudart CUDA::cusparse) + target_link_libraries(polysolve_linear PUBLIC CUDA::cudart CUDA::cusparse) +endif() + # ------ # Linear # ------ @@ -229,8 +301,8 @@ if(POLYSOLVE_WITH_HYPRE) include(hypre) target_link_libraries(polysolve_linear PUBLIC HYPRE::HYPRE) target_compile_definitions(polysolve_linear PUBLIC POLYSOLVE_WITH_HYPRE) - if(HYPRE_WITH_MPI) - target_compile_definitions(polysolve_linear PUBLIC HYPRE_WITH_MPI) + if(HYPRE_ENABLE_MPI) + target_compile_definitions(polysolve_linear PUBLIC HYPRE_ENABLE_MPI) endif() endif() @@ -304,6 +376,10 @@ endif() # cuSolver solvers if(POLYSOLVE_WITH_CUSOLVER) + if (NOT POLYSOLVE_WITH_CUDA) + message(FATAL_ERROR "Expect POLYSOLVE_WITH_CUDA set to ON when CUSOLVER is enabled") + endif() + include(cusolverdn) if(TARGET CUDA::cusolver) target_link_libraries(polysolve_linear PUBLIC CUDA::cusolver) @@ -331,6 +407,21 @@ if(POLYSOLVE_WITH_UNICODE) endif() endif() +# CCCL +if(POLYSOLVE_WITH_CUDA) + include(cccl) + target_link_libraries(polysolve_linear PUBLIC CCCL::CCCL) + target_link_libraries(polysolve PUBLIC CCCL::CCCL) +endif() + +# KaMinPar +# KaMinPar requires onetbb +if(POLYSOLVE_WITH_CUDA) + include(kaminpar) + target_link_libraries(polysolve_linear PRIVATE KaMinPar::KaMinPar) +endif() + + # --------- # Nonlinear # --------- diff --git a/cmake/polysolve/polysolve_warnings.cmake b/cmake/polysolve/polysolve_warnings.cmake index 20b16a61..cbad7bfa 100644 --- a/cmake/polysolve/polysolve_warnings.cmake +++ b/cmake/polysolve/polysolve_warnings.cmake @@ -10,7 +10,6 @@ endif() set(POLYSOLVE_WARNING_FLAGS -Wall -Wextra - -pedantic # -Wconversion #-Wunsafe-loop-optimizations # broken with C++11 loops diff --git a/cmake/recipes/amgcl.cmake b/cmake/recipes/amgcl.cmake index 300f7479..53849d5f 100644 --- a/cmake/recipes/amgcl.cmake +++ b/cmake/recipes/amgcl.cmake @@ -48,6 +48,18 @@ function(amgcl_import_target) ) target_link_libraries(amgcl INTERFACE Boost::boost) + + # AMGCL pollutes all downstream target with their aggressive diagostic flags, + # which causes cuda code to vomit warnings every line cause nvcc generates gcc specific intrinsics. + # Use hacky regex replacement to purge -Wpedantic, -Wall, and -Wextra. + get_target_property(AMGCL_OPTIONS amgcl INTERFACE_COMPILE_OPTIONS) + if(AMGCL_OPTIONS) + string(REGEX REPLACE ";?-Wpedantic" "" AMGCL_OPTIONS_CLEAN "${AMGCL_OPTIONS}") + string(REGEX REPLACE ";?-Wextra" "" AMGCL_OPTIONS_CLEAN "${AMGCL_OPTIONS_CLEAN}") + string(REGEX REPLACE ";?-Wall" "" AMGCL_OPTIONS_CLEAN "${AMGCL_OPTIONS_CLEAN}") + set_target_properties(amgcl PROPERTIES INTERFACE_COMPILE_OPTIONS "${AMGCL_OPTIONS_CLEAN}") + endif() + endfunction() amgcl_import_target() diff --git a/cmake/recipes/cccl.cmake b/cmake/recipes/cccl.cmake new file mode 100644 index 00000000..d335dacb --- /dev/null +++ b/cmake/recipes/cccl.cmake @@ -0,0 +1,15 @@ +# cccl (https://github.com/NVIDIA/cccl) +# License: Apache + +if(TARGET CCCL::CCCL) + return() +endif() + +message(STATUS "Third-party: creating target 'CCCL::CCCL'") + +include(CPM) +CPMAddPackage( + NAME CCCL + GITHUB_REPOSITORY NVIDIA/cccl + VERSION 3.3.0 +) diff --git a/cmake/recipes/cusolverdn.cmake b/cmake/recipes/cusolverdn.cmake index 43657ac0..a2215fb8 100644 --- a/cmake/recipes/cusolverdn.cmake +++ b/cmake/recipes/cusolverdn.cmake @@ -6,55 +6,11 @@ endif() message(STATUS "Third-party: creating targets 'CUDA::cusolver'") -include(CheckLanguage) -check_language(CUDA) -if(CMAKE_CUDA_COMPILER) - enable_language(CUDA) - - # We do not have a build recipe for this, so find it as a system installed library. - find_package(CUDAToolkit) - if(CUDAToolkit_FOUND) - set(CUDA_SEPARABLE_COMPILATION ON) - set(CUDA_PROPAGATE_HOST_FLAGS OFF) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -use_fast_math --expt-relaxed-constexpr -gencode arch=compute_86,code=sm_86") - - target_compile_options(polysolve PUBLIC $<$: - - --generate-line-info - --use_fast_math - --relocatable-device-code=true - - --ptxas-options=-v - --maxrregcount=7 - --compiler-options - -fPIC # https://stackoverflow.com/questions/5311515/gcc-fpic-option - - > - - ) - - target_link_libraries(polysolve PUBLIC CUDA::toolkit) - set_target_properties(polysolve PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - # Nvidia RTX8000 -> compute_75 - # Nvidia V100 -> compute_70 - # Nvidia 1080/1080Ti -> compute_61 - # Nvidia 3080Ti -> compute_86 - if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES 70 75 86) - endif() - set_target_properties(polysolve PROPERTIES CUDA_ARCHITECTURES "70;75;86") - - if(APPLE) - # We need to add the path to the driver (libcuda.dylib) as an rpath, - # so that the static cuda runtime can find it at runtime. - set_property(TARGET polysolve - PROPERTY - BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) - endif() - - else() - message(WARNING "cuSOLVER not found, solver will not be available.") - endif() +# Use cuSolver bundled with cuda-toolkit. +find_package(CUDAToolkit) +if (CUDAToolkit_FIND) + target_link_libraries(polysolve PUBLIC CUDA::toolkit) + target_link_libraries(polysolve_linear PUBLIC CUDA::toolkit) else() - message(WARNING "CUDA not found, cuSOLVER will not be available.") -endif() \ No newline at end of file + message(WARNING "cuSOLVER not found, solver will not be available.") +endif() diff --git a/cmake/recipes/hypre.cmake b/cmake/recipes/hypre.cmake index 7cfd0c2b..e3b19a6a 100644 --- a/cmake/recipes/hypre.cmake +++ b/cmake/recipes/hypre.cmake @@ -6,18 +6,28 @@ endif() message(STATUS "Third-party: creating target 'HYPRE::HYPRE'") -set(HYPRE_WITH_MPI OFF CACHE INTERNAL "" FORCE) -set(HYPRE_PRINT_ERRORS ON CACHE INTERNAL "" FORCE) -set(HYPRE_BIGINT ON CACHE INTERNAL "" FORCE) -set(HYPRE_USING_FEI OFF CACHE INTERNAL "" FORCE) -set(HYPRE_USING_OPENMP OFF CACHE INTERNAL "" FORCE) -set(HYPRE_SHARED OFF CACHE INTERNAL "" FORCE) +set(HYPRE_ENABLE_MPI OFF CACHE INTERNAL "" FORCE) +set(HYPRE_ENABLE_PRINT_ERRORS ON CACHE INTERNAL "" FORCE) +set(HYPRE_ENABLE_BIGINT ON CACHE INTERNAL "" FORCE) +set(HYPRE_ENABLE_FEI OFF CACHE INTERNAL "" FORCE) +set(HYPRE_ENABLE_OPENMP OFF CACHE INTERNAL "" FORCE) +set(HYPRE_SHARED OFF CACHE INTERNAL "" FORCE) + +# HYPRE unconditionally defines an "uninstall" target, which conflicts with other buggy libraries +# as modern cmake requires unique target name. This is a hacky workaround until upstream is fixed. +macro(add_custom_target _target_name) + if("${_target_name}" STREQUAL "uninstall" AND TARGET uninstall) + # skip: HYPRE's uninstall target conflicts with an existing one + else() + _add_custom_target(${_target_name} ${ARGN}) + endif() +endmacro() include(CPM) CPMAddPackage( NAME hypre GITHUB_REPOSITORY hypre-space/hypre - GIT_TAG v2.28.0 + GIT_TAG v3.1.0 SOURCE_SUBDIR src ) file(REMOVE "${hypre_SOURCE_DIR}/src/utilities/version") diff --git a/cmake/recipes/kaminpar.cmake b/cmake/recipes/kaminpar.cmake new file mode 100644 index 00000000..124398e3 --- /dev/null +++ b/cmake/recipes/kaminpar.cmake @@ -0,0 +1,29 @@ +# KaMinPar (https://github.com/KaHIP/KaMinPar) +# License: MIT + +if(TARGET KaMinPar::KaMinPar) + return() +endif() + +message(STATUS "Third-party: creating target 'KaMinPar::KaMinPar'") + +include(onetbb) + +include(CPM) +CPMAddPackage( + NAME KaMinPar + VERSION 3.7.3 + GITHUB_REPOSITORY KaHIP/KaMinPar + GIT_TAG v3.7.3 + OPTIONS + "KAMINPAR_BUILD_APPS OFF" + "KAMINPAR_BUILD_IO OFF" + "KAMINPAR_BUILD_WITH_SPARSEHASH OFF" + "KAMINPAR_BUILD_WITH_DEBUG_SYMBOLS OFF" + "KAMINPAR_64BIT_WEIGHTS ON" + "KAMINPAR_ENABLE_TIMERS OFF" + "INSTALL_KAMINPAR OFF" + # Below hack force kaminpar to use our vendored tbb + "KAMINPAR_DOWNLOAD_TBB ON" + "TBB_FOUND TRUE" +) diff --git a/linear-solver-spec.json b/linear-solver-spec.json index 7f181bb6..ab585aea 100644 --- a/linear-solver-spec.json +++ b/linear-solver-spec.json @@ -15,7 +15,8 @@ "Eigen::MINRES", "Pardiso", "Hypre", - "AMGCL" + "AMGCL", + "CUDA_PCG" ], "doc": "Settings for the linear solver." }, @@ -47,7 +48,8 @@ "Eigen::ConjugateGradient", "Eigen::BiCGSTAB", "Eigen::GMRES", - "Eigen::MINRES" + "Eigen::MINRES", + "CUDA_PCG" ] }, { @@ -449,5 +451,60 @@ "default": 0, "type": "float", "doc": "Aggregation epsilon strong." + }, + { + "pointer": "/CUDA_PCG", + "default": null, + "type": "object", + "optional": [ + "block_dim", + "max_iter", + "relative_tolerance", + "absolute_tolerance", + "lazy_partitioning", + "use_preconditioned_residual_norm" + ], + "doc": "Settings for the CUDA PCG solver." + }, + { + "pointer": "/CUDA_PCG/block_dim", + "default": 1, + "type": "int", + "options": [ + 1, + 2, + 3 + ], + "doc": "Block size of the BSR matrix PCG sovler internally use. Choose 3 for 3d problem, 2 for 2d, etc." + }, + { + "pointer": "/CUDA_PCG/max_iter", + "default": 10000, + "type": "int", + "doc": "Maximum number of iterations." + }, + { + "pointer": "/CUDA_PCG/relative_tolerance", + "default": 0.0, + "type": "float", + "doc": "Relative convergence tolerance." + }, + { + "pointer": "/CUDA_PCG/absolute_tolerance", + "default": 1e-8, + "type": "float", + "doc": "Absolute convergence tolerance." + }, + { + "pointer": "/CUDA_PCG/lazy_partitioning", + "default": true, + "type": "bool", + "doc": "If true, reuse the first graph partition." + }, + { + "pointer": "/CUDA_PCG/use_preconditioned_residual_norm", + "default": false, + "type": "bool", + "doc": "Use preconditioned residual norm for termination check." } -] \ No newline at end of file +] diff --git a/src/polysolve/linear/CMakeLists.txt b/src/polysolve/linear/CMakeLists.txt index a742a357..1eb58850 100644 --- a/src/polysolve/linear/CMakeLists.txt +++ b/src/polysolve/linear/CMakeLists.txt @@ -17,5 +17,25 @@ set(SOURCES SaddlePointSolver.hpp ) +if(POLYSOLVE_WITH_CUDA) + list(APPEND SOURCES + PCGSolver.cu + PCGSolver.hpp + mas_utils/BSRAdjacency.cpp + mas_utils/BSRAdjacency.hpp + mas_utils/BSRMatrix.cu + mas_utils/BSRMatrix.hpp + mas_utils/CuSparseWrapper.hpp + mas_utils/CudaUtils.cuh + mas_utils/InnerProduct.cu + mas_utils/InnerProduct.hpp + mas_utils/MASPreconditioner.cu + mas_utils/MASPreconditioner.hpp + mas_utils/GraphPartition.cpp + mas_utils/GraphPartition.hpp + ) + +endif() + source_group(TREE "${CMAKE_CURRENT_SOURCE_DIR}" PREFIX "Source Files" FILES ${SOURCES}) target_sources(polysolve_linear PRIVATE ${SOURCES}) diff --git a/src/polysolve/linear/CuSolverDN.cu b/src/polysolve/linear/CuSolverDN.cu index 29cf7570..e0d01d15 100644 --- a/src/polysolve/linear/CuSolverDN.cu +++ b/src/polysolve/linear/CuSolverDN.cu @@ -206,7 +206,7 @@ namespace polysolve::linear } // namespace polysolve::linear -template polysolve::CuSolverDN::CuSolverDN(); -template polysolve::CuSolverDN::CuSolverDN(); +template polysolve::linear::CuSolverDN::CuSolverDN(); +template polysolve::linear::CuSolverDN::CuSolverDN(); -#endif \ No newline at end of file +#endif diff --git a/src/polysolve/linear/HypreSolver.cpp b/src/polysolve/linear/HypreSolver.cpp index 804eca1f..0901b880 100644 --- a/src/polysolve/linear/HypreSolver.cpp +++ b/src/polysolve/linear/HypreSolver.cpp @@ -15,7 +15,7 @@ namespace polysolve::linear HypreSolver::HypreSolver() { precond_num_ = 0; -#ifdef HYPRE_WITH_MPI +#ifdef HYPRE_ENABLE_MPI int done_already; MPI_Initialized(&done_already); @@ -32,6 +32,10 @@ namespace polysolve::linear MPI_Comm_size(MPI_COMM_WORLD, &num_procs); } #endif + if (!HYPRE_Initialized()) + { + HYPRE_Initialize(); + } } // Set solver parameters @@ -91,7 +95,7 @@ namespace polysolve::linear has_matrix_ = true; const HYPRE_Int rows = Ain.rows(); const HYPRE_Int cols = Ain.cols(); -#ifdef HYPRE_WITH_MPI +#ifdef HYPRE_ENABLE_MPI HYPRE_IJMatrixCreate(MPI_COMM_WORLD, 0, rows - 1, 0, cols - 1, &A); #else HYPRE_IJMatrixCreate(0, 0, rows - 1, 0, cols - 1, &A); @@ -285,7 +289,7 @@ namespace polysolve::linear /* Create solver */ HYPRE_Solver solver, precond; -#ifdef HYPRE_WITH_MPI +#ifdef HYPRE_ENABLE_MPI HYPRE_ParCSRPCGCreate(MPI_COMM_WORLD, &solver); #else HYPRE_ParCSRPCGCreate(0, &solver); diff --git a/src/polysolve/linear/PCGSolver.cu b/src/polysolve/linear/PCGSolver.cu new file mode 100644 index 00000000..de45c89e --- /dev/null +++ b/src/polysolve/linear/PCGSolver.cu @@ -0,0 +1,649 @@ +#include "PCGSolver.hpp" + +// #ifndef SPDLOG_ACTIVE_LEVEL +// #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE +// #endif + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace polysolve::linear +{ + + using namespace mas; + + namespace + { + using clock = std::chrono::steady_clock; + + double elapsed_seconds(const std::chrono::time_point &begin) + { + return std::chrono::duration(clock::now() - begin).count(); + } + + /// Device scalar devision num / denom. + /// Defaults to zero when denom is small because we only check termination condition every 10 + /// PCG iterations and beta should be zero if we already arrive at solution. + void scalar_division( + ctd::span num, + ctd::span denom, + ctd::span out, + CudaRuntime rt) + { + auto op = [num, denom, out] __device__(int) { + out[0] = (ctd::abs(denom[0]) < 1e-20) ? 0.0 : (num[0] / denom[0]); + }; + cub::DeviceFor::Bulk(1, op, rt.stream.get()); + } + + /// Compute alpha * x + beta * y. + /// @params h_alpha Host alpha. + /// @params d_alpha Device alpha. nullptr implies 1.0. + /// @params h_beta Host beta. + /// @params d_beta Device beta. nullptr implies 1.0. + /// @params x Device vector x. + /// @params y Device vector y and the output. + /// @params rt Cuda runtime. + void axpby( + double h_alpha, + const double *d_alpha, + double h_beta, + const double *d_beta, + ctd::span x, + ctd::span y, + CudaRuntime rt) + { + auto op = [h_alpha, d_alpha, h_beta, d_beta, x, y] __device__(int idx) { + double alpha = h_alpha * ((d_alpha == nullptr) ? 1.0 : *d_alpha); + double beta = h_beta * ((d_beta == nullptr) ? 1.0 : *d_beta); + y[idx] = alpha * x[idx] + beta * y[idx]; + }; + cub::DeviceFor::Bulk(x.size(), op, rt.stream.get()); + } + + /// @brief Build CSR row ptr style parition offset. + /// Suppose all nodes are sorted by partition id, partition offset layout is: + /// | partition 0 start | parition 1 start | ... | + std::vector build_part_offsets(ctd::span part_id, int part_num) + { + std::vector offsets(part_num + 1, 0); + for (int old_id = 0; old_id < part_id.size(); ++old_id) + { + offsets[part_id[old_id] + 1] += 1; + } + for (int part = 0; part < part_num; ++part) + { + offsets[part + 1] += offsets[part]; + } + return offsets; + } + + /// @brief Build permutation based such that all nodes are sorted by parition. + std::vector build_permutation( + const std::vector &part_id, + const std::vector &offsets) + { + std::vector next = offsets; + std::vector permutation(part_id.size(), 0); + for (int old_id = 0; old_id < part_id.size(); ++old_id) + { + int part = part_id[old_id]; + permutation[old_id] = next[part]; + next[part] += 1; + } + return permutation; + } + + void permute_vector( + ctd::span input, + ctd::span output, + ctd::span block_map, + int block_dim, + CudaRuntime rt) + { + auto op = [input, output, block_map, block_dim] __device__(int idx) { + int block_id = idx / block_dim; + int comp = idx % block_dim; + int mapped = block_map[block_id]; + output[idx] = input[mapped * block_dim + comp]; + }; + cub::DeviceFor::Bulk(output.size(), op, rt.stream.get()); + } + + } // namespace + + class CudaPCG::CudaPCGImpl + { + public: + int block_dim_ = 3; ///< BSR block dim. + int max_iter_ = 1e5; + int true_residual_period_ = 4; + double abs_tol_ = 1e-20; + double rel_tol_ = 1e-6; + bool lazy_partitioning_ = false; + bool use_preconditioned_residual_norm_ = true; + + int dim_ = 0; ///< Input matrix A dim. + int permuted_dim_ = 0; ///< Dim with block padding. + int iterations_ = 0; + double residual_norm_ = 0.0; + CudaPCGStatus status_ = CudaPCGStatus::Running; + + // == IMPORTANT == + // You must declare cu::stream and memory_pool before any device storage! + // Else stream and mem pool will be destroyed before cuda::buffer, causing segfault. + ctd::optional default_device_; + ctd::optional default_stream_; + ctd::optional default_mem_pool_; + CuSparseHandle cusparse_handle_; + + BSRMatrix A_; + MASPreconditioner mas_precond_; + std::vector permutation_; ///< Permutation from graph partition. + std::vector inv_permutation_; ///< Permutation from graph partition. + std::vector part_offsets_; ///< Partition offsets for sorted blocks. + CuSparseBSR sparse_A_; ///< CuSparse handle of A. + Buf d_permutation_; ///< Permutation from graph partition. + Buf d_inv_permutation_; ///< Permutation from graph partition. + Buf spmv_workspace_; ///< CuSparse temp. + + // PCG variables. + + Buf x_; + Buf b_; + Buf r_; + Buf p_; + Buf z_; + Buf Ap_; + Buf reduction_storage_; + Buf scalar_rz_; + Buf scalar_pAp_; + Buf scalar_alpha_; + Buf scalar_beta_; + Buf scalar_rz_old_; + Buf scalar_rr_; + + public: + CudaPCGImpl() + { + if (cu::devices.size() == 0) + { + throw std::runtime_error("No Nvidia GPU!!"); + } + + default_device_.emplace(cu::devices[0]); + default_stream_.emplace(*default_device_); + default_mem_pool_.emplace(*default_device_); + } + + void set_parameters(const json ¶ms) + { + if (params.contains("block_dim")) + block_dim_ = params["block_dim"]; + if (params.contains("max_iter")) + max_iter_ = params["max_iter"]; + if (params.contains("relative_tolerance")) + rel_tol_ = params["relative_tolerance"]; + if (params.contains("absolute_tolerance")) + abs_tol_ = params["absolute_tolerance"]; + if (params.contains("lazy_partitioning")) + lazy_partitioning_ = params["lazy_partitioning"]; + if (params.contains("use_preconditioned_residual_norm")) + use_preconditioned_residual_norm_ = params["use_preconditioned_residual_norm"]; + } + + void get_info(json ¶ms) const + { + params["solver_iter"] = iterations_; + params["solver_error"] = residual_norm_; + params["solver_status"] = pcg_status_to_string(status_); + } + + void analyze_pattern(const StiffnessMatrix &, const int) {} + + void build_partition_and_perm(BSRAdjacency &adj) + { + int part_num = 0; + std::vector part_id; + graph_partition( + adj.row_ptr, + adj.cols, + adj.weights, + 32, + part_num, + part_id); + + part_offsets_ = build_part_offsets(part_id, part_num); + permutation_ = build_permutation(part_id, part_offsets_); + inv_permutation_.resize(permutation_.size()); + for (int old_id = 0; old_id < permutation_.size(); ++old_id) + { + inv_permutation_[permutation_[old_id]] = old_id; + } + } + + // Allocate cusparse workspace buffer. + void setup_cusparse(CudaRuntime rt) + { + sparse_A_ = CuSparseBSR(A_.view()); + + double alpha = 1.0; + double beta = 0.0; + CuSparseConstVec x_desc(*x_); + CuSparseVec y_desc(*Ap_); + size_t workspace_size = 0; + + cusparseSetStream(cusparse_handle_.raw, rt.stream.get()); + cusparseSpMV_bufferSize( + cusparse_handle_.raw, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + sparse_A_.raw, + x_desc.raw, + &beta, + y_desc.raw, + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, + &workspace_size); + + spmv_workspace_ = cu::make_buffer( + rt.stream, + rt.mr, + workspace_size == 0 ? 1 : workspace_size, + cu::no_init); + } + + void spmv(ctd::span x, ctd::span y, CudaRuntime rt) + { + double alpha = 1.0; + double beta = 0.0; + CuSparseConstVec x_desc(x); + CuSparseVec y_desc(y); + + cusparseSetStream(cusparse_handle_.raw, rt.stream.get()); + cusparseSpMV( + cusparse_handle_.raw, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + sparse_A_.raw, + x_desc.raw, + &beta, + y_desc.raw, + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, + spmv_workspace_->data()); + } + + void factorize(const StiffnessMatrix &A) + { + CudaRuntime rt{*default_stream_, default_mem_pool_->as_ref()}; + auto total_begin = clock::now(); + int block_n = div_round_up(A.rows(), block_dim_); + + // We do: + // 1. K-way graph parition. + // 2. Build permutation based on partition. + // 3. Initialize MAS preconditioner. + // 4. Allocates buffer for PCG loop. + + bool rebuild_partition = + !lazy_partitioning_ || permutation_.size() != block_n || part_offsets_.empty(); + if (rebuild_partition) + { + // Build adjacency for graph partition. + auto phase_begin = clock::now(); + BSRAdjacency adj{A, block_dim_}; + SPDLOG_TRACE("CUDA_PCG setup: topology_adj {:.6f}s", elapsed_seconds(phase_begin)); + + // Sort nodes based on parition. + phase_begin = clock::now(); + build_partition_and_perm(adj); + SPDLOG_TRACE("CUDA_PCG setup: graph_partition {:.6f}s", elapsed_seconds(phase_begin)); + } + else + { + SPDLOG_TRACE("CUDA_PCG setup: reuse_partition"); + } + + // Build new sorted BSR matrix for MAS initialization. + auto phase_begin = clock::now(); + A_ = BSRMatrix{ + A, + block_dim_, + ctd::span(permutation_.data(), permutation_.size()), + rt}; + SPDLOG_TRACE("CUDA_PCG setup: permuted_bsr {:.6f}s", elapsed_seconds(phase_begin)); + + BSRView view = A_.view(); + dim_ = A.rows(); + permuted_dim_ = view.block_dim * view.dim; + + // Initialize MAS. + phase_begin = clock::now(); + mas_precond_.factorize( + A_, + ctd::span(part_offsets_.data(), part_offsets_.size()), + rt); + rt.stream.sync(); + SPDLOG_TRACE("CUDA_PCG setup: mas_factorize {:.6f}s", elapsed_seconds(phase_begin)); + + // Copy permutation to device. + phase_begin = clock::now(); + d_permutation_ = cu::make_buffer(rt.stream, rt.mr, permutation_.size(), cu::no_init); + d_inv_permutation_ = + cu::make_buffer(rt.stream, rt.mr, inv_permutation_.size(), cu::no_init); + cu::copy_bytes(rt.stream, permutation_, *d_permutation_); + cu::copy_bytes(rt.stream, inv_permutation_, *d_inv_permutation_); + + // Allocates buffers for PCG loop. + x_ = cu::make_buffer(rt.stream, rt.mr, permuted_dim_, cu::no_init); + b_ = cu::make_buffer(rt.stream, rt.mr, permuted_dim_, cu::no_init); + r_ = cu::make_buffer(rt.stream, rt.mr, permuted_dim_, cu::no_init); + p_ = cu::make_buffer(rt.stream, rt.mr, permuted_dim_, cu::no_init); + z_ = cu::make_buffer(rt.stream, rt.mr, permuted_dim_, cu::no_init); + Ap_ = cu::make_buffer(rt.stream, rt.mr, permuted_dim_, cu::no_init); + + scalar_rz_ = cu::make_buffer(rt.stream, rt.mr, 1, cu::no_init); + scalar_pAp_ = cu::make_buffer(rt.stream, rt.mr, 1, cu::no_init); + scalar_alpha_ = cu::make_buffer(rt.stream, rt.mr, 1, cu::no_init); + scalar_beta_ = cu::make_buffer(rt.stream, rt.mr, 1, cu::no_init); + scalar_rz_old_ = cu::make_buffer(rt.stream, rt.mr, 1, cu::no_init); + scalar_rr_ = cu::make_buffer(rt.stream, rt.mr, 1, cu::no_init); + rt.stream.sync(); + SPDLOG_TRACE("CUDA_PCG setup: device_buffers {:.6f}s", elapsed_seconds(phase_begin)); + + // Allocates buffers for CuSparse. + phase_begin = clock::now(); + setup_cusparse(rt); + rt.stream.sync(); + SPDLOG_TRACE("CUDA_PCG setup: cusparse {:.6f}s", elapsed_seconds(phase_begin)); + SPDLOG_TRACE("CUDA_PCG setup: total {:.6f}s", elapsed_seconds(total_begin)); + } + + void solve(const Eigen::Ref b, Eigen::Ref x) + { + status_ = CudaPCGStatus::Running; + + if (b.size() != x.size() || !check_buffer_size(b.size())) + { + throw std::runtime_error("[CudaPCG] Size mismatch. Did you forget to call factorize?"); + } + + CudaRuntime rt{*default_stream_, default_mem_pool_->as_ref()}; + + cu::copy_bytes( + rt.stream, + ctd::span(b.data(), dim_), + ctd::span(r_->data(), dim_)); + cu::fill_bytes( + rt.stream, ctd::span(r_->data() + dim_, permuted_dim_ - dim_), 0); + permute_vector(*r_, *b_, *d_inv_permutation_, A_.view().block_dim, rt); + + // The solver sometimes fails to converge if we use input x as initial value. + // Maybe the caller does not initialize x properly? + // Set initial x to zero to work around this issue for now. + cu::fill_bytes(rt.stream, *x_, 0); + + pcg_solve(rt); + + permute_vector(*x_, *r_, *d_permutation_, A_.view().block_dim, rt); + + cu::copy_bytes( + rt.stream, + ctd::span(r_->data(), dim_), + ctd::span(x.data(), dim_)); + rt.stream.sync(); + } + + bool check_buffer_size(int n) const + { + if (n <= 0 || mas_precond_.empty() || !x_ || !b_ || !r_ || !p_ || !z_ || !Ap_ + || !scalar_rz_ || !scalar_pAp_ || !scalar_alpha_ || !scalar_beta_ + || !scalar_rz_old_ || !scalar_rr_) + { + return false; + } + + BSRView view = A_.view(); + int block_n = (n + view.block_dim - 1) / view.block_dim; + int block_size = view.block_dim * view.block_dim; + + if (n != dim_ || block_n != view.dim) + { + return false; + } + + if (view.rows.size() != (view.dim + 1) + || view.cols.size() != view.non_zeros + || view.vals.size() != block_size * view.non_zeros) + { + return false; + } + + if (!d_permutation_ || !d_inv_permutation_ || sparse_A_.raw == nullptr || !spmv_workspace_ + || x_->size() != permuted_dim_ + || b_->size() != permuted_dim_ + || r_->size() != permuted_dim_ + || p_->size() != permuted_dim_ + || z_->size() != permuted_dim_ + || Ap_->size() != permuted_dim_) + { + return false; + } + + if (scalar_rz_->size() < 1 + || scalar_pAp_->size() < 1 + || scalar_alpha_->size() < 1 + || scalar_beta_->size() < 1 + || scalar_rz_old_->size() < 1 + || scalar_rr_->size() < 1) + { + return false; + } + + return true; + } + + /// https://www.cs.cmu.edu/~quake-papers/painless-conjugate-gradient.pdf + void pcg_solve(CudaRuntime rt) + { + // Compute initial residual r = b-Ax. + spmv(*x_, *r_, rt); + axpby(1.0, nullptr, -1.0, nullptr, *b_, *r_, rt); + + // Compute z = M^-1 r. + mas_precond_.apply(*r_, *z_, rt); + // Initial search direction p = z; + cu::copy_bytes(rt.stream, *z_, *p_); + + // Compute rz = r^T M^-1 r. + inner_product(*r_, *z_, *scalar_rz_, rt); + const double rz0 = device2host(scalar_rz_->data(), rt); + if (ctd::isnan(rz0) || !ctd::isfinite(rz0)) + { + throw std::runtime_error("[CudaPCG] Invalid initial residual."); + } + + // Compute rr = r^T r. + double rr0 = 0.0; + if (!use_preconditioned_residual_norm_) + { + inner_product(*r_, *r_, *scalar_rr_, rt); + rr0 = device2host(scalar_rr_->data(), rt); + } + + // debug logging + auto iter_window_begin = clock::now(); + int iter_window_start = 1; + + for (int k = 1; k <= max_iter_; ++k) + { + // Compute Ap = A p. + spmv(*p_, *Ap_, rt); + // Compute pAp = p^T * A * p. + inner_product(*p_, *Ap_, *scalar_pAp_, rt); + // Compute alpha = (r M^-1 r) / (p^T A p). + scalar_division(*scalar_rz_, *scalar_pAp_, *scalar_alpha_, rt); + // Compute x = x + alpha A p. + axpby(1.0, scalar_alpha_->data(), 1.0, nullptr, *p_, *x_, rt); + + // Compute residual b-Ax directly. + if (k % true_residual_period_ == 0) + { + spmv(*x_, *r_, rt); + axpby(1.0, nullptr, -1.0, nullptr, *b_, *r_, rt); + } + // Compute residual update using r' = r - alpha A p. + // This saves one spmv but accumulates floating point error overtime. + else + { + axpby(-1.0, scalar_alpha_->data(), 1.0, nullptr, *Ap_, *r_, rt); + } + + // Compute z = M^-1 r. + mas_precond_.apply(*r_, *z_, rt); + + // Compute rz = r M^-1 r. + cu::copy_bytes(rt.stream, *scalar_rz_, *scalar_rz_old_); + inner_product(*r_, *z_, *scalar_rz_, rt); + + iterations_ = k; + bool converged = false; + + // Check convergence every 10 iterations. + if (k % 10 == 0) + { + if (use_preconditioned_residual_norm_) + { + double rz_new = device2host(scalar_rz_->data(), rt); + residual_norm_ = ctd::sqrt(rz_new); + if (rz_new <= rel_tol_ * rel_tol_ * rz0 || rz_new <= abs_tol_ * abs_tol_) + { + status_ = (rz_new <= abs_tol_ * abs_tol_) + ? CudaPCGStatus::ReachAbsoluteTolerance + : CudaPCGStatus::ReachRelativeTolerance; + converged = true; + } + } + else + { + inner_product(*r_, *r_, *scalar_rr_, rt); + double rr = device2host(scalar_rr_->data(), rt); + residual_norm_ = ctd::sqrt(rr); + if (rr <= rel_tol_ * rel_tol_ * rr0 || rr <= abs_tol_ * abs_tol_) + { + status_ = (rr <= abs_tol_ * abs_tol_) + ? CudaPCGStatus::ReachAbsoluteTolerance + : CudaPCGStatus::ReachRelativeTolerance; + converged = true; + } + } + } + + // debug logging + if (k % 100 == 0) + { + rt.stream.sync(); + SPDLOG_TRACE( + "CUDA_PCG iter {}-{}: {:.6f}s residual {:.6e}", + iter_window_start, + k, + elapsed_seconds(iter_window_begin), + residual_norm_); + iter_window_begin = clock::now(); + iter_window_start = k + 1; + } + + if (converged) + { + break; + } + + // Compute beta = rz / rz_old. + scalar_division(*scalar_rz_, *scalar_rz_old_, *scalar_beta_, rt); + // Compute direction update p' = M^-1 r + beta p. + axpby(1.0, nullptr, 1.0, scalar_beta_->data(), *z_, *p_, rt); + } + + if (iterations_ == max_iter_) + { + status_ = CudaPCGStatus::ReachMaxIterations; + } + + SPDLOG_TRACE( + "CUDA_PCG iter {}-{}: {:.6f}s residual {:.6e}", + iter_window_start, + iterations_, + elapsed_seconds(iter_window_begin), + residual_norm_); + } + }; + + CudaPCG::CudaPCG() + : impl_(std::make_unique()) + { + } + + CudaPCG::~CudaPCG() = default; + + void CudaPCG::set_parameters(const json ¶ms) + { + const std::string solver_name = name(); + if (!params.contains(solver_name)) + { + return; + } + + impl_->set_parameters(params[solver_name]); + } + + void CudaPCG::get_info(json ¶ms) const + { + impl_->get_info(params); + } + + void CudaPCG::analyze_pattern(const StiffnessMatrix &A, const int precond_num) + { + impl_->analyze_pattern(A, precond_num); + } + + void CudaPCG::factorize(const StiffnessMatrix &A) + { + impl_->factorize(A); + } + + void CudaPCG::solve(const Ref b, Ref x) + { + impl_->solve(b, x); + } + + std::string CudaPCG::name() const + { + return "CUDA_PCG"; + } + +} // namespace polysolve::linear diff --git a/src/polysolve/linear/PCGSolver.hpp b/src/polysolve/linear/PCGSolver.hpp new file mode 100644 index 00000000..556b696b --- /dev/null +++ b/src/polysolve/linear/PCGSolver.hpp @@ -0,0 +1,74 @@ +#pragma once + +#include "Solver.hpp" + +#include +#include + +namespace polysolve::linear +{ + enum class CudaPCGStatus + { + Running, + ReachRelativeTolerance, + ReachAbsoluteTolerance, + ReachMaxIterations, + }; + + inline std::string pcg_status_to_string(CudaPCGStatus stat) + { + switch (stat) + { + case CudaPCGStatus::Running: + return "Running"; + case CudaPCGStatus::ReachRelativeTolerance: + return "Reach relative tolerance"; + case CudaPCGStatus::ReachAbsoluteTolerance: + return "Reach absolute tolerance"; + case CudaPCGStatus::ReachMaxIterations: + return "Reach max iterations"; + default: + return "Unknown"; + } + } + + class CudaPCG : public Solver + { + + public: + CudaPCG(); + ~CudaPCG() override; + POLYSOLVE_DELETE_MOVE_COPY(CudaPCG) + + public: + ////////////////////// + // Public interface // + ////////////////////// + + // Set solver parameters + void set_parameters(const json ¶ms) override; + + // Retrieve information + void get_info(json ¶ms) const override; + + // Analyze sparsity pattern. This is a no-op + void analyze_pattern(const StiffnessMatrix &A, const int precond_num) override; + + // Factorize system matrix. + void factorize(const StiffnessMatrix &A) override; + + // Solve the linear system Ax = b + void solve(const Ref b, Ref x) override; + + // Set solver tolerance + // void set_tolerance(const double tol) override; + + // Name of the solver type (for debugging purposes) + std::string name() const override; + + private: + class CudaPCGImpl; + std::unique_ptr impl_; + }; + +} // namespace polysolve::linear diff --git a/src/polysolve/linear/Solver.cpp b/src/polysolve/linear/Solver.cpp index e7c0c259..ad7b1d2b 100644 --- a/src/polysolve/linear/Solver.cpp +++ b/src/polysolve/linear/Solver.cpp @@ -63,6 +63,10 @@ namespace polysolve::linear { #ifdef POLYSOLVE_WITH_CUSOLVER #include "CuSolverDN.cuh" #endif +#ifdef POLYSOLVE_WITH_CUDA +#include "PCGSolver.hpp" +#endif + #include //////////////////////////////////////////////////////////////////////////////// @@ -399,6 +403,12 @@ namespace polysolve::linear { return std::make_unique>(); #endif +#ifdef POLYSOLVE_WITH_CUDA + } + else if (solver == "CUDA_PCG") + { + return std::make_unique(); +#endif #ifdef POLYSOLVE_WITH_HYPRE } else if (solver == "Hypre") @@ -531,6 +541,9 @@ namespace polysolve::linear "cuSolverDN", "cuSolverDN_float", #endif +#ifdef POLYSOLVE_WITH_CUDA + "CUDA_PCG", +#endif #ifdef POLYSOLVE_WITH_HYPRE "Hypre", #endif diff --git a/src/polysolve/linear/mas_utils/BSRAdjacency.cpp b/src/polysolve/linear/mas_utils/BSRAdjacency.cpp new file mode 100644 index 00000000..4c77ffdb --- /dev/null +++ b/src/polysolve/linear/mas_utils/BSRAdjacency.cpp @@ -0,0 +1,192 @@ +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace polysolve::linear::mas +{ + namespace + { + std::vector quantize_weight(cuda::std::span weight) + { + if (weight.empty()) + { + return {}; + } + + auto [min_it, max_it] = std::minmax_element(weight.begin(), weight.end()); + double min_w = *min_it; + double max_w = *max_it; + double w_range = max_w - min_w; + if (w_range <= 0.0) + { + return std::vector(weight.size(), 1); + } + + constexpr int MAX_QUANT = 1000000; + std::vector quant(weight.size()); + for (int i = 0; i < static_cast(weight.size()); ++i) + { + double scaled = (weight[i] - min_w) / w_range * MAX_QUANT; + quant[i] = static_cast(std::clamp(static_cast(scaled), 1, MAX_QUANT)); + } + return quant; + } + + /// Average upper and lower part of topology connection weight. + void avg_sym_weight(cuda::std::span row_ptr, + cuda::std::span cols, + cuda::std::span weights) + { + struct Sum + { + double val = 0.0; + int count = 0; + }; + + std::unordered_map pair_weight; + + auto ij_to_key = [](int i, int j) -> uint64_t { + int u = std::min(i, j); + int v = std::max(i, j); + return (static_cast(u) << 32) | static_cast(v); + }; + + for (int i = 0; i < static_cast(row_ptr.size()) - 1; ++i) + { + for (int n = row_ptr[i]; n < row_ptr[i + 1]; ++n) + { + int j = cols[n]; + Sum &stat = pair_weight[ij_to_key(i, j)]; + stat.val += weights[n]; + stat.count += 1; + } + } + + for (int i = 0; i < static_cast(row_ptr.size()) - 1; ++i) + { + for (int n = row_ptr[i]; n < row_ptr[i + 1]; ++n) + { + int j = cols[n]; + auto it = pair_weight.find(ij_to_key(i, j)); + if (it == pair_weight.end() || it->second.count == 0) + { + continue; + } + weights[n] = it->second.val / it->second.count; + } + } + } + + template + void build_host_adjacency(const StiffnessMatrix &A, + int dim, + std::vector &row_ptr, + std::vector &cols, + std::vector &weights) + { + using Iter = Eigen::SparseMatrix::InnerIterator; + + Eigen::SparseMatrix Arow = A; + Arow.makeCompressed(); + + row_ptr.assign(dim + 1, 0); + cols.clear(); + weights.clear(); + + std::vector norms; + + // One block row at a time. + for (int bi = 0; bi < dim; ++bi) + { + // Accumulate block Frobenius norm squared per block col. + std::unordered_map col2norm2; + + for (int li = 0; li < BLOCK_DIM; ++li) + { + int scalar_row = bi * BLOCK_DIM + li; + if (scalar_row >= Arow.rows()) + { + break; + } + + for (Iter it(Arow, scalar_row); it; ++it) + { + int bj = it.col() / BLOCK_DIM; + if (bj == bi) + { + continue; // self-excluding + } + + double v = it.value(); + col2norm2[bj] += v * v; + } + } + + std::vector row_cols; + row_cols.reserve(col2norm2.size()); + for (auto &kv : col2norm2) + { + row_cols.push_back(kv.first); + } + std::sort(row_cols.begin(), row_cols.end()); + + row_ptr[bi + 1] = static_cast(row_cols.size()); + for (int bj : row_cols) + { + cols.push_back(bj); + norms.push_back(std::sqrt(col2norm2[bj])); + } + } + + for (int i = 0; i < dim; ++i) + { + row_ptr[i + 1] += row_ptr[i]; + } + + // Stabilize weights for the graph partitioner: symmetrize, then quantize. + avg_sym_weight(row_ptr, cols, norms); + weights = quantize_weight(norms); + } + } // namespace + + BSRAdjacency::BSRAdjacency(const StiffnessMatrix &A, int block_dim) + { + if (A.cols() != A.rows() || A.cols() == 0 || A.rows() == 0 || A.nonZeros() == 0) + { + throw std::runtime_error("[CudaPcg] Factorization failed due to invalid A"); + } + if (A.cols() > std::numeric_limits::max() || A.rows() > std::numeric_limits::max()) + { + throw std::runtime_error("[CudaPcg] A is too large. Row/Col number exceeding int32 max."); + } + if (block_dim < 1 || block_dim > 3) + { + throw std::runtime_error("[CudaPcg] MAS only supports block size 1, 2, or 3."); + } + + int dim = (A.rows() + block_dim - 1) / block_dim; + if (block_dim == 1) + { + build_host_adjacency<1>(A, dim, row_ptr, cols, weights); + } + else if (block_dim == 2) + { + build_host_adjacency<2>(A, dim, row_ptr, cols, weights); + } + else + { + build_host_adjacency<3>(A, dim, row_ptr, cols, weights); + } + } + +} // namespace polysolve::linear::mas diff --git a/src/polysolve/linear/mas_utils/BSRAdjacency.hpp b/src/polysolve/linear/mas_utils/BSRAdjacency.hpp new file mode 100644 index 00000000..0a89eadb --- /dev/null +++ b/src/polysolve/linear/mas_utils/BSRAdjacency.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include + +#include +#include + +namespace polysolve::linear::mas +{ + /// Host-only CSR adjacency for graph partitioning. + /// Self-excluding, with symmetrized + quantized positive integer weights. + class BSRAdjacency + { + public: + BSRAdjacency() = default; + BSRAdjacency(const StiffnessMatrix &A, int block_dim); + + std::vector row_ptr; + std::vector cols; + std::vector weights; + }; + +} // namespace polysolve::linear::mas diff --git a/src/polysolve/linear/mas_utils/BSRMatrix.cu b/src/polysolve/linear/mas_utils/BSRMatrix.cu new file mode 100644 index 00000000..01e95ef7 --- /dev/null +++ b/src/polysolve/linear/mas_utils/BSRMatrix.cu @@ -0,0 +1,445 @@ +#include + +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +namespace polysolve::linear::mas +{ + namespace + { + // Key bit layout: + // | block row idx (32) | block col idx (32) | + + __device__ uint64_t pack_key(int block_row, int block_col) + { + return (static_cast(static_cast(block_row)) << 32) | static_cast(block_col); + } + + __device__ int key_to_row(uint64_t key) + { + return static_cast(static_cast(key >> 32)); + } + + __device__ int key_to_col(uint64_t key) + { + return static_cast(static_cast(key)); + } + + // Payload bit layout: + // | is padding (1) | block local offset (31) | nnz index (32) | + + /// @brief Pack payload for radix sort. + /// @param is_padding If true, this is the diagonal padding of trailing block. + /// @param nnz_index Index of input CSC vals array. + /// @param local_offset Block local offset. Assumes Row major block. + __device__ uint64_t pack_payload(bool is_padding, int nnz_index, int local_offset) + { + uint64_t payload = 0; + payload |= static_cast(nnz_index); + payload |= (static_cast(static_cast(local_offset)) << 32); + if (is_padding) + { + payload |= (uint64_t{1} << 63); + } + return payload; + } + + __device__ bool payload_to_is_padding(uint64_t payload) + { + return (payload >> 63) != 0; + } + + __device__ int payload_to_local_offset(uint64_t payload) + { + return static_cast(static_cast(payload >> 32) & 0x7FFFFFFF); + } + + __device__ int payload_to_nnz_index(uint64_t payload) + { + return static_cast(static_cast(payload)); + } + + __global__ void build_col_of_nnz(int col_num, + ctd::span col_ptr, + ctd::span col_of_nnz) + { + int col = blockDim.x * blockIdx.x + threadIdx.x; + if (col >= col_num) + { + return; + } + + int begin = col_ptr[col]; + int end = col_ptr[col + 1]; + for (int k = begin; k < end; ++k) + { + col_of_nnz[k] = col; + } + } + + __global__ void build_keys_payloads(int nnz, + int padded, + int block_dim, + int dim_blocks, + const int *perm, // nullable + ctd::span col_of_nnz, + ctd::span row_idx, + ctd::span keys, + ctd::span payloads) + { + int tid = blockDim.x * blockIdx.x + threadIdx.x; + int nnz_total = nnz + padded; + if (tid >= nnz_total) + { + return; + } + + // Real node from input A. compute permuted block ij then pack. + if (tid < nnz) + { + int old_row = row_idx[tid]; + int old_col = col_of_nnz[tid]; + + int br_old = old_row / block_dim; + int bc_old = old_col / block_dim; + int br = (perm == nullptr) ? br_old : perm[br_old]; + int bc = (perm == nullptr) ? bc_old : perm[bc_old]; + + int offset = (old_row - br_old * block_dim) * block_dim + (old_col - bc_old * block_dim); + keys[tid] = pack_key(br, bc); + payloads[tid] = pack_payload(false, tid, offset); + } + // Padding for trailing block. Compute permuted block ij for tail block then pack. + else + { + int t = tid - nnz; + int tail_old = dim_blocks - 1; + int tail_new = (perm == nullptr) ? tail_old : perm[tail_old]; + int local = (block_dim - padded) + t; + int offset = local * block_dim + local; + + keys[tid] = pack_key(tail_new, tail_new); + payloads[tid] = pack_payload(true, 0, offset); + } + } + + __global__ void extract_block_rows(ctd::span unique_keys, + ctd::span block_rows) + { + int tid = blockDim.x * blockIdx.x + threadIdx.x; + if (tid >= unique_keys.size()) + { + return; + } + + block_rows[tid] = key_to_row(unique_keys[tid]); + } + + __global__ void fill_bsr_cols_vals(int block_dim, + ctd::span unique_keys, + ctd::span payload_offsets, + ctd::span payloads, + ctd::span csc_vals, + ctd::span bsr_cols, + ctd::span bsr_vals) + { + int bid = blockDim.x * blockIdx.x + threadIdx.x; + if (bid >= unique_keys.size()) + { + return; + } + + bsr_cols[bid] = key_to_col(unique_keys[bid]); + + int begin = payload_offsets[bid]; + int end = payload_offsets[bid + 1]; + int block_size = block_dim * block_dim; + int base = bid * block_size; + for (int i = begin; i < end; ++i) + { + uint64_t payload = payloads[i]; + int offset = payload_to_local_offset(payload); + // We pad diagonal entry to 1.0 for trailing blocks. + double val = payload_to_is_padding(payload) ? 1.0 : csc_vals[payload_to_nnz_index(payload)]; + bsr_vals[base + offset] = val; + } + } + + int build_device_bsr_from_csc(ctd::span d_col_ptr, + ctd::span d_row_idx, + ctd::span d_vals, + int rows_num, + int cols_num, + int block_dim, + const int *d_perm, + Buf &out_rows, + Buf &out_cols, + Buf &out_vals, + CudaRuntime rt) + { + int nnz = d_row_idx.size(); + int dim_blocks = div_round_up(rows_num, block_dim); + int padded = block_dim * dim_blocks - rows_num; + int nnz_total = nnz + padded; + + // --------------------------------------------------------------------------- + // Map each nnz index to input col. + // --------------------------------------------------------------------------- + + auto col_of_nnz = cu::make_buffer(rt.stream, rt.mr, nnz, cu::no_init); + build_col_of_nnz<<>>( + cols_num, d_col_ptr, col_of_nnz); + + // --------------------------------------------------------------------------- + // Convert each non-zero to [key, payload] pair. + // --------------------------------------------------------------------------- + + auto keys_in = cu::make_buffer(rt.stream, rt.mr, nnz_total, cu::no_init); + auto payloads_in = cu::make_buffer(rt.stream, rt.mr, nnz_total, cu::no_init); + build_keys_payloads<<>>( + nnz, + padded, + block_dim, + dim_blocks, + d_perm, + col_of_nnz, + d_row_idx, + keys_in, + payloads_in); + + // --------------------------------------------------------------------------- + // Radix sort by key. Result should be in row major order. + // --------------------------------------------------------------------------- + + auto keys_alt = cu::make_buffer(rt.stream, rt.mr, nnz_total, cu::no_init); + auto payloads_alt = cu::make_buffer(rt.stream, rt.mr, nnz_total, cu::no_init); + cub::DoubleBuffer d_keys(keys_in.data(), keys_alt.data()); + cub::DoubleBuffer d_payloads(payloads_in.data(), payloads_alt.data()); + Buf cub_tmp; + auto make_cub_tmp = [&cub_tmp, rt](size_t required_size) { + if (!cub_tmp || cub_tmp->size() < required_size) + { + cub_tmp = cu::make_buffer(rt.stream, rt.mr, required_size, cu::no_init); + } + return cub_tmp->data(); + }; + + size_t sort_tmp_size = 0; + cub::DeviceRadixSort::SortPairs(nullptr, + sort_tmp_size, + d_keys, + d_payloads, + nnz_total, + 0, + 64, + rt.stream.get()); + cub::DeviceRadixSort::SortPairs(make_cub_tmp(sort_tmp_size), + sort_tmp_size, + d_keys, + d_payloads, + nnz_total, + 0, + 64, + rt.stream.get()); + + // --------------------------------------------------------------------------- + // Run-length encode. + // This step computes non-zero block num + payload count for each block. + // --------------------------------------------------------------------------- + + auto unique_keys = cu::make_buffer(rt.stream, rt.mr, nnz_total, cu::no_init); + auto counts = cu::make_buffer(rt.stream, rt.mr, nnz_total, cu::no_init); + auto num_runs = cu::make_buffer(rt.stream, rt.mr, 1, cu::no_init); + + size_t rle_tmp_size = 0; + cub::DeviceRunLengthEncode::Encode(nullptr, + rle_tmp_size, + d_keys.Current(), + unique_keys.data(), + counts.data(), + num_runs.data(), + nnz_total, + rt.stream.get()); + cub::DeviceRunLengthEncode::Encode(make_cub_tmp(rle_tmp_size), + rle_tmp_size, + d_keys.Current(), + unique_keys.data(), + counts.data(), + num_runs.data(), + nnz_total, + rt.stream.get()); + + // --------------------------------------------------------------------------- + // Histogram + ExclusiveSum + // This step count non-zero blocks per rows to compute row_ptr. + // --------------------------------------------------------------------------- + + int nnz_blocks = device2host(num_runs.data(), rt); + auto block_rows = cu::make_buffer(rt.stream, rt.mr, nnz_blocks, cu::no_init); + // Extract row index from packed key. + extract_block_rows<<>>( + ctd::span(unique_keys.data(), nnz_blocks), + block_rows); + // Histogram count nnz block per row. + auto hist = cu::make_buffer(rt.stream, rt.mr, dim_blocks + 1, 0); + size_t hist_tmp_size = 0; + cub::DeviceHistogram::HistogramEven(nullptr, + hist_tmp_size, + block_rows.data(), + hist.data(), + dim_blocks + 1, + 0, + dim_blocks, + nnz_blocks, + rt.stream.get()); + cub::DeviceHistogram::HistogramEven(make_cub_tmp(hist_tmp_size), + hist_tmp_size, + block_rows.data(), + hist.data(), + dim_blocks + 1, + 0, + dim_blocks, + nnz_blocks, + rt.stream.get()); + // Exclusive scan compute CSR row ptr. + out_rows = cu::make_buffer(rt.stream, rt.mr, dim_blocks + 1, cu::no_init); + size_t scan_tmp_size = 0; + cub::DeviceScan::ExclusiveSum(nullptr, + scan_tmp_size, + hist.data(), + out_rows->data(), + dim_blocks + 1, + rt.stream.get()); + cub::DeviceScan::ExclusiveSum(make_cub_tmp(scan_tmp_size), + scan_tmp_size, + hist.data(), + out_rows->data(), + dim_blocks + 1, + rt.stream.get()); + + // --------------------------------------------------------------------------- + // Fill cols and vals non-zero blocks. + // --------------------------------------------------------------------------- + + // Compute Payload offsets for each block. + auto payload_offsets = cu::make_buffer(rt.stream, rt.mr, nnz_blocks + 1, cu::no_init); + cudaMemsetAsync(counts.data() + nnz_blocks, 0, sizeof(int), rt.stream.get()); + size_t off_tmp_size = 0; + cub::DeviceScan::ExclusiveSum(nullptr, + off_tmp_size, + counts.data(), + payload_offsets.data(), + nnz_blocks + 1, + rt.stream.get()); + cub::DeviceScan::ExclusiveSum(make_cub_tmp(off_tmp_size), + off_tmp_size, + counts.data(), + payload_offsets.data(), + nnz_blocks + 1, + rt.stream.get()); + + // Fill cols and vals. + out_cols = cu::make_buffer(rt.stream, rt.mr, nnz_blocks, cu::no_init); + out_vals = cu::make_buffer(rt.stream, rt.mr, nnz_blocks * block_dim * block_dim, 0.0); + fill_bsr_cols_vals<<>>( + block_dim, + ctd::span(unique_keys.data(), nnz_blocks), + payload_offsets, + ctd::span(d_payloads.Current(), nnz_total), + d_vals, + *out_cols, + *out_vals); + + return nnz_blocks; + } + } // namespace + + BSRMatrix::BSRMatrix(const StiffnessMatrix &A, + int block_dim, + ctd::span permutation, + CudaRuntime rt) + { + static_assert(std::is_same_v, "MAS only support int32 index type."); + if (A.nonZeros() > std::numeric_limits::max()) + { + throw std::runtime_error("[CudaPcg] A is too large. Non-zero number exceeding int32 max."); + } + + // if A is not compressed. Copy and compress. + const StiffnessMatrix *A_csc = &A; + StiffnessMatrix compressed_A; + if (!A.isCompressed()) + { + compressed_A = A; + compressed_A.makeCompressed(); + A_csc = &compressed_A; + } + + block_dim_ = block_dim; + dim_ = div_round_up(A_csc->rows(), block_dim_); + + int rows_num = A_csc->rows(); + int cols_num = A_csc->cols(); + int nnz = A_csc->nonZeros(); + + // Upload input CSC to device. + auto d_col_ptr = cu::make_buffer(rt.stream, rt.mr, cols_num + 1, cu::no_init); + auto d_row_idx = cu::make_buffer(rt.stream, rt.mr, nnz, cu::no_init); + auto d_vals = cu::make_buffer(rt.stream, rt.mr, nnz, cu::no_init); + cudaMemcpyAsync( + d_col_ptr.data(), + A_csc->outerIndexPtr(), + static_cast(cols_num + 1) * sizeof(int), + cudaMemcpyHostToDevice, + rt.stream.get()); + cudaMemcpyAsync( + d_row_idx.data(), + A_csc->innerIndexPtr(), + static_cast(nnz) * sizeof(int), + cudaMemcpyHostToDevice, + rt.stream.get()); + cudaMemcpyAsync( + d_vals.data(), + A_csc->valuePtr(), + static_cast(nnz) * sizeof(double), + cudaMemcpyHostToDevice, + rt.stream.get()); + + // Upload permutation if non empty. + const int *d_perm_ptr = nullptr; + Buf d_perm; + if (!permutation.empty()) + { + d_perm = cu::make_buffer(rt.stream, rt.mr, permutation.size(), cu::no_init); + cu::copy_bytes(rt.stream, permutation, *d_perm); + d_perm_ptr = d_perm->data(); + } + + non_zeros_ = build_device_bsr_from_csc( + d_col_ptr, + d_row_idx, + d_vals, + rows_num, + cols_num, + block_dim_, + d_perm_ptr, + rows_, + cols_, + vals_, + rt); + + rt.stream.sync(); + } + +} // namespace polysolve::linear::mas diff --git a/src/polysolve/linear/mas_utils/BSRMatrix.hpp b/src/polysolve/linear/mas_utils/BSRMatrix.hpp new file mode 100644 index 00000000..ecf68f2a --- /dev/null +++ b/src/polysolve/linear/mas_utils/BSRMatrix.hpp @@ -0,0 +1,47 @@ +#pragma once + +#include +#include + +#include + +namespace polysolve::linear::mas +{ + struct BSRView + { + int dim; + int block_dim; + int non_zeros; + ctd::span rows; + ctd::span cols; + ctd::span vals; + }; + + /// @brief Block CSR matrix. + class BSRMatrix + { + public: + BSRMatrix() = default; + + /// @brief Build from host CSR matrix. Empty permutation implies identity. + /// Sync stream before return. + BSRMatrix(const StiffnessMatrix &A, + int block_dim, + ctd::span permutation, + CudaRuntime rt); + + BSRView view() const + { + return BSRView{dim_, block_dim_, non_zeros_, *rows_, *cols_, *vals_}; + } + + private: + int dim_ = 0; + int block_dim_ = 0; + int non_zeros_ = 0; + Buf rows_; + Buf cols_; + Buf vals_; + }; + +} // namespace polysolve::linear::mas diff --git a/src/polysolve/linear/mas_utils/CuSparseWrapper.hpp b/src/polysolve/linear/mas_utils/CuSparseWrapper.hpp new file mode 100644 index 00000000..20c8065a --- /dev/null +++ b/src/polysolve/linear/mas_utils/CuSparseWrapper.hpp @@ -0,0 +1,182 @@ +#pragma once + +#include + +#include + +// -------------------------------------------------------------- +// RAII wrapper for varous cuSparse handle +// -------------------------------------------------------------- + +namespace polysolve::linear::mas +{ + class CuSparseHandle + { + public: + cusparseHandle_t raw = nullptr; + + CuSparseHandle() + { + cusparseCreate(&raw); + } + + ~CuSparseHandle() + { + if (raw != nullptr) + { + cusparseDestroy(raw); + } + } + + CuSparseHandle(const CuSparseHandle &) = delete; + CuSparseHandle(CuSparseHandle &&other) { swap(other); } + CuSparseHandle &operator=(const CuSparseHandle &) = delete; + CuSparseHandle &operator=(CuSparseHandle &&other) + { + swap(other); + return *this; + } + + private: + void swap(CuSparseHandle &other) + { + std::swap(raw, other.raw); + } + }; + + class CuSparseConstVec + { + public: + cusparseConstDnVecDescr_t raw = nullptr; + + CuSparseConstVec(ctd::span vec) + { + cusparseCreateConstDnVec(&raw, vec.size(), vec.data(), CUDA_R_64F); + } + + ~CuSparseConstVec() + { + if (raw != nullptr) + { + cusparseDestroyDnVec(raw); + } + } + + CuSparseConstVec(const CuSparseConstVec &) = delete; + CuSparseConstVec(CuSparseConstVec &&other) { swap(other); } + CuSparseConstVec &operator=(const CuSparseConstVec &) = delete; + CuSparseConstVec &operator=(CuSparseConstVec &&other) + { + swap(other); + return *this; + } + + private: + void swap(CuSparseConstVec &other) + { + std::swap(raw, other.raw); + } + }; + + /// @brief Minimal RAII wrapper for CuSparse dense vector. + class CuSparseVec + { + public: + cusparseDnVecDescr_t raw = nullptr; + + CuSparseVec(ctd::span vec) + { + cusparseCreateDnVec(&raw, vec.size(), vec.data(), CUDA_R_64F); + } + + ~CuSparseVec() + { + if (raw != nullptr) + { + cusparseDestroyDnVec(raw); + } + } + + CuSparseVec(const CuSparseVec &) = delete; + CuSparseVec(CuSparseVec &&other) { swap(other); } + CuSparseVec &operator=(const CuSparseVec &) = delete; + CuSparseVec &operator=(CuSparseVec &&other) + { + swap(other); + return *this; + } + + private: + void swap(CuSparseVec &other) + { + std::swap(raw, other.raw); + } + }; + + class CuSparseBSR + { + public: + cusparseConstSpMatDescr_t raw = nullptr; + + CuSparseBSR() = default; + + CuSparseBSR(BSRView mat) + { + if (mat.block_dim == 1) + { + cusparseCreateConstCsr(&raw, + mat.dim, + mat.dim, + mat.non_zeros, + mat.rows.data(), + mat.cols.data(), + mat.vals.data(), + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F); + } + else + { + cusparseCreateConstBsr(&raw, + mat.dim, + mat.dim, + mat.non_zeros, + mat.block_dim, + mat.block_dim, + mat.rows.data(), + mat.cols.data(), + mat.vals.data(), + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F, + CUSPARSE_ORDER_ROW); + } + } + + ~CuSparseBSR() + { + if (raw != nullptr) + { + cusparseDestroySpMat(raw); + } + } + + CuSparseBSR(const CuSparseBSR &) = delete; + CuSparseBSR(CuSparseBSR &&other) { swap(other); } + CuSparseBSR &operator=(const CuSparseBSR &) = delete; + CuSparseBSR &operator=(CuSparseBSR &&other) + { + swap(other); + return *this; + } + + private: + void swap(CuSparseBSR &other) + { + std::swap(raw, other.raw); + } + }; + +} // namespace polysolve::linear::mas diff --git a/src/polysolve/linear/mas_utils/CudaUtils.cuh b/src/polysolve/linear/mas_utils/CudaUtils.cuh new file mode 100644 index 00000000..ddd72ba7 --- /dev/null +++ b/src/polysolve/linear/mas_utils/CudaUtils.cuh @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include +#include + +#define __both__ __host__ __device__ + +// Convenient namespace alias for libcudacxx. +namespace cu = ::cuda; +namespace ctd = ::cuda::std; + +namespace polysolve::linear::mas +{ + /// @brief ceil(num/denom) + constexpr int div_round_up(int num, int denom) + { + return (num + denom - 1) / denom; + } + + struct CudaRuntime + { + cu::stream_ref stream; + cu::device_memory_pool_ref mr; + }; + + /// @brief Transfer device scalar src to host. + template + T device2host(const T *src, CudaRuntime rt) + { + T result; + cudaMemcpyAsync(&result, src, sizeof(T), cudaMemcpyDeviceToHost, + rt.stream.get()); + rt.stream.sync(); + return result; + } + + /// @brief Transfer host scalar val to device dst. Does not sync stream. + template + void host2device(T *dst, T val, CudaRuntime rt) + { + cudaMemcpyAsync(dst, &val, sizeof(T), cudaMemcpyHostToDevice, + rt.stream.get()); + } + + /// @brief Nullable device buffer. + /// It's very annoying device_buffer does not have default ctor for empty buffer. + template + using Buf = ctd::optional>; + +} // namespace polysolve::linear::mas diff --git a/src/polysolve/linear/mas_utils/GraphPartition.cpp b/src/polysolve/linear/mas_utils/GraphPartition.cpp new file mode 100644 index 00000000..2219883d --- /dev/null +++ b/src/polysolve/linear/mas_utils/GraphPartition.cpp @@ -0,0 +1,92 @@ +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +namespace polysolve::linear::mas +{ + void graph_partition(ctd::span row_ptr, + ctd::span cols, + ctd::span weights, + int max_part_size, + int &part_num, + std::vector &part_id) + { + int node_num = row_ptr.size() - 1; + assert(node_num >= 0); + assert(max_part_size > 0); + assert(weights.empty() || weights.size() == cols.size()); + + part_id.resize(node_num, 0); + if (node_num <= max_part_size) + { + part_num = 1; + return; + } + + static_assert(sizeof(int) == sizeof(kaminpar_node_id_t), "int and KaMinPar node IDs differ in size"); + static_assert(sizeof(int) == sizeof(kaminpar_edge_id_t), "int and KaMinPar edge IDs differ in size"); + static_assert(sizeof(int64_t) == sizeof(kaminpar_edge_weight_t), + "int64 and KaMinPar edge weights differ in size (enable KAMINPAR_64BIT_WEIGHTS?)"); + + auto *xadj = reinterpret_cast(row_ptr.data()); + auto *adjncy = reinterpret_cast(cols.data()); + auto *adjwgt = weights.empty() + ? nullptr + : reinterpret_cast(weights.data()); + + kaminpar_context_t *ctx = kaminpar_create_context_by_preset_name("default"); + assert(ctx); + + int thread_num = (std::thread::hardware_concurrency() != 0) + ? std::thread::hardware_concurrency() + : 1; + kaminpar_t *partitioner = kaminpar_create(thread_num, ctx); + assert(partitioner); + + kaminpar_set_output_level(partitioner, KAMINPAR_OUTPUT_LEVEL_QUIET); + kaminpar_borrow_and_mutate_graph( + partitioner, + node_num, + xadj, + adjncy, + nullptr, + adjwgt); + + // Impl Eq.7 of arXiv:2411.06224 + + // Forcing tight K for k-way algorithm leads to low quality parition. + // It seems like max_part_size - 2 is a good balance. + part_num = div_round_up(node_num, max_part_size - 2); + assert(part_num >= 1); + std::vector max_block_weights( + part_num, + static_cast(max_part_size)); + std::vector partition(node_num); + + // K way partition with absolution max block weight. + // Since our node weight is uniform, this is equivalent to limiting parition size. + kaminpar_compute_partition_with_max_block_weights( + partitioner, + part_num, + max_block_weights.data(), + partition.data()); + + kaminpar_context_free(ctx); + kaminpar_free(partitioner); + + for (int i = 0; i < node_num; ++i) + { + part_id[i] = static_cast(partition[i]); + } + } + +} // namespace polysolve::linear::mas diff --git a/src/polysolve/linear/mas_utils/GraphPartition.hpp b/src/polysolve/linear/mas_utils/GraphPartition.hpp new file mode 100644 index 00000000..3bd45dfd --- /dev/null +++ b/src/polysolve/linear/mas_utils/GraphPartition.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include +#include +#include + +#include + +namespace polysolve::linear::mas +{ + /// @brief K-way graph partition. + /// @param row_ptr[in] CSR graph topology. Will be modified! + /// @param cols[in] CSR graph topology. Will be modified! + /// @param weights[in] CSR graph edge weights. Empty span implies unweighted graph. Will be modified! + /// @param max_part_size[in] Maximum partition size. + /// @param part_num[out] Total partition num. + /// @param part_id[out] Partition id of each graph node. + void graph_partition(ctd::span row_ptr, + ctd::span cols, + ctd::span weights, + int max_part_size, + int &part_num, + std::vector &part_id); +} // namespace polysolve::linear::mas diff --git a/src/polysolve/linear/mas_utils/InnerProduct.cu b/src/polysolve/linear/mas_utils/InnerProduct.cu new file mode 100644 index 00000000..cc93e23b --- /dev/null +++ b/src/polysolve/linear/mas_utils/InnerProduct.cu @@ -0,0 +1,46 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace polysolve::linear::mas +{ + + namespace + { + __global__ void inner_product_kernel( + ctd::span a, ctd::span b, double *out) + { + int tid = blockDim.x * blockIdx.x + threadIdx.x; + + double c = (tid < a.size()) ? a[tid] * b[tid] : 0.0; + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage tmp; + + double reduced = BlockReduce(tmp).Sum(c); + + if (threadIdx.x == 0) + { + cu::atomic_ref a_out{*out}; + a_out.fetch_add(reduced, ctd::memory_order_relaxed); + } + } + } // namespace + + void inner_product(ctd::span a, ctd::span b, + ctd::span out, CudaRuntime rt) + { + assert(a.size() == b.size()); + assert(out.size() == 1); + + cu::fill_bytes(rt.stream, out, 0); + int grid_num = div_round_up(a.size(), 128); + inner_product_kernel<<>>(a, b, out.data()); + } +} // namespace polysolve::linear::mas diff --git a/src/polysolve/linear/mas_utils/InnerProduct.hpp b/src/polysolve/linear/mas_utils/InnerProduct.hpp new file mode 100644 index 00000000..6fb584be --- /dev/null +++ b/src/polysolve/linear/mas_utils/InnerProduct.hpp @@ -0,0 +1,12 @@ +#pragma once + +#include +#include +#include + +namespace polysolve::linear::mas +{ + /// @brief Compute inner product a dot b. Does not sync implicitly. + void inner_product(ctd::span a, ctd::span b, + ctd::span out, CudaRuntime rt); +} // namespace polysolve::linear::mas diff --git a/src/polysolve/linear/mas_utils/MASPreconditioner.cu b/src/polysolve/linear/mas_utils/MASPreconditioner.cu new file mode 100644 index 00000000..a23435dc --- /dev/null +++ b/src/polysolve/linear/mas_utils/MASPreconditioner.cu @@ -0,0 +1,1117 @@ +#include + +// #ifndef SPDLOG_ACTIVE_LEVEL +// #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE +// #endif + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace polysolve::linear::mas +{ + namespace + { + using clock = std::chrono::steady_clock; + + double elapsed_seconds(const std::chrono::time_point &begin) + { + return std::chrono::duration(clock::now() - begin).count(); + } + + // This is not tunable. + constexpr int BANK_SIZE = 32; + + /// @brief Convenience wrapper for CoarseMatrices. + struct CoarseMatricesRef + { + int mat_dim; + int mat_storage_size; + ctd::array, MAS_LEVEL_COUNT> matrix_per_level; + + CoarseMatricesRef(CoarseMatrices &mats) + { + mat_dim = mats.mat_dim; + mat_storage_size = mats.mat_storage_size; + + int scalar_offset = 0; + for (int i = 0; i < MAS_LEVEL_COUNT; ++i) + { + int level_size = mats.matrix_counts[i] * mats.mat_storage_size; + matrix_per_level[i] = + ctd::span(mats.data->data() + scalar_offset, level_size); + scalar_offset += level_size; + } + } + }; + + __global__ void build_padded_maps(ctd::span part_offsets, + ctd::span bsr_rows, + ctd::span real_to_padded, + ctd::span padded_to_real, + ctd::span real_num_per_row) + { + int padded_id = blockDim.x * blockIdx.x + threadIdx.x; + if (padded_id >= real_num_per_row.size()) + { + return; + } + + int part = padded_id / BANK_SIZE; + int local = padded_id % BANK_SIZE; + int part_begin = part_offsets[part]; + int part_end = part_offsets[part + 1]; + int part_size = part_end - part_begin; + + if (local < part_size) + { + int real_id = part_begin + local; + real_to_padded[real_id] = padded_id; + padded_to_real[padded_id] = real_id; + real_num_per_row[padded_id] = bsr_rows[real_id + 1] - bsr_rows[real_id]; + } + else + { + padded_to_real[padded_id] = -1; + real_num_per_row[padded_id] = 0; + } + } + + __global__ void fill_padded_cols(ctd::span bsr_rows, + ctd::span bsr_cols, + ctd::span real_to_padded, + ctd::span padded_rows, + ctd::span padded_cols) + { + int real_id = blockDim.x * blockIdx.x + threadIdx.x; + int node_num = bsr_rows.size() - 1; + if (real_id >= node_num) + { + return; + } + + int padded_id = real_to_padded[real_id]; + int dst = padded_rows[padded_id]; + for (int n = bsr_rows[real_id]; n < bsr_rows[real_id + 1]; ++n) + { + padded_cols[dst] = real_to_padded[bsr_cols[n]]; + ++dst; + } + } + + /// @brief Get coarse space CCO id. + /// @param map Coarse space map. + /// @param vid Vertex id. + /// @param level Coarse space level. Start at lv 0. + /// @return CCO id. + __both__ int get_coarse_space_id(ctd::span map, int vid, int level) + { + return map[vid * MAS_MAX_COARSE_LEVEL + level]; + } + + /// @brief Build local CCO mapping from input space -> coarse space lv 0 and collapse topology. + /// @param row_ptr CSR graph topology (may include self). + /// @param cols CSR graph topology (may include self). + /// @param padded_to_real Padded id -> real id. -1 denotes padding. + /// @param cco_num_per_bank CCO number per bank. + /// @param local_cco_ids Bank local CCO id at coarse space lv 0. + __global__ void build_local_cco_lv0(ctd::span row_ptr, + ctd::span cols, + ctd::span padded_to_real, + ctd::span cco_num_per_bank, + ctd::span local_cco_ids) + { + // Bank local neighbor masks. + __shared__ uint32_t neighbors[128]; + + int btid = threadIdx.x; // block local thread id + int tid = blockDim.x * blockIdx.x + btid; // global thread id + int wid = threadIdx.x / BANK_SIZE; // block local warp id + int lid = threadIdx.x % BANK_SIZE; // lane id + int bid = tid / BANK_SIZE; // bank id + + if (lid == 0 && bid < cco_num_per_bank.size()) + { + cco_num_per_bank[bid] = 0; + } + + int node_num = row_ptr.size() - 1; + if (tid >= node_num) + { + return; + } + // Skip virtual padding. + if (padded_to_real[tid] == -1) + { + local_cco_ids[tid] = -1; + return; + } + + // Build neighbor mask (including self) and collapse topology. + int row_begin = row_ptr[tid]; + int row_end = row_ptr[tid + 1]; + uint32_t neighbor = (1u << lid); + int out_of_bank_neighbor_count = 0; + for (int n = row_begin; n < row_end; ++n) + { + int neighbor_vid = cols[n]; + int neighbor_bid = neighbor_vid / BANK_SIZE; + if (bid == neighbor_bid) + { + neighbor |= (1u << (neighbor_vid % BANK_SIZE)); + } + else + { + // Compact topology to remove in-bank neighbors so we can skip + // additional filtering when building future coarse spaces. + cols[row_begin + out_of_bank_neighbor_count] = neighbor_vid; + ++out_of_bank_neighbor_count; + } + } + if (row_begin + out_of_bank_neighbor_count < row_end) + { + // -1 denotes neighbor list ending. + cols[row_begin + out_of_bank_neighbor_count] = -1; + } + neighbors[btid] = neighbor; + __syncwarp(); + + // Build connectivity mask (including self). + uint32_t connection = neighbor; + uint32_t visited = (1u << lid); + uint32_t to_visit = connection ^ visited; + while (to_visit) + { + int visiting = ctd::countr_zero(to_visit); + connection |= neighbors[visiting + wid * BANK_SIZE]; + visited |= (1u << visiting); + to_visit = connection ^ visited; + } + + // Find bank (warp) local connected component (CCO). + uint32_t cco_lead_lid = ctd::countr_zero(connection); + bool is_lead = (cco_lead_lid == lid); + uint32_t lead_lanes = __ballot_sync(0xFFFFFFFFu, is_lead); + uint32_t before_cco_lead_mask = static_cast((1ull << cco_lead_lid) - 1ull); + + // Write cco info back. + local_cco_ids[tid] = ctd::popcount(lead_lanes & before_cco_lead_mask); + if (lid == 0) + { + cco_num_per_bank[bid] = ctd::popcount(lead_lanes); + } + } + + /// @brief Build bank neighbor mask and collapse topology. + /// @param level Target coarse space level. + /// @param coarse_space_map Coarse space map. + /// @param row_ptr CSR graph topology (may include self). + /// @param cols CSR graph topology (may include self). + /// @param padded_to_real Padded id -> real id. -1 denotes padding. + /// @param neighbors Bank local neighbor mask. + __global__ void build_neighbor_masks_lvx(int level, + ctd::span coarse_space_map, + ctd::span row_ptr, + ctd::span cols, + ctd::span padded_to_real, + ctd::span neighbors) + { + int tid = blockDim.x * blockIdx.x + threadIdx.x; // global thread id + int node_num = row_ptr.size() - 1; + if (tid >= node_num) + { + return; + } + // Skip virtual padding. + if (padded_to_real[tid] == -1) + { + return; + } + + // Build neighbor mask (including self) and collapse topology. + int row_begin = row_ptr[tid]; + int row_end = row_ptr[tid + 1]; + int real_id = padded_to_real[tid]; + int cco_id = get_coarse_space_id(coarse_space_map, real_id, level - 1); + uint32_t neighbor = (1u << (cco_id % BANK_SIZE)); + int out_of_bank_neighbor_count = 0; + for (int n = row_begin; n < row_end; ++n) + { + int neighbor_vid = cols[n]; + if (neighbor_vid == -1) + { + break; + } + + int neighbor_real_id = padded_to_real[neighbor_vid]; + int neighbor_cco_id = + get_coarse_space_id(coarse_space_map, neighbor_real_id, level - 1); + if (cco_id / BANK_SIZE == neighbor_cco_id / BANK_SIZE) + { + neighbor |= (1u << (neighbor_cco_id % BANK_SIZE)); + } + else + { + cols[row_begin + out_of_bank_neighbor_count] = neighbor_vid; + ++out_of_bank_neighbor_count; + } + } + if (row_begin + out_of_bank_neighbor_count < row_end) + { + cols[row_begin + out_of_bank_neighbor_count] = -1; + } + + atomicOr(neighbors.data() + cco_id, neighbor); + } + + /// @brief Build local CCO mapping from coarse space level x-1 -> coarse space level x. + /// @param cco_num Total CCO number at coarse space lv x-1. + /// @param cco_num_per_bank Independent CCO number per bank at coarse space lv x. + /// @param local_cco_ids Bank local CCO id at coarse space lv x. + __global__ void build_local_cco_lvx(int cco_num, + ctd::span neighbors, + ctd::span cco_num_per_bank, + ctd::span local_cco_ids) + { + int tid = blockDim.x * blockIdx.x + threadIdx.x; // global thread id + int bid = tid / BANK_SIZE; // bank id + int lid = tid % BANK_SIZE; // lane id + if (tid >= cco_num) + { + return; + } + + // Build connectivity mask (including self). + uint32_t connection = neighbors[tid]; + uint32_t visited = (1u << lid); + uint32_t to_visit = connection ^ visited; + while (to_visit) + { + int visiting = ctd::countr_zero(to_visit); + connection |= neighbors[visiting + bid * BANK_SIZE]; + visited |= (1u << visiting); + to_visit = connection ^ visited; + } + + // Find bank (warp) local connected component (CCO). + uint32_t cco_lead_lid = ctd::countr_zero(connection); + bool is_lead = (cco_lead_lid == lid); + uint32_t lead_lanes = __ballot_sync(0xFFFFFFFFu, is_lead); + uint32_t before_cco_lead_mask = static_cast((1ull << cco_lead_lid) - 1ull); + + // Write cco info back. + local_cco_ids[tid] = ctd::popcount(lead_lanes & before_cco_lead_mask); + if (lid == 0) + { + cco_num_per_bank[bid] = ctd::popcount(lead_lanes); + } + } + + /// @brief Compute global CCO id and write it to coarse space map. + /// + /// Compute global CCO id offset from prefix sum, then write result to coarse_space_map. + __global__ void aggregate_coarse_space_map_lv0(ctd::span local_cco_ids, + ctd::span cco_num_per_bank, + ctd::span cco_num_per_bank_summed, + ctd::span real_to_padded, + ctd::span coarse_space_map) + { + int real_id = blockDim.x * blockIdx.x + threadIdx.x; + if (real_id >= real_to_padded.size()) + { + return; + } + + int padded_id = real_to_padded[real_id]; + int bid = padded_id / BANK_SIZE; + int cco_id = + local_cco_ids[padded_id] + cco_num_per_bank_summed[bid] - cco_num_per_bank[bid]; + coarse_space_map[real_id * MAS_MAX_COARSE_LEVEL] = cco_id; + } + + /// @brief Compute global CCO id and write it to coarse space map. + /// + /// Compute global CCO id offset from prefix sum, then write result to coarse_space_map. + __global__ void aggregate_coarse_space_map_lvx(int level, + ctd::span local_cco_ids, + ctd::span cco_num_per_bank, + ctd::span cco_num_per_bank_summed, + ctd::span coarse_space_map) + { + int real_id = blockDim.x * blockIdx.x + threadIdx.x; + int node_num = coarse_space_map.size() / MAS_MAX_COARSE_LEVEL; + if (real_id >= node_num) + { + return; + } + + int prev_cco_id = get_coarse_space_id(coarse_space_map, real_id, level - 1); + int bid = prev_cco_id / BANK_SIZE; + int cco_id = + local_cco_ids[prev_cco_id] + cco_num_per_bank_summed[bid] - cco_num_per_bank[bid]; + coarse_space_map[real_id * MAS_MAX_COARSE_LEVEL + level] = cco_id; + } + + /// @brief Build coarse space map. + /// + /// Coarse space map real node id layout: + /// | node0_lv0 node0_lv1 ... node0_lv_max | node1_lv0 ... node1_lv_max | ... | + /// + /// @param row_ptr CSR graph topology (may include self). + /// @param cols CSR graph topology (may include self). + /// @param real_to_padded Real id -> padded id. + /// @param padded_to_real Padded id -> real id. -1 denotes virtual padding. + /// @rt Cuda runtime config. + /// @warning Will modify cols as side effect! + CoarseSpace build_coarse_space(ctd::span row_ptr, + ctd::span cols, + ctd::span real_to_padded, + ctd::span padded_to_real, + CudaRuntime rt) + { + ctd::array cco_nums{}; + + int padded_node_num = row_ptr.size() - 1; + int real_node_num = real_to_padded.size(); + int max_bank_per_level = div_round_up(padded_node_num, BANK_SIZE); + // Bank local CCO id. + auto local_cco_ids = + cu::make_buffer(rt.stream, rt.mr, padded_node_num, cu::no_init); + auto cco_num_per_bank = + cu::make_buffer(rt.stream, rt.mr, max_bank_per_level, cu::no_init); + // Inclusive prefix sum of cco_num_per_bank. + auto cco_num_per_bank_summed = + cu::make_buffer(rt.stream, rt.mr, max_bank_per_level, cu::no_init); + auto coarse_space_map = + cu::make_buffer(rt.stream, rt.mr, real_node_num * MAS_MAX_COARSE_LEVEL, cu::no_init); + + // Build coarse map level 0. + int grid_num = div_round_up(padded_node_num, 128); + build_local_cco_lv0<<>>( + row_ptr, cols, padded_to_real, cco_num_per_bank, local_cco_ids); + + // Prefix sum to compute global CCO id offset. + size_t cub_tmp_size = 0; + cub::DeviceScan::InclusiveSum(nullptr, + cub_tmp_size, + cco_num_per_bank.data(), + cco_num_per_bank_summed.data(), + max_bank_per_level, + rt.stream.get()); + auto cub_tmp = + cu::make_buffer(rt.stream, rt.mr, cub_tmp_size, cu::no_init); + cub::DeviceScan::InclusiveSum(cub_tmp.data(), + cub_tmp_size, + cco_num_per_bank.data(), + cco_num_per_bank_summed.data(), + max_bank_per_level, + rt.stream.get()); + + aggregate_coarse_space_map_lv0<<>>( + local_cco_ids, cco_num_per_bank, cco_num_per_bank_summed, real_to_padded, coarse_space_map); + + int cco_num = device2host(cco_num_per_bank_summed.data() + max_bank_per_level - 1, rt); + cco_nums[0] = cco_num; + int level_num = 1; + + // Build coarse map level 1 ... MAS_MAX_COARSE_LEVEL-1 recursively. + // We do: + // 1. Build bank local neighbbor bit mask. + // 2. Build bank local connectivity mask and find local CCO partition. + // 3. Compute global CCO id and write to coarse space map. + auto neighbors = cu::make_buffer(rt.stream, rt.mr, padded_node_num, cu::no_init); + for (int lv = 1; lv < MAS_MAX_COARSE_LEVEL; ++lv) + { + cudaMemsetAsync(neighbors.data(), 0, cco_num * sizeof(uint32_t), rt.stream.get()); + + grid_num = div_round_up(padded_node_num, 128); + build_neighbor_masks_lvx<<>>( + lv, + coarse_space_map, + row_ptr, + cols, + padded_to_real, + ctd::span(neighbors.data(), cco_num)); + + int bank_num = div_round_up(cco_num, BANK_SIZE); + grid_num = div_round_up(cco_num, 128); + build_local_cco_lvx<<>>( + cco_num, + ctd::span(neighbors.data(), cco_num), + ctd::span(cco_num_per_bank.data(), bank_num), + ctd::span(local_cco_ids.data(), cco_num)); + + cub::DeviceScan::InclusiveSum(cub_tmp.data(), + cub_tmp_size, + cco_num_per_bank.data(), + cco_num_per_bank_summed.data(), + bank_num, + rt.stream.get()); + aggregate_coarse_space_map_lvx<<>>( + lv, + ctd::span(local_cco_ids.data(), cco_num), + ctd::span(cco_num_per_bank.data(), bank_num), + ctd::span(cco_num_per_bank_summed.data(), bank_num), + coarse_space_map); + + // No more CCO to merge. + int next_cco_num = device2host(cco_num_per_bank_summed.data() + bank_num - 1, rt); + if (next_cco_num == cco_num) + { + break; + } + + cco_num = next_cco_num; + cco_nums[lv] = cco_num; + level_num = lv + 1; + } + + CoarseSpace cs; + cs.cco_nums = cco_nums; + cs.level_num = level_num; + cs.map = std::move(coarse_space_map); + return cs; + } + + /// @brief Compute offset of row major upper triangular dim x dim matrix. + __both__ int index_upper_mat(int dim, int i, int j) + { + if (i > j) + { + ctd::swap(i, j); + } + return i * dim - (i * (i + 1) / 2) + j; + } + + /// @brief Build coarse space matrices by accumulating padded space hessian. + __global__ void fill_coarse_matrices(BSRView mat_in, + CoarseMatricesRef mat_out, + ctd::span coarse_space_map, + ctd::span real_to_padded, + int coarse_level_num) + { + int tid = blockDim.x * blockIdx.x + threadIdx.x; // global thread id + if (tid >= mat_in.dim) + { + return; + } + + int padded_i = real_to_padded[tid]; + int block_size = mat_in.block_dim * mat_in.block_dim; + for (int nz = mat_in.rows[tid]; nz < mat_in.rows[tid + 1]; ++nz) + { + int j = mat_in.cols[nz]; + // Assume input mat is symmetric (Which is required by CG anyway). Skip lower. + if (tid > j) + { + continue; + } + + int padded_j = real_to_padded[j]; + int block_offset = nz * block_size; + // Padded space matrices. + if (padded_i / BANK_SIZE == padded_j / BANK_SIZE) + { + int mat_id = padded_i / BANK_SIZE; + int scalar_i_root = (padded_i % BANK_SIZE) * mat_in.block_dim; + int scalar_j_root = (padded_j % BANK_SIZE) * mat_in.block_dim; + double *mat = mat_out.matrix_per_level[0].data() + mat_id * mat_out.mat_storage_size; + for (int bi = 0; bi < mat_in.block_dim; ++bi) + { + for (int bj = 0; bj < mat_in.block_dim; ++bj) + { + int row = scalar_i_root + bi; + int col = scalar_j_root + bj; + // Write upper part of coarse matrix only. + if (row > col) + { + continue; + } + // For diagonal block (tid == j), write upper part only. + if (tid == j && bi > bj) + { + continue; + } + + double val = mat_in.vals[block_offset + bi * mat_in.block_dim + bj]; + atomicAdd(mat + index_upper_mat(mat_out.mat_dim, row, col), val); + } + } + } + // Coarse space matrices. + for (int lv = 0; lv < coarse_level_num; ++lv) + { + int cco_i = coarse_space_map[tid * MAS_MAX_COARSE_LEVEL + lv]; + int cco_j = coarse_space_map[j * MAS_MAX_COARSE_LEVEL + lv]; + if (cco_i / BANK_SIZE != cco_j / BANK_SIZE) + { + continue; + } + + int mat_id = cco_i / BANK_SIZE; + int scalar_i_root = (cco_i % BANK_SIZE) * mat_in.block_dim; + int scalar_j_root = (cco_j % BANK_SIZE) * mat_in.block_dim; + double *mat = mat_out.matrix_per_level[lv + 1].data() + mat_id * mat_out.mat_storage_size; + for (int bi = 0; bi < mat_in.block_dim; ++bi) + { + for (int bj = 0; bj < mat_in.block_dim; ++bj) + { + int row = scalar_i_root + bi; + int col = scalar_j_root + bj; + // Write upper part of coarse matrix only. + if (row > col) + { + continue; + } + // For diagonal block (tid == j), write upper part only. + if (tid == j && bi > bj) + { + continue; + } + + double val = mat_in.vals[block_offset + bi * mat_in.block_dim + bj]; + // When two different fine nodes collapse to the same coarse node, + // the coarse diagonal block must receive both A_ij and A_ji = A_ij^T. + if (tid != j && cco_i == cco_j) + { + val += mat_in.vals[block_offset + bj * mat_in.block_dim + bi]; + } + atomicAdd(mat + index_upper_mat(mat_out.mat_dim, row, col), val); + } + } + } + } + } + + __global__ void gather_multi_level_r( + ctd::span r, + ctd::span multi_level_r, + ctd::span real_to_padded, + ctd::span coarse_space_map, + ctd::array level_offsets, + int block_dim, + int coarse_level_num) + { + int real_id = blockDim.x * blockIdx.x + threadIdx.x; + if (real_id >= real_to_padded.size()) + { + return; + } + + int padded_id = real_to_padded[real_id]; + int src_root = real_id * block_dim; + int fine_root = padded_id * block_dim; + // Gather padded space. + for (int i = 0; i < block_dim; ++i) + { + multi_level_r[fine_root + i] = r[src_root + i]; + } + // Gather coarse space. + for (int lv = 0; lv < coarse_level_num; ++lv) + { + int cco_id = coarse_space_map[real_id * MAS_MAX_COARSE_LEVEL + lv]; + int dst_root = (level_offsets[lv + 1] + cco_id) * block_dim; + for (int i = 0; i < block_dim; ++i) + { + atomicAdd(multi_level_r.data() + dst_root + i, r[src_root + i]); + } + } + } + + /// @brief Matrix vector multiplication. A is packed upper row major matrix. + /// + /// This is the main bottleneck of MAS preconditioner. But since we are vram bandwidth + /// bound, there's nothing I could do. The compute / memory ratio is just too low. + template + __global__ void symv_upper_packed(const double *A_upper, + const double *x, + double *y) + { + int mat_id = blockIdx.x; + int row = threadIdx.x; + constexpr int L = N * (N + 1) / 2; + const double *Amat = A_upper + mat_id * L; + + // All threads in block load A and x into shared mem. + + __shared__ double sx[N]; + __shared__ double sA[L]; + + for (int i = threadIdx.x; i < N; i += BLOCK) + { + sx[i] = x[mat_id * N + i]; + } + for (int k = threadIdx.x; k < L; k += BLOCK) + { + sA[k] = Amat[k]; + } + + __syncthreads(); + + // Each thread do one row * x. + + if (row >= N) + { + return; + } + + double sum = 0.0; + for (int col = 0; col < N; ++col) + { + sum += sA[index_upper_mat(N, row, col)] * sx[col]; + } + + y[mat_id * N + row] = sum; + } + + void apply_inverse(const CoarseMatrices &mats, + ctd::span x, + ctd::span y, + int block_dim, + CudaRuntime rt) + { + int mat_num = mats.total_matrix_num; + if (mat_num == 0) + { + return; + } + + if (block_dim == 1) + { + symv_upper_packed<32, 32><<>>( + mats.data->data(), + x.data(), + y.data()); + return; + } + if (block_dim == 2) + { + symv_upper_packed<64, 64><<>>( + mats.data->data(), + x.data(), + y.data()); + return; + } + if (block_dim == 3) + { + symv_upper_packed<96, 96><<>>( + mats.data->data(), + x.data(), + y.data()); + return; + } + + throw std::runtime_error("[CudaPCG] MAS only supports block size 1, 2, or 3."); + } + + __global__ void gather_multi_level_z( + ctd::span multi_level_z, + ctd::span z, + ctd::span real_to_padded, + ctd::span coarse_space_map, + ctd::array level_offsets, + int block_dim, + int coarse_level_num) + { + int real_id = blockDim.x * blockIdx.x + threadIdx.x; + if (real_id >= real_to_padded.size()) + { + return; + } + + // Padded space. + int padded_id = real_to_padded[real_id]; + int dst_root = real_id * block_dim; + for (int i = 0; i < block_dim; ++i) + { + z[dst_root + i] = multi_level_z[padded_id * block_dim + i]; + } + // Coarse space. + for (int lv = 0; lv < coarse_level_num; ++lv) + { + int cco_id = coarse_space_map[real_id * MAS_MAX_COARSE_LEVEL + lv]; + int src_root = (level_offsets[lv + 1] + cco_id) * block_dim; + for (int i = 0; i < block_dim; ++i) + { + z[dst_root + i] += multi_level_z[src_root + i]; + } + } + } + + __global__ void pad_zero_diagonal(double *mats, int mat_num, int mat_dim, int mat_storage_size) + { + int tid = blockDim.x * blockIdx.x + threadIdx.x; + int total_diag = mat_num * mat_dim; + if (tid >= total_diag) + { + return; + } + + int mat_id = tid / mat_dim; + int row = tid % mat_dim; + double *mat = mats + mat_id * mat_storage_size; + double *diag = mat + index_upper_mat(mat_dim, row, row); + if (*diag == 0.0) + { + *diag = 1.0; + } + } + + /// @brief Invert a packed symmetric matrix in-place using symmetric Gauss-Jordan sweeps. + template + __global__ void batched_invert_upper(double *d_matrices, + bool *success) + { + int mat_idx = blockIdx.x; + constexpr int STORAGE = N * (N + 1) / 2; + double *d_A = d_matrices + mat_idx * STORAGE; + + __shared__ double s_A[STORAGE]; + __shared__ double s_col[N]; + __shared__ double s_pivot; + int tx = threadIdx.x; + + for (int i = tx; i < STORAGE; i += N) + { + s_A[i] = d_A[i]; + } + __syncthreads(); + + for (int pivot = 0; pivot < N; ++pivot) + { + if (tx == 0) + { + s_pivot = s_A[index_upper_mat(N, pivot, pivot)]; + if (!ctd::isfinite(s_pivot) || s_pivot == 0.0) + { + *success = false; + } + } + __syncthreads(); + + if (tx < N) + { + if (tx == pivot) + { + s_col[tx] = 0.0; + } + else + { + int row = ctd::min(tx, pivot); + int col = ctd::max(tx, pivot); + s_col[tx] = s_A[index_upper_mat(N, row, col)]; + } + } + __syncthreads(); + + if (tx < N && tx != pivot) + { + double a_ik = s_col[tx]; + for (int col = tx; col < N; ++col) + { + if (col == pivot) + { + continue; + } + + double updated = + s_A[index_upper_mat(N, tx, col)] - a_ik * s_col[col] / s_pivot; + if (!ctd::isfinite(updated)) + { + *success = false; + } + s_A[index_upper_mat(N, tx, col)] = updated; + } + } + __syncthreads(); + + if (tx == pivot) + { + double updated = -1.0 / s_pivot; + if (!ctd::isfinite(updated)) + { + *success = false; + } + else + { + s_A[index_upper_mat(N, pivot, pivot)] = updated; + } + } + else if (tx < N) + { + double updated = s_col[tx] / s_pivot; + if (!ctd::isfinite(updated)) + { + *success = false; + } + else + { + int row = ctd::min(tx, pivot); + int col = ctd::max(tx, pivot); + s_A[index_upper_mat(N, row, col)] = updated; + } + } + __syncthreads(); + } + + for (int i = tx; i < STORAGE; i += N) + { + double output = -s_A[i]; + if (!ctd::isfinite(output)) + { + *success = false; + } + d_A[i] = output; + } + } + + void invert_packed_matrices(double *mats, int mat_num, int block_dim, CudaRuntime rt) + { + if (mat_num == 0) + { + return; + } + + auto success = cu::make_buffer(rt.stream, rt.mr, 1, true); + + if (block_dim == 1) + { + batched_invert_upper<32><<>>( + mats, success.data()); + } + else if (block_dim == 2) + { + batched_invert_upper<64><<>>( + mats, success.data()); + } + else if (block_dim == 3) + { + batched_invert_upper<96><<>>( + mats, success.data()); + } + else + { + assert(false); + } + + bool host_success = device2host(success.data(), rt); + if (!host_success) + { + throw std::runtime_error("[CudaPCG] MAS packed inverse failed."); + } + } + + /// @brief Build symmetric upper triangular coarse space matrices. + CoarseMatrices build_sym_coarse_matrices(const CoarseSpace &cs, + BSRView mat, + ctd::span real_to_padded, + int padded_node_num, + CudaRuntime rt) + { + CoarseMatrices out; + out.mat_dim = BANK_SIZE * mat.block_dim; + out.mat_storage_size = out.mat_dim * (out.mat_dim + 1) / 2; + out.total_matrix_num = 0; + out.matrix_offsets[0] = 0; + out.matrix_counts[0] = padded_node_num / BANK_SIZE; + out.total_matrix_num += out.matrix_counts[0]; + for (int i = 0; i < cs.level_num; ++i) + { + out.matrix_offsets[i + 1] = out.total_matrix_num; + out.matrix_counts[i + 1] = div_round_up(cs.cco_nums[i], BANK_SIZE); + out.total_matrix_num += out.matrix_counts[i + 1]; + } + + out.data = cu::make_buffer( + rt.stream, + rt.mr, + out.total_matrix_num * out.mat_storage_size, + 0.0); + CoarseMatricesRef view{out}; + + // Gather. + int grid_num = div_round_up(mat.dim, 128); + fill_coarse_matrices<<>>( + mat, view, *cs.map, real_to_padded, cs.level_num); + + // Pad diagonal. + int total_diag = out.total_matrix_num * out.mat_dim; + pad_zero_diagonal<<>>( + out.data->data(), out.total_matrix_num, out.mat_dim, out.mat_storage_size); + + // Compute inverse. + invert_packed_matrices(out.data->data(), out.total_matrix_num, mat.block_dim, rt); + return out; + } + } // namespace + + void MASPreconditioner::factorize(const BSRMatrix &A, + ctd::span part_offsets, + CudaRuntime rt) + { + assert(part_offsets.size() >= 2); + + BSRView view = A.view(); + assert(view.block_dim >= 1 && view.block_dim <= 3); + assert(part_offsets.back() == view.dim); + for (int i = 0; i + 1 < static_cast(part_offsets.size()); ++i) + { + assert(part_offsets[i + 1] - part_offsets[i] <= BANK_SIZE); + } + + initialized_ = false; + block_dim_ = view.block_dim; + vector_size_ = view.dim * view.block_dim; + + auto total_begin = clock::now(); + auto phase_begin = clock::now(); + + // Build padded topology. + int node_num = view.dim; + int part_num = part_offsets.size() - 1; + int padded_node_num = part_num * BANK_SIZE; + + auto d_part_offsets = cu::make_buffer(rt.stream, rt.mr, part_offsets.size(), cu::no_init); + cu::copy_bytes(rt.stream, part_offsets, d_part_offsets); + + padded_topology_.node_num = node_num; + padded_topology_.padded_node_num = padded_node_num; + padded_topology_.real_to_padded = cu::make_buffer(rt.stream, rt.mr, node_num, -1); + padded_topology_.padded_to_real = cu::make_buffer(rt.stream, rt.mr, padded_node_num, -1); + padded_topology_.rows = cu::make_buffer(rt.stream, rt.mr, padded_node_num + 1, cu::no_init); + padded_topology_.cols = cu::make_buffer(rt.stream, rt.mr, view.non_zeros, cu::no_init); + + auto read_num_per_row = cu::make_buffer(rt.stream, rt.mr, padded_node_num, 0); + build_padded_maps<<>>( + d_part_offsets, + view.rows, + *(padded_topology_.real_to_padded), + *(padded_topology_.padded_to_real), + read_num_per_row); + + // Build padded space row_ptr. + size_t cub_tmp_size = 0; + cub::DeviceScan::ExclusiveSum(nullptr, + cub_tmp_size, + read_num_per_row.data(), + padded_topology_.rows->data(), + padded_node_num, + rt.stream.get()); + auto cub_tmp = cu::make_buffer(rt.stream, rt.mr, cub_tmp_size, cu::no_init); + cub::DeviceScan::ExclusiveSum(cub_tmp.data(), + cub_tmp_size, + read_num_per_row.data(), + padded_topology_.rows->data(), + padded_node_num, + rt.stream.get()); + host2device(padded_topology_.rows->data() + padded_node_num, view.non_zeros, rt); + + fill_padded_cols<<>>( + view.rows, + view.cols, + *(padded_topology_.real_to_padded), + *(padded_topology_.rows), + *(padded_topology_.cols)); + + rt.stream.sync(); + SPDLOG_TRACE("CUDA_PCG MAS: build_padded_topology_device {:.6f}s", elapsed_seconds(phase_begin)); + + // Build coarse space hierarchy. + phase_begin = clock::now(); + coarse_space_ = build_coarse_space( + *(padded_topology_.rows), + *(padded_topology_.cols), + *(padded_topology_.real_to_padded), + *(padded_topology_.padded_to_real), + rt); + rt.stream.sync(); + SPDLOG_TRACE("CUDA_PCG MAS: build_coarse_space {:.6f}s", elapsed_seconds(phase_begin)); + + // Build coarse space matrices by 1. gather coarse space hessian from fine nodes 2. invert + phase_begin = clock::now(); + coarse_matrices_ = build_sym_coarse_matrices( + coarse_space_, + view, + *(padded_topology_.real_to_padded), + padded_topology_.padded_node_num, + rt); + rt.stream.sync(); + SPDLOG_TRACE("CUDA_PCG MAS: build_coarse_matrices {:.6f}s", elapsed_seconds(phase_begin)); + + // Allocate memory for coarse space residual (r) and preconditioned residual (z). + phase_begin = clock::now(); + coarse_vectors_.level_offsets[0] = 0; + coarse_vectors_.level_sizes[0] = padded_topology_.padded_node_num; + coarse_vectors_.total_level_nodes = padded_topology_.padded_node_num; + for (int i = 0; i < coarse_space_.level_num; ++i) + { + coarse_vectors_.level_offsets[i + 1] = coarse_vectors_.total_level_nodes; + coarse_vectors_.level_sizes[i + 1] = coarse_matrices_.matrix_counts[i + 1] * BANK_SIZE; + coarse_vectors_.total_level_nodes += coarse_vectors_.level_sizes[i + 1]; + } + int total_level_scalars = coarse_vectors_.total_level_nodes * block_dim_; + coarse_vectors_.multi_level_r = + cu::make_buffer(rt.stream, rt.mr, total_level_scalars, 0.0); + coarse_vectors_.multi_level_z = + cu::make_buffer(rt.stream, rt.mr, total_level_scalars, 0.0); + rt.stream.sync(); + SPDLOG_TRACE("CUDA_PCG MAS: allocate_coarse_vectors {:.6f}s", elapsed_seconds(phase_begin)); + SPDLOG_TRACE("CUDA_PCG MAS: factorize_total {:.6f}s", elapsed_seconds(total_begin)); + + initialized_ = true; + } + + void MASPreconditioner::apply( + ctd::span r, + ctd::span z, + CudaRuntime rt) + { + if (!initialized_) + { + throw std::runtime_error("[CudaPCG] MASPreconditioner is not initialized."); + } + if (r.size() != z.size() || r.size() != vector_size_) + { + throw std::runtime_error("[CudaPCG] Invalid vector size for MAS preconditioner."); + } + + cu::fill_bytes(rt.stream, *(coarse_vectors_.multi_level_r), 0); + cu::fill_bytes(rt.stream, *(coarse_vectors_.multi_level_z), 0); + + int grid_num = div_round_up(padded_topology_.node_num, 128); + gather_multi_level_r<<>>( + r, + *(coarse_vectors_.multi_level_r), + *(padded_topology_.real_to_padded), + *(coarse_space_.map), + coarse_vectors_.level_offsets, + block_dim_, + coarse_space_.level_num); + + apply_inverse( + coarse_matrices_, + *(coarse_vectors_.multi_level_r), + *(coarse_vectors_.multi_level_z), + block_dim_, + rt); + + gather_multi_level_z<<>>( + *(coarse_vectors_.multi_level_z), + z, + *(padded_topology_.real_to_padded), + *(coarse_space_.map), + coarse_vectors_.level_offsets, + block_dim_, + coarse_space_.level_num); + } + +} // namespace polysolve::linear::mas diff --git a/src/polysolve/linear/mas_utils/MASPreconditioner.hpp b/src/polysolve/linear/mas_utils/MASPreconditioner.hpp new file mode 100644 index 00000000..cde2b77b --- /dev/null +++ b/src/polysolve/linear/mas_utils/MASPreconditioner.hpp @@ -0,0 +1,96 @@ +#pragma once + +#include +#include + +#include + +namespace polysolve::linear::mas +{ + // Our bank size is 32, so 4 coarse levels should cover 1M x 1M BSR matrix. + constexpr int MAS_MAX_COARSE_LEVEL = 4; + constexpr int MAS_LEVEL_COUNT = MAS_MAX_COARSE_LEVEL + 1; + + // Topology enhanced MAS has 2 levels of mapping: + // Real -> Padded: virtual nodes are inserted so each partition resides in one BANK. + // see fig.6 in arXiv:2411.06224. + // Padded -> Coarse Space: map fine nodes to coarse space CCO (Connected components). + + /// @brief Real <-> Padded space mapping. + struct PaddedTopology + { + Buf real_to_padded; ///< Real id to padded id. + Buf padded_to_real; ///< Padded id to real id. -1 = virtual padding. + Buf rows; ///< CSR row ptr. Virtual nodes are not included. + Buf cols; ///< CSR cols. Virtual nodes are not included. + int node_num = 0; ///< Real node num. + int padded_node_num = 0; ///< Padded space node num. + }; + + struct CoarseSpace + { + /// Coarse space map (real id -> coarse space id) layout: + /// | node0_lv0 node0_lv1 ... node0_lv_max | node1_lv0 ... node1_lv_max | ... | + Buf map; + /// CCO numbers per coarse space level. + ctd::array cco_nums{}; + /// Total coarse space levels. Sometimes we terminates early cause there are no CCO to merge. + int level_num = 0; + }; + + struct CoarseMatrices + { + /// Packed upper triangular row major coarse space matrices (inverse hessian). + /// | lv0_mat0 lv0_mat1 ... | lv1_mat0 ... | ... | + Buf data; + /// Per coarse space level offset and counts. + ctd::array matrix_offsets{}; + ctd::array matrix_counts{}; + /// We support block dim 1,2,3. So coarse space mat dim should be 32,64,96. + int mat_dim = 0; + int mat_storage_size = 0; + int total_matrix_num = 0; + }; + + struct CoarseVectors + { + /// | lv0 r | lv 1 r | ... | + Buf multi_level_r; + /// | lv0 z | lv 1 z | ... | + Buf multi_level_z; + /// Per coarse space level offset and counts. + ctd::array level_offsets{}; + ctd::array level_sizes{}; + int total_level_nodes = 0; + }; + + class MASPreconditioner + { + public: + bool empty() const + { + return !initialized_; + } + + /// @brief Initialize MAS preconditioner. Build coarse space, compute inverse, allocate buffer... + /// @param A Input matrix sorted by graph partition. + /// @param part_offsets Graph paritition offset. CSR row ptr style. + /// @param rt Cuda runtime. + void factorize(const BSRMatrix &A, ctd::span part_offsets, CudaRuntime rt); + + /// @brief Apply MAS preconditioner. + /// @param r Residual. + /// @param z Preconditioned residual. + /// @param rt Cuda runtime. + void apply(ctd::span r, ctd::span z, CudaRuntime rt); + + private: + bool initialized_ = false; + int vector_size_ = 0; + int block_dim_ = 0; + PaddedTopology padded_topology_; + CoarseSpace coarse_space_; + CoarseMatrices coarse_matrices_; + CoarseVectors coarse_vectors_; + }; +} // namespace polysolve::linear::mas diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 8f7b56ca..415294e1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -30,6 +30,10 @@ if(POLYSOLVE_WITH_AMGCL) target_compile_definitions(unit_tests PRIVATE -DPOLYSOLVE_WITH_AMGCL) endif() +if (POLYFEM_WITH_CUDA) + target_compile_definitions(unit_tests PUBLIC POLYSOLVE_WITH_CUDA) +endif() + include(polyfem-data) target_link_libraries(unit_tests PRIVATE polyfem::data) diff --git a/tests/test_linear_solver.cpp b/tests/test_linear_solver.cpp index 0e157dea..37d3f186 100644 --- a/tests/test_linear_solver.cpp +++ b/tests/test_linear_solver.cpp @@ -127,6 +127,13 @@ TEST_CASE("all", "[solver]") json params; params[s]["tolerance"] = 1e-10; solver->set_parameters(params); + if (s == "CUDA_PCG") + { + params[s]["relative_tolerance"] = 0.0; + params[s]["absolute_tolerance"] = 1e-8; + params[s]["use_preconditioned_residual_norm"] = false; + solver->set_parameters(params); + } Eigen::VectorXd b(A.rows()); b.setRandom(); Eigen::VectorXd x(b.size()); @@ -194,6 +201,41 @@ TEST_CASE("eigen_params", "[solver]") } } +#ifdef POLYSOLVE_WITH_CUDA +// Test block dim 1, 2, 3 +TEST_CASE("cuda_pcg_block_dims", "[solver]") +{ + const std::string path = POLYFEM_DATA_DIR; + Eigen::SparseMatrix A; + const bool ok = loadMarket(A, path + "/A_2.mat"); + REQUIRE(ok); + + for (int block_dim : {1, 2, 3}) + { + auto solver = Solver::create("CUDA_PCG", ""); + json params; + params["CUDA_PCG"]["block_dim"] = block_dim; + params["CUDA_PCG"]["relative_tolerance"] = 0.0; + params["CUDA_PCG"]["absolute_tolerance"] = 1e-8; + params["CUDA_PCG"]["use_preconditioned_residual_norm"] = false; + solver->set_parameters(params); + + Eigen::VectorXd b(A.rows()); + b.setRandom(); + Eigen::VectorXd x(b.size()); + x.setZero(); + + solver->analyze_pattern(A, A.rows()); + solver->factorize(A); + solver->solve(b, x); + + const double err = (A * x - b).norm(); + INFO("block_dim: " + std::to_string(block_dim)); + REQUIRE(err < 1e-8); + } +} +#endif + TEST_CASE("pre_factor", "[solver]") { const std::string path = POLYFEM_DATA_DIR;