From 4df38516281858214f7aa4752d172bf4b75a7d4d Mon Sep 17 00:00:00 2001 From: constracktor <74077030+constracktor@users.noreply.github.com> Date: Wed, 29 Apr 2026 10:19:43 +0200 Subject: [PATCH 01/13] Initial reference draft --- reference/.clang-format | 174 ++++++++++++++++++ reference/CMakeLists.txt | 91 +++++++++ reference/CMakePresets.json | 17 ++ reference/compile.sh | 99 ++++++++++ reference/core/CMakeLists.txt | 62 +++++++ reference/core/include/adapter_cblas_fp64.hpp | 23 +++ reference/core/include/cholesky_factor.hpp | 20 ++ reference/core/include/functions.hpp | 23 +++ reference/core/include/matrix_generation.hpp | 26 +++ reference/core/include/validate.hpp | 28 +++ reference/core/src/adapter_cblas_fp64.cpp | 19 ++ reference/core/src/cholesky_factor.cpp | 15 ++ reference/core/src/functions.cpp | 20 ++ reference/core/src/matrix_generation.cpp | 32 ++++ reference/core/src/validate.cpp | 72 ++++++++ reference/main.cpp | 119 ++++++++++++ reference/run.sh | 47 +++++ 17 files changed, 887 insertions(+) create mode 100644 reference/.clang-format create mode 100644 reference/CMakeLists.txt create mode 100644 reference/CMakePresets.json create mode 100755 reference/compile.sh create mode 100644 reference/core/CMakeLists.txt create mode 100644 reference/core/include/adapter_cblas_fp64.hpp create mode 100644 reference/core/include/cholesky_factor.hpp create mode 100644 reference/core/include/functions.hpp create mode 100644 reference/core/include/matrix_generation.hpp create mode 100644 reference/core/include/validate.hpp create mode 100644 reference/core/src/adapter_cblas_fp64.cpp create mode 100644 reference/core/src/cholesky_factor.cpp create mode 100644 reference/core/src/functions.cpp create mode 100644 reference/core/src/matrix_generation.cpp create mode 100644 reference/core/src/validate.cpp create mode 100644 reference/main.cpp create mode 100755 reference/run.sh diff --git a/reference/.clang-format b/reference/.clang-format new file mode 100644 index 0000000..e8d875c --- /dev/null +++ b/reference/.clang-format @@ -0,0 +1,174 @@ +--- +Language: Cpp +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignConsecutiveAssignments: None +AlignConsecutiveBitFields: None +AlignConsecutiveDeclarations: None +AlignConsecutiveMacros: None +AlignConsecutiveShortCaseStatements: + Enabled: true + AcrossEmptyLines: false + AcrossComments: false + AlignCaseColons: false +AlignEscapedNewlines: Right +AlignOperands: Align +AlignTrailingComments: + Kind: Always +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowBreakBeforeNoexceptSpecifier: OnlyWithParen +AllowShortBlocksOnASingleLine: Empty +AllowShortCaseLabelsOnASingleLine: true +AllowShortEnumsOnASingleLine: true +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: true +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakAfterReturnType: None +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: false +BinPackParameters: false +BitFieldColonSpacing: Both +BraceWrapping: + AfterCaseLabel: false + AfterClass: true + AfterControlStatement: Always + AfterEnum: false + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: true + AfterStruct: true + AfterUnion: true + AfterExternBlock: false + BeforeCatch: true + BeforeElse: true + BeforeLambdaBody: true + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false +BreakAfterAttributes: Never +BreakAfterJavaFieldAnnotations: false +BreakBeforeBinaryOperators: NonAssignment +BreakBeforeBraces: Custom +BreakBeforeConceptDeclarations: Always +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: AfterColon +BreakInheritanceList: AfterComma +BreakStringLiterals: true +ColumnLimit: 120 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: LogicalBlock +FixNamespaceComments: true +ForEachMacros: [ 'foreach', 'Q_FOREACH', 'BOOST_FOREACH' ] +IfMacros: [ ] +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^"gprat/' + Priority: 1 + - Regex: '^"(tests|bindings)/' + Priority: 2 + - Regex: '^"(fmt|catch2|pybind)' + Priority: 3 + - Regex: '^.*' + Priority: 4 +IncludeIsMainRegex: '(Test)?$' +IncludeIsMainSourceRegex: '(\.cu|\.hip)' +IndentAccessModifiers: false +IndentCaseBlocks: true +IndentCaseLabels: true +IndentExternBlock: NoIndent +IndentGotoLabels: false +IndentPPDirectives: None +IndentRequiresClause: false +IndentWidth: 4 +IndentWrappedFunctionNames: false +InsertBraces: true +InsertNewlineAtEOF: true +InsertTrailingCommas: None +IntegerLiteralSeparator: + Binary: 8 + Decimal: 3 + DecimalMinDigits: 5 + Hex: -1 +KeepEmptyLinesAtEOF: false +KeepEmptyLinesAtTheStartOfBlocks: false +LambdaBodyIndentation: Signature +LineEnding: DeriveLF +MacroBlockBegin: '' +MacroBlockEnd: '' +Macros: [ ] +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +NamespaceMacros: [ ] +PPIndentWidth: -1 +PackConstructorInitializers: Never +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakOpenParenthesis: 0 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyIndentedWhitespace: 1 +PenaltyReturnTypeOnItsOwnLine: 60 +PointerAlignment: Right +QualifierAlignment: Custom +QualifierOrder: [ 'inline', 'constexpr', 'static', 'friend', 'restrict', 'const', 'volatile', 'type' ] +ReferenceAlignment: Pointer +ReflowComments: true +RemoveBracesLLVM: false +RemoveParentheses: Leave +RemoveSemicolon: true +RequiresClausePosition: OwnLine +RequiresExpressionIndentation: OuterScope +SeparateDefinitionBlocks: Always +ShortNamespaceLines: 1 +SortIncludes: CaseInsensitive +SortUsingDeclarations: LexicographicNumeric +SpaceAfterCStyleCast: true +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceAroundPointerQualifiers: Default +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeJsonColon: false +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyBlock: true +SpacesBeforeTrailingComments: 2 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: 1 +SpacesInParens: Never +SpacesInSquareBrackets: false +Standard: c++17 +StatementAttributeLikeMacros: [ ] +StatementMacros: [ 'Q_UNUSED', 'QT_REQUIRE_VERSION' ] +TabWidth: 4 +TypeNames: [ ] +TypenameMacros: [ ] +UseTab: Never +WhitespaceSensitiveMacros: [ 'STRINGIZE', 'PP_STRINGIZE', 'BOOST_PP_STRINGIZE' ] +... + diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt new file mode 100644 index 0000000..e63612f --- /dev/null +++ b/reference/CMakeLists.txt @@ -0,0 +1,91 @@ +cmake_minimum_required(VERSION 3.23) +project(cholesky_reference) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# What to build? +option(BUILD_CORE "Build the core library" ON) +option(ENABLE_MKL "Enable Intel oneMKL support (threaded)" OFF) +option( + ENABLE_VALIDATION + "Compute ||A - L*L^T||_F / ||A||_F after each factorization (off by default)" + OFF) +option(ENABLE_FORMAT_TARGETS "Enable clang-format / cmake-format targets" + ${PROJECT_IS_TOP_LEVEL}) + +if(ENABLE_FORMAT_TARGETS) + find_package(format QUIET) + if(NOT format_FOUND) + include(FetchContent) + FetchContent_Declare( + format + GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git + GIT_TAG v1.8.1 + QUIET) + FetchContent_MakeAvailable(format) + endif() +endif() + +if(NOT CMAKE_SKIP_INSTALL_RULES) + # Our installs follow the standard GNU directory layout. This include needs to + # come first since we need the CMAKE_INSTALL_* in the CMakeLists.txt of each + # target. + include(GNUInstallDirs) +endif() + +if(BUILD_CORE) + if(ENABLE_MKL) + # Threaded Intel oneMKL: ask MKL to use its OpenMP runtime ('intel_thread'). + # This is the only difference from the OpenMP/HPX builds, which pin + # MKL_THREADING=sequential because they parallelise at the tile level. + # Here the parallelism lives inside dpotrf itself, so we want the + # vendor-threaded backend. + set(MKL_INTERFACE_FULL "intel_lp64") + set(MKL_THREADING "intel_thread") + find_package(MKL CONFIG REQUIRED) + + if(MKL_FOUND) + message(STATUS "Intel oneMKL Library found (threaded: ${MKL_THREADING})") + else() + message(FATAL_ERROR "No BLAS Library found") + endif() + else() + # Threaded OpenBLAS. The library name is the same as the sequential one, + # but the Spack environment loaded by compile.sh selects an OpenBLAS built + # with threads=openmp. + find_library(OpenBLAS_LIB NAMES openblas REQUIRED) + + if(OpenBLAS_LIB) + message(STATUS "OpenBLAS Library found at ${OpenBLAS_LIB}") + find_path( + OpenBLAS_INCLUDE_DIR + NAMES cblas.h + PATH_SUFFIXES openblas) + if(NOT OpenBLAS_INCLUDE_DIR) + message(FATAL_ERROR "OpenBLAS include directory not found") + endif() + + message(STATUS "OpenBLAS include dir: ${OpenBLAS_INCLUDE_DIR}") + else() + message(FATAL_ERROR "No BLAS Library found") + endif() + endif() + + # OpenMP is required for the matrix-generation parallel loop and to pick up + # the OpenMP runtime that threaded OpenBLAS / threaded MKL share. + find_package(OpenMP REQUIRED) + + add_subdirectory(core) + + # Add the executable + add_executable(cholesky_reference main.cpp) + + # Link the libraries + target_link_libraries(cholesky_reference PUBLIC Cholesky::core + OpenMP::OpenMP_CXX) + + if(ENABLE_VALIDATION) + target_compile_definitions(cholesky_reference PRIVATE ENABLE_VALIDATION) + endif() +endif() diff --git a/reference/CMakePresets.json b/reference/CMakePresets.json new file mode 100644 index 0000000..f3839f8 --- /dev/null +++ b/reference/CMakePresets.json @@ -0,0 +1,17 @@ +{ + "version": 6, + "cmakeMinimumRequired": { + "major": 3, + "minor": 22, + "patch": 0 + }, + "configurePresets": [ + { + "name": "clang-tidy", + "hidden": true, + "cacheVariables": { + "CMAKE_CXX_CLANG_TIDY": "clang-tidy;--header-filter=^${sourceDir}/" + } + } + ] +} diff --git a/reference/compile.sh b/reference/compile.sh new file mode 100755 index 0000000..f896d8c --- /dev/null +++ b/reference/compile.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Usage: compile.sh +# +# Builds the parallel-BLAS reference benchmark: a single threaded +# LAPACKE_dpotrf call on the full matrix, used as a baseline against the +# tiled OpenMP / HPX implementations. GCC only. +# +# CMake project options can be overridden via environment variables +# (defaults match the project's CMakeLists.txt defaults): +# ENABLE_MKL ON|OFF (default OFF) - link threaded Intel oneMKL +# instead of threaded OpenBLAS +# ENABLE_VALIDATION ON|OFF (default OFF) - residual check after each +# factorisation +# +# Examples: +# ./compile.sh +# ENABLE_MKL=ON ./compile.sh +# ENABLE_VALIDATION=ON ./compile.sh +################################################################################ +set -e # Exit immediately if a command exits with a non-zero status. + +################################################################################ +# CMake project options (env-var overridable; defaults match CMakeLists.txt) +################################################################################ +: "${ENABLE_MKL:=OFF}" +: "${ENABLE_VALIDATION:=OFF}" + +for var in ENABLE_MKL ENABLE_VALIDATION; do + case "${!var}" in + ON | OFF) ;; + *) + echo "Error: $var must be ON or OFF (got '${!var}')." >&2 + exit 1 + ;; + esac +done + +################################################################################ +# Toolchain selection (gcc only) +################################################################################ +select_toolchain() { + module load gcc/14.2.0 + export CC=gcc + export CXX=g++ +} + +################################################################################ +# Configurations +# +# The reference benchmark uses *threaded* OpenBLAS / MKL — that is the whole +# point of this directory. The OpenMP and HPX builds, by contrast, pin the +# BLAS to its sequential variant because they parallelise at the tile level. +################################################################################ +if command -v spack &>/dev/null; then + echo "Spack command found. Loading libraries (gcc)" + # Get current hostname + HOSTNAME=$(hostname -s) + + if [[ "$HOSTNAME" == "ipvs-epyc1" ]]; then + # Compiler + select_toolchain + if [[ "$ENABLE_MKL" == "OFF" ]]; then + # OpenBLAS built with OpenMP threading + spack load openblas@0.3.28%gcc@14.2.0 threads=openmp + fi + + elif [[ "$HOSTNAME" == "nasrin0" || "$HOSTNAME" == "nasrin1" ]]; then + # Compiler + select_toolchain + if [[ "$ENABLE_MKL" == "OFF" ]]; then + # OpenBLAS built with OpenMP threading + spack load openblas@0.3.28%gcc@14.2.0 arch=linux-almalinux9-zen3 threads=openmp + fi + + else + echo "Hostname is $HOSTNAME — no action taken." + fi +else + echo "Spack command not found. Exiting." +fi + +################################################################################ +# Compile code +################################################################################ +rm -rf build && mkdir build && cd build + +echo "CMake options:" +echo " ENABLE_MKL = $ENABLE_MKL" +echo " ENABLE_VALIDATION = $ENABLE_VALIDATION" + +cmake -DCMAKE_BUILD_TYPE=Release \ + -DENABLE_MKL="$ENABLE_MKL" \ + -DENABLE_VALIDATION="$ENABLE_VALIDATION" \ + .. +make -j VERBOSE=1 +cd .. + +# Launch Example +# OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores ./build/cholesky_reference --size_start 65536 --size_stop 65536 --loop 20 diff --git a/reference/core/CMakeLists.txt b/reference/core/CMakeLists.txt new file mode 100644 index 0000000..0453e4f --- /dev/null +++ b/reference/core/CMakeLists.txt @@ -0,0 +1,62 @@ +set(SOURCE_FILES src/matrix_generation.cpp src/functions.cpp + src/cholesky_factor.cpp src/adapter_cblas_fp64.cpp) + +if(ENABLE_VALIDATION) + list(APPEND SOURCE_FILES src/validate.cpp) +endif() + +add_library(cholesky_core STATIC ${SOURCE_FILES}) + +set_property(TARGET cholesky_core PROPERTY EXPORT_NAME core) +add_library(Cholesky::core ALIAS cholesky_core) + +# Add them as PRIVATE sources here so they show up in project files Can't use +# PUBLIC etc., see: https://stackoverflow.com/a/62465051 +file(GLOB_RECURSE header_files CONFIGURE_DEPENDS include/*.hpp) +target_sources(cholesky_core PRIVATE ${header_files}) + +# Link OpenMP libraries (used by the parallel matrix generator) +target_link_libraries(cholesky_core PUBLIC OpenMP::OpenMP_CXX) + +# Include directories +target_include_directories( + cholesky_core PUBLIC "$") + +# Link BLAS +if(ENABLE_MKL) + # Link threaded Intel oneMKL + target_link_libraries( + cholesky_core PUBLIC MKL::mkl_intel_lp64 MKL::mkl_core MKL::MKL + MKL::mkl_intel_thread) +else() + # Link threaded OpenBLAS (the library name is the same; threading is + # determined by the OpenBLAS build that compile.sh's Spack env selects). + target_link_libraries(cholesky_core PUBLIC ${OpenBLAS_LIB}) + target_include_directories(cholesky_core PUBLIC ${OpenBLAS_INCLUDE_DIR}) +endif() + +if(ENABLE_MKL) + target_compile_definitions(cholesky_core PUBLIC ENABLE_MKL) +endif() + +target_compile_features(cholesky_core PUBLIC cxx_std_17) + +set_property(TARGET cholesky_core PROPERTY POSITION_INDEPENDENT_CODE ON) + +if(NOT CMAKE_SKIP_INSTALL_RULES) + # We need to manually install those into CMAKE_INSTALL_INCLUDEDIR. Below + # install(TARGETS ...) only setups the paths for the exported targets. + install( + DIRECTORY include/ + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" + COMPONENT Development) + + install( + TARGETS cholesky_core + EXPORT CholeskyTargets + RUNTIME COMPONENT Runtime + LIBRARY COMPONENT Runtime NAMELINK_COMPONENT Development + ARCHIVE COMPONENT Development + INCLUDES + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") +endif() diff --git a/reference/core/include/adapter_cblas_fp64.hpp b/reference/core/include/adapter_cblas_fp64.hpp new file mode 100644 index 0000000..139945c --- /dev/null +++ b/reference/core/include/adapter_cblas_fp64.hpp @@ -0,0 +1,23 @@ +#ifndef CPU_ADAPTER_CBLAS_FP64_H +#define CPU_ADAPTER_CBLAS_FP64_H + +#pragma once + +#include + +using vector = std::vector; + +// LAPACK level 3 operations + +/** + * @brief FP64 In-place Cholesky decomposition of A using a single, threaded + * LAPACKE_dpotrf call (no tiling). This is the parallel-BLAS reference + * implementation that the OpenMP and HPX tiled variants are compared + * against. + * + * @param A row-major matrix of size N*N to be factorised in place + * @param N matrix dimension + */ +void potrf(vector &A, const int N); + +#endif // end of CPU_ADAPTER_CBLAS_FP64_H diff --git a/reference/core/include/cholesky_factor.hpp b/reference/core/include/cholesky_factor.hpp new file mode 100644 index 0000000..f54237d --- /dev/null +++ b/reference/core/include/cholesky_factor.hpp @@ -0,0 +1,20 @@ +#ifndef CPU_CHOLESKY_FACTOR_H +#define CPU_CHOLESKY_FACTOR_H + +#pragma once + +#include + +namespace cpu +{ + +/** + * @brief Run a single, threaded LAPACKE_dpotrf on the full N x N row-major + * matrix @p A. This is the reference (non-tiled) parallel BLAS + * Cholesky factorisation that the OpenMP / HPX tiled variants are + * benchmarked against. + */ +void parallel_blas_cholesky(std::vector &A, int N); + +} // end of namespace cpu +#endif // end of CPU_CHOLESKY_FACTOR_H diff --git a/reference/core/include/functions.hpp b/reference/core/include/functions.hpp new file mode 100644 index 0000000..63614d8 --- /dev/null +++ b/reference/core/include/functions.hpp @@ -0,0 +1,23 @@ +#ifndef CPU_FUNCTIONS_H +#define CPU_FUNCTIONS_H + +#pragma once + +#include +#include + +namespace cpu +{ + +/** + * @brief Time a single threaded LAPACKE_dpotrf call on the @p A buffer + * (row-major, N x N). The buffer is factorised in place. + * + * @param A row-major matrix; on return contains the lower-triangular factor L + * @param N matrix dimension + * @return wall-clock elapsed time in seconds + */ +double cholesky(std::vector &A, std::size_t N); + +} // namespace cpu +#endif // end of CPU_FUNCTIONS_H diff --git a/reference/core/include/matrix_generation.hpp b/reference/core/include/matrix_generation.hpp new file mode 100644 index 0000000..22a3206 --- /dev/null +++ b/reference/core/include/matrix_generation.hpp @@ -0,0 +1,26 @@ +#ifndef MATRIX_GENERATION_H +#define MATRIX_GENERATION_H + +#pragma once + +#include +#include + +/** + * @brief Generate a deterministic, dense, row-major SPD matrix of size N x N. + * + * Entries are uniform on [0, 1) using a per-row seed; the diagonal is shifted + * by +N to guarantee strict diagonal dominance and therefore symmetric + * positive definiteness. The result is stored as a single contiguous + * std::vector of length N*N in row-major order, ready to be passed to + * LAPACKE_dpotrf. + * + * Generation is parallelised with OpenMP across rows so it does not dominate + * the timed factorisation phase. + * + * @param N matrix dimension + * @return owning row-major buffer of length N*N + */ +std::vector gen_matrix(std::size_t N); + +#endif diff --git a/reference/core/include/validate.hpp b/reference/core/include/validate.hpp new file mode 100644 index 0000000..6cf829c --- /dev/null +++ b/reference/core/include/validate.hpp @@ -0,0 +1,28 @@ +#ifndef CPU_VALIDATE_H +#define CPU_VALIDATE_H + +#pragma once + +#include +#include + +namespace cpu +{ + +/** + * @brief Compute the relative Cholesky residual ||A - L * L^T||_F / ||A||_F + * for the dense, row-major reference factorisation. + * + * The original A is regenerated on the fly with the same deterministic seed + * used by gen_matrix, so no extra storage is needed. + * + * @param N matrix dimension (must match the factorisation) + * @param L row-major buffer of length N*N holding the factor returned by + * LAPACKE_dpotrf with uplo='L' (only the lower triangle is read) + * @return relative Frobenius residual + */ +double cholesky_residual(std::size_t N, const std::vector &L); + +} // namespace cpu + +#endif // end of CPU_VALIDATE_H diff --git a/reference/core/src/adapter_cblas_fp64.cpp b/reference/core/src/adapter_cblas_fp64.cpp new file mode 100644 index 0000000..566290f --- /dev/null +++ b/reference/core/src/adapter_cblas_fp64.cpp @@ -0,0 +1,19 @@ +#include "adapter_cblas_fp64.hpp" + +#ifdef ENABLE_MKL +// MKL CBLAS / LAPACKE +#include "mkl_cblas.h" +#include "mkl_lapacke.h" +#else +#include "cblas.h" +#include "lapacke.h" +#endif + +void potrf(vector &A, const int N) +{ + // Single threaded LAPACKE call on the full matrix. dpotrf2 is the + // recursive variant, which is what the OpenMP / HPX variants use on + // their diagonal tiles, so picking it here keeps the underlying kernel + // identical and isolates the parallelism source as the only difference. + LAPACKE_dpotrf2(LAPACK_ROW_MAJOR, 'L', N, A.data(), N); +} diff --git a/reference/core/src/cholesky_factor.cpp b/reference/core/src/cholesky_factor.cpp new file mode 100644 index 0000000..4fd93fb --- /dev/null +++ b/reference/core/src/cholesky_factor.cpp @@ -0,0 +1,15 @@ +#include "cholesky_factor.hpp" + +#include "adapter_cblas_fp64.hpp" + +namespace cpu +{ + +void parallel_blas_cholesky(std::vector &A, int N) +{ + // The whole factorisation is one threaded LAPACKE call; the BLAS library + // takes care of dispatching work across the available threads. + potrf(A, N); +} + +} // end of namespace cpu diff --git a/reference/core/src/functions.cpp b/reference/core/src/functions.cpp new file mode 100644 index 0000000..abe4cb0 --- /dev/null +++ b/reference/core/src/functions.cpp @@ -0,0 +1,20 @@ +#include "functions.hpp" + +#include "cholesky_factor.hpp" +#include + +namespace cpu +{ + +double cholesky(std::vector &A, std::size_t N) +{ + auto start = std::chrono::high_resolution_clock::now(); + /////////////////////////////////////////////////////////////////////////// + // Launch Cholesky decomposition: A = L * L^T (single threaded LAPACKE call) + parallel_blas_cholesky(A, static_cast(N)); + /////////////////////////////////////////////////////////////////////////// + auto stop = std::chrono::high_resolution_clock::now(); + return (stop - start).count() / 1e9; +} + +} // end of namespace cpu diff --git a/reference/core/src/matrix_generation.cpp b/reference/core/src/matrix_generation.cpp new file mode 100644 index 0000000..b0db740 --- /dev/null +++ b/reference/core/src/matrix_generation.cpp @@ -0,0 +1,32 @@ +#include "matrix_generation.hpp" + +#include +#include + +std::vector gen_matrix(std::size_t N) +{ + // Row-major dense buffer + std::vector A(N * N); + + // The matrix is built row by row in parallel. Each row uses its own RNG + // seeded by the row index, so the matrix is deterministic and + // reproducible regardless of the number of threads. Off-diagonal entries + // are mirrored to keep A symmetric; the diagonal is shifted by +N to + // guarantee strict diagonal dominance (and therefore SPD), mirroring the + // +N*n_tiles shift used by the tiled variants when n_tiles == 1. +#pragma omp parallel for schedule(static) + for (std::size_t i = 0; i < N; ++i) + { + std::mt19937 generator(static_cast(i + 1)); + std::uniform_real_distribution distribute(0.0, 1.0); + for (std::size_t j = 0; j <= i; ++j) + { + const double v = distribute(generator); + A[i * N + j] = v; + A[j * N + i] = v; + } + A[i * N + i] += static_cast(N); + } + + return A; +} diff --git a/reference/core/src/validate.cpp b/reference/core/src/validate.cpp new file mode 100644 index 0000000..5a43cc8 --- /dev/null +++ b/reference/core/src/validate.cpp @@ -0,0 +1,72 @@ +#include "validate.hpp" + +#include "matrix_generation.hpp" + +#ifdef ENABLE_MKL +#include "mkl_cblas.h" +#else +#include "cblas.h" +#endif + +#include +#include +#include +#include + +namespace cpu +{ + +double cholesky_residual(std::size_t N, const std::vector &L) +{ + // Build a working copy of L with its strictly upper triangle zeroed out. + // dpotrf with uplo='L' leaves the upper triangle untouched (it still + // contains the original A values), so we must mask it before forming + // L * L^T with a plain dgemm. + std::vector Lwork(L); + for (std::size_t i = 0; i < N; ++i) + { + for (std::size_t j = i + 1; j < N; ++j) + { + Lwork[i * N + j] = 0.0; + } + } + + // Compute LLt = L * L^T (full N x N) with a single dgemm. + std::vector LLt(N * N, 0.0); + cblas_dgemm( + CblasRowMajor, + CblasNoTrans, + CblasTrans, + static_cast(N), + static_cast(N), + static_cast(N), + 1.0, + Lwork.data(), + static_cast(N), + Lwork.data(), + static_cast(N), + 0.0, + LLt.data(), + static_cast(N)); + + // Regenerate the original A deterministically and accumulate Frobenius + // norms of (A - LLt) and A. + const std::vector A = gen_matrix(N); + + double r_norm_sq = 0.0; + double a_norm_sq = 0.0; + for (std::size_t idx = 0; idx < A.size(); ++idx) + { + const double d = A[idx] - LLt[idx]; + r_norm_sq += d * d; + a_norm_sq += A[idx] * A[idx]; + } + + if (a_norm_sq == 0.0) + { + return 0.0; + } + return std::sqrt(r_norm_sq / a_norm_sq); +} + +} // namespace cpu diff --git a/reference/main.cpp b/reference/main.cpp new file mode 100644 index 0000000..0d58b8f --- /dev/null +++ b/reference/main.cpp @@ -0,0 +1,119 @@ +#include "functions.hpp" +#include "matrix_generation.hpp" +#ifdef ENABLE_VALIDATION +#include "validate.hpp" +#endif +#include +#include +#include +#include +#include +#include + +int main(int argc, char *argv[]) +{ + /////////////////////////////////////////////////////////////////////////// + // cmdline arguments + // + // The reference benchmark calls a single threaded LAPACKE_dpotrf on the + // full matrix, so there is no tiling axis. We still accept --tiles_start + // / --tiles_stop for CLI compatibility with the openmp/ and hpx/ binaries + // (they are silently ignored), which keeps any shared driver script + // unchanged. + std::size_t loop = 1; + std::size_t size_start = 32, size_stop = 128; + + for (int i = 1; i < argc; ++i) + { + std::string arg = argv[i]; + if (arg == "--loop" && i + 1 < argc) + { + loop = std::stoul(argv[++i]); + } + else if (arg == "--size_start" && i + 1 < argc) + { + size_start = std::stoul(argv[++i]); + } + else if (arg == "--size_stop" && i + 1 < argc) + { + size_stop = std::stoul(argv[++i]); + } + else if ((arg == "--tiles_start" || arg == "--tiles_stop") && i + 1 < argc) + { + // Accept-and-ignore for CLI parity with the tiled variants. + ++i; + } + } + /////////////////////////////////////////////////////////////////////////// + // configuration + const std::size_t LOOP = loop; + + const std::size_t START_SIZE = size_start; + const std::size_t STOP_SIZE = size_stop; + const std::size_t STEP_SIZE = 2; + + // print and write results + bool HEADER_FLAG = true; + std::string runtime_file_path = "runtimes_reference_cholesky_"; + if (START_SIZE != STOP_SIZE) + { + runtime_file_path += std::string("size_"); + } + runtime_file_path += std::to_string(LOOP) + std::string(".txt"); + std::ofstream runtime_file; + runtime_file.open(runtime_file_path, std::ios_base::app); + + for (std::size_t size = START_SIZE; size <= STOP_SIZE; size = size * STEP_SIZE) + { + for (std::size_t l = 0; l < LOOP; l++) + { + // header for output file -- columns mirror the openmp/hpx output so + // results from all three benchmarks can be merged on (problem_size). + // The reference has no tiling, so tile_size == problem_size and + // n_tiles == 1. + std::string header = "threads;problem_size;tile_size;n_tiles"; + std::string values = std::to_string(omp_get_max_threads()); + values += std::string(";") + std::to_string(size); + values += std::string(";") + std::to_string(size); + values += std::string(";") + std::to_string(1); + /////////////////////////////////////////////////////////////////// + // Single mode: parallel-BLAS reference dpotrf on the full matrix. + std::vector modes = { "reference" }; + + for (const auto &mode : modes) + { + auto A = gen_matrix(size); + auto cholesky_cpu = cpu::cholesky(A, size); + + header += ";" + mode; + values += ";" + std::to_string(cholesky_cpu); + +#ifdef ENABLE_VALIDATION + // Validate by computing relative residual ||A - L L^T||_F / ||A||_F + constexpr double residual_tol = 1e-10; + const double residual = cpu::cholesky_residual(size, A); + std::cout << "[validate] mode=" << mode << " size=" << size << " residual=" << residual << std::endl; + if (!(residual <= residual_tol)) // catches NaN too + { + std::cerr << "Validation warning: variant '" << mode << "' residual " << residual + << " exceeds tolerance " << residual_tol << " (size=" << size << ")" << std::endl; + } +#endif + } + /////////////////////////////////////////////////////////////////// + // print/write header only once + if (HEADER_FLAG) + { + HEADER_FLAG = false; + std::cout << header << std::endl; + runtime_file << header << std::endl; + } + // print/write runtimes + std::cout << values << std::endl; + runtime_file << values << std::endl; + } + } + + runtime_file.close(); + return 0; +} diff --git a/reference/run.sh b/reference/run.sh new file mode 100755 index 0000000..0600513 --- /dev/null +++ b/reference/run.sh @@ -0,0 +1,47 @@ +#!/bin/bash +#SBATCH --job-name=cholesky_reference +#SBATCH --output=logs/cholesky_reference_%j.out +#SBATCH --error=logs/cholesky_reference_%j.err +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=128 +#SBATCH --time=144:00:00 +#SBATCH --exclusive +# +# Usage: run.sh +# +# Submit example: +# sbatch run.sh +# +# Runs the parallel-BLAS reference benchmark — a single threaded +# LAPACKE_dpotrf call on the full matrix — as a baseline for the OpenMP and +# HPX tiled implementations. GCC only. + +set -e # Exit immediately if a command exits with a non-zero status. + +################################################################################ +# Toolchain runtime selection (gcc only) +################################################################################ +module load gcc/14.2.0 + +# Resolve directory where the script is located +SCRIPT_DIR="$(pwd)" + +# OpenMP settings — the threaded BLAS picks these up to spread dpotrf across +# all the cores. Both OpenBLAS (threads=openmp) and threaded MKL respect the +# standard OMP_* environment. +export OMP_NUM_THREADS=128 +export OMP_PROC_BIND=close +export OMP_PLACES=cores + +# Make sure threaded MKL uses the OpenMP runtime if ENABLE_MKL=ON was used at +# build time. Harmless when linking OpenBLAS. +export MKL_NUM_THREADS=${MKL_NUM_THREADS:-$OMP_NUM_THREADS} + +echo "Running with gcc runtime" + +# Run executable +srun --cpu-bind=cores "$SCRIPT_DIR/build/cholesky_reference" \ + --loop 20 \ + --size_start 65536 \ + --size_stop 65536 From bd93795b50a01d710d815dbf6aeef943d62d9f6b Mon Sep 17 00:00:00 2001 From: constracktor <74077030+constracktor@users.noreply.github.com> Date: Wed, 29 Apr 2026 12:59:49 +0200 Subject: [PATCH 02/13] Adjust OpenBLAS --- reference/compile.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reference/compile.sh b/reference/compile.sh index f896d8c..d0d2727 100755 --- a/reference/compile.sh +++ b/reference/compile.sh @@ -56,12 +56,12 @@ if command -v spack &>/dev/null; then # Get current hostname HOSTNAME=$(hostname -s) - if [[ "$HOSTNAME" == "ipvs-epyc1" ]]; then + if [[ "$HOSTNAME" == "ipvs-epyc1" || "$HOSTNAME" == "ipvs-epyc2" ]]; then # Compiler select_toolchain if [[ "$ENABLE_MKL" == "OFF" ]]; then # OpenBLAS built with OpenMP threading - spack load openblas@0.3.28%gcc@14.2.0 threads=openmp + spack load openblas@0.3.28%gcc@14.2.0 threads=openmp ilp64=true fi elif [[ "$HOSTNAME" == "nasrin0" || "$HOSTNAME" == "nasrin1" ]]; then From 8de7ed7e7dae6cec90e721d9516bd5c922eec85a Mon Sep 17 00:00:00 2001 From: constracktor <74077030+constracktor@users.noreply.github.com> Date: Wed, 29 Apr 2026 13:01:33 +0200 Subject: [PATCH 03/13] Adjust README --- README.md | 101 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 78 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 7f06a60..0287f4b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Cholesky-Bench -Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-join to asynchronous tasks across several parallelism models, currently comparing OpenMP and HPX implementations side by side. +Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-join to asynchronous tasks across several parallelism models, currently comparing OpenMP and HPX implementations side by side. A non-tiled parallel-BLAS reference is also included as a baseline. ## Variants @@ -24,27 +24,40 @@ Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-j | `loop_two` | Collapsed fork-join with dynamic schedule for trailing-update | | `async_void` | Fully asynchronous tasking with dataflow using `hpx::shared_future` | +### Reference (`reference/`) + +| Mode | Description | +|------|-------------| +| `reference` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). | +| `plasma` | Single `plasma_dpotrf` call on the full matrix. PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. | + +This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `reference` mode isolates the contribution of vendor-provided dense-LA parallelism, and the `plasma` mode adds a true tiled-parallel competitor that uses the same OpenMP runtime as the in-house variants. + ## Dependencies -Both implementations share the same sequential BLAS backend and are built with CMake (≥ 3.23) and C++20. +All three implementations are built with CMake (≥ 3.23) and C++20. The OpenMP and HPX directories link against a *sequential* BLAS (parallelism is at the tile level); the `reference/` directory links against a *threaded* BLAS instead. -| Dependency | OpenMP | HPX | -|---|---|---| -| OpenBLAS 0.3.28 | ✓ (default) | ✓ (default) | -| Intel oneMKL | optional (`ENABLE_MKL=ON`) | optional (`ENABLE_MKL=ON`) | -| HPX 1.11.0 + jemalloc | — | ✓ | -| GCC 14.2.0 | ✓ | ✓ | -| LLVM/Clang 22.1.2 | optional | — | +| Dependency | OpenMP | HPX | Reference | +|---|---|---|---| +| OpenBLAS 0.3.28 (sequential) | ✓ (default) | ✓ (default) | — | +| OpenBLAS 0.3.28 (`threads=openmp`) | — | — | ✓ (default) | +| Intel oneMKL (sequential) | optional (`ENABLE_MKL=ON`) | optional (`ENABLE_MKL=ON`) | — | +| Intel oneMKL (`intel_thread`) | — | — | optional (`ENABLE_MKL=ON`) | +| PLASMA | — | — | optional (`ENABLE_PLASMA=ON`) | +| HPX 1.11.0 + jemalloc | — | ✓ | — | +| GCC 14.2.0 | ✓ | ✓ | ✓ | +| LLVM/Clang 22.1.2 | optional | — | — | Dependencies are managed via [Spack](https://spack.io/). The compile scripts auto-detect the host system and load the correct Spack environment. ## Build -From within the `openmp/` or `hpx/` directory, run: +From within the `openmp/`, `hpx/`, or `reference/` directory, run: ```bash -./compile.sh [gcc|llvm] # OpenMP: gcc (default) or llvm -./compile.sh # HPX: always gcc +./compile.sh [gcc|llvm] # OpenMP: gcc (default) or llvm +./compile.sh # HPX: always gcc +./compile.sh # Reference: always gcc ``` The script clears and recreates the `build/` directory, then runs CMake in Release mode followed by a parallel make. @@ -55,10 +68,11 @@ These can be set as environment variables before calling `compile.sh`: | Option | Default | Description | |--------|---------|-------------| -| `ENABLE_VALIDATION` | `OFF` | After each factorization, compute the relative residual ‖A − LL^T‖_F / ‖A‖_F and warn if it exceeds 1e-10. Mutually exclusive with `DISABLE_COMPUTATION`. | -| `DISABLE_COMPUTATION` | `OFF` | Replace all BLAS/tile-generation calls with no-ops. The task graph and loops remain intact, so scheduling overhead can be measured in isolation. | -| `ENABLE_DYNAMIC_SCHEDULE` | `OFF` | *(OpenMP only)* Use `schedule(dynamic,1)` on the trailing-update worksharing loops in `for_collapse`. Requires the LLVM toolchain; rejected at compile time with GCC. | -| `ENABLE_MKL` | `OFF` | Link against Intel oneMKL instead of OpenBLAS. | +| `ENABLE_VALIDATION` | `OFF` | After each factorization, compute the relative residual ‖A − LL^T‖_F / ‖A‖_F and warn if it exceeds 1e-10. In `openmp/` and `hpx/`, mutually exclusive with `DISABLE_COMPUTATION`. | +| `DISABLE_COMPUTATION` | `OFF` | *(`openmp/` and `hpx/` only)* Replace all BLAS/tile-generation calls with no-ops. The task graph and loops remain intact, so scheduling overhead can be measured in isolation. | +| `ENABLE_DYNAMIC_SCHEDULE` | `OFF` | *(`openmp/` only)* Use `schedule(dynamic,1)` on the trailing-update worksharing loops in `for_collapse`. Requires the LLVM toolchain; rejected at compile time with GCC. | +| `ENABLE_MKL` | `OFF` | Link against Intel oneMKL instead of OpenBLAS. In `openmp/` and `hpx/` this is the *sequential* MKL; in `reference/` it is the *threaded* MKL. | +| `ENABLE_PLASMA` | `OFF` | *(`reference/` only)* Also build the PLASMA tiled-Cholesky variant. Adds a `plasma` column alongside `reference` in the runtime output. | **Examples:** @@ -71,6 +85,12 @@ ENABLE_DYNAMIC_SCHEDULE=ON ./compile.sh llvm # HPX: measure pure scheduling overhead DISABLE_COMPUTATION=ON ./compile.sh + +# Reference: threaded MKL baseline +ENABLE_MKL=ON ./compile.sh + +# Reference: also build the PLASMA tiled-Cholesky variant +ENABLE_PLASMA=ON ./compile.sh ``` ## Run @@ -89,16 +109,22 @@ OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores \ --hpx:threads=128 \ --loop=1 --size_start=1024 --size_stop=65536 \ --tiles_start=64 --tiles_stop=64 + +# Reference (parallel BLAS, no tiling) +OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores \ + ./build/cholesky_reference \ + --loop 1 --size_start 1024 --size_stop 65536 ``` ### Via SLURM -Both directories contain a `run.sh` that is a ready-to-submit SLURM batch script (128 CPUs, exclusive node, 144-hour wall time): +All three directories contain a `run.sh` that is a ready-to-submit SLURM batch script (128 CPUs, exclusive node, 144-hour wall time): ```bash -sbatch openmp/run.sh # gcc runtime (default) -sbatch openmp/run.sh llvm # llvm runtime +sbatch openmp/run.sh # gcc runtime (default) +sbatch openmp/run.sh llvm # llvm runtime sbatch hpx/run.sh +sbatch reference/run.sh ``` ### Command-line arguments @@ -107,7 +133,7 @@ sbatch hpx/run.sh |----------|---------|-------------| | `--loop` / `--loop=` | 1 | Number of timed repetitions per configuration | | `--size_start` / `--size_stop` | 32 / 128 | Problem size range (doubled each step) | -| `--tiles_start` / `--tiles_stop` | 16 / 32 | Tile count range (doubled each step) | +| `--tiles_start` / `--tiles_stop` | 16 / 32 | Tile count range (doubled each step). Accepted but ignored by the `reference/` binary, which has no tiling axis. | ## Output @@ -116,6 +142,7 @@ Results are appended to a text file in the working directory: ``` runtimes_openmp_cholesky_.txt runtimes_hpx_cholesky_.txt +runtimes_reference_cholesky_.txt ``` The suffix encodes which dimension is swept: `tile_` if tiles vary, `size_` if size varies, followed by the loop count. The file uses `;`-separated columns: @@ -125,6 +152,13 @@ threads;problem_size;tile_size;n_tiles;for_collapse;for_naive;task_naive;task_de 128;65536;1024;64;3.14;3.21;2.98;2.87 ``` +The `reference/` binary reports a `reference` column (and a `plasma` column when built with `ENABLE_PLASMA=ON`), with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key: + +``` +threads;problem_size;tile_size;n_tiles;reference;plasma +128;65536;65536;1;2.71;2.45 +``` + The same lines are also printed to stdout. ## Repository structure @@ -150,7 +184,26 @@ The same lines are also printed to stdout. │ ├── tile_generation.cpp │ ├── validate.cpp │ └── adapter_cblas_fp64.cpp -└── hpx/ +├── hpx/ +│ ├── CMakeLists.txt +│ ├── CMakePresets.json +│ ├── compile.sh # build script (gcc only) +│ ├── run.sh # SLURM job script +│ ├── main.cpp +│ └── core/ +│ ├── include/ +│ │ ├── cholesky_factor.hpp +│ │ ├── functions.hpp +│ │ ├── tile_generation.hpp +│ │ ├── validate.hpp +│ │ └── adapter_cblas_fp64.hpp +│ └── src/ +│ ├── cholesky_factor.cpp +│ ├── functions.cpp +│ ├── tile_generation.cpp +│ ├── validate.cpp +│ └── adapter_cblas_fp64.cpp +└── reference/ ├── CMakeLists.txt ├── CMakePresets.json ├── compile.sh # build script (gcc only) @@ -160,13 +213,15 @@ The same lines are also printed to stdout. ├── include/ │ ├── cholesky_factor.hpp │ ├── functions.hpp - │ ├── tile_generation.hpp + │ ├── matrix_generation.hpp + │ ├── plasma_factor.hpp # only used when ENABLE_PLASMA=ON │ ├── validate.hpp │ └── adapter_cblas_fp64.hpp └── src/ ├── cholesky_factor.cpp ├── functions.cpp - ├── tile_generation.cpp + ├── matrix_generation.cpp + ├── plasma_factor.cpp # only built when ENABLE_PLASMA=ON ├── validate.cpp └── adapter_cblas_fp64.cpp ``` From d1edae32b93dadbfe319c0035179e633f003f1a2 Mon Sep 17 00:00:00 2001 From: constracktor <74077030+constracktor@users.noreply.github.com> Date: Wed, 29 Apr 2026 13:50:42 +0200 Subject: [PATCH 04/13] Add PLASMA draft --- reference/CMakeLists.txt | 26 +++++++++++++++++ reference/compile.sh | 15 +++++++++- reference/core/CMakeLists.txt | 13 +++++++++ reference/core/include/cholesky_factor.hpp | 34 ++++++++++++++++++---- reference/core/include/functions.hpp | 13 +++++---- reference/core/include/plasma_factor.hpp | 26 +++++++++++++++++ reference/core/src/cholesky_factor.cpp | 27 ++++++++++++++--- reference/core/src/functions.cpp | 7 +++-- reference/core/src/plasma_factor.cpp | 25 ++++++++++++++++ reference/main.cpp | 27 +++++++++++++++-- 10 files changed, 193 insertions(+), 20 deletions(-) create mode 100644 reference/core/include/plasma_factor.hpp create mode 100644 reference/core/src/plasma_factor.cpp diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index e63612f..084b4fb 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -7,6 +7,10 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) # What to build? option(BUILD_CORE "Build the core library" ON) option(ENABLE_MKL "Enable Intel oneMKL support (threaded)" OFF) +option( + ENABLE_PLASMA + "Build the PLASMA tiled-Cholesky reference variant in addition to the LAPACKE_dpotrf one" + OFF) option( ENABLE_VALIDATION "Compute ||A - L*L^T||_F / ||A||_F after each factorization (off by default)" @@ -76,6 +80,23 @@ if(BUILD_CORE) # the OpenMP runtime that threaded OpenBLAS / threaded MKL share. find_package(OpenMP REQUIRED) + if(ENABLE_PLASMA) + # PLASMA exposes its own tiled parallel Cholesky (plasma_dpotrf). Spack + # installs it as a single shared library plus a coreblas helper; we look + # for both and link whichever is present. + find_path(PLASMA_INCLUDE_DIR plasma.h) + if(NOT PLASMA_INCLUDE_DIR) + message(FATAL_ERROR "ENABLE_PLASMA=ON but plasma.h was not found") + endif() + find_library(PLASMA_LIB NAMES plasma REQUIRED) + find_library(PLASMA_CORE_BLAS_LIB NAMES coreblas plasma_core_blas) + message(STATUS "PLASMA include dir: ${PLASMA_INCLUDE_DIR}") + message(STATUS "PLASMA library: ${PLASMA_LIB}") + if(PLASMA_CORE_BLAS_LIB) + message(STATUS "PLASMA coreblas library: ${PLASMA_CORE_BLAS_LIB}") + endif() + endif() + add_subdirectory(core) # Add the executable @@ -88,4 +109,9 @@ if(BUILD_CORE) if(ENABLE_VALIDATION) target_compile_definitions(cholesky_reference PRIVATE ENABLE_VALIDATION) endif() + + if(ENABLE_PLASMA) + target_compile_definitions(cholesky_reference PRIVATE ENABLE_PLASMA) + target_include_directories(cholesky_reference PRIVATE ${PLASMA_INCLUDE_DIR}) + endif() endif() diff --git a/reference/compile.sh b/reference/compile.sh index d0d2727..0fa48b7 100755 --- a/reference/compile.sh +++ b/reference/compile.sh @@ -9,12 +9,16 @@ # (defaults match the project's CMakeLists.txt defaults): # ENABLE_MKL ON|OFF (default OFF) - link threaded Intel oneMKL # instead of threaded OpenBLAS +# ENABLE_PLASMA ON|OFF (default OFF) - also build the PLASMA tiled +# Cholesky variant (extra +# 'plasma' column in the output) # ENABLE_VALIDATION ON|OFF (default OFF) - residual check after each # factorisation # # Examples: # ./compile.sh # ENABLE_MKL=ON ./compile.sh +# ENABLE_PLASMA=ON ./compile.sh # ENABLE_VALIDATION=ON ./compile.sh ################################################################################ set -e # Exit immediately if a command exits with a non-zero status. @@ -23,9 +27,10 @@ set -e # Exit immediately if a command exits with a non-zero status. # CMake project options (env-var overridable; defaults match CMakeLists.txt) ################################################################################ : "${ENABLE_MKL:=OFF}" +: "${ENABLE_PLASMA:=OFF}" : "${ENABLE_VALIDATION:=OFF}" -for var in ENABLE_MKL ENABLE_VALIDATION; do +for var in ENABLE_MKL ENABLE_PLASMA ENABLE_VALIDATION; do case "${!var}" in ON | OFF) ;; *) @@ -63,6 +68,9 @@ if command -v spack &>/dev/null; then # OpenBLAS built with OpenMP threading spack load openblas@0.3.28%gcc@14.2.0 threads=openmp ilp64=true fi + if [[ "$ENABLE_PLASMA" == "ON" ]]; then + spack load plasma%gcc@14.2.0 + fi elif [[ "$HOSTNAME" == "nasrin0" || "$HOSTNAME" == "nasrin1" ]]; then # Compiler @@ -71,6 +79,9 @@ if command -v spack &>/dev/null; then # OpenBLAS built with OpenMP threading spack load openblas@0.3.28%gcc@14.2.0 arch=linux-almalinux9-zen3 threads=openmp fi + if [[ "$ENABLE_PLASMA" == "ON" ]]; then + spack load plasma%gcc@14.2.0 arch=linux-almalinux9-zen3 + fi else echo "Hostname is $HOSTNAME — no action taken." @@ -86,10 +97,12 @@ rm -rf build && mkdir build && cd build echo "CMake options:" echo " ENABLE_MKL = $ENABLE_MKL" +echo " ENABLE_PLASMA = $ENABLE_PLASMA" echo " ENABLE_VALIDATION = $ENABLE_VALIDATION" cmake -DCMAKE_BUILD_TYPE=Release \ -DENABLE_MKL="$ENABLE_MKL" \ + -DENABLE_PLASMA="$ENABLE_PLASMA" \ -DENABLE_VALIDATION="$ENABLE_VALIDATION" \ .. make -j VERBOSE=1 diff --git a/reference/core/CMakeLists.txt b/reference/core/CMakeLists.txt index 0453e4f..f7b6535 100644 --- a/reference/core/CMakeLists.txt +++ b/reference/core/CMakeLists.txt @@ -5,6 +5,10 @@ if(ENABLE_VALIDATION) list(APPEND SOURCE_FILES src/validate.cpp) endif() +if(ENABLE_PLASMA) + list(APPEND SOURCE_FILES src/plasma_factor.cpp) +endif() + add_library(cholesky_core STATIC ${SOURCE_FILES}) set_property(TARGET cholesky_core PROPERTY EXPORT_NAME core) @@ -39,6 +43,15 @@ if(ENABLE_MKL) target_compile_definitions(cholesky_core PUBLIC ENABLE_MKL) endif() +if(ENABLE_PLASMA) + target_compile_definitions(cholesky_core PUBLIC ENABLE_PLASMA) + target_include_directories(cholesky_core PUBLIC ${PLASMA_INCLUDE_DIR}) + target_link_libraries(cholesky_core PUBLIC ${PLASMA_LIB}) + if(PLASMA_CORE_BLAS_LIB) + target_link_libraries(cholesky_core PUBLIC ${PLASMA_CORE_BLAS_LIB}) + endif() +endif() + target_compile_features(cholesky_core PUBLIC cxx_std_17) set_property(TARGET cholesky_core PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/reference/core/include/cholesky_factor.hpp b/reference/core/include/cholesky_factor.hpp index f54237d..8ec3c4a 100644 --- a/reference/core/include/cholesky_factor.hpp +++ b/reference/core/include/cholesky_factor.hpp @@ -3,18 +3,42 @@ #pragma once +#include +#include #include namespace cpu { /** - * @brief Run a single, threaded LAPACKE_dpotrf on the full N x N row-major - * matrix @p A. This is the reference (non-tiled) parallel BLAS - * Cholesky factorisation that the OpenMP / HPX tiled variants are - * benchmarked against. + * @brief Reference Cholesky variants. + * + * - reference : single threaded LAPACKE_dpotrf2 call (no tiling; parallelism + * lives entirely inside the threaded BLAS). + * - plasma : single plasma_dpotrf call (PLASMA's own tiled parallel + * Cholesky over the OpenMP runtime). */ -void parallel_blas_cholesky(std::vector &A, int N); +enum class Variant { reference, plasma }; + +inline Variant to_variant(const std::string &s) +{ + if (s == "reference") + { + return Variant::reference; + } + if (s == "plasma") + { + return Variant::plasma; + } + throw std::invalid_argument("Unknown Variant: " + s); +} + +/** + * @brief Run the requested reference variant on the full row-major N x N + * matrix @p A. Factorisation is in place; @p A holds the lower + * triangular factor L on return. + */ +void parallel_blas_cholesky(Variant variant, std::vector &A, int N); } // end of namespace cpu #endif // end of CPU_CHOLESKY_FACTOR_H diff --git a/reference/core/include/functions.hpp b/reference/core/include/functions.hpp index 63614d8..0740b4d 100644 --- a/reference/core/include/functions.hpp +++ b/reference/core/include/functions.hpp @@ -4,20 +4,23 @@ #pragma once #include +#include #include namespace cpu { /** - * @brief Time a single threaded LAPACKE_dpotrf call on the @p A buffer - * (row-major, N x N). The buffer is factorised in place. + * @brief Time a single call to the requested reference variant + * ('reference' or 'plasma') on the @p A buffer (row-major, N x N). + * The buffer is factorised in place. * - * @param A row-major matrix; on return contains the lower-triangular factor L - * @param N matrix dimension + * @param A row-major matrix; on return contains the lower-triangular factor L + * @param N matrix dimension + * @param variant which reference path to time * @return wall-clock elapsed time in seconds */ -double cholesky(std::vector &A, std::size_t N); +double cholesky(std::vector &A, std::size_t N, const std::string &variant); } // namespace cpu #endif // end of CPU_FUNCTIONS_H diff --git a/reference/core/include/plasma_factor.hpp b/reference/core/include/plasma_factor.hpp new file mode 100644 index 0000000..cfd85d7 --- /dev/null +++ b/reference/core/include/plasma_factor.hpp @@ -0,0 +1,26 @@ +#ifndef CPU_PLASMA_FACTOR_H +#define CPU_PLASMA_FACTOR_H + +#pragma once + +#include + +namespace cpu +{ + +/** + * @brief PLASMA tiled Cholesky on a row-major N x N buffer. + * + * PLASMA's high-level API is column-major, so we ask for @c PlasmaUpper: + * the upper triangle in PLASMA's column-major view aliases the lower + * triangle in our row-major view, which is the layout the validation + * routine expects (and which matches the LAPACKE_dpotrf2 reference). + * + * Caller is responsible for having invoked plasma_init() at startup; that + * cost is intentionally amortised over all timed calls and stays out of the + * timed region. + */ +void plasma_cholesky(std::vector &A, int N); + +} // end of namespace cpu +#endif // end of CPU_PLASMA_FACTOR_H diff --git a/reference/core/src/cholesky_factor.cpp b/reference/core/src/cholesky_factor.cpp index 4fd93fb..677feba 100644 --- a/reference/core/src/cholesky_factor.cpp +++ b/reference/core/src/cholesky_factor.cpp @@ -1,15 +1,34 @@ #include "cholesky_factor.hpp" #include "adapter_cblas_fp64.hpp" +#ifdef ENABLE_PLASMA +#include "plasma_factor.hpp" +#endif + +#include namespace cpu { -void parallel_blas_cholesky(std::vector &A, int N) +void parallel_blas_cholesky(Variant variant, std::vector &A, int N) { - // The whole factorisation is one threaded LAPACKE call; the BLAS library - // takes care of dispatching work across the available threads. - potrf(A, N); + switch (variant) + { + case Variant::reference: + // Single threaded LAPACKE call on the full matrix; the BLAS + // library dispatches work across the available threads. + potrf(A, N); + return; + + case Variant::plasma: +#ifdef ENABLE_PLASMA + plasma_cholesky(A, N); + return; +#else + throw std::invalid_argument( + "Variant 'plasma' requested but the binary was built without ENABLE_PLASMA=ON"); +#endif + } } } // end of namespace cpu diff --git a/reference/core/src/functions.cpp b/reference/core/src/functions.cpp index abe4cb0..e2986ea 100644 --- a/reference/core/src/functions.cpp +++ b/reference/core/src/functions.cpp @@ -6,12 +6,13 @@ namespace cpu { -double cholesky(std::vector &A, std::size_t N) +double cholesky(std::vector &A, std::size_t N, const std::string &variant) { + const Variant v = to_variant(variant); auto start = std::chrono::high_resolution_clock::now(); /////////////////////////////////////////////////////////////////////////// - // Launch Cholesky decomposition: A = L * L^T (single threaded LAPACKE call) - parallel_blas_cholesky(A, static_cast(N)); + // Launch Cholesky decomposition: A = L * L^T (single dispatched call) + parallel_blas_cholesky(v, A, static_cast(N)); /////////////////////////////////////////////////////////////////////////// auto stop = std::chrono::high_resolution_clock::now(); return (stop - start).count() / 1e9; diff --git a/reference/core/src/plasma_factor.cpp b/reference/core/src/plasma_factor.cpp new file mode 100644 index 0000000..2618c38 --- /dev/null +++ b/reference/core/src/plasma_factor.cpp @@ -0,0 +1,25 @@ +#include "plasma_factor.hpp" + +#include + +#include +#include + +namespace cpu +{ + +void plasma_cholesky(std::vector &A, int N) +{ + // PLASMA is column-major. Our buffer is row-major and the matrix is + // symmetric, so we can pass it through unchanged and ask PLASMA to write + // its result into the upper triangle of its column-major view -- that + // upper triangle aliases the lower triangle of our row-major view, which + // is the layout the validator (and the LAPACKE reference path) expects. + const int info = plasma_dpotrf(PlasmaUpper, N, A.data(), N); + if (info != 0) + { + throw std::runtime_error("plasma_dpotrf failed with info=" + std::to_string(info)); + } +} + +} // end of namespace cpu diff --git a/reference/main.cpp b/reference/main.cpp index 0d58b8f..96d52e4 100644 --- a/reference/main.cpp +++ b/reference/main.cpp @@ -3,10 +3,14 @@ #ifdef ENABLE_VALIDATION #include "validate.hpp" #endif +#ifdef ENABLE_PLASMA +#include +#endif #include #include #include #include +#include #include #include @@ -63,6 +67,15 @@ int main(int argc, char *argv[]) std::ofstream runtime_file; runtime_file.open(runtime_file_path, std::ios_base::app); +#ifdef ENABLE_PLASMA + // PLASMA spins up its own context and worker pool; do this once so the + // cost is not folded into any timed factorisation. + if (plasma_init() != 0) + { + throw std::runtime_error("plasma_init() failed"); + } +#endif + for (std::size_t size = START_SIZE; size <= STOP_SIZE; size = size * STEP_SIZE) { for (std::size_t l = 0; l < LOOP; l++) @@ -77,13 +90,18 @@ int main(int argc, char *argv[]) values += std::string(";") + std::to_string(size); values += std::string(";") + std::to_string(1); /////////////////////////////////////////////////////////////////// - // Single mode: parallel-BLAS reference dpotrf on the full matrix. + // Reference modes: + // reference -> single threaded LAPACKE_dpotrf2 on the full matrix + // plasma -> single plasma_dpotrf (added when ENABLE_PLASMA=ON) std::vector modes = { "reference" }; +#ifdef ENABLE_PLASMA + modes.push_back("plasma"); +#endif for (const auto &mode : modes) { auto A = gen_matrix(size); - auto cholesky_cpu = cpu::cholesky(A, size); + auto cholesky_cpu = cpu::cholesky(A, size, mode); header += ";" + mode; values += ";" + std::to_string(cholesky_cpu); @@ -115,5 +133,10 @@ int main(int argc, char *argv[]) } runtime_file.close(); + +#ifdef ENABLE_PLASMA + plasma_finalize(); +#endif + return 0; } From eb3e93b2847c44a24707804c9cf4b63a8b8fb424 Mon Sep 17 00:00:00 2001 From: constracktor <74077030+constracktor@users.noreply.github.com> Date: Wed, 29 Apr 2026 23:48:33 +0200 Subject: [PATCH 05/13] Add plasma guard and tile variant --- README.md | 22 +++-- reference/core/include/cholesky_factor.hpp | 16 ++-- reference/core/include/plasma_factor.hpp | 28 ++++++- reference/core/src/cholesky_factor.cpp | 9 ++ reference/core/src/plasma_factor.cpp | 98 ++++++++++++++++++++++ reference/main.cpp | 53 ++++++++++-- 6 files changed, 209 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 0287f4b..2b0ebff 100644 --- a/README.md +++ b/README.md @@ -29,9 +29,21 @@ Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-j | Mode | Description | |------|-------------| | `reference` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). | -| `plasma` | Single `plasma_dpotrf` call on the full matrix. PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. | +| `plasma` | Single `plasma_dpotrf` call on the full matrix (PLASMA's high-level synchronous API). PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. | +| `plasma_tile` | `plasma_omp_dpotrf` over a manually-built `plasma_desc_t` (PLASMA's asynchronous tile interface). Allocates the tile-layout backing store in user code (so PLASMA's `_create` routines never run) and wraps it via `plasma_desc_general_init`, which avoids the int32 overflow that bounds the `plasma` mode. Built only when `ENABLE_PLASMA=ON`. | -This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `reference` mode isolates the contribution of vendor-provided dense-LA parallelism, and the `plasma` mode adds a true tiled-parallel competitor that uses the same OpenMP runtime as the in-house variants. +This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `reference` mode isolates the contribution of vendor-provided dense-LA parallelism, and the two `plasma*` modes add true tiled-parallel competitors that use the same OpenMP runtime as the in-house variants. + +#### PLASMA descriptor int32 overflow + +PLASMA 24.8.7's `plasma_desc_*_create()` routines compute their tile-storage size as `int * int` before casting to `size_t`, which silently overflows once the padded tile-area exceeds `INT32_MAX`: + +| Path | Behaviour past the boundary (default `nb=256`) | +|------|------------------------------------------------| +| `plasma` (high-level, triangular descriptor) | Skipped for `N > 65280`. The benchmark detects the overflow condition before invoking PLASMA and records `nan` for that cell instead of triggering PLASMA's multi-line `malloc() failed` diagnostic. | +| `plasma_tile` (tile API, user-allocated buffer) | Continues to run. The tile path allocates its own tile-layout backing store with `size_t` arithmetic and wraps it via `plasma_desc_general_init`, so no `_create`/malloc happens inside PLASMA at all. The int32 ceiling does not apply. | + +Patching `(size_t)` casts into `control/descriptor.c` in the spack PLASMA package removes the ceiling for the high-level path too and the guard becomes a no-op. ## Dependencies @@ -152,11 +164,11 @@ threads;problem_size;tile_size;n_tiles;for_collapse;for_naive;task_naive;task_de 128;65536;1024;64;3.14;3.21;2.98;2.87 ``` -The `reference/` binary reports a `reference` column (and a `plasma` column when built with `ENABLE_PLASMA=ON`), with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key: +The `reference/` binary reports a `reference` column (and `plasma` + `plasma_tile` columns when built with `ENABLE_PLASMA=ON`), with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key: ``` -threads;problem_size;tile_size;n_tiles;reference;plasma -128;65536;65536;1;2.71;2.45 +threads;problem_size;tile_size;n_tiles;reference;plasma;plasma_tile +128;65280;65280;1;2.71;68.12;71.30 ``` The same lines are also printed to stdout. diff --git a/reference/core/include/cholesky_factor.hpp b/reference/core/include/cholesky_factor.hpp index 8ec3c4a..f7f1b2f 100644 --- a/reference/core/include/cholesky_factor.hpp +++ b/reference/core/include/cholesky_factor.hpp @@ -13,12 +13,14 @@ namespace cpu /** * @brief Reference Cholesky variants. * - * - reference : single threaded LAPACKE_dpotrf2 call (no tiling; parallelism - * lives entirely inside the threaded BLAS). - * - plasma : single plasma_dpotrf call (PLASMA's own tiled parallel - * Cholesky over the OpenMP runtime). + * - reference : single threaded LAPACKE_dpotrf2 call (no tiling; + * parallelism lives entirely inside the threaded BLAS). + * - plasma : single plasma_dpotrf call (PLASMA's high-level + * synchronous Cholesky over the OpenMP runtime). + * - plasma_tile : plasma_omp_dpotrf called over a manually-built tile + * descriptor (PLASMA's asynchronous tile interface). */ -enum class Variant { reference, plasma }; +enum class Variant { reference, plasma, plasma_tile }; inline Variant to_variant(const std::string &s) { @@ -30,6 +32,10 @@ inline Variant to_variant(const std::string &s) { return Variant::plasma; } + if (s == "plasma_tile") + { + return Variant::plasma_tile; + } throw std::invalid_argument("Unknown Variant: " + s); } diff --git a/reference/core/include/plasma_factor.hpp b/reference/core/include/plasma_factor.hpp index cfd85d7..a07edbe 100644 --- a/reference/core/include/plasma_factor.hpp +++ b/reference/core/include/plasma_factor.hpp @@ -9,7 +9,8 @@ namespace cpu { /** - * @brief PLASMA tiled Cholesky on a row-major N x N buffer. + * @brief PLASMA tiled Cholesky on a row-major N x N buffer using the + * high-level synchronous API (plasma_dpotrf). * * PLASMA's high-level API is column-major, so we ask for @c PlasmaUpper: * the upper triangle in PLASMA's column-major view aliases the lower @@ -19,8 +20,33 @@ namespace cpu * Caller is responsible for having invoked plasma_init() at startup; that * cost is intentionally amortised over all timed calls and stays out of the * timed region. + * + * Throws @c std::runtime_error before calling PLASMA if the descriptor + * size computation inside plasma_desc_triangular_create() would overflow + * int32 (PLASMA 24.8.7 still does this multiplication in @c int). This + * keeps PLASMA's own multi-line error spam off stderr when the surrounding + * sweep walks past N=65280. */ void plasma_cholesky(std::vector &A, int N); +/** + * @brief PLASMA tiled Cholesky on a row-major N x N buffer using the + * asynchronous tile interface (plasma_omp_dpotrf). + * + * Allocates the tile-layout backing store ourselves with size_t + * arithmetic, then wraps it in a @c plasma_desc_t via + * plasma_desc_general_init -- which performs no malloc and therefore + * sidesteps PLASMA 24.8.7's int32 overflow inside the create routines. + * This means the tile path is expected to keep working past N>65280 + * where the high-level @c plasma_cholesky aborts. + * + * After the descriptor is set up, PLASMA's tile-API routines translate + * our row-major buffer into tile layout (plasma_omp_dge2desc), run the + * tiled factorisation (plasma_omp_dpotrf with PlasmaUpper), and + * translate back (plasma_omp_ddesc2ge). The output layout matches the + * high-level path: row-major lower triangle holds L. + */ +void plasma_tile_cholesky(std::vector &A, int N); + } // end of namespace cpu #endif // end of CPU_PLASMA_FACTOR_H diff --git a/reference/core/src/cholesky_factor.cpp b/reference/core/src/cholesky_factor.cpp index 677feba..81f054e 100644 --- a/reference/core/src/cholesky_factor.cpp +++ b/reference/core/src/cholesky_factor.cpp @@ -28,6 +28,15 @@ void parallel_blas_cholesky(Variant variant, std::vector &A, int N) throw std::invalid_argument( "Variant 'plasma' requested but the binary was built without ENABLE_PLASMA=ON"); #endif + + case Variant::plasma_tile: +#ifdef ENABLE_PLASMA + plasma_tile_cholesky(A, N); + return; +#else + throw std::invalid_argument( + "Variant 'plasma_tile' requested but the binary was built without ENABLE_PLASMA=ON"); +#endif } } diff --git a/reference/core/src/plasma_factor.cpp b/reference/core/src/plasma_factor.cpp index 2618c38..d7fae86 100644 --- a/reference/core/src/plasma_factor.cpp +++ b/reference/core/src/plasma_factor.cpp @@ -2,14 +2,56 @@ #include +#include +#include #include #include +#include namespace cpu { +namespace +{ + +// PLASMA's default tile size for fp64 (typical 24.x default). We hardcode +// this rather than calling plasma_get(PlasmaNb, ...) so the overflow guard +// below stays portable across PLASMA versions. If you tune via +// plasma_set(PlasmaNb, ...) at startup, keep this matching. +constexpr int kPlasmaDefaultNb = 256; + +// Pre-flight: would PLASMA's int32 multiplication for descriptor sizing +// overflow? PLASMA 24.8.7's plasma_desc_*_create routines compute the +// total tile-layout backing-store size as int*int and then cast to size_t, +// so the malloc gets a sign-extended-negative argument and fails for any +// padded total >= INT32_MAX. We replicate the math here and throw before +// invoking PLASMA, which avoids the multi-line PLASMA ERROR diagnostic on +// stderr and keeps the surrounding sweep clean. +// +// Only used for the high-level path. The tile path bypasses _create entirely +// by allocating its tile buffer in user code, so it does not need this. +void guard_descriptor_overflow(int N, int nb, bool triangular, const char *which) +{ + const long long mt = (N + nb - 1) / nb; + const long long padded = + triangular ? (mt * (mt + 1) / 2) * static_cast(nb) * nb + : mt * mt * static_cast(nb) * nb; + if (padded > static_cast(INT_MAX)) + { + throw std::runtime_error( + std::string(which) + ": skipped to avoid PLASMA descriptor int32 overflow at N=" + std::to_string(N) + + " (nb=" + std::to_string(nb) + ", mt=" + std::to_string(mt) + + ", padded elements=" + std::to_string(padded) + " > INT32_MAX)"); + } +} + +} // anonymous namespace void plasma_cholesky(std::vector &A, int N) { + // High-level plasma_dpotrf allocates a triangular tile descriptor + // internally; overflow check uses the triangular size formula. + guard_descriptor_overflow(N, kPlasmaDefaultNb, /*triangular=*/true, "plasma_dpotrf"); + // PLASMA is column-major. Our buffer is row-major and the matrix is // symmetric, so we can pass it through unchanged and ask PLASMA to write // its result into the upper triangle of its column-major view -- that @@ -22,4 +64,60 @@ void plasma_cholesky(std::vector &A, int N) } } +void plasma_tile_cholesky(std::vector &A, int N) +{ + // The tile path sidesteps PLASMA 24.8.7's int32 overflow in + // plasma_desc_general_create() by *allocating the tile-layout backing + // store ourselves* and handing PLASMA a descriptor that merely wraps + // it. plasma_desc_general_init does no malloc, so the buggy + // multiplication is never reached. Our std::vector handles size_t + // arithmetic correctly and frees the buffer on scope exit. + // + // PLASMA may still hit additional int math on its internal tile-offset + // computations during execution; if so, plasma_omp_dpotrf will mark + // the sequence with a non-zero status, we'll throw, and main.cpp's + // try/catch will record nan for this cell. But the malloc-overflow + // failure that hits at N>~46080 with the create path is gone. + const int nb = kPlasmaDefaultNb; + const long long mt_ll = (N + nb - 1) / nb; + const int mt = static_cast(mt_ll); + const int lm = mt * nb; // padded leading dimension; fits int32 even for huge N + const std::size_t tile_buf_elements = static_cast(lm) * static_cast(lm); + std::vector tile_buf(tile_buf_elements); + + plasma_desc_t descA; + int retval = plasma_desc_general_init(PlasmaRealDouble, tile_buf.data(), nb, nb, lm, lm, 0, 0, N, N, &descA); + if (retval != PlasmaSuccess) + { + throw std::runtime_error("plasma_desc_general_init failed with retval=" + std::to_string(retval)); + } + + // PLASMA 24.8.7's tile interface uses stack-allocated sequence/request + // structs (no plasma_sequence_create/destroy, no PlasmaRequestInitializer + // macro). Zero-init lands status=0=PlasmaSuccess, which is the expected + // pre-call state for both structs. + plasma_sequence_t sequence{}; + plasma_request_t request{}; + + // Translate row-major buffer -> tile descriptor, factor in place on the + // descriptor, translate back. Same PlasmaUpper convention as the + // high-level path, so the resulting layout (row-major lower triangle = L) + // matches what the validator expects. +#pragma omp parallel +#pragma omp master + { + plasma_omp_dge2desc(A.data(), N, descA, &sequence, &request); + plasma_omp_dpotrf(PlasmaUpper, descA, &sequence, &request); + plasma_omp_ddesc2ge(descA, A.data(), N, &sequence, &request); + } + + // No plasma_desc_destroy: the descriptor never owned the buffer (we did), + // and tile_buf goes out of scope here. No sequence destroy: stack-alloc. + + if (sequence.status != PlasmaSuccess) + { + throw std::runtime_error("plasma tile sequence failed with status=" + std::to_string(sequence.status)); + } +} + } // end of namespace cpu diff --git a/reference/main.cpp b/reference/main.cpp index 96d52e4..f6b8c17 100644 --- a/reference/main.cpp +++ b/reference/main.cpp @@ -6,9 +6,12 @@ #ifdef ENABLE_PLASMA #include #endif +#include #include +#include #include #include +#include #include #include #include @@ -91,19 +94,57 @@ int main(int argc, char *argv[]) values += std::string(";") + std::to_string(1); /////////////////////////////////////////////////////////////////// // Reference modes: - // reference -> single threaded LAPACKE_dpotrf2 on the full matrix - // plasma -> single plasma_dpotrf (added when ENABLE_PLASMA=ON) - std::vector modes = { "reference" }; + // reference -> single threaded LAPACKE_dpotrf2 on the full + // matrix (currently disabled; uncomment the + // initializer below to re-enable) + // plasma -> single plasma_dpotrf (high-level synchronous + // PLASMA API; added when ENABLE_PLASMA=ON) + // plasma_tile -> plasma_omp_dpotrf over a manually-built + // plasma_desc_t (PLASMA's asynchronous tile + // interface; added when ENABLE_PLASMA=ON) + std::vector modes = { + // "reference", + }; #ifdef ENABLE_PLASMA modes.push_back("plasma"); + modes.push_back("plasma_tile"); #endif for (const auto &mode : modes) { - auto A = gen_matrix(size); - auto cholesky_cpu = cpu::cholesky(A, size, mode); - header += ";" + mode; + + // We let one mode fail (e.g. PLASMA running out of memory at + // very large N -- its high-level wrapper allocates an extra + // tiled triangular copy on top of the input buffer) without + // killing the whole sweep. The failed cell is recorded as NaN + // and we continue with the next mode and size. + std::vector A; + try + { + A = gen_matrix(size); + } + catch (const std::exception &e) + { + std::cerr << "Error: gen_matrix(size=" << size << ") threw '" << e.what() + << "'. Recording NaN for variant '" << mode << "'." << std::endl; + values += ";nan"; + continue; + } + + double cholesky_cpu = std::numeric_limits::quiet_NaN(); + try + { + cholesky_cpu = cpu::cholesky(A, size, mode); + } + catch (const std::exception &e) + { + std::cerr << "Error: variant '" << mode << "' failed at size=" << size << ": " << e.what() + << ". Recording NaN and continuing." << std::endl; + values += ";nan"; + continue; + } + values += ";" + std::to_string(cholesky_cpu); #ifdef ENABLE_VALIDATION From e545c35b8a3b499cf5734bd82353fda0c77d1150 Mon Sep 17 00:00:00 2001 From: constracktor <74077030+constracktor@users.noreply.github.com> Date: Thu, 30 Apr 2026 10:00:37 +0200 Subject: [PATCH 06/13] Improved plasma tiled --- README.md | 12 +++--- reference/core/include/plasma_factor.hpp | 26 +++++++----- reference/core/src/plasma_factor.cpp | 51 ++++++++++++++---------- 3 files changed, 53 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 2b0ebff..aeb7bdb 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-j |------|-------------| | `reference` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). | | `plasma` | Single `plasma_dpotrf` call on the full matrix (PLASMA's high-level synchronous API). PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. | -| `plasma_tile` | `plasma_omp_dpotrf` over a manually-built `plasma_desc_t` (PLASMA's asynchronous tile interface). Allocates the tile-layout backing store in user code (so PLASMA's `_create` routines never run) and wraps it via `plasma_desc_general_init`, which avoids the int32 overflow that bounds the `plasma` mode. Built only when `ENABLE_PLASMA=ON`. | +| `plasma_tile` | `plasma_omp_dpotrf` over a manually-built `plasma_desc_t` (PLASMA's asynchronous tile interface). Allocates an *uninitialised* general (full N×N) tile-layout backing store in user code and wraps it via `plasma_desc_general_init` — PLASMA's `_create` routines never run. Leaving the buffer uninitialised lets `plasma_omp_dge2desc` first-touch each tile from its consuming core, so pages land on the right NUMA node and a chunk of the runtime gap with `plasma` closes. Built only when `ENABLE_PLASMA=ON`. | This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `reference` mode isolates the contribution of vendor-provided dense-LA parallelism, and the two `plasma*` modes add true tiled-parallel competitors that use the same OpenMP runtime as the in-house variants. @@ -38,12 +38,12 @@ This directory is the natural baseline for the OpenMP and HPX tiled implementati PLASMA 24.8.7's `plasma_desc_*_create()` routines compute their tile-storage size as `int * int` before casting to `size_t`, which silently overflows once the padded tile-area exceeds `INT32_MAX`: -| Path | Behaviour past the boundary (default `nb=256`) | -|------|------------------------------------------------| -| `plasma` (high-level, triangular descriptor) | Skipped for `N > 65280`. The benchmark detects the overflow condition before invoking PLASMA and records `nan` for that cell instead of triggering PLASMA's multi-line `malloc() failed` diagnostic. | -| `plasma_tile` (tile API, user-allocated buffer) | Continues to run. The tile path allocates its own tile-layout backing store with `size_t` arithmetic and wraps it via `plasma_desc_general_init`, so no `_create`/malloc happens inside PLASMA at all. The int32 ceiling does not apply. | +| Path | Boundary (default `nb=256`) | Behaviour past the boundary | +|------|------------------------------|------------------------------| +| `plasma` (high-level, triangular descriptor) | `N > 65280` | Skipped before invoking PLASMA. Records `nan` instead of triggering PLASMA's multi-line `malloc() failed` diagnostic. | +| `plasma_tile` (tile API, user-allocated general buffer) | `N > 46080` | Skipped before invoking PLASMA. The user-allocated buffer avoids `_create`'s malloc-overflow, but PLASMA does additional int32 tile-offset arithmetic *during execution* of `plasma_omp_dpotrf`, which segfaults past this boundary. The guard makes the failure clean. | -Patching `(size_t)` casts into `control/descriptor.c` in the spack PLASMA package removes the ceiling for the high-level path too and the guard becomes a no-op. +Patching `(size_t)` casts into `control/descriptor.c` and the tile-offset code in the spack PLASMA package removes both ceilings, and the guards become no-ops. ## Dependencies diff --git a/reference/core/include/plasma_factor.hpp b/reference/core/include/plasma_factor.hpp index a07edbe..f7639ea 100644 --- a/reference/core/include/plasma_factor.hpp +++ b/reference/core/include/plasma_factor.hpp @@ -33,18 +33,26 @@ void plasma_cholesky(std::vector &A, int N); * @brief PLASMA tiled Cholesky on a row-major N x N buffer using the * asynchronous tile interface (plasma_omp_dpotrf). * - * Allocates the tile-layout backing store ourselves with size_t - * arithmetic, then wraps it in a @c plasma_desc_t via + * Allocates an *uninitialised* general (full N x N) tile-layout backing + * store ourselves and wraps it in a @c plasma_desc_t via * plasma_desc_general_init -- which performs no malloc and therefore * sidesteps PLASMA 24.8.7's int32 overflow inside the create routines. - * This means the tile path is expected to keep working past N>65280 - * where the high-level @c plasma_cholesky aborts. * - * After the descriptor is set up, PLASMA's tile-API routines translate - * our row-major buffer into tile layout (plasma_omp_dge2desc), run the - * tiled factorisation (plasma_omp_dpotrf with PlasmaUpper), and - * translate back (plasma_omp_ddesc2ge). The output layout matches the - * high-level path: row-major lower triangle holds L. + * Leaving the buffer uninitialised lets plasma_omp_dge2desc first-touch + * each tile from its consuming core, so pages land on the right NUMA + * node instead of all on the main thread's. That is the optimisation + * that closes part of the runtime gap with @c plasma_cholesky; the + * remainder of the gap is the wider working-set of the general + * descriptor (full N*N tile area vs the high-level path's triangular + * mt*(mt+1)/2 area), which would only be recovered by switching to + * @c plasma_desc_triangular_init -- attempted but found incompatible + * with the dge2desc/ddesc2ge translation routines in PLASMA 24.8.7. + * + * Note: PLASMA does int32 tile-offset arithmetic during execution as + * well, so the tile path is also bounded by an int32 overflow guard + * (general formula). Past the bound this function throws and + * @c main.cpp's catch handler records @c nan rather than letting PLASMA + * segfault. */ void plasma_tile_cholesky(std::vector &A, int N); diff --git a/reference/core/src/plasma_factor.cpp b/reference/core/src/plasma_factor.cpp index d7fae86..897a05d 100644 --- a/reference/core/src/plasma_factor.cpp +++ b/reference/core/src/plasma_factor.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -27,8 +28,11 @@ constexpr int kPlasmaDefaultNb = 256; // invoking PLASMA, which avoids the multi-line PLASMA ERROR diagnostic on // stderr and keeps the surrounding sweep clean. // -// Only used for the high-level path. The tile path bypasses _create entirely -// by allocating its tile buffer in user code, so it does not need this. +// Used for both paths. The high-level path needs it because of the malloc +// inside _create; the tile path needs it because PLASMA also does int32 +// tile-offset arithmetic *during execution* (segfaults at N>~46080 with the +// general descriptor and default nb), even though we allocate the buffer +// ourselves and bypass _create entirely. void guard_descriptor_overflow(int N, int nb, bool triangular, const char *which) { const long long mt = (N + nb - 1) / nb; @@ -66,36 +70,44 @@ void plasma_cholesky(std::vector &A, int N) void plasma_tile_cholesky(std::vector &A, int N) { - // The tile path sidesteps PLASMA 24.8.7's int32 overflow in - // plasma_desc_general_create() by *allocating the tile-layout backing - // store ourselves* and handing PLASMA a descriptor that merely wraps - // it. plasma_desc_general_init does no malloc, so the buggy - // multiplication is never reached. Our std::vector handles size_t - // arithmetic correctly and frees the buffer on scope exit. + // Pre-flight: PLASMA does int32 tile-offset arithmetic during execution + // (not just inside _create), so the general descriptor still hits an + // overflow ceiling at N>~46080 with the default nb. Without this guard + // plasma_omp_dpotrf segfaults rather than failing cleanly. + guard_descriptor_overflow(N, kPlasmaDefaultNb, /*triangular=*/false, "plasma_omp_dpotrf"); + + // The tile path bypasses PLASMA's _create allocator (which has the + // int32-multiply malloc bug) by allocating the tile-layout backing + // store ourselves and wrapping it with plasma_desc_general_init. _init + // performs no malloc, so the buggy multiplication is never reached. // - // PLASMA may still hit additional int math on its internal tile-offset - // computations during execution; if so, plasma_omp_dpotrf will mark - // the sequence with a non-zero status, we'll throw, and main.cpp's - // try/catch will record nan for this cell. But the malloc-overflow - // failure that hits at N>~46080 with the create path is gone. + // The buffer is *uninitialised* (new double[N], not value-initialised + // with std::vector). Two reasons: (1) skips a multi-GB zero-init pass + // run on the main thread, and (2) lets plasma_omp_dge2desc first-touch + // each tile from its consuming core, so pages land on the right NUMA + // node instead of all on the main thread's node. That's what shaves + // time off the general-descriptor tile path here. + const int nb = kPlasmaDefaultNb; const long long mt_ll = (N + nb - 1) / nb; const int mt = static_cast(mt_ll); const int lm = mt * nb; // padded leading dimension; fits int32 even for huge N + const std::size_t tile_buf_elements = static_cast(lm) * static_cast(lm); - std::vector tile_buf(tile_buf_elements); + + std::unique_ptr tile_buf(new double[tile_buf_elements]); plasma_desc_t descA; - int retval = plasma_desc_general_init(PlasmaRealDouble, tile_buf.data(), nb, nb, lm, lm, 0, 0, N, N, &descA); + int retval = + plasma_desc_general_init(PlasmaRealDouble, tile_buf.get(), nb, nb, lm, lm, 0, 0, N, N, &descA); if (retval != PlasmaSuccess) { throw std::runtime_error("plasma_desc_general_init failed with retval=" + std::to_string(retval)); } // PLASMA 24.8.7's tile interface uses stack-allocated sequence/request - // structs (no plasma_sequence_create/destroy, no PlasmaRequestInitializer - // macro). Zero-init lands status=0=PlasmaSuccess, which is the expected - // pre-call state for both structs. + // structs. Zero-init lands status=0=PlasmaSuccess, the expected + // pre-call state. plasma_sequence_t sequence{}; plasma_request_t request{}; @@ -111,9 +123,6 @@ void plasma_tile_cholesky(std::vector &A, int N) plasma_omp_ddesc2ge(descA, A.data(), N, &sequence, &request); } - // No plasma_desc_destroy: the descriptor never owned the buffer (we did), - // and tile_buf goes out of scope here. No sequence destroy: stack-alloc. - if (sequence.status != PlasmaSuccess) { throw std::runtime_error("plasma tile sequence failed with status=" + std::to_string(sequence.status)); From 2a9d90ba87019e6587f48da30cc61cc0db671f40 Mon Sep 17 00:00:00 2001 From: constracktor <74077030+constracktor@users.noreply.github.com> Date: Thu, 30 Apr 2026 10:48:05 +0200 Subject: [PATCH 07/13] Remove tiled plasma --- README.md | 31 +++--- reference/CMakeLists.txt | 8 ++ reference/compile.sh | 15 ++- reference/core/include/cholesky_factor.hpp | 16 +-- reference/core/include/plasma_factor.hpp | 38 ++----- reference/core/src/cholesky_factor.cpp | 9 -- reference/core/src/plasma_factor.cpp | 120 +++------------------ reference/main.cpp | 34 +++--- 8 files changed, 85 insertions(+), 186 deletions(-) diff --git a/README.md b/README.md index aeb7bdb..3307869 100644 --- a/README.md +++ b/README.md @@ -28,22 +28,21 @@ Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-j | Mode | Description | |------|-------------| -| `reference` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). | +| `reference` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). Enabled by default; disable with `DISABLE_BLAS_REFERENCE=ON`. | | `plasma` | Single `plasma_dpotrf` call on the full matrix (PLASMA's high-level synchronous API). PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. | -| `plasma_tile` | `plasma_omp_dpotrf` over a manually-built `plasma_desc_t` (PLASMA's asynchronous tile interface). Allocates an *uninitialised* general (full N×N) tile-layout backing store in user code and wraps it via `plasma_desc_general_init` — PLASMA's `_create` routines never run. Leaving the buffer uninitialised lets `plasma_omp_dge2desc` first-touch each tile from its consuming core, so pages land on the right NUMA node and a chunk of the runtime gap with `plasma` closes. Built only when `ENABLE_PLASMA=ON`. | -This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `reference` mode isolates the contribution of vendor-provided dense-LA parallelism, and the two `plasma*` modes add true tiled-parallel competitors that use the same OpenMP runtime as the in-house variants. +This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `reference` mode isolates the contribution of vendor-provided dense-LA parallelism, and the `plasma` mode adds a tiled-parallel competitor that uses the same OpenMP runtime as the in-house variants. #### PLASMA descriptor int32 overflow -PLASMA 24.8.7's `plasma_desc_*_create()` routines compute their tile-storage size as `int * int` before casting to `size_t`, which silently overflows once the padded tile-area exceeds `INT32_MAX`: +PLASMA 24.8.7's `plasma_desc_*_create()` routines compute their tile-storage size as `int * int` before casting to `size_t`, which silently overflows once the padded triangular tile-area exceeds `INT32_MAX`. With the default `nb=256`, the boundary is at `N=65280` (`mt=255`). -| Path | Boundary (default `nb=256`) | Behaviour past the boundary | -|------|------------------------------|------------------------------| -| `plasma` (high-level, triangular descriptor) | `N > 65280` | Skipped before invoking PLASMA. Records `nan` instead of triggering PLASMA's multi-line `malloc() failed` diagnostic. | -| `plasma_tile` (tile API, user-allocated general buffer) | `N > 46080` | Skipped before invoking PLASMA. The user-allocated buffer avoids `_create`'s malloc-overflow, but PLASMA does additional int32 tile-offset arithmetic *during execution* of `plasma_omp_dpotrf`, which segfaults past this boundary. The guard makes the failure clean. | +The benchmark handles this transparently: -Patching `(size_t)` casts into `control/descriptor.c` and the tile-offset code in the spack PLASMA package removes both ceilings, and the guards become no-ops. +- For sweep sizes `N` in `(65280, 65536]` the working size is **clamped to 65280** for the whole row (both `reference` and `plasma` run at 65280, and the `problem_size` column reports 65280). This keeps the largest practical PLASMA point on the curve without touching the underlying PLASMA build. +- For `N > 65536` `plasma` records `nan`. `reference` (LAPACKE) is unaffected by the int32 ceiling and continues normally. + +Patching `(size_t)` casts into `control/descriptor.c` in the spack PLASMA package removes the ceiling and the clamp + guard become no-ops. ## Dependencies @@ -84,7 +83,8 @@ These can be set as environment variables before calling `compile.sh`: | `DISABLE_COMPUTATION` | `OFF` | *(`openmp/` and `hpx/` only)* Replace all BLAS/tile-generation calls with no-ops. The task graph and loops remain intact, so scheduling overhead can be measured in isolation. | | `ENABLE_DYNAMIC_SCHEDULE` | `OFF` | *(`openmp/` only)* Use `schedule(dynamic,1)` on the trailing-update worksharing loops in `for_collapse`. Requires the LLVM toolchain; rejected at compile time with GCC. | | `ENABLE_MKL` | `OFF` | Link against Intel oneMKL instead of OpenBLAS. In `openmp/` and `hpx/` this is the *sequential* MKL; in `reference/` it is the *threaded* MKL. | -| `ENABLE_PLASMA` | `OFF` | *(`reference/` only)* Also build the PLASMA tiled-Cholesky variant. Adds a `plasma` column alongside `reference` in the runtime output. | +| `ENABLE_PLASMA` | `OFF` | *(`reference/` only)* Also build the PLASMA `plasma_dpotrf` variant. Adds a `plasma` column alongside `reference` in the runtime output. | +| `DISABLE_BLAS_REFERENCE` | `OFF` | *(`reference/` only)* Skip the LAPACKE_dpotrf reference mode at runtime, so only `plasma` runs (when `ENABLE_PLASMA=ON`). Linking is unchanged — PLASMA and validation still need cblas/lapacke symbols. | **Examples:** @@ -103,6 +103,9 @@ ENABLE_MKL=ON ./compile.sh # Reference: also build the PLASMA tiled-Cholesky variant ENABLE_PLASMA=ON ./compile.sh + +# Reference: PLASMA only, skip the LAPACKE_dpotrf column at runtime +DISABLE_BLAS_REFERENCE=ON ENABLE_PLASMA=ON ./compile.sh ``` ## Run @@ -164,11 +167,11 @@ threads;problem_size;tile_size;n_tiles;for_collapse;for_naive;task_naive;task_de 128;65536;1024;64;3.14;3.21;2.98;2.87 ``` -The `reference/` binary reports a `reference` column (and `plasma` + `plasma_tile` columns when built with `ENABLE_PLASMA=ON`), with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key: +The `reference/` binary reports a `reference` column (suppressed by `DISABLE_BLAS_REFERENCE=ON`) plus a `plasma` column when built with `ENABLE_PLASMA=ON`, with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key: ``` -threads;problem_size;tile_size;n_tiles;reference;plasma;plasma_tile -128;65280;65280;1;2.71;68.12;71.30 +threads;problem_size;tile_size;n_tiles;reference;plasma +128;65280;65280;1;2.71;68.12 ``` The same lines are also printed to stdout. @@ -238,6 +241,8 @@ The same lines are also printed to stdout. └── adapter_cblas_fp64.cpp ``` +When `DISABLE_BLAS_REFERENCE=ON`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still compiled and linked (they share cblas/lapacke symbols with PLASMA's BLAS dependency); only the runtime dispatch of the `reference` mode is skipped. + ## Contributing We would be happy to expand Cholesky-Bench to additional asynchronous many-task (AMT) runtimes. If you have an implementation you would like to add, feel free to open a pull request. diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 084b4fb..111fae1 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -11,6 +11,10 @@ option( ENABLE_PLASMA "Build the PLASMA tiled-Cholesky reference variant in addition to the LAPACKE_dpotrf one" OFF) +option( + DISABLE_BLAS_REFERENCE + "Skip the LAPACKE_dpotrf reference mode at runtime. Linking is unchanged (PLASMA and validation still need cblas/lapacke)." + OFF) option( ENABLE_VALIDATION "Compute ||A - L*L^T||_F / ||A||_F after each factorization (off by default)" @@ -114,4 +118,8 @@ if(BUILD_CORE) target_compile_definitions(cholesky_reference PRIVATE ENABLE_PLASMA) target_include_directories(cholesky_reference PRIVATE ${PLASMA_INCLUDE_DIR}) endif() + + if(DISABLE_BLAS_REFERENCE) + target_compile_definitions(cholesky_reference PRIVATE DISABLE_BLAS_REFERENCE) + endif() endif() diff --git a/reference/compile.sh b/reference/compile.sh index 0fa48b7..3bffd73 100755 --- a/reference/compile.sh +++ b/reference/compile.sh @@ -12,6 +12,9 @@ # ENABLE_PLASMA ON|OFF (default OFF) - also build the PLASMA tiled # Cholesky variant (extra # 'plasma' column in the output) +# DISABLE_BLAS_REFERENCE ON|OFF (default OFF) - skip the LAPACKE_dpotrf +# reference mode at runtime +# (linking unchanged) # ENABLE_VALIDATION ON|OFF (default OFF) - residual check after each # factorisation # @@ -19,6 +22,7 @@ # ./compile.sh # ENABLE_MKL=ON ./compile.sh # ENABLE_PLASMA=ON ./compile.sh +# DISABLE_BLAS_REFERENCE=ON ENABLE_PLASMA=ON ./compile.sh # ENABLE_VALIDATION=ON ./compile.sh ################################################################################ set -e # Exit immediately if a command exits with a non-zero status. @@ -28,9 +32,10 @@ set -e # Exit immediately if a command exits with a non-zero status. ################################################################################ : "${ENABLE_MKL:=OFF}" : "${ENABLE_PLASMA:=OFF}" +: "${DISABLE_BLAS_REFERENCE:=OFF}" : "${ENABLE_VALIDATION:=OFF}" -for var in ENABLE_MKL ENABLE_PLASMA ENABLE_VALIDATION; do +for var in ENABLE_MKL ENABLE_PLASMA DISABLE_BLAS_REFERENCE ENABLE_VALIDATION; do case "${!var}" in ON | OFF) ;; *) @@ -96,13 +101,15 @@ fi rm -rf build && mkdir build && cd build echo "CMake options:" -echo " ENABLE_MKL = $ENABLE_MKL" -echo " ENABLE_PLASMA = $ENABLE_PLASMA" -echo " ENABLE_VALIDATION = $ENABLE_VALIDATION" +echo " ENABLE_MKL = $ENABLE_MKL" +echo " ENABLE_PLASMA = $ENABLE_PLASMA" +echo " DISABLE_BLAS_REFERENCE = $DISABLE_BLAS_REFERENCE" +echo " ENABLE_VALIDATION = $ENABLE_VALIDATION" cmake -DCMAKE_BUILD_TYPE=Release \ -DENABLE_MKL="$ENABLE_MKL" \ -DENABLE_PLASMA="$ENABLE_PLASMA" \ + -DDISABLE_BLAS_REFERENCE="$DISABLE_BLAS_REFERENCE" \ -DENABLE_VALIDATION="$ENABLE_VALIDATION" \ .. make -j VERBOSE=1 diff --git a/reference/core/include/cholesky_factor.hpp b/reference/core/include/cholesky_factor.hpp index f7f1b2f..5828475 100644 --- a/reference/core/include/cholesky_factor.hpp +++ b/reference/core/include/cholesky_factor.hpp @@ -13,14 +13,12 @@ namespace cpu /** * @brief Reference Cholesky variants. * - * - reference : single threaded LAPACKE_dpotrf2 call (no tiling; - * parallelism lives entirely inside the threaded BLAS). - * - plasma : single plasma_dpotrf call (PLASMA's high-level - * synchronous Cholesky over the OpenMP runtime). - * - plasma_tile : plasma_omp_dpotrf called over a manually-built tile - * descriptor (PLASMA's asynchronous tile interface). + * - reference : single threaded LAPACKE_dpotrf2 call (no tiling; + * parallelism lives entirely inside the threaded BLAS). + * - plasma : single plasma_dpotrf call (PLASMA's high-level + * synchronous Cholesky over the OpenMP runtime). */ -enum class Variant { reference, plasma, plasma_tile }; +enum class Variant { reference, plasma }; inline Variant to_variant(const std::string &s) { @@ -32,10 +30,6 @@ inline Variant to_variant(const std::string &s) { return Variant::plasma; } - if (s == "plasma_tile") - { - return Variant::plasma_tile; - } throw std::invalid_argument("Unknown Variant: " + s); } diff --git a/reference/core/include/plasma_factor.hpp b/reference/core/include/plasma_factor.hpp index f7639ea..d15868e 100644 --- a/reference/core/include/plasma_factor.hpp +++ b/reference/core/include/plasma_factor.hpp @@ -21,40 +21,14 @@ namespace cpu * cost is intentionally amortised over all timed calls and stays out of the * timed region. * - * Throws @c std::runtime_error before calling PLASMA if the descriptor - * size computation inside plasma_desc_triangular_create() would overflow - * int32 (PLASMA 24.8.7 still does this multiplication in @c int). This - * keeps PLASMA's own multi-line error spam off stderr when the surrounding - * sweep walks past N=65280. + * Throws @c std::runtime_error before calling PLASMA when the descriptor + * size computation inside plasma_desc_*_create() would overflow int32 + * (PLASMA 24.8.7 still does this multiplication in @c int). With the + * default @c nb=256 the boundary is at @c N=65280; main.cpp transparently + * clamps any iteration size in @c (65280, 65536] down to 65280, so this + * guard fires only for @c N>65536 (which then becomes a @c nan cell). */ void plasma_cholesky(std::vector &A, int N); -/** - * @brief PLASMA tiled Cholesky on a row-major N x N buffer using the - * asynchronous tile interface (plasma_omp_dpotrf). - * - * Allocates an *uninitialised* general (full N x N) tile-layout backing - * store ourselves and wraps it in a @c plasma_desc_t via - * plasma_desc_general_init -- which performs no malloc and therefore - * sidesteps PLASMA 24.8.7's int32 overflow inside the create routines. - * - * Leaving the buffer uninitialised lets plasma_omp_dge2desc first-touch - * each tile from its consuming core, so pages land on the right NUMA - * node instead of all on the main thread's. That is the optimisation - * that closes part of the runtime gap with @c plasma_cholesky; the - * remainder of the gap is the wider working-set of the general - * descriptor (full N*N tile area vs the high-level path's triangular - * mt*(mt+1)/2 area), which would only be recovered by switching to - * @c plasma_desc_triangular_init -- attempted but found incompatible - * with the dge2desc/ddesc2ge translation routines in PLASMA 24.8.7. - * - * Note: PLASMA does int32 tile-offset arithmetic during execution as - * well, so the tile path is also bounded by an int32 overflow guard - * (general formula). Past the bound this function throws and - * @c main.cpp's catch handler records @c nan rather than letting PLASMA - * segfault. - */ -void plasma_tile_cholesky(std::vector &A, int N); - } // end of namespace cpu #endif // end of CPU_PLASMA_FACTOR_H diff --git a/reference/core/src/cholesky_factor.cpp b/reference/core/src/cholesky_factor.cpp index 81f054e..677feba 100644 --- a/reference/core/src/cholesky_factor.cpp +++ b/reference/core/src/cholesky_factor.cpp @@ -28,15 +28,6 @@ void parallel_blas_cholesky(Variant variant, std::vector &A, int N) throw std::invalid_argument( "Variant 'plasma' requested but the binary was built without ENABLE_PLASMA=ON"); #endif - - case Variant::plasma_tile: -#ifdef ENABLE_PLASMA - plasma_tile_cholesky(A, N); - return; -#else - throw std::invalid_argument( - "Variant 'plasma_tile' requested but the binary was built without ENABLE_PLASMA=ON"); -#endif } } diff --git a/reference/core/src/plasma_factor.cpp b/reference/core/src/plasma_factor.cpp index 897a05d..323e5ed 100644 --- a/reference/core/src/plasma_factor.cpp +++ b/reference/core/src/plasma_factor.cpp @@ -2,59 +2,32 @@ #include -#include -#include -#include #include #include -#include namespace cpu { -namespace -{ - -// PLASMA's default tile size for fp64 (typical 24.x default). We hardcode -// this rather than calling plasma_get(PlasmaNb, ...) so the overflow guard -// below stays portable across PLASMA versions. If you tune via -// plasma_set(PlasmaNb, ...) at startup, keep this matching. -constexpr int kPlasmaDefaultNb = 256; -// Pre-flight: would PLASMA's int32 multiplication for descriptor sizing -// overflow? PLASMA 24.8.7's plasma_desc_*_create routines compute the -// total tile-layout backing-store size as int*int and then cast to size_t, -// so the malloc gets a sign-extended-negative argument and fails for any -// padded total >= INT32_MAX. We replicate the math here and throw before -// invoking PLASMA, which avoids the multi-line PLASMA ERROR diagnostic on -// stderr and keeps the surrounding sweep clean. -// -// Used for both paths. The high-level path needs it because of the malloc -// inside _create; the tile path needs it because PLASMA also does int32 -// tile-offset arithmetic *during execution* (segfaults at N>~46080 with the -// general descriptor and default nb), even though we allocate the buffer -// ourselves and bypass _create entirely. -void guard_descriptor_overflow(int N, int nb, bool triangular, const char *which) +void plasma_cholesky(std::vector &A, int N) { - const long long mt = (N + nb - 1) / nb; - const long long padded = - triangular ? (mt * (mt + 1) / 2) * static_cast(nb) * nb - : mt * mt * static_cast(nb) * nb; - if (padded > static_cast(INT_MAX)) + // PLASMA 24.8.7's plasma_desc_*_create routines compute their tile-storage + // size as int*int and then cast to size_t, so the malloc gets a + // sign-extended-negative argument and fails for any padded total + // >= INT32_MAX. With the default nb=256 the triangular padded element + // count first crosses INT32_MAX at N=65281 (mt=256), so any N>65280 hits + // the bug. Guard before invoking PLASMA so the multi-line PLASMA ERROR + // diagnostic does not reach stderr. + // + // main.cpp transparently clamps iteration sizes in (65280, 65536] down to + // 65280, so in practice this guard only fires for N>65536 -- which then + // becomes a nan cell via main.cpp's per-mode catch handler. + constexpr int kPlasmaMaxN = 65280; + if (N > kPlasmaMaxN) { throw std::runtime_error( - std::string(which) + ": skipped to avoid PLASMA descriptor int32 overflow at N=" + std::to_string(N) - + " (nb=" + std::to_string(nb) + ", mt=" + std::to_string(mt) - + ", padded elements=" + std::to_string(padded) + " > INT32_MAX)"); + "plasma_dpotrf: skipped to avoid PLASMA descriptor int32 overflow at N=" + std::to_string(N) + + " (max supported with default nb=256: " + std::to_string(kPlasmaMaxN) + ")"); } -} - -} // anonymous namespace - -void plasma_cholesky(std::vector &A, int N) -{ - // High-level plasma_dpotrf allocates a triangular tile descriptor - // internally; overflow check uses the triangular size formula. - guard_descriptor_overflow(N, kPlasmaDefaultNb, /*triangular=*/true, "plasma_dpotrf"); // PLASMA is column-major. Our buffer is row-major and the matrix is // symmetric, so we can pass it through unchanged and ask PLASMA to write @@ -68,65 +41,4 @@ void plasma_cholesky(std::vector &A, int N) } } -void plasma_tile_cholesky(std::vector &A, int N) -{ - // Pre-flight: PLASMA does int32 tile-offset arithmetic during execution - // (not just inside _create), so the general descriptor still hits an - // overflow ceiling at N>~46080 with the default nb. Without this guard - // plasma_omp_dpotrf segfaults rather than failing cleanly. - guard_descriptor_overflow(N, kPlasmaDefaultNb, /*triangular=*/false, "plasma_omp_dpotrf"); - - // The tile path bypasses PLASMA's _create allocator (which has the - // int32-multiply malloc bug) by allocating the tile-layout backing - // store ourselves and wrapping it with plasma_desc_general_init. _init - // performs no malloc, so the buggy multiplication is never reached. - // - // The buffer is *uninitialised* (new double[N], not value-initialised - // with std::vector). Two reasons: (1) skips a multi-GB zero-init pass - // run on the main thread, and (2) lets plasma_omp_dge2desc first-touch - // each tile from its consuming core, so pages land on the right NUMA - // node instead of all on the main thread's node. That's what shaves - // time off the general-descriptor tile path here. - - const int nb = kPlasmaDefaultNb; - const long long mt_ll = (N + nb - 1) / nb; - const int mt = static_cast(mt_ll); - const int lm = mt * nb; // padded leading dimension; fits int32 even for huge N - - const std::size_t tile_buf_elements = static_cast(lm) * static_cast(lm); - - std::unique_ptr tile_buf(new double[tile_buf_elements]); - - plasma_desc_t descA; - int retval = - plasma_desc_general_init(PlasmaRealDouble, tile_buf.get(), nb, nb, lm, lm, 0, 0, N, N, &descA); - if (retval != PlasmaSuccess) - { - throw std::runtime_error("plasma_desc_general_init failed with retval=" + std::to_string(retval)); - } - - // PLASMA 24.8.7's tile interface uses stack-allocated sequence/request - // structs. Zero-init lands status=0=PlasmaSuccess, the expected - // pre-call state. - plasma_sequence_t sequence{}; - plasma_request_t request{}; - - // Translate row-major buffer -> tile descriptor, factor in place on the - // descriptor, translate back. Same PlasmaUpper convention as the - // high-level path, so the resulting layout (row-major lower triangle = L) - // matches what the validator expects. -#pragma omp parallel -#pragma omp master - { - plasma_omp_dge2desc(A.data(), N, descA, &sequence, &request); - plasma_omp_dpotrf(PlasmaUpper, descA, &sequence, &request); - plasma_omp_ddesc2ge(descA, A.data(), N, &sequence, &request); - } - - if (sequence.status != PlasmaSuccess) - { - throw std::runtime_error("plasma tile sequence failed with status=" + std::to_string(sequence.status)); - } -} - } // end of namespace cpu diff --git a/reference/main.cpp b/reference/main.cpp index f6b8c17..ef17ccc 100644 --- a/reference/main.cpp +++ b/reference/main.cpp @@ -79,8 +79,19 @@ int main(int argc, char *argv[]) } #endif - for (std::size_t size = START_SIZE; size <= STOP_SIZE; size = size * STEP_SIZE) + for (std::size_t input_size = START_SIZE; input_size <= STOP_SIZE; input_size = input_size * STEP_SIZE) { + // PLASMA 24.8.7's triangular descriptor allocation overflows int32 for + // N>65280 with the default nb=256. For sweep sizes in (65280, 65536] + // we transparently clamp the working size down to 65280 so the row + // still produces a real plasma timing instead of a nan. Sizes beyond + // 65536 fall through and the per-mode catch handler records nan. + std::size_t size = input_size; + if (size > 65280 && size <= 65536) + { + size = 65280; + } + for (std::size_t l = 0; l < LOOP; l++) { // header for output file -- columns mirror the openmp/hpx output so @@ -94,20 +105,17 @@ int main(int argc, char *argv[]) values += std::string(";") + std::to_string(1); /////////////////////////////////////////////////////////////////// // Reference modes: - // reference -> single threaded LAPACKE_dpotrf2 on the full - // matrix (currently disabled; uncomment the - // initializer below to re-enable) - // plasma -> single plasma_dpotrf (high-level synchronous - // PLASMA API; added when ENABLE_PLASMA=ON) - // plasma_tile -> plasma_omp_dpotrf over a manually-built - // plasma_desc_t (PLASMA's asynchronous tile - // interface; added when ENABLE_PLASMA=ON) - std::vector modes = { - // "reference", - }; + // reference -> single threaded LAPACKE_dpotrf2 on the full + // matrix. Enabled by default; disable at build + // time with DISABLE_BLAS_REFERENCE=ON. + // plasma -> single plasma_dpotrf (high-level synchronous + // PLASMA API). Built only when ENABLE_PLASMA=ON. + std::vector modes = {}; +#ifndef DISABLE_BLAS_REFERENCE + modes.push_back("reference"); +#endif #ifdef ENABLE_PLASMA modes.push_back("plasma"); - modes.push_back("plasma_tile"); #endif for (const auto &mode : modes) From 417878c8c53805254b1d051f30dcf1044d019eec Mon Sep 17 00:00:00 2001 From: constracktor <74077030+constracktor@users.noreply.github.com> Date: Thu, 30 Apr 2026 20:52:41 +0200 Subject: [PATCH 08/13] Cleanup --- README.md | 8 +-- hpx/core/include/adapter_cblas_fp64.hpp | 8 +-- reference/CMakeLists.txt | 16 ------ reference/compile.sh | 41 ++++++++------- reference/core/CMakeLists.txt | 7 +-- reference/core/include/adapter_cblas_fp64.hpp | 10 ++-- .../core/include/adapter_plasma_fp64.hpp | 22 ++++++++ reference/core/include/cholesky_factor.hpp | 9 ++-- reference/core/include/functions.hpp | 8 +-- reference/core/include/matrix_generation.hpp | 6 +-- reference/core/include/plasma_factor.hpp | 34 ------------- reference/core/include/validate.hpp | 4 +- reference/core/src/adapter_cblas_fp64.cpp | 9 +--- reference/core/src/adapter_plasma_fp64.cpp | 30 +++++++++++ reference/core/src/cholesky_factor.cpp | 15 ++---- reference/core/src/functions.cpp | 6 +-- reference/core/src/matrix_generation.cpp | 6 +-- reference/core/src/plasma_factor.cpp | 44 ---------------- reference/core/src/validate.cpp | 6 +-- reference/main.cpp | 51 +++---------------- reference/run.sh | 14 ++--- 21 files changed, 120 insertions(+), 234 deletions(-) create mode 100644 reference/core/include/adapter_plasma_fp64.hpp delete mode 100644 reference/core/include/plasma_factor.hpp create mode 100644 reference/core/src/adapter_plasma_fp64.cpp delete mode 100644 reference/core/src/plasma_factor.cpp diff --git a/README.md b/README.md index 3307869..28fdb26 100644 --- a/README.md +++ b/README.md @@ -139,7 +139,7 @@ All three directories contain a `run.sh` that is a ready-to-submit SLURM batch s sbatch openmp/run.sh # gcc runtime (default) sbatch openmp/run.sh llvm # llvm runtime sbatch hpx/run.sh -sbatch reference/run.sh +sbatch reference/run.sh # gcc runtime; defaults to N=65280 (see PLASMA boundary note) ``` ### Command-line arguments @@ -171,7 +171,7 @@ The `reference/` binary reports a `reference` column (suppressed by `DISABLE_BLA ``` threads;problem_size;tile_size;n_tiles;reference;plasma -128;65280;65280;1;2.71;68.12 +128;65280;65280;1;5.21;68.12 ``` The same lines are also printed to stdout. @@ -229,14 +229,14 @@ The same lines are also printed to stdout. │ ├── cholesky_factor.hpp │ ├── functions.hpp │ ├── matrix_generation.hpp - │ ├── plasma_factor.hpp # only used when ENABLE_PLASMA=ON + │ ├── adapter_plasma_fp64.hpp # only used when ENABLE_PLASMA=ON │ ├── validate.hpp │ └── adapter_cblas_fp64.hpp └── src/ ├── cholesky_factor.cpp ├── functions.cpp ├── matrix_generation.cpp - ├── plasma_factor.cpp # only built when ENABLE_PLASMA=ON + ├── adapter_plasma_fp64.cpp # only built when ENABLE_PLASMA=ON ├── validate.cpp └── adapter_cblas_fp64.cpp ``` diff --git a/hpx/core/include/adapter_cblas_fp64.hpp b/hpx/core/include/adapter_cblas_fp64.hpp index 5440833..91ce5c9 100644 --- a/hpx/core/include/adapter_cblas_fp64.hpp +++ b/hpx/core/include/adapter_cblas_fp64.hpp @@ -126,7 +126,7 @@ void gemm(const vector &A, * @param dep_future dependency future to wait on before executing * @param A matrix to be factorized (mutated in-place) * @param N matrix dimension - * @return void future signalling completion + * @return void future signaling completion */ void_future potrf_f(void_future dep_future, vector &A, const int N); @@ -140,7 +140,7 @@ void_future potrf_f(void_future dep_future, vector &A, const int N); * @param M second dimension * @param transpose_L transpose flag for L * @param side_L side flag for L - * @return void future signalling completion + * @return void future signaling completion */ void_future trsm_f(void_future dep_L, void_future dep_A, @@ -158,7 +158,7 @@ void_future trsm_f(void_future dep_L, * @param A base matrix (mutated in-place) * @param B symmetric update matrix * @param N matrix dimension - * @return void future signalling completion + * @return void future signaling completion */ void_future syrk_f(void_future dep_A, void_future dep_B, vector &A, const vector &B, const int N); @@ -175,7 +175,7 @@ void_future syrk_f(void_future dep_A, void_future dep_B, vector &A, const vector * @param K third matrix dimension * @param transpose_A transpose flag for A * @param transpose_B transpose flag for B - * @return void future signalling completion + * @return void future signaling completion */ void_future gemm_f(void_future dep_A, diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 111fae1..a9b5b90 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -36,19 +36,11 @@ if(ENABLE_FORMAT_TARGETS) endif() if(NOT CMAKE_SKIP_INSTALL_RULES) - # Our installs follow the standard GNU directory layout. This include needs to - # come first since we need the CMAKE_INSTALL_* in the CMakeLists.txt of each - # target. include(GNUInstallDirs) endif() if(BUILD_CORE) if(ENABLE_MKL) - # Threaded Intel oneMKL: ask MKL to use its OpenMP runtime ('intel_thread'). - # This is the only difference from the OpenMP/HPX builds, which pin - # MKL_THREADING=sequential because they parallelise at the tile level. - # Here the parallelism lives inside dpotrf itself, so we want the - # vendor-threaded backend. set(MKL_INTERFACE_FULL "intel_lp64") set(MKL_THREADING "intel_thread") find_package(MKL CONFIG REQUIRED) @@ -59,9 +51,6 @@ if(BUILD_CORE) message(FATAL_ERROR "No BLAS Library found") endif() else() - # Threaded OpenBLAS. The library name is the same as the sequential one, - # but the Spack environment loaded by compile.sh selects an OpenBLAS built - # with threads=openmp. find_library(OpenBLAS_LIB NAMES openblas REQUIRED) if(OpenBLAS_LIB) @@ -80,14 +69,9 @@ if(BUILD_CORE) endif() endif() - # OpenMP is required for the matrix-generation parallel loop and to pick up - # the OpenMP runtime that threaded OpenBLAS / threaded MKL share. find_package(OpenMP REQUIRED) if(ENABLE_PLASMA) - # PLASMA exposes its own tiled parallel Cholesky (plasma_dpotrf). Spack - # installs it as a single shared library plus a coreblas helper; we look - # for both and link whichever is present. find_path(PLASMA_INCLUDE_DIR plasma.h) if(NOT PLASMA_INCLUDE_DIR) message(FATAL_ERROR "ENABLE_PLASMA=ON but plasma.h was not found") diff --git a/reference/compile.sh b/reference/compile.sh index 3bffd73..2f20af3 100755 --- a/reference/compile.sh +++ b/reference/compile.sh @@ -1,22 +1,21 @@ #!/bin/bash # Usage: compile.sh # -# Builds the parallel-BLAS reference benchmark: a single threaded +# Builds the parallel-BLAS reference benchmark: a single tile parallel # LAPACKE_dpotrf call on the full matrix, used as a baseline against the -# tiled OpenMP / HPX implementations. GCC only. +# tiled fork-join and tasking implementations. # # CMake project options can be overridden via environment variables # (defaults match the project's CMakeLists.txt defaults): -# ENABLE_MKL ON|OFF (default OFF) - link threaded Intel oneMKL -# instead of threaded OpenBLAS -# ENABLE_PLASMA ON|OFF (default OFF) - also build the PLASMA tiled -# Cholesky variant (extra -# 'plasma' column in the output) -# DISABLE_BLAS_REFERENCE ON|OFF (default OFF) - skip the LAPACKE_dpotrf -# reference mode at runtime -# (linking unchanged) -# ENABLE_VALIDATION ON|OFF (default OFF) - residual check after each -# factorisation +# ENABLE_MKL ON|OFF (default OFF) - link threaded Intel oneMKL +# instead of threaded OpenBLAS +# ENABLE_PLASMA ON|OFF (default OFF) - also build the PLASMA +# plasma_dpotrf variant (extra +# 'plasma' column in the output) +# DISABLE_BLAS_REFERENCE ON|OFF (default OFF) - skip the LAPACKE_dpotrf +# reference at runtime +# ENABLE_VALIDATION ON|OFF (default OFF) - residual check after each +# factorization # # Examples: # ./compile.sh @@ -46,7 +45,7 @@ for var in ENABLE_MKL ENABLE_PLASMA DISABLE_BLAS_REFERENCE ENABLE_VALIDATION; do done ################################################################################ -# Toolchain selection (gcc only) +# Toolchain selection ################################################################################ select_toolchain() { module load gcc/14.2.0 @@ -57,12 +56,11 @@ select_toolchain() { ################################################################################ # Configurations # -# The reference benchmark uses *threaded* OpenBLAS / MKL — that is the whole -# point of this directory. The OpenMP and HPX builds, by contrast, pin the -# BLAS to its sequential variant because they parallelise at the tile level. +# The reference benchmark uses *threaded* BLAS as they operate on a single tile +# and do not parallelize at the tile level. ################################################################################ if command -v spack &>/dev/null; then - echo "Spack command found. Loading libraries (gcc)" + echo "Spack command found. Loading libraries." # Get current hostname HOSTNAME=$(hostname -s) @@ -74,7 +72,7 @@ if command -v spack &>/dev/null; then spack load openblas@0.3.28%gcc@14.2.0 threads=openmp ilp64=true fi if [[ "$ENABLE_PLASMA" == "ON" ]]; then - spack load plasma%gcc@14.2.0 + spack load plasma%gcc@14.2.0 ^openblas@0.3.28%gcc@14.2.0 threads=openmp fi elif [[ "$HOSTNAME" == "nasrin0" || "$HOSTNAME" == "nasrin1" ]]; then @@ -82,10 +80,10 @@ if command -v spack &>/dev/null; then select_toolchain if [[ "$ENABLE_MKL" == "OFF" ]]; then # OpenBLAS built with OpenMP threading - spack load openblas@0.3.28%gcc@14.2.0 arch=linux-almalinux9-zen3 threads=openmp + spack load openblas@0.3.28%gcc@14.2.0 arch=linux-almalinux9-zen3 threads=openmp ilp64=true fi if [[ "$ENABLE_PLASMA" == "ON" ]]; then - spack load plasma%gcc@14.2.0 arch=linux-almalinux9-zen3 + spack load plasma%gcc@14.2.0 arch=linux-almalinux9-zen3 openblas@0.3.28%gcc@14.2.0 threads=openmp fi else @@ -116,4 +114,5 @@ make -j VERBOSE=1 cd .. # Launch Example -# OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores ./build/cholesky_reference --size_start 65536 --size_stop 65536 --loop 20 +# OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores \ +# ./build/cholesky_reference --size_start 1024 --size_stop 65536 --loop 1 diff --git a/reference/core/CMakeLists.txt b/reference/core/CMakeLists.txt index f7b6535..eea5cbf 100644 --- a/reference/core/CMakeLists.txt +++ b/reference/core/CMakeLists.txt @@ -6,7 +6,7 @@ if(ENABLE_VALIDATION) endif() if(ENABLE_PLASMA) - list(APPEND SOURCE_FILES src/plasma_factor.cpp) + list(APPEND SOURCE_FILES src/adapter_plasma_fp64.cpp) endif() add_library(cholesky_core STATIC ${SOURCE_FILES}) @@ -33,8 +33,7 @@ if(ENABLE_MKL) cholesky_core PUBLIC MKL::mkl_intel_lp64 MKL::mkl_core MKL::MKL MKL::mkl_intel_thread) else() - # Link threaded OpenBLAS (the library name is the same; threading is - # determined by the OpenBLAS build that compile.sh's Spack env selects). + # Link threaded OpenBLAS target_link_libraries(cholesky_core PUBLIC ${OpenBLAS_LIB}) target_include_directories(cholesky_core PUBLIC ${OpenBLAS_INCLUDE_DIR}) endif() @@ -57,8 +56,6 @@ target_compile_features(cholesky_core PUBLIC cxx_std_17) set_property(TARGET cholesky_core PROPERTY POSITION_INDEPENDENT_CODE ON) if(NOT CMAKE_SKIP_INSTALL_RULES) - # We need to manually install those into CMAKE_INSTALL_INCLUDEDIR. Below - # install(TARGETS ...) only setups the paths for the exported targets. install( DIRECTORY include/ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" diff --git a/reference/core/include/adapter_cblas_fp64.hpp b/reference/core/include/adapter_cblas_fp64.hpp index 139945c..11a79e4 100644 --- a/reference/core/include/adapter_cblas_fp64.hpp +++ b/reference/core/include/adapter_cblas_fp64.hpp @@ -10,14 +10,12 @@ using vector = std::vector; // LAPACK level 3 operations /** - * @brief FP64 In-place Cholesky decomposition of A using a single, threaded - * LAPACKE_dpotrf call (no tiling). This is the parallel-BLAS reference - * implementation that the OpenMP and HPX tiled variants are compared - * against. + * @brief FP64 In-place Cholesky decomposition of A using a threaded + * LAPACKE_dpotrf call. * - * @param A row-major matrix of size N*N to be factorised in place + * @param A row-major matrix of size N*N to be factorized in place * @param N matrix dimension */ -void potrf(vector &A, const int N); +void lapacke_potrf(vector &A, const int N); #endif // end of CPU_ADAPTER_CBLAS_FP64_H diff --git a/reference/core/include/adapter_plasma_fp64.hpp b/reference/core/include/adapter_plasma_fp64.hpp new file mode 100644 index 0000000..3edd661 --- /dev/null +++ b/reference/core/include/adapter_plasma_fp64.hpp @@ -0,0 +1,22 @@ +#ifndef CPU_ADAPTER_PLASMA_FP64_H +#define CPU_ADAPTER_PLASMA_FP64_H + +#pragma once + +#include + +namespace cpu +{ + +/** + * @brief PLASMA tiled Cholesky on a row-major N x N buffer using the + * high-level synchronous API (plasma_dpotrf). + * + * Throws @c std::runtime_error before calling PLASMA when the descriptor + * size computation inside plasma_desc_*_create() would overflow int32. + * + */ +void plasma_potrf(std::vector &A, int N); + +} // end of namespace cpu +#endif // end of CPU_ADAPTER_PLASMA_FP64_H diff --git a/reference/core/include/cholesky_factor.hpp b/reference/core/include/cholesky_factor.hpp index 5828475..e1e48e2 100644 --- a/reference/core/include/cholesky_factor.hpp +++ b/reference/core/include/cholesky_factor.hpp @@ -13,9 +13,8 @@ namespace cpu /** * @brief Reference Cholesky variants. * - * - reference : single threaded LAPACKE_dpotrf2 call (no tiling; - * parallelism lives entirely inside the threaded BLAS). - * - plasma : single plasma_dpotrf call (PLASMA's high-level + * - reference : threaded LAPACKE_dpotrf2 call + * - plasma : plasma_dpotrf call (PLASMA's high-level * synchronous Cholesky over the OpenMP runtime). */ enum class Variant { reference, plasma }; @@ -35,10 +34,10 @@ inline Variant to_variant(const std::string &s) /** * @brief Run the requested reference variant on the full row-major N x N - * matrix @p A. Factorisation is in place; @p A holds the lower + * matrix. Factorization is in place; @p matrix holds the lower * triangular factor L on return. */ -void parallel_blas_cholesky(Variant variant, std::vector &A, int N); +void parallel_cholesky(Variant variant, std::vector &matrix, int N); } // end of namespace cpu #endif // end of CPU_CHOLESKY_FACTOR_H diff --git a/reference/core/include/functions.hpp b/reference/core/include/functions.hpp index 0740b4d..f7e74ba 100644 --- a/reference/core/include/functions.hpp +++ b/reference/core/include/functions.hpp @@ -12,15 +12,15 @@ namespace cpu /** * @brief Time a single call to the requested reference variant - * ('reference' or 'plasma') on the @p A buffer (row-major, N x N). - * The buffer is factorised in place. + * ('reference' or 'plasma') on the @p matrix buffer (row-major, N x N). + * The buffer is factorized in place. * - * @param A row-major matrix; on return contains the lower-triangular factor L + * @param matrix row-major matrix; on return contains the lower-triangular factor L * @param N matrix dimension * @param variant which reference path to time * @return wall-clock elapsed time in seconds */ -double cholesky(std::vector &A, std::size_t N, const std::string &variant); +double cholesky(std::vector &matrix, std::size_t N, const std::string &variant); } // namespace cpu #endif // end of CPU_FUNCTIONS_H diff --git a/reference/core/include/matrix_generation.hpp b/reference/core/include/matrix_generation.hpp index 22a3206..967398b 100644 --- a/reference/core/include/matrix_generation.hpp +++ b/reference/core/include/matrix_generation.hpp @@ -12,11 +12,7 @@ * Entries are uniform on [0, 1) using a per-row seed; the diagonal is shifted * by +N to guarantee strict diagonal dominance and therefore symmetric * positive definiteness. The result is stored as a single contiguous - * std::vector of length N*N in row-major order, ready to be passed to - * LAPACKE_dpotrf. - * - * Generation is parallelised with OpenMP across rows so it does not dominate - * the timed factorisation phase. + * std::vector of length N*N in row-major order. * * @param N matrix dimension * @return owning row-major buffer of length N*N diff --git a/reference/core/include/plasma_factor.hpp b/reference/core/include/plasma_factor.hpp deleted file mode 100644 index d15868e..0000000 --- a/reference/core/include/plasma_factor.hpp +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef CPU_PLASMA_FACTOR_H -#define CPU_PLASMA_FACTOR_H - -#pragma once - -#include - -namespace cpu -{ - -/** - * @brief PLASMA tiled Cholesky on a row-major N x N buffer using the - * high-level synchronous API (plasma_dpotrf). - * - * PLASMA's high-level API is column-major, so we ask for @c PlasmaUpper: - * the upper triangle in PLASMA's column-major view aliases the lower - * triangle in our row-major view, which is the layout the validation - * routine expects (and which matches the LAPACKE_dpotrf2 reference). - * - * Caller is responsible for having invoked plasma_init() at startup; that - * cost is intentionally amortised over all timed calls and stays out of the - * timed region. - * - * Throws @c std::runtime_error before calling PLASMA when the descriptor - * size computation inside plasma_desc_*_create() would overflow int32 - * (PLASMA 24.8.7 still does this multiplication in @c int). With the - * default @c nb=256 the boundary is at @c N=65280; main.cpp transparently - * clamps any iteration size in @c (65280, 65536] down to 65280, so this - * guard fires only for @c N>65536 (which then becomes a @c nan cell). - */ -void plasma_cholesky(std::vector &A, int N); - -} // end of namespace cpu -#endif // end of CPU_PLASMA_FACTOR_H diff --git a/reference/core/include/validate.hpp b/reference/core/include/validate.hpp index 6cf829c..4c666d0 100644 --- a/reference/core/include/validate.hpp +++ b/reference/core/include/validate.hpp @@ -11,12 +11,12 @@ namespace cpu /** * @brief Compute the relative Cholesky residual ||A - L * L^T||_F / ||A||_F - * for the dense, row-major reference factorisation. + * for the dense, row-major reference factorization. * * The original A is regenerated on the fly with the same deterministic seed * used by gen_matrix, so no extra storage is needed. * - * @param N matrix dimension (must match the factorisation) + * @param N matrix dimension (must match the factorization) * @param L row-major buffer of length N*N holding the factor returned by * LAPACKE_dpotrf with uplo='L' (only the lower triangle is read) * @return relative Frobenius residual diff --git a/reference/core/src/adapter_cblas_fp64.cpp b/reference/core/src/adapter_cblas_fp64.cpp index 566290f..264d442 100644 --- a/reference/core/src/adapter_cblas_fp64.cpp +++ b/reference/core/src/adapter_cblas_fp64.cpp @@ -9,11 +9,4 @@ #include "lapacke.h" #endif -void potrf(vector &A, const int N) -{ - // Single threaded LAPACKE call on the full matrix. dpotrf2 is the - // recursive variant, which is what the OpenMP / HPX variants use on - // their diagonal tiles, so picking it here keeps the underlying kernel - // identical and isolates the parallelism source as the only difference. - LAPACKE_dpotrf2(LAPACK_ROW_MAJOR, 'L', N, A.data(), N); -} +void lapacke_potrf(vector &A, const int N) { LAPACKE_dpotrf2(LAPACK_ROW_MAJOR, 'L', N, A.data(), N); } diff --git a/reference/core/src/adapter_plasma_fp64.cpp b/reference/core/src/adapter_plasma_fp64.cpp new file mode 100644 index 0000000..06147ad --- /dev/null +++ b/reference/core/src/adapter_plasma_fp64.cpp @@ -0,0 +1,30 @@ +#include "adapter_plasma_fp64.hpp" + +#include +#include +#include + +namespace cpu +{ + +void plasma_potrf(std::vector &A, int N) +{ + constexpr int k_plasma_max_n = 65'280; + if (N > k_plasma_max_n) + { + throw std::runtime_error( + "plasma_dpotrf: skipped to avoid PLASMA descriptor int32 overflow at N=" + std::to_string(N) + + " (max supported with default nb=256: " + std::to_string(k_plasma_max_n) + ")"); + } + + // PLASMA is column-major. Our buffer is row-major and the matrix is + // symmetric, so we can pass it through unchanged and ask PLASMA to write + // its result into the upper triangle of its column-major view + const int info = plasma_dpotrf(PlasmaUpper, N, A.data(), N); + if (info != 0) + { + throw std::runtime_error("plasma_dpotrf failed with info=" + std::to_string(info)); + } +} + +} // end of namespace cpu diff --git a/reference/core/src/cholesky_factor.cpp b/reference/core/src/cholesky_factor.cpp index 677feba..25bbe20 100644 --- a/reference/core/src/cholesky_factor.cpp +++ b/reference/core/src/cholesky_factor.cpp @@ -2,7 +2,7 @@ #include "adapter_cblas_fp64.hpp" #ifdef ENABLE_PLASMA -#include "plasma_factor.hpp" +#include "adapter_plasma_fp64.hpp" #endif #include @@ -10,23 +10,18 @@ namespace cpu { -void parallel_blas_cholesky(Variant variant, std::vector &A, int N) +void parallel_cholesky(Variant variant, std::vector &matrix, int N) { switch (variant) { - case Variant::reference: - // Single threaded LAPACKE call on the full matrix; the BLAS - // library dispatches work across the available threads. - potrf(A, N); - return; + case Variant::reference: lapacke_potrf(matrix, N); return; case Variant::plasma: #ifdef ENABLE_PLASMA - plasma_cholesky(A, N); + plasma_potrf(matrix, N); return; #else - throw std::invalid_argument( - "Variant 'plasma' requested but the binary was built without ENABLE_PLASMA=ON"); + throw std::invalid_argument("Variant 'plasma' requested but the binary was built without ENABLE_PLASMA=ON"); #endif } } diff --git a/reference/core/src/functions.cpp b/reference/core/src/functions.cpp index e2986ea..1f15f26 100644 --- a/reference/core/src/functions.cpp +++ b/reference/core/src/functions.cpp @@ -6,13 +6,13 @@ namespace cpu { -double cholesky(std::vector &A, std::size_t N, const std::string &variant) +double cholesky(std::vector &matrix, std::size_t N, const std::string &variant) { const Variant v = to_variant(variant); auto start = std::chrono::high_resolution_clock::now(); /////////////////////////////////////////////////////////////////////////// - // Launch Cholesky decomposition: A = L * L^T (single dispatched call) - parallel_blas_cholesky(v, A, static_cast(N)); + // Launch Cholesky decomposition: A = L * L^T + parallel_cholesky(v, matrix, static_cast(N)); /////////////////////////////////////////////////////////////////////////// auto stop = std::chrono::high_resolution_clock::now(); return (stop - start).count() / 1e9; diff --git a/reference/core/src/matrix_generation.cpp b/reference/core/src/matrix_generation.cpp index b0db740..a67ff5a 100644 --- a/reference/core/src/matrix_generation.cpp +++ b/reference/core/src/matrix_generation.cpp @@ -5,15 +5,11 @@ std::vector gen_matrix(std::size_t N) { - // Row-major dense buffer std::vector A(N * N); // The matrix is built row by row in parallel. Each row uses its own RNG // seeded by the row index, so the matrix is deterministic and - // reproducible regardless of the number of threads. Off-diagonal entries - // are mirrored to keep A symmetric; the diagonal is shifted by +N to - // guarantee strict diagonal dominance (and therefore SPD), mirroring the - // +N*n_tiles shift used by the tiled variants when n_tiles == 1. + // reproducible regardless of the number of threads. #pragma omp parallel for schedule(static) for (std::size_t i = 0; i < N; ++i) { diff --git a/reference/core/src/plasma_factor.cpp b/reference/core/src/plasma_factor.cpp deleted file mode 100644 index 323e5ed..0000000 --- a/reference/core/src/plasma_factor.cpp +++ /dev/null @@ -1,44 +0,0 @@ -#include "plasma_factor.hpp" - -#include - -#include -#include - -namespace cpu -{ - -void plasma_cholesky(std::vector &A, int N) -{ - // PLASMA 24.8.7's plasma_desc_*_create routines compute their tile-storage - // size as int*int and then cast to size_t, so the malloc gets a - // sign-extended-negative argument and fails for any padded total - // >= INT32_MAX. With the default nb=256 the triangular padded element - // count first crosses INT32_MAX at N=65281 (mt=256), so any N>65280 hits - // the bug. Guard before invoking PLASMA so the multi-line PLASMA ERROR - // diagnostic does not reach stderr. - // - // main.cpp transparently clamps iteration sizes in (65280, 65536] down to - // 65280, so in practice this guard only fires for N>65536 -- which then - // becomes a nan cell via main.cpp's per-mode catch handler. - constexpr int kPlasmaMaxN = 65280; - if (N > kPlasmaMaxN) - { - throw std::runtime_error( - "plasma_dpotrf: skipped to avoid PLASMA descriptor int32 overflow at N=" + std::to_string(N) - + " (max supported with default nb=256: " + std::to_string(kPlasmaMaxN) + ")"); - } - - // PLASMA is column-major. Our buffer is row-major and the matrix is - // symmetric, so we can pass it through unchanged and ask PLASMA to write - // its result into the upper triangle of its column-major view -- that - // upper triangle aliases the lower triangle of our row-major view, which - // is the layout the validator (and the LAPACKE reference path) expects. - const int info = plasma_dpotrf(PlasmaUpper, N, A.data(), N); - if (info != 0) - { - throw std::runtime_error("plasma_dpotrf failed with info=" + std::to_string(info)); - } -} - -} // end of namespace cpu diff --git a/reference/core/src/validate.cpp b/reference/core/src/validate.cpp index 5a43cc8..8b1f647 100644 --- a/reference/core/src/validate.cpp +++ b/reference/core/src/validate.cpp @@ -8,7 +8,6 @@ #include "cblas.h" #endif -#include #include #include #include @@ -19,9 +18,6 @@ namespace cpu double cholesky_residual(std::size_t N, const std::vector &L) { // Build a working copy of L with its strictly upper triangle zeroed out. - // dpotrf with uplo='L' leaves the upper triangle untouched (it still - // contains the original A values), so we must mask it before forming - // L * L^T with a plain dgemm. std::vector Lwork(L); for (std::size_t i = 0; i < N; ++i) { @@ -49,7 +45,7 @@ double cholesky_residual(std::size_t N, const std::vector &L) LLt.data(), static_cast(N)); - // Regenerate the original A deterministically and accumulate Frobenius + // Regenerate the original matrix A deterministically and accumulate Frobenius // norms of (A - LLt) and A. const std::vector A = gen_matrix(N); diff --git a/reference/main.cpp b/reference/main.cpp index ef17ccc..b4c8585 100644 --- a/reference/main.cpp +++ b/reference/main.cpp @@ -6,7 +6,6 @@ #ifdef ENABLE_PLASMA #include #endif -#include #include #include #include @@ -21,12 +20,6 @@ int main(int argc, char *argv[]) { /////////////////////////////////////////////////////////////////////////// // cmdline arguments - // - // The reference benchmark calls a single threaded LAPACKE_dpotrf on the - // full matrix, so there is no tiling axis. We still accept --tiles_start - // / --tiles_stop for CLI compatibility with the openmp/ and hpx/ binaries - // (they are silently ignored), which keeps any shared driver script - // unchanged. std::size_t loop = 1; std::size_t size_start = 32, size_stop = 128; @@ -71,8 +64,6 @@ int main(int argc, char *argv[]) runtime_file.open(runtime_file_path, std::ios_base::app); #ifdef ENABLE_PLASMA - // PLASMA spins up its own context and worker pool; do this once so the - // cost is not folded into any timed factorisation. if (plasma_init() != 0) { throw std::runtime_error("plasma_init() failed"); @@ -81,23 +72,18 @@ int main(int argc, char *argv[]) for (std::size_t input_size = START_SIZE; input_size <= STOP_SIZE; input_size = input_size * STEP_SIZE) { - // PLASMA 24.8.7's triangular descriptor allocation overflows int32 for + // PLASMA's triangular descriptor allocation overflows int32 for // N>65280 with the default nb=256. For sweep sizes in (65280, 65536] - // we transparently clamp the working size down to 65280 so the row - // still produces a real plasma timing instead of a nan. Sizes beyond + // we transparently clamp the working size down to 65280. Sizes beyond // 65536 fall through and the per-mode catch handler records nan. std::size_t size = input_size; - if (size > 65280 && size <= 65536) + if (size > 65'280 && size <= 65'536) { - size = 65280; + size = 65'280; } for (std::size_t l = 0; l < LOOP; l++) { - // header for output file -- columns mirror the openmp/hpx output so - // results from all three benchmarks can be merged on (problem_size). - // The reference has no tiling, so tile_size == problem_size and - // n_tiles == 1. std::string header = "threads;problem_size;tile_size;n_tiles"; std::string values = std::to_string(omp_get_max_threads()); values += std::string(";") + std::to_string(size); @@ -105,11 +91,6 @@ int main(int argc, char *argv[]) values += std::string(";") + std::to_string(1); /////////////////////////////////////////////////////////////////// // Reference modes: - // reference -> single threaded LAPACKE_dpotrf2 on the full - // matrix. Enabled by default; disable at build - // time with DISABLE_BLAS_REFERENCE=ON. - // plasma -> single plasma_dpotrf (high-level synchronous - // PLASMA API). Built only when ENABLE_PLASMA=ON. std::vector modes = {}; #ifndef DISABLE_BLAS_REFERENCE modes.push_back("reference"); @@ -122,28 +103,12 @@ int main(int argc, char *argv[]) { header += ";" + mode; - // We let one mode fail (e.g. PLASMA running out of memory at - // very large N -- its high-level wrapper allocates an extra - // tiled triangular copy on top of the input buffer) without - // killing the whole sweep. The failed cell is recorded as NaN - // and we continue with the next mode and size. - std::vector A; - try - { - A = gen_matrix(size); - } - catch (const std::exception &e) - { - std::cerr << "Error: gen_matrix(size=" << size << ") threw '" << e.what() - << "'. Recording NaN for variant '" << mode << "'." << std::endl; - values += ";nan"; - continue; - } - + std::vector matrix = gen_matrix(size); + // NaN guard double cholesky_cpu = std::numeric_limits::quiet_NaN(); try { - cholesky_cpu = cpu::cholesky(A, size, mode); + cholesky_cpu = cpu::cholesky(matrix, size, mode); } catch (const std::exception &e) { @@ -158,7 +123,7 @@ int main(int argc, char *argv[]) #ifdef ENABLE_VALIDATION // Validate by computing relative residual ||A - L L^T||_F / ||A||_F constexpr double residual_tol = 1e-10; - const double residual = cpu::cholesky_residual(size, A); + const double residual = cpu::cholesky_residual(size, matrix); std::cout << "[validate] mode=" << mode << " size=" << size << " residual=" << residual << std::endl; if (!(residual <= residual_tol)) // catches NaN too { diff --git a/reference/run.sh b/reference/run.sh index 0600513..0b5c772 100755 --- a/reference/run.sh +++ b/reference/run.sh @@ -12,30 +12,24 @@ # # Submit example: # sbatch run.sh -# -# Runs the parallel-BLAS reference benchmark — a single threaded -# LAPACKE_dpotrf call on the full matrix — as a baseline for the OpenMP and -# HPX tiled implementations. GCC only. set -e # Exit immediately if a command exits with a non-zero status. ################################################################################ -# Toolchain runtime selection (gcc only) +# Toolchain runtime selection ################################################################################ module load gcc/14.2.0 # Resolve directory where the script is located SCRIPT_DIR="$(pwd)" -# OpenMP settings — the threaded BLAS picks these up to spread dpotrf across -# all the cores. Both OpenBLAS (threads=openmp) and threaded MKL respect the -# standard OMP_* environment. +# OpenMP settings export OMP_NUM_THREADS=128 export OMP_PROC_BIND=close export OMP_PLACES=cores # Make sure threaded MKL uses the OpenMP runtime if ENABLE_MKL=ON was used at -# build time. Harmless when linking OpenBLAS. +# build time. export MKL_NUM_THREADS=${MKL_NUM_THREADS:-$OMP_NUM_THREADS} echo "Running with gcc runtime" @@ -43,5 +37,5 @@ echo "Running with gcc runtime" # Run executable srun --cpu-bind=cores "$SCRIPT_DIR/build/cholesky_reference" \ --loop 20 \ - --size_start 65536 \ + --size_start 1024 \ --size_stop 65536 From f2742da81087189a47ae31865ac1f6c19fcd4b8d Mon Sep 17 00:00:00 2001 From: constracktor <74077030+constracktor@users.noreply.github.com> Date: Fri, 1 May 2026 00:01:24 +0200 Subject: [PATCH 09/13] Rename to lapacke --- README.md | 18 +++++++++--------- reference/core/include/cholesky_factor.hpp | 12 ++++++------ reference/core/src/cholesky_factor.cpp | 2 +- reference/main.cpp | 2 +- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 28fdb26..d2f9494 100644 --- a/README.md +++ b/README.md @@ -28,10 +28,10 @@ Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-j | Mode | Description | |------|-------------| -| `reference` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). Enabled by default; disable with `DISABLE_BLAS_REFERENCE=ON`. | +| `lapacke` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). Enabled by default; disable with `DISABLE_BLAS_REFERENCE=ON`. | | `plasma` | Single `plasma_dpotrf` call on the full matrix (PLASMA's high-level synchronous API). PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. | -This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `reference` mode isolates the contribution of vendor-provided dense-LA parallelism, and the `plasma` mode adds a tiled-parallel competitor that uses the same OpenMP runtime as the in-house variants. +This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `lapacke` mode isolates the contribution of vendor-provided dense-LA parallelism, and the `plasma` mode adds a tiled-parallel competitor that uses the same OpenMP runtime as the in-house variants. #### PLASMA descriptor int32 overflow @@ -39,8 +39,8 @@ PLASMA 24.8.7's `plasma_desc_*_create()` routines compute their tile-storage siz The benchmark handles this transparently: -- For sweep sizes `N` in `(65280, 65536]` the working size is **clamped to 65280** for the whole row (both `reference` and `plasma` run at 65280, and the `problem_size` column reports 65280). This keeps the largest practical PLASMA point on the curve without touching the underlying PLASMA build. -- For `N > 65536` `plasma` records `nan`. `reference` (LAPACKE) is unaffected by the int32 ceiling and continues normally. +- For sweep sizes `N` in `(65280, 65536]` the working size is **clamped to 65280** for the whole row (both `lapacke` and `plasma` run at 65280, and the `problem_size` column reports 65280). This keeps the largest practical PLASMA point on the curve without touching the underlying PLASMA build. +- For `N > 65536` `plasma` records `nan`. `lapacke` is unaffected by the int32 ceiling and continues normally. Patching `(size_t)` casts into `control/descriptor.c` in the spack PLASMA package removes the ceiling and the clamp + guard become no-ops. @@ -83,8 +83,8 @@ These can be set as environment variables before calling `compile.sh`: | `DISABLE_COMPUTATION` | `OFF` | *(`openmp/` and `hpx/` only)* Replace all BLAS/tile-generation calls with no-ops. The task graph and loops remain intact, so scheduling overhead can be measured in isolation. | | `ENABLE_DYNAMIC_SCHEDULE` | `OFF` | *(`openmp/` only)* Use `schedule(dynamic,1)` on the trailing-update worksharing loops in `for_collapse`. Requires the LLVM toolchain; rejected at compile time with GCC. | | `ENABLE_MKL` | `OFF` | Link against Intel oneMKL instead of OpenBLAS. In `openmp/` and `hpx/` this is the *sequential* MKL; in `reference/` it is the *threaded* MKL. | -| `ENABLE_PLASMA` | `OFF` | *(`reference/` only)* Also build the PLASMA `plasma_dpotrf` variant. Adds a `plasma` column alongside `reference` in the runtime output. | -| `DISABLE_BLAS_REFERENCE` | `OFF` | *(`reference/` only)* Skip the LAPACKE_dpotrf reference mode at runtime, so only `plasma` runs (when `ENABLE_PLASMA=ON`). Linking is unchanged — PLASMA and validation still need cblas/lapacke symbols. | +| `ENABLE_PLASMA` | `OFF` | *(`reference/` only)* Also build the PLASMA `plasma_dpotrf` variant. Adds a `plasma` column alongside `lapacke` in the runtime output. | +| `DISABLE_BLAS_REFERENCE` | `OFF` | *(`reference/` only)* Skip the `lapacke` mode at runtime, so only `plasma` runs (when `ENABLE_PLASMA=ON`). Linking is unchanged — PLASMA and validation still need cblas/lapacke symbols. | **Examples:** @@ -167,10 +167,10 @@ threads;problem_size;tile_size;n_tiles;for_collapse;for_naive;task_naive;task_de 128;65536;1024;64;3.14;3.21;2.98;2.87 ``` -The `reference/` binary reports a `reference` column (suppressed by `DISABLE_BLAS_REFERENCE=ON`) plus a `plasma` column when built with `ENABLE_PLASMA=ON`, with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key: +The `reference/` binary reports a `lapacke` column (suppressed by `DISABLE_BLAS_REFERENCE=ON`) plus a `plasma` column when built with `ENABLE_PLASMA=ON`, with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key: ``` -threads;problem_size;tile_size;n_tiles;reference;plasma +threads;problem_size;tile_size;n_tiles;lapacke;plasma 128;65280;65280;1;5.21;68.12 ``` @@ -241,7 +241,7 @@ The same lines are also printed to stdout. └── adapter_cblas_fp64.cpp ``` -When `DISABLE_BLAS_REFERENCE=ON`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still compiled and linked (they share cblas/lapacke symbols with PLASMA's BLAS dependency); only the runtime dispatch of the `reference` mode is skipped. +When `DISABLE_BLAS_REFERENCE=ON`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still compiled and linked (they share cblas/lapacke symbols with PLASMA's BLAS dependency); only the runtime dispatch of the `lapacke` mode is skipped. ## Contributing diff --git a/reference/core/include/cholesky_factor.hpp b/reference/core/include/cholesky_factor.hpp index e1e48e2..9bcf784 100644 --- a/reference/core/include/cholesky_factor.hpp +++ b/reference/core/include/cholesky_factor.hpp @@ -13,17 +13,17 @@ namespace cpu /** * @brief Reference Cholesky variants. * - * - reference : threaded LAPACKE_dpotrf2 call - * - plasma : plasma_dpotrf call (PLASMA's high-level - * synchronous Cholesky over the OpenMP runtime). + * - lapacke : threaded LAPACKE_dpotrf2 call + * - plasma : plasma_dpotrf call (PLASMA's high-level + * synchronous Cholesky over the OpenMP runtime). */ -enum class Variant { reference, plasma }; +enum class Variant { lapacke, plasma }; inline Variant to_variant(const std::string &s) { - if (s == "reference") + if (s == "lapacke") { - return Variant::reference; + return Variant::lapacke; } if (s == "plasma") { diff --git a/reference/core/src/cholesky_factor.cpp b/reference/core/src/cholesky_factor.cpp index 25bbe20..3a20132 100644 --- a/reference/core/src/cholesky_factor.cpp +++ b/reference/core/src/cholesky_factor.cpp @@ -14,7 +14,7 @@ void parallel_cholesky(Variant variant, std::vector &matrix, int N) { switch (variant) { - case Variant::reference: lapacke_potrf(matrix, N); return; + case Variant::lapacke: lapacke_potrf(matrix, N); return; case Variant::plasma: #ifdef ENABLE_PLASMA diff --git a/reference/main.cpp b/reference/main.cpp index b4c8585..2aceb09 100644 --- a/reference/main.cpp +++ b/reference/main.cpp @@ -93,7 +93,7 @@ int main(int argc, char *argv[]) // Reference modes: std::vector modes = {}; #ifndef DISABLE_BLAS_REFERENCE - modes.push_back("reference"); + modes.push_back("lapacke"); #endif #ifdef ENABLE_PLASMA modes.push_back("plasma"); From 493d11e6c4bba03a2940438457789a068a0c435b Mon Sep 17 00:00:00 2001 From: constracktor <74077030+constracktor@users.noreply.github.com> Date: Fri, 1 May 2026 10:35:31 +0200 Subject: [PATCH 10/13] Cropped mode for Plasma only --- README.md | 8 +++----- reference/main.cpp | 35 ++++++++++++++++++----------------- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index d2f9494..1e21cfa 100644 --- a/README.md +++ b/README.md @@ -39,11 +39,9 @@ PLASMA 24.8.7's `plasma_desc_*_create()` routines compute their tile-storage siz The benchmark handles this transparently: -- For sweep sizes `N` in `(65280, 65536]` the working size is **clamped to 65280** for the whole row (both `lapacke` and `plasma` run at 65280, and the `problem_size` column reports 65280). This keeps the largest practical PLASMA point on the curve without touching the underlying PLASMA build. +- For sweep sizes `N` in `(65280, 65536]`, **only `plasma` is silently clamped down to 65280** for that iteration; `lapacke` runs at the full `N`. The `problem_size` column reports the original `N`, so `plasma`'s timing in this range corresponds to the 65280 compute even though the row is labelled with the input size. - For `N > 65536` `plasma` records `nan`. `lapacke` is unaffected by the int32 ceiling and continues normally. -Patching `(size_t)` casts into `control/descriptor.c` in the spack PLASMA package removes the ceiling and the clamp + guard become no-ops. - ## Dependencies All three implementations are built with CMake (≥ 3.23) and C++20. The OpenMP and HPX directories link against a *sequential* BLAS (parallelism is at the tile level); the `reference/` directory links against a *threaded* BLAS instead. @@ -54,12 +52,12 @@ All three implementations are built with CMake (≥ 3.23) and C++20. The OpenMP | OpenBLAS 0.3.28 (`threads=openmp`) | — | — | ✓ (default) | | Intel oneMKL (sequential) | optional (`ENABLE_MKL=ON`) | optional (`ENABLE_MKL=ON`) | — | | Intel oneMKL (`intel_thread`) | — | — | optional (`ENABLE_MKL=ON`) | -| PLASMA | — | — | optional (`ENABLE_PLASMA=ON`) | +| PLASMA 24.8.7 | — | — | optional (`ENABLE_PLASMA=ON`) | | HPX 1.11.0 + jemalloc | — | ✓ | — | | GCC 14.2.0 | ✓ | ✓ | ✓ | | LLVM/Clang 22.1.2 | optional | — | — | -Dependencies are managed via [Spack](https://spack.io/). The compile scripts auto-detect the host system and load the correct Spack environment. +Dependencies are managed via [Spack](https://spack.io/). ## Build diff --git a/reference/main.cpp b/reference/main.cpp index 2aceb09..38b768a 100644 --- a/reference/main.cpp +++ b/reference/main.cpp @@ -70,18 +70,8 @@ int main(int argc, char *argv[]) } #endif - for (std::size_t input_size = START_SIZE; input_size <= STOP_SIZE; input_size = input_size * STEP_SIZE) + for (std::size_t size = START_SIZE; size <= STOP_SIZE; size = size * STEP_SIZE) { - // PLASMA's triangular descriptor allocation overflows int32 for - // N>65280 with the default nb=256. For sweep sizes in (65280, 65536] - // we transparently clamp the working size down to 65280. Sizes beyond - // 65536 fall through and the per-mode catch handler records nan. - std::size_t size = input_size; - if (size > 65'280 && size <= 65'536) - { - size = 65'280; - } - for (std::size_t l = 0; l < LOOP; l++) { std::string header = "threads;problem_size;tile_size;n_tiles"; @@ -102,17 +92,27 @@ int main(int argc, char *argv[]) for (const auto &mode : modes) { header += ";" + mode; + std::size_t mode_size = size; + + // PLASMA's triangular descriptor allocation + // overflows int32 for N>65280 with the default nb=256. For + // input sizes in (65280, 65536] we silently clamp PLASMA's + // working size down to 65280; std::size_t mode_size = size; + if (mode == "plasma" && mode_size > 65'280 && mode_size <= 65'536) + { + mode_size = 65'280; + } - std::vector matrix = gen_matrix(size); + std::vector matrix = gen_matrix(mode_size); // NaN guard double cholesky_cpu = std::numeric_limits::quiet_NaN(); try { - cholesky_cpu = cpu::cholesky(matrix, size, mode); + cholesky_cpu = cpu::cholesky(matrix, mode_size, mode); } catch (const std::exception &e) { - std::cerr << "Error: variant '" << mode << "' failed at size=" << size << ": " << e.what() + std::cerr << "Error: variant '" << mode << "' failed at size=" << mode_size << ": " << e.what() << ". Recording NaN and continuing." << std::endl; values += ";nan"; continue; @@ -123,12 +123,13 @@ int main(int argc, char *argv[]) #ifdef ENABLE_VALIDATION // Validate by computing relative residual ||A - L L^T||_F / ||A||_F constexpr double residual_tol = 1e-10; - const double residual = cpu::cholesky_residual(size, matrix); - std::cout << "[validate] mode=" << mode << " size=" << size << " residual=" << residual << std::endl; + const double residual = cpu::cholesky_residual(mode_size, matrix); + std::cout << "[validate] mode=" << mode << " size=" << mode_size << " residual=" << residual + << std::endl; if (!(residual <= residual_tol)) // catches NaN too { std::cerr << "Validation warning: variant '" << mode << "' residual " << residual - << " exceeds tolerance " << residual_tol << " (size=" << size << ")" << std::endl; + << " exceeds tolerance " << residual_tol << " (size=" << mode_size << ")" << std::endl; } #endif } From 879774dcc367b304f5504d6dfe16031e99f38920 Mon Sep 17 00:00:00 2001 From: constracktor <74077030+constracktor@users.noreply.github.com> Date: Fri, 1 May 2026 10:43:09 +0200 Subject: [PATCH 11/13] Add enable lapacke --- README.md | 10 +++++----- reference/CMakeLists.txt | 10 +++++----- reference/compile.sh | 20 ++++++++++---------- reference/main.cpp | 2 +- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 1e21cfa..c3eecfc 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-j | Mode | Description | |------|-------------| -| `lapacke` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). Enabled by default; disable with `DISABLE_BLAS_REFERENCE=ON`. | +| `lapacke` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). Enabled by default; disable with `ENABLE_LAPACKE=OFF`. | | `plasma` | Single `plasma_dpotrf` call on the full matrix (PLASMA's high-level synchronous API). PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. | This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `lapacke` mode isolates the contribution of vendor-provided dense-LA parallelism, and the `plasma` mode adds a tiled-parallel competitor that uses the same OpenMP runtime as the in-house variants. @@ -82,7 +82,7 @@ These can be set as environment variables before calling `compile.sh`: | `ENABLE_DYNAMIC_SCHEDULE` | `OFF` | *(`openmp/` only)* Use `schedule(dynamic,1)` on the trailing-update worksharing loops in `for_collapse`. Requires the LLVM toolchain; rejected at compile time with GCC. | | `ENABLE_MKL` | `OFF` | Link against Intel oneMKL instead of OpenBLAS. In `openmp/` and `hpx/` this is the *sequential* MKL; in `reference/` it is the *threaded* MKL. | | `ENABLE_PLASMA` | `OFF` | *(`reference/` only)* Also build the PLASMA `plasma_dpotrf` variant. Adds a `plasma` column alongside `lapacke` in the runtime output. | -| `DISABLE_BLAS_REFERENCE` | `OFF` | *(`reference/` only)* Skip the `lapacke` mode at runtime, so only `plasma` runs (when `ENABLE_PLASMA=ON`). Linking is unchanged — PLASMA and validation still need cblas/lapacke symbols. | +| `ENABLE_LAPACKE` | `ON` | *(`reference/` only)* Run the `lapacke` mode at runtime. Set `OFF` to skip it (e.g. when only `plasma` is wanted). Linking is unchanged either way — PLASMA and validation still need cblas/lapacke symbols. | **Examples:** @@ -103,7 +103,7 @@ ENABLE_MKL=ON ./compile.sh ENABLE_PLASMA=ON ./compile.sh # Reference: PLASMA only, skip the LAPACKE_dpotrf column at runtime -DISABLE_BLAS_REFERENCE=ON ENABLE_PLASMA=ON ./compile.sh +ENABLE_LAPACKE=OFF ENABLE_PLASMA=ON ./compile.sh ``` ## Run @@ -165,7 +165,7 @@ threads;problem_size;tile_size;n_tiles;for_collapse;for_naive;task_naive;task_de 128;65536;1024;64;3.14;3.21;2.98;2.87 ``` -The `reference/` binary reports a `lapacke` column (suppressed by `DISABLE_BLAS_REFERENCE=ON`) plus a `plasma` column when built with `ENABLE_PLASMA=ON`, with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key: +The `reference/` binary reports a `lapacke` column (suppressed by `ENABLE_LAPACKE=OFF`) plus a `plasma` column when built with `ENABLE_PLASMA=ON`, with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key: ``` threads;problem_size;tile_size;n_tiles;lapacke;plasma @@ -239,7 +239,7 @@ The same lines are also printed to stdout. └── adapter_cblas_fp64.cpp ``` -When `DISABLE_BLAS_REFERENCE=ON`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still compiled and linked (they share cblas/lapacke symbols with PLASMA's BLAS dependency); only the runtime dispatch of the `lapacke` mode is skipped. +When `ENABLE_LAPACKE=OFF`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still compiled and linked (they share cblas/lapacke symbols with PLASMA's BLAS dependency); only the runtime dispatch of the `lapacke` mode is skipped. ## Contributing diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index a9b5b90..e8045b8 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -12,9 +12,9 @@ option( "Build the PLASMA tiled-Cholesky reference variant in addition to the LAPACKE_dpotrf one" OFF) option( - DISABLE_BLAS_REFERENCE - "Skip the LAPACKE_dpotrf reference mode at runtime. Linking is unchanged (PLASMA and validation still need cblas/lapacke)." - OFF) + ENABLE_LAPACKE + "Run the LAPACKE_dpotrf reference mode at runtime (on by default). Linking is unchanged either way (PLASMA and validation still need cblas/lapacke)." + ON) option( ENABLE_VALIDATION "Compute ||A - L*L^T||_F / ||A||_F after each factorization (off by default)" @@ -103,7 +103,7 @@ if(BUILD_CORE) target_include_directories(cholesky_reference PRIVATE ${PLASMA_INCLUDE_DIR}) endif() - if(DISABLE_BLAS_REFERENCE) - target_compile_definitions(cholesky_reference PRIVATE DISABLE_BLAS_REFERENCE) + if(ENABLE_LAPACKE) + target_compile_definitions(cholesky_reference PRIVATE ENABLE_LAPACKE) endif() endif() diff --git a/reference/compile.sh b/reference/compile.sh index 2f20af3..b5e66c0 100755 --- a/reference/compile.sh +++ b/reference/compile.sh @@ -12,8 +12,8 @@ # ENABLE_PLASMA ON|OFF (default OFF) - also build the PLASMA # plasma_dpotrf variant (extra # 'plasma' column in the output) -# DISABLE_BLAS_REFERENCE ON|OFF (default OFF) - skip the LAPACKE_dpotrf -# reference at runtime +# ENABLE_LAPACKE ON|OFF (default ON) - run the LAPACKE_dpotrf +# reference mode at runtime # ENABLE_VALIDATION ON|OFF (default OFF) - residual check after each # factorization # @@ -21,7 +21,7 @@ # ./compile.sh # ENABLE_MKL=ON ./compile.sh # ENABLE_PLASMA=ON ./compile.sh -# DISABLE_BLAS_REFERENCE=ON ENABLE_PLASMA=ON ./compile.sh +# ENABLE_LAPACKE=OFF ENABLE_PLASMA=ON ./compile.sh # ENABLE_VALIDATION=ON ./compile.sh ################################################################################ set -e # Exit immediately if a command exits with a non-zero status. @@ -31,10 +31,10 @@ set -e # Exit immediately if a command exits with a non-zero status. ################################################################################ : "${ENABLE_MKL:=OFF}" : "${ENABLE_PLASMA:=OFF}" -: "${DISABLE_BLAS_REFERENCE:=OFF}" +: "${ENABLE_LAPACKE:=ON}" : "${ENABLE_VALIDATION:=OFF}" -for var in ENABLE_MKL ENABLE_PLASMA DISABLE_BLAS_REFERENCE ENABLE_VALIDATION; do +for var in ENABLE_MKL ENABLE_PLASMA ENABLE_LAPACKE ENABLE_VALIDATION; do case "${!var}" in ON | OFF) ;; *) @@ -99,15 +99,15 @@ fi rm -rf build && mkdir build && cd build echo "CMake options:" -echo " ENABLE_MKL = $ENABLE_MKL" -echo " ENABLE_PLASMA = $ENABLE_PLASMA" -echo " DISABLE_BLAS_REFERENCE = $DISABLE_BLAS_REFERENCE" -echo " ENABLE_VALIDATION = $ENABLE_VALIDATION" +echo " ENABLE_MKL = $ENABLE_MKL" +echo " ENABLE_PLASMA = $ENABLE_PLASMA" +echo " ENABLE_LAPACKE = $ENABLE_LAPACKE" +echo " ENABLE_VALIDATION = $ENABLE_VALIDATION" cmake -DCMAKE_BUILD_TYPE=Release \ -DENABLE_MKL="$ENABLE_MKL" \ -DENABLE_PLASMA="$ENABLE_PLASMA" \ - -DDISABLE_BLAS_REFERENCE="$DISABLE_BLAS_REFERENCE" \ + -DENABLE_LAPACKE="$ENABLE_LAPACKE" \ -DENABLE_VALIDATION="$ENABLE_VALIDATION" \ .. make -j VERBOSE=1 diff --git a/reference/main.cpp b/reference/main.cpp index 38b768a..3c824c9 100644 --- a/reference/main.cpp +++ b/reference/main.cpp @@ -82,7 +82,7 @@ int main(int argc, char *argv[]) /////////////////////////////////////////////////////////////////// // Reference modes: std::vector modes = {}; -#ifndef DISABLE_BLAS_REFERENCE +#ifdef ENABLE_LAPACKE modes.push_back("lapacke"); #endif #ifdef ENABLE_PLASMA From 9e44539f7830b96820b972eead77b3c92d630a32 Mon Sep 17 00:00:00 2001 From: constracktor <74077030+constracktor@users.noreply.github.com> Date: Fri, 1 May 2026 11:12:32 +0200 Subject: [PATCH 12/13] Unify formatting --- hpx/.clang-format => .clang-format | 0 .github/workflows/lint.yml | 24 ++-- CMakeLists.txt | 14 +++ README.md | 17 ++- hpx/CMakeLists.txt | 17 +-- hpx/CMakePresets.json | 17 --- openmp/.clang-format | 174 ----------------------------- openmp/CMakeLists.txt | 17 +-- openmp/CMakePresets.json | 17 --- reference/.clang-format | 174 ----------------------------- reference/CMakeLists.txt | 17 +-- reference/CMakePresets.json | 17 --- reference/core/CMakeLists.txt | 5 +- 13 files changed, 50 insertions(+), 460 deletions(-) rename hpx/.clang-format => .clang-format (100%) create mode 100644 CMakeLists.txt delete mode 100644 hpx/CMakePresets.json delete mode 100644 openmp/.clang-format delete mode 100644 openmp/CMakePresets.json delete mode 100644 reference/.clang-format delete mode 100644 reference/CMakePresets.json diff --git a/hpx/.clang-format b/.clang-format similarity index 100% rename from hpx/.clang-format rename to .clang-format diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index cd6047c..54ed8f7 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -19,14 +19,18 @@ jobs: - name: Install cmakelang run: pip3 install cmakelang - - name: Configure dummy project - run: cd hpx && cmake -B build-fmt -DBUILD_CORE=OFF -DCLANG_FORMAT_PROGRAM=$(which clang-format-18) + - name: Configure top-level format project + # The repo-root CMakeLists.txt is a format-only coordinator that pulls + # in TheLartians/Format.cmake and exposes the *-clang-format and + # *-cmake-format targets. The actual builds live in openmp/, hpx/, + # and reference/ and are not configured here. + run: cmake -B build-fmt -DCLANG_FORMAT_PROGRAM=$(which clang-format-18) - name: Check code formatting id: clangformat run: | set +e - cd hpx && cmake --build build-fmt --target check-clang-format + cmake --build build-fmt --target check-clang-format status=$? if [ $status -ne 0 ]; then echo "Formatting errors found!" @@ -37,14 +41,14 @@ jobs: fi - name: Check CMake formatting - # Let's run the CMake formatting checks even if our code is mis-formatted. + # Run CMake formatting checks even if the C++ check failed. if: success() || steps.clangformat.conclusion == 'failure' - # Note that diff generation for cmake-format is somewhat broken in the upstream project. - # Diffs always end up with incorrect paths so manual fixes would be necessary, which we sidestep - # by re-formatting in-place and then using `git diff`. + # Diff generation for cmake-format is somewhat broken upstream (paths + # come out wrong), so we sidestep it by fixing in place and using + # `git diff` to produce the patch. run: | set +e - cd hpx && cmake --build build-fmt --target check-cmake-format + cmake --build build-fmt --target check-cmake-format status=$? if [ $status -ne 0 ]; then echo "Formatting errors found!" @@ -61,5 +65,5 @@ jobs: with: name: Formatting fix .patch files path: | - hpx/clang-format.patch - hpx/cmake-format.patch + clang-format.patch + cmake-format.patch diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..4184e03 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 3.14) +# Top-level coordinator for *source formatting only*. Each subdirectory owns its +# own standalone CMake project (with its own dependencies and its own +# compile.sh); this file exists so the clang-format / cmake-format integration +# can be configured once for the whole repository. +project(cholesky_bench LANGUAGES NONE) + +include(FetchContent) +FetchContent_Declare( + format + GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git + GIT_TAG v1.8.1 + QUIET) +FetchContent_MakeAvailable(format) diff --git a/README.md b/README.md index c3eecfc..1096d29 100644 --- a/README.md +++ b/README.md @@ -178,9 +178,10 @@ The same lines are also printed to stdout. ``` . +├── .clang-format # repo-wide style; governs all three subtrees +├── CMakeLists.txt # top-level coordinator (formatting only; LANGUAGES NONE) ├── openmp/ │ ├── CMakeLists.txt -│ ├── CMakePresets.json │ ├── compile.sh # build script (gcc or llvm) │ ├── run.sh # SLURM job script │ ├── main.cpp @@ -199,7 +200,6 @@ The same lines are also printed to stdout. │ └── adapter_cblas_fp64.cpp ├── hpx/ │ ├── CMakeLists.txt -│ ├── CMakePresets.json │ ├── compile.sh # build script (gcc only) │ ├── run.sh # SLURM job script │ ├── main.cpp @@ -218,7 +218,6 @@ The same lines are also printed to stdout. │ └── adapter_cblas_fp64.cpp └── reference/ ├── CMakeLists.txt - ├── CMakePresets.json ├── compile.sh # build script (gcc only) ├── run.sh # SLURM job script ├── main.cpp @@ -241,6 +240,18 @@ The same lines are also printed to stdout. When `ENABLE_LAPACKE=OFF`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still compiled and linked (they share cblas/lapacke symbols with PLASMA's BLAS dependency); only the runtime dispatch of the `lapacke` mode is skipped. +## Formatting + +A repository-wide [`.clang-format`](.clang-format) governs all three subtrees. The top-level [`CMakeLists.txt`](CMakeLists.txt) wires up `clang-format` and `cmake-format` targets via [Format.cmake](https://github.com/TheLartians/Format.cmake); configure once from the repo root and use the targets: + +```bash +cmake -B build-fmt +cmake --build build-fmt --target check-clang-format # CI-style check +cmake --build build-fmt --target fix-clang-format # apply formatting +``` + +Each subproject (`openmp/`, `hpx/`, `reference/`) is its own standalone CMake project with its own dependencies, so the top-level `CMakeLists.txt` only handles formatting — actual builds still happen from inside each subdirectory via its `compile.sh`. + ## Contributing We would be happy to expand Cholesky-Bench to additional asynchronous many-task (AMT) runtimes. If you have an implementation you would like to add, feel free to open a pull request. diff --git a/hpx/CMakeLists.txt b/hpx/CMakeLists.txt index b2b7fdc..356c529 100644 --- a/hpx/CMakeLists.txt +++ b/hpx/CMakeLists.txt @@ -15,9 +15,6 @@ option( DISABLE_COMPUTATION "Replace all BLAS/LAPACK calls and tile generation with no-ops; keeps the dataflow graph intact so HPX scheduling overhead can be measured in isolation" OFF) -option(ENABLE_FORMAT_TARGETS "Enable clang-format / cmake-format targets" - ${PROJECT_IS_TOP_LEVEL}) - if(ENABLE_VALIDATION AND DISABLE_COMPUTATION) message( FATAL_ERROR @@ -25,18 +22,8 @@ if(ENABLE_VALIDATION AND DISABLE_COMPUTATION) "residual validation needs a real factorization to check against.") endif() -if(ENABLE_FORMAT_TARGETS) - find_package(format QUIET) - if(NOT format_FOUND) - include(FetchContent) - FetchContent_Declare( - format - GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git - GIT_TAG v1.8.1 - QUIET) - FetchContent_MakeAvailable(format) - endif() -endif() +# clang-format / cmake-format integration is hoisted to the top-level +# CMakeLists.txt; configure from the repo root to use it. if(NOT CMAKE_SKIP_INSTALL_RULES) # Our installs follow the standard GNU directory layout. This include needs to diff --git a/hpx/CMakePresets.json b/hpx/CMakePresets.json deleted file mode 100644 index f3839f8..0000000 --- a/hpx/CMakePresets.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "version": 6, - "cmakeMinimumRequired": { - "major": 3, - "minor": 22, - "patch": 0 - }, - "configurePresets": [ - { - "name": "clang-tidy", - "hidden": true, - "cacheVariables": { - "CMAKE_CXX_CLANG_TIDY": "clang-tidy;--header-filter=^${sourceDir}/" - } - } - ] -} diff --git a/openmp/.clang-format b/openmp/.clang-format deleted file mode 100644 index e8d875c..0000000 --- a/openmp/.clang-format +++ /dev/null @@ -1,174 +0,0 @@ ---- -Language: Cpp -AccessModifierOffset: -2 -AlignAfterOpenBracket: Align -AlignArrayOfStructures: None -AlignConsecutiveAssignments: None -AlignConsecutiveBitFields: None -AlignConsecutiveDeclarations: None -AlignConsecutiveMacros: None -AlignConsecutiveShortCaseStatements: - Enabled: true - AcrossEmptyLines: false - AcrossComments: false - AlignCaseColons: false -AlignEscapedNewlines: Right -AlignOperands: Align -AlignTrailingComments: - Kind: Always -AllowAllArgumentsOnNextLine: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowBreakBeforeNoexceptSpecifier: OnlyWithParen -AllowShortBlocksOnASingleLine: Empty -AllowShortCaseLabelsOnASingleLine: true -AllowShortEnumsOnASingleLine: true -AllowShortFunctionsOnASingleLine: All -AllowShortIfStatementsOnASingleLine: Never -AllowShortLambdasOnASingleLine: All -AllowShortLoopsOnASingleLine: true -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakBeforeMultilineStrings: false -AlwaysBreakAfterReturnType: None -AlwaysBreakTemplateDeclarations: Yes -BinPackArguments: false -BinPackParameters: false -BitFieldColonSpacing: Both -BraceWrapping: - AfterCaseLabel: false - AfterClass: true - AfterControlStatement: Always - AfterEnum: false - AfterFunction: true - AfterNamespace: true - AfterObjCDeclaration: true - AfterStruct: true - AfterUnion: true - AfterExternBlock: false - BeforeCatch: true - BeforeElse: true - BeforeLambdaBody: true - BeforeWhile: false - IndentBraces: false - SplitEmptyFunction: false - SplitEmptyRecord: false - SplitEmptyNamespace: false -BreakAfterAttributes: Never -BreakAfterJavaFieldAnnotations: false -BreakBeforeBinaryOperators: NonAssignment -BreakBeforeBraces: Custom -BreakBeforeConceptDeclarations: Always -BreakBeforeInlineASMColon: OnlyMultiline -BreakBeforeTernaryOperators: true -BreakConstructorInitializers: AfterColon -BreakInheritanceList: AfterComma -BreakStringLiterals: true -ColumnLimit: 120 -CommentPragmas: '^ IWYU pragma:' -CompactNamespaces: false -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: false -DerivePointerAlignment: false -DisableFormat: false -EmptyLineAfterAccessModifier: Never -EmptyLineBeforeAccessModifier: LogicalBlock -FixNamespaceComments: true -ForEachMacros: [ 'foreach', 'Q_FOREACH', 'BOOST_FOREACH' ] -IfMacros: [ ] -IncludeBlocks: Regroup -IncludeCategories: - - Regex: '^"gprat/' - Priority: 1 - - Regex: '^"(tests|bindings)/' - Priority: 2 - - Regex: '^"(fmt|catch2|pybind)' - Priority: 3 - - Regex: '^.*' - Priority: 4 -IncludeIsMainRegex: '(Test)?$' -IncludeIsMainSourceRegex: '(\.cu|\.hip)' -IndentAccessModifiers: false -IndentCaseBlocks: true -IndentCaseLabels: true -IndentExternBlock: NoIndent -IndentGotoLabels: false -IndentPPDirectives: None -IndentRequiresClause: false -IndentWidth: 4 -IndentWrappedFunctionNames: false -InsertBraces: true -InsertNewlineAtEOF: true -InsertTrailingCommas: None -IntegerLiteralSeparator: - Binary: 8 - Decimal: 3 - DecimalMinDigits: 5 - Hex: -1 -KeepEmptyLinesAtEOF: false -KeepEmptyLinesAtTheStartOfBlocks: false -LambdaBodyIndentation: Signature -LineEnding: DeriveLF -MacroBlockBegin: '' -MacroBlockEnd: '' -Macros: [ ] -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -NamespaceMacros: [ ] -PPIndentWidth: -1 -PackConstructorInitializers: Never -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 19 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakOpenParenthesis: 0 -PenaltyBreakString: 1000 -PenaltyBreakTemplateDeclaration: 10 -PenaltyExcessCharacter: 1000000 -PenaltyIndentedWhitespace: 1 -PenaltyReturnTypeOnItsOwnLine: 60 -PointerAlignment: Right -QualifierAlignment: Custom -QualifierOrder: [ 'inline', 'constexpr', 'static', 'friend', 'restrict', 'const', 'volatile', 'type' ] -ReferenceAlignment: Pointer -ReflowComments: true -RemoveBracesLLVM: false -RemoveParentheses: Leave -RemoveSemicolon: true -RequiresClausePosition: OwnLine -RequiresExpressionIndentation: OuterScope -SeparateDefinitionBlocks: Always -ShortNamespaceLines: 1 -SortIncludes: CaseInsensitive -SortUsingDeclarations: LexicographicNumeric -SpaceAfterCStyleCast: true -SpaceAfterLogicalNot: false -SpaceAfterTemplateKeyword: true -SpaceAroundPointerQualifiers: Default -SpaceBeforeAssignmentOperators: true -SpaceBeforeCaseColon: false -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeJsonColon: false -SpaceBeforeParens: ControlStatements -SpaceBeforeRangeBasedForLoopColon: true -SpaceBeforeSquareBrackets: false -SpaceInEmptyBlock: true -SpacesBeforeTrailingComments: 2 -SpacesInAngles: false -SpacesInContainerLiterals: true -SpacesInLineCommentPrefix: - Minimum: 1 - Maximum: 1 -SpacesInParens: Never -SpacesInSquareBrackets: false -Standard: c++17 -StatementAttributeLikeMacros: [ ] -StatementMacros: [ 'Q_UNUSED', 'QT_REQUIRE_VERSION' ] -TabWidth: 4 -TypeNames: [ ] -TypenameMacros: [ ] -UseTab: Never -WhitespaceSensitiveMacros: [ 'STRINGIZE', 'PP_STRINGIZE', 'BOOST_PP_STRINGIZE' ] -... - diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt index f506c0e..aba403a 100644 --- a/openmp/CMakeLists.txt +++ b/openmp/CMakeLists.txt @@ -19,9 +19,6 @@ option( ENABLE_DYNAMIC_SCHEDULE "Use schedule(dynamic, 1) on the trailing-update worksharing loops in for_collapse. OFF by default so GCC builds compile out of the box. Turn ON for LLVM builds where the dynamic schedule is supported and gives better load balancing." OFF) -option(ENABLE_FORMAT_TARGETS "Enable clang-format / cmake-format targets" - ${PROJECT_IS_TOP_LEVEL}) - if(ENABLE_VALIDATION AND DISABLE_COMPUTATION) message( FATAL_ERROR @@ -29,18 +26,8 @@ if(ENABLE_VALIDATION AND DISABLE_COMPUTATION) "residual validation needs a real factorization to check against.") endif() -if(ENABLE_FORMAT_TARGETS) - find_package(format QUIET) - if(NOT format_FOUND) - include(FetchContent) - FetchContent_Declare( - format - GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git - GIT_TAG v1.8.1 - QUIET) - FetchContent_MakeAvailable(format) - endif() -endif() +# clang-format / cmake-format integration is hoisted to the top-level +# CMakeLists.txt; configure from the repo root to use it. if(NOT CMAKE_SKIP_INSTALL_RULES) # Our installs follow the standard GNU directory layout. This include needs to diff --git a/openmp/CMakePresets.json b/openmp/CMakePresets.json deleted file mode 100644 index f3839f8..0000000 --- a/openmp/CMakePresets.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "version": 6, - "cmakeMinimumRequired": { - "major": 3, - "minor": 22, - "patch": 0 - }, - "configurePresets": [ - { - "name": "clang-tidy", - "hidden": true, - "cacheVariables": { - "CMAKE_CXX_CLANG_TIDY": "clang-tidy;--header-filter=^${sourceDir}/" - } - } - ] -} diff --git a/reference/.clang-format b/reference/.clang-format deleted file mode 100644 index e8d875c..0000000 --- a/reference/.clang-format +++ /dev/null @@ -1,174 +0,0 @@ ---- -Language: Cpp -AccessModifierOffset: -2 -AlignAfterOpenBracket: Align -AlignArrayOfStructures: None -AlignConsecutiveAssignments: None -AlignConsecutiveBitFields: None -AlignConsecutiveDeclarations: None -AlignConsecutiveMacros: None -AlignConsecutiveShortCaseStatements: - Enabled: true - AcrossEmptyLines: false - AcrossComments: false - AlignCaseColons: false -AlignEscapedNewlines: Right -AlignOperands: Align -AlignTrailingComments: - Kind: Always -AllowAllArgumentsOnNextLine: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowBreakBeforeNoexceptSpecifier: OnlyWithParen -AllowShortBlocksOnASingleLine: Empty -AllowShortCaseLabelsOnASingleLine: true -AllowShortEnumsOnASingleLine: true -AllowShortFunctionsOnASingleLine: All -AllowShortIfStatementsOnASingleLine: Never -AllowShortLambdasOnASingleLine: All -AllowShortLoopsOnASingleLine: true -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakBeforeMultilineStrings: false -AlwaysBreakAfterReturnType: None -AlwaysBreakTemplateDeclarations: Yes -BinPackArguments: false -BinPackParameters: false -BitFieldColonSpacing: Both -BraceWrapping: - AfterCaseLabel: false - AfterClass: true - AfterControlStatement: Always - AfterEnum: false - AfterFunction: true - AfterNamespace: true - AfterObjCDeclaration: true - AfterStruct: true - AfterUnion: true - AfterExternBlock: false - BeforeCatch: true - BeforeElse: true - BeforeLambdaBody: true - BeforeWhile: false - IndentBraces: false - SplitEmptyFunction: false - SplitEmptyRecord: false - SplitEmptyNamespace: false -BreakAfterAttributes: Never -BreakAfterJavaFieldAnnotations: false -BreakBeforeBinaryOperators: NonAssignment -BreakBeforeBraces: Custom -BreakBeforeConceptDeclarations: Always -BreakBeforeInlineASMColon: OnlyMultiline -BreakBeforeTernaryOperators: true -BreakConstructorInitializers: AfterColon -BreakInheritanceList: AfterComma -BreakStringLiterals: true -ColumnLimit: 120 -CommentPragmas: '^ IWYU pragma:' -CompactNamespaces: false -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: false -DerivePointerAlignment: false -DisableFormat: false -EmptyLineAfterAccessModifier: Never -EmptyLineBeforeAccessModifier: LogicalBlock -FixNamespaceComments: true -ForEachMacros: [ 'foreach', 'Q_FOREACH', 'BOOST_FOREACH' ] -IfMacros: [ ] -IncludeBlocks: Regroup -IncludeCategories: - - Regex: '^"gprat/' - Priority: 1 - - Regex: '^"(tests|bindings)/' - Priority: 2 - - Regex: '^"(fmt|catch2|pybind)' - Priority: 3 - - Regex: '^.*' - Priority: 4 -IncludeIsMainRegex: '(Test)?$' -IncludeIsMainSourceRegex: '(\.cu|\.hip)' -IndentAccessModifiers: false -IndentCaseBlocks: true -IndentCaseLabels: true -IndentExternBlock: NoIndent -IndentGotoLabels: false -IndentPPDirectives: None -IndentRequiresClause: false -IndentWidth: 4 -IndentWrappedFunctionNames: false -InsertBraces: true -InsertNewlineAtEOF: true -InsertTrailingCommas: None -IntegerLiteralSeparator: - Binary: 8 - Decimal: 3 - DecimalMinDigits: 5 - Hex: -1 -KeepEmptyLinesAtEOF: false -KeepEmptyLinesAtTheStartOfBlocks: false -LambdaBodyIndentation: Signature -LineEnding: DeriveLF -MacroBlockBegin: '' -MacroBlockEnd: '' -Macros: [ ] -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -NamespaceMacros: [ ] -PPIndentWidth: -1 -PackConstructorInitializers: Never -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 19 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakOpenParenthesis: 0 -PenaltyBreakString: 1000 -PenaltyBreakTemplateDeclaration: 10 -PenaltyExcessCharacter: 1000000 -PenaltyIndentedWhitespace: 1 -PenaltyReturnTypeOnItsOwnLine: 60 -PointerAlignment: Right -QualifierAlignment: Custom -QualifierOrder: [ 'inline', 'constexpr', 'static', 'friend', 'restrict', 'const', 'volatile', 'type' ] -ReferenceAlignment: Pointer -ReflowComments: true -RemoveBracesLLVM: false -RemoveParentheses: Leave -RemoveSemicolon: true -RequiresClausePosition: OwnLine -RequiresExpressionIndentation: OuterScope -SeparateDefinitionBlocks: Always -ShortNamespaceLines: 1 -SortIncludes: CaseInsensitive -SortUsingDeclarations: LexicographicNumeric -SpaceAfterCStyleCast: true -SpaceAfterLogicalNot: false -SpaceAfterTemplateKeyword: true -SpaceAroundPointerQualifiers: Default -SpaceBeforeAssignmentOperators: true -SpaceBeforeCaseColon: false -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeJsonColon: false -SpaceBeforeParens: ControlStatements -SpaceBeforeRangeBasedForLoopColon: true -SpaceBeforeSquareBrackets: false -SpaceInEmptyBlock: true -SpacesBeforeTrailingComments: 2 -SpacesInAngles: false -SpacesInContainerLiterals: true -SpacesInLineCommentPrefix: - Minimum: 1 - Maximum: 1 -SpacesInParens: Never -SpacesInSquareBrackets: false -Standard: c++17 -StatementAttributeLikeMacros: [ ] -StatementMacros: [ 'Q_UNUSED', 'QT_REQUIRE_VERSION' ] -TabWidth: 4 -TypeNames: [ ] -TypenameMacros: [ ] -UseTab: Never -WhitespaceSensitiveMacros: [ 'STRINGIZE', 'PP_STRINGIZE', 'BOOST_PP_STRINGIZE' ] -... - diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index e8045b8..3f50b6b 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -19,21 +19,8 @@ option( ENABLE_VALIDATION "Compute ||A - L*L^T||_F / ||A||_F after each factorization (off by default)" OFF) -option(ENABLE_FORMAT_TARGETS "Enable clang-format / cmake-format targets" - ${PROJECT_IS_TOP_LEVEL}) - -if(ENABLE_FORMAT_TARGETS) - find_package(format QUIET) - if(NOT format_FOUND) - include(FetchContent) - FetchContent_Declare( - format - GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git - GIT_TAG v1.8.1 - QUIET) - FetchContent_MakeAvailable(format) - endif() -endif() +# clang-format / cmake-format integration is hoisted to the top-level +# CMakeLists.txt; configure from the repo root to use it. if(NOT CMAKE_SKIP_INSTALL_RULES) include(GNUInstallDirs) diff --git a/reference/CMakePresets.json b/reference/CMakePresets.json deleted file mode 100644 index f3839f8..0000000 --- a/reference/CMakePresets.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "version": 6, - "cmakeMinimumRequired": { - "major": 3, - "minor": 22, - "patch": 0 - }, - "configurePresets": [ - { - "name": "clang-tidy", - "hidden": true, - "cacheVariables": { - "CMAKE_CXX_CLANG_TIDY": "clang-tidy;--header-filter=^${sourceDir}/" - } - } - ] -} diff --git a/reference/core/CMakeLists.txt b/reference/core/CMakeLists.txt index eea5cbf..b74c17f 100644 --- a/reference/core/CMakeLists.txt +++ b/reference/core/CMakeLists.txt @@ -29,9 +29,8 @@ target_include_directories( # Link BLAS if(ENABLE_MKL) # Link threaded Intel oneMKL - target_link_libraries( - cholesky_core PUBLIC MKL::mkl_intel_lp64 MKL::mkl_core MKL::MKL - MKL::mkl_intel_thread) + target_link_libraries(cholesky_core PUBLIC MKL::mkl_intel_lp64 MKL::mkl_core + MKL::MKL MKL::mkl_intel_thread) else() # Link threaded OpenBLAS target_link_libraries(cholesky_core PUBLIC ${OpenBLAS_LIB}) From ad29902d6471b63c18fa1acbbd2eb977ba916284 Mon Sep 17 00:00:00 2001 From: constracktor <74077030+constracktor@users.noreply.github.com> Date: Fri, 1 May 2026 11:27:42 +0200 Subject: [PATCH 13/13] Final cleanup --- .github/workflows/lint.yml | 4 ---- README.md | 20 ++++---------------- hpx/CMakeLists.txt | 3 --- openmp/CMakeLists.txt | 3 --- reference/CMakeLists.txt | 2 -- 5 files changed, 4 insertions(+), 28 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 54ed8f7..0c82399 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -20,10 +20,6 @@ jobs: run: pip3 install cmakelang - name: Configure top-level format project - # The repo-root CMakeLists.txt is a format-only coordinator that pulls - # in TheLartians/Format.cmake and exposes the *-clang-format and - # *-cmake-format targets. The actual builds live in openmp/, hpx/, - # and reference/ and are not configured here. run: cmake -B build-fmt -DCLANG_FORMAT_PROGRAM=$(which clang-format-18) - name: Check code formatting diff --git a/README.md b/README.md index 1096d29..c2b694a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Cholesky-Bench -Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-join to asynchronous tasks across several parallelism models, currently comparing OpenMP and HPX implementations side by side. A non-tiled parallel-BLAS reference is also included as a baseline. +Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-join to asynchronous tasks across several parallelism models, currently comparing OpenMP and HPX implementations side by side. A non-tiled parallel reference is also included as a baseline. ## Variants @@ -160,20 +160,8 @@ runtimes_reference_cholesky_.txt The suffix encodes which dimension is swept: `tile_` if tiles vary, `size_` if size varies, followed by the loop count. The file uses `;`-separated columns: -``` -threads;problem_size;tile_size;n_tiles;for_collapse;for_naive;task_naive;task_depend -128;65536;1024;64;3.14;3.21;2.98;2.87 -``` - The `reference/` binary reports a `lapacke` column (suppressed by `ENABLE_LAPACKE=OFF`) plus a `plasma` column when built with `ENABLE_PLASMA=ON`, with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key: -``` -threads;problem_size;tile_size;n_tiles;lapacke;plasma -128;65280;65280;1;5.21;68.12 -``` - -The same lines are also printed to stdout. - ## Repository structure ``` @@ -242,7 +230,7 @@ When `ENABLE_LAPACKE=OFF`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still ## Formatting -A repository-wide [`.clang-format`](.clang-format) governs all three subtrees. The top-level [`CMakeLists.txt`](CMakeLists.txt) wires up `clang-format` and `cmake-format` targets via [Format.cmake](https://github.com/TheLartians/Format.cmake); configure once from the repo root and use the targets: +A repository-wide [`.clang-format`](.clang-format) governs all subtrees. The top-level [`CMakeLists.txt`](CMakeLists.txt) wires up `clang-format` and `cmake-format` targets via [Format.cmake](https://github.com/TheLartians/Format.cmake); configure once from the repo root and use the targets: ```bash cmake -B build-fmt @@ -250,11 +238,11 @@ cmake --build build-fmt --target check-clang-format # CI-style check cmake --build build-fmt --target fix-clang-format # apply formatting ``` -Each subproject (`openmp/`, `hpx/`, `reference/`) is its own standalone CMake project with its own dependencies, so the top-level `CMakeLists.txt` only handles formatting — actual builds still happen from inside each subdirectory via its `compile.sh`. +Each subproject (`openmp/`, `hpx/`, `reference/`) is its own standalone CMake project with its own dependencies, so the top-level `CMakeLists.txt` only handles formatting. The actual builds still happen from inside each subdirectory via its `compile.sh`. ## Contributing -We would be happy to expand Cholesky-Bench to additional asynchronous many-task (AMT) runtimes. If you have an implementation you would like to add, feel free to open a pull request. +We would be happy to expand Cholesky-Bench to additional asynchronous many-task (AMT) runtimes. If you would like to add an implementation, feel free to open a pull request. ## How to cite diff --git a/hpx/CMakeLists.txt b/hpx/CMakeLists.txt index 356c529..1cc87f5 100644 --- a/hpx/CMakeLists.txt +++ b/hpx/CMakeLists.txt @@ -22,9 +22,6 @@ if(ENABLE_VALIDATION AND DISABLE_COMPUTATION) "residual validation needs a real factorization to check against.") endif() -# clang-format / cmake-format integration is hoisted to the top-level -# CMakeLists.txt; configure from the repo root to use it. - if(NOT CMAKE_SKIP_INSTALL_RULES) # Our installs follow the standard GNU directory layout. This include needs to # come first since we need the CMAKE_INSTALL_* in the CMakeLists.txt of each diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt index aba403a..038bb9e 100644 --- a/openmp/CMakeLists.txt +++ b/openmp/CMakeLists.txt @@ -26,9 +26,6 @@ if(ENABLE_VALIDATION AND DISABLE_COMPUTATION) "residual validation needs a real factorization to check against.") endif() -# clang-format / cmake-format integration is hoisted to the top-level -# CMakeLists.txt; configure from the repo root to use it. - if(NOT CMAKE_SKIP_INSTALL_RULES) # Our installs follow the standard GNU directory layout. This include needs to # come first since we need the CMAKE_INSTALL_* in the CMakeLists.txt of each diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 3f50b6b..69996cd 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -19,8 +19,6 @@ option( ENABLE_VALIDATION "Compute ||A - L*L^T||_F / ||A||_F after each factorization (off by default)" OFF) -# clang-format / cmake-format integration is hoisted to the top-level -# CMakeLists.txt; configure from the repo root to use it. if(NOT CMAKE_SKIP_INSTALL_RULES) include(GNUInstallDirs)