diff --git a/hpx/.clang-format b/.clang-format similarity index 100% rename from hpx/.clang-format rename to .clang-format diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index cd6047c..0c82399 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -19,14 +19,14 @@ jobs: - name: Install cmakelang run: pip3 install cmakelang - - name: Configure dummy project - run: cd hpx && cmake -B build-fmt -DBUILD_CORE=OFF -DCLANG_FORMAT_PROGRAM=$(which clang-format-18) + - name: Configure top-level format project + run: cmake -B build-fmt -DCLANG_FORMAT_PROGRAM=$(which clang-format-18) - name: Check code formatting id: clangformat run: | set +e - cd hpx && cmake --build build-fmt --target check-clang-format + cmake --build build-fmt --target check-clang-format status=$? if [ $status -ne 0 ]; then echo "Formatting errors found!" @@ -37,14 +37,14 @@ jobs: fi - name: Check CMake formatting - # Let's run the CMake formatting checks even if our code is mis-formatted. + # Run CMake formatting checks even if the C++ check failed. if: success() || steps.clangformat.conclusion == 'failure' - # Note that diff generation for cmake-format is somewhat broken in the upstream project. - # Diffs always end up with incorrect paths so manual fixes would be necessary, which we sidestep - # by re-formatting in-place and then using `git diff`. + # Diff generation for cmake-format is somewhat broken upstream (paths + # come out wrong), so we sidestep it by fixing in place and using + # `git diff` to produce the patch. run: | set +e - cd hpx && cmake --build build-fmt --target check-cmake-format + cmake --build build-fmt --target check-cmake-format status=$? if [ $status -ne 0 ]; then echo "Formatting errors found!" @@ -61,5 +61,5 @@ jobs: with: name: Formatting fix .patch files path: | - hpx/clang-format.patch - hpx/cmake-format.patch + clang-format.patch + cmake-format.patch diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..4184e03 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 3.14) +# Top-level coordinator for *source formatting only*. Each subdirectory owns its +# own standalone CMake project (with its own dependencies and its own +# compile.sh); this file exists so the clang-format / cmake-format integration +# can be configured once for the whole repository. +project(cholesky_bench LANGUAGES NONE) + +include(FetchContent) +FetchContent_Declare( + format + GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git + GIT_TAG v1.8.1 + QUIET) +FetchContent_MakeAvailable(format) diff --git a/README.md b/README.md index 7f06a60..c2b694a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Cholesky-Bench -Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-join to asynchronous tasks across several parallelism models, currently comparing OpenMP and HPX implementations side by side. +Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-join to asynchronous tasks across several parallelism models, currently comparing OpenMP and HPX implementations side by side. A non-tiled parallel reference is also included as a baseline. ## Variants @@ -24,27 +24,49 @@ Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-j | `loop_two` | Collapsed fork-join with dynamic schedule for trailing-update | | `async_void` | Fully asynchronous tasking with dataflow using `hpx::shared_future` | +### Reference (`reference/`) + +| Mode | Description | +|------|-------------| +| `lapacke` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). Enabled by default; disable with `ENABLE_LAPACKE=OFF`. | +| `plasma` | Single `plasma_dpotrf` call on the full matrix (PLASMA's high-level synchronous API). PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. | + +This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `lapacke` mode isolates the contribution of vendor-provided dense-LA parallelism, and the `plasma` mode adds a tiled-parallel competitor that uses the same OpenMP runtime as the in-house variants. + +#### PLASMA descriptor int32 overflow + +PLASMA 24.8.7's `plasma_desc_*_create()` routines compute their tile-storage size as `int * int` before casting to `size_t`, which silently overflows once the padded triangular tile-area exceeds `INT32_MAX`. With the default `nb=256`, the boundary is at `N=65280` (`mt=255`). + +The benchmark handles this transparently: + +- For sweep sizes `N` in `(65280, 65536]`, **only `plasma` is silently clamped down to 65280** for that iteration; `lapacke` runs at the full `N`. The `problem_size` column reports the original `N`, so `plasma`'s timing in this range corresponds to the 65280 compute even though the row is labelled with the input size. +- For `N > 65536` `plasma` records `nan`. `lapacke` is unaffected by the int32 ceiling and continues normally. + ## Dependencies -Both implementations share the same sequential BLAS backend and are built with CMake (≥ 3.23) and C++20. +All three implementations are built with CMake (≥ 3.23) and C++20. The OpenMP and HPX directories link against a *sequential* BLAS (parallelism is at the tile level); the `reference/` directory links against a *threaded* BLAS instead. -| Dependency | OpenMP | HPX | -|---|---|---| -| OpenBLAS 0.3.28 | ✓ (default) | ✓ (default) | -| Intel oneMKL | optional (`ENABLE_MKL=ON`) | optional (`ENABLE_MKL=ON`) | -| HPX 1.11.0 + jemalloc | — | ✓ | -| GCC 14.2.0 | ✓ | ✓ | -| LLVM/Clang 22.1.2 | optional | — | +| Dependency | OpenMP | HPX | Reference | +|---|---|---|---| +| OpenBLAS 0.3.28 (sequential) | ✓ (default) | ✓ (default) | — | +| OpenBLAS 0.3.28 (`threads=openmp`) | — | — | ✓ (default) | +| Intel oneMKL (sequential) | optional (`ENABLE_MKL=ON`) | optional (`ENABLE_MKL=ON`) | — | +| Intel oneMKL (`intel_thread`) | — | — | optional (`ENABLE_MKL=ON`) | +| PLASMA 24.8.7 | — | — | optional (`ENABLE_PLASMA=ON`) | +| HPX 1.11.0 + jemalloc | — | ✓ | — | +| GCC 14.2.0 | ✓ | ✓ | ✓ | +| LLVM/Clang 22.1.2 | optional | — | — | -Dependencies are managed via [Spack](https://spack.io/). The compile scripts auto-detect the host system and load the correct Spack environment. +Dependencies are managed via [Spack](https://spack.io/). ## Build -From within the `openmp/` or `hpx/` directory, run: +From within the `openmp/`, `hpx/`, or `reference/` directory, run: ```bash -./compile.sh [gcc|llvm] # OpenMP: gcc (default) or llvm -./compile.sh # HPX: always gcc +./compile.sh [gcc|llvm] # OpenMP: gcc (default) or llvm +./compile.sh # HPX: always gcc +./compile.sh # Reference: always gcc ``` The script clears and recreates the `build/` directory, then runs CMake in Release mode followed by a parallel make. @@ -55,10 +77,12 @@ These can be set as environment variables before calling `compile.sh`: | Option | Default | Description | |--------|---------|-------------| -| `ENABLE_VALIDATION` | `OFF` | After each factorization, compute the relative residual ‖A − LL^T‖_F / ‖A‖_F and warn if it exceeds 1e-10. Mutually exclusive with `DISABLE_COMPUTATION`. | -| `DISABLE_COMPUTATION` | `OFF` | Replace all BLAS/tile-generation calls with no-ops. The task graph and loops remain intact, so scheduling overhead can be measured in isolation. | -| `ENABLE_DYNAMIC_SCHEDULE` | `OFF` | *(OpenMP only)* Use `schedule(dynamic,1)` on the trailing-update worksharing loops in `for_collapse`. Requires the LLVM toolchain; rejected at compile time with GCC. | -| `ENABLE_MKL` | `OFF` | Link against Intel oneMKL instead of OpenBLAS. | +| `ENABLE_VALIDATION` | `OFF` | After each factorization, compute the relative residual ‖A − LL^T‖_F / ‖A‖_F and warn if it exceeds 1e-10. In `openmp/` and `hpx/`, mutually exclusive with `DISABLE_COMPUTATION`. | +| `DISABLE_COMPUTATION` | `OFF` | *(`openmp/` and `hpx/` only)* Replace all BLAS/tile-generation calls with no-ops. The task graph and loops remain intact, so scheduling overhead can be measured in isolation. | +| `ENABLE_DYNAMIC_SCHEDULE` | `OFF` | *(`openmp/` only)* Use `schedule(dynamic,1)` on the trailing-update worksharing loops in `for_collapse`. Requires the LLVM toolchain; rejected at compile time with GCC. | +| `ENABLE_MKL` | `OFF` | Link against Intel oneMKL instead of OpenBLAS. In `openmp/` and `hpx/` this is the *sequential* MKL; in `reference/` it is the *threaded* MKL. | +| `ENABLE_PLASMA` | `OFF` | *(`reference/` only)* Also build the PLASMA `plasma_dpotrf` variant. Adds a `plasma` column alongside `lapacke` in the runtime output. | +| `ENABLE_LAPACKE` | `ON` | *(`reference/` only)* Run the `lapacke` mode at runtime. Set `OFF` to skip it (e.g. when only `plasma` is wanted). Linking is unchanged either way — PLASMA and validation still need cblas/lapacke symbols. | **Examples:** @@ -71,6 +95,15 @@ ENABLE_DYNAMIC_SCHEDULE=ON ./compile.sh llvm # HPX: measure pure scheduling overhead DISABLE_COMPUTATION=ON ./compile.sh + +# Reference: threaded MKL baseline +ENABLE_MKL=ON ./compile.sh + +# Reference: also build the PLASMA tiled-Cholesky variant +ENABLE_PLASMA=ON ./compile.sh + +# Reference: PLASMA only, skip the LAPACKE_dpotrf column at runtime +ENABLE_LAPACKE=OFF ENABLE_PLASMA=ON ./compile.sh ``` ## Run @@ -89,16 +122,22 @@ OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores \ --hpx:threads=128 \ --loop=1 --size_start=1024 --size_stop=65536 \ --tiles_start=64 --tiles_stop=64 + +# Reference (parallel BLAS, no tiling) +OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores \ + ./build/cholesky_reference \ + --loop 1 --size_start 1024 --size_stop 65536 ``` ### Via SLURM -Both directories contain a `run.sh` that is a ready-to-submit SLURM batch script (128 CPUs, exclusive node, 144-hour wall time): +All three directories contain a `run.sh` that is a ready-to-submit SLURM batch script (128 CPUs, exclusive node, 144-hour wall time): ```bash -sbatch openmp/run.sh # gcc runtime (default) -sbatch openmp/run.sh llvm # llvm runtime +sbatch openmp/run.sh # gcc runtime (default) +sbatch openmp/run.sh llvm # llvm runtime sbatch hpx/run.sh +sbatch reference/run.sh # gcc runtime; defaults to N=65280 (see PLASMA boundary note) ``` ### Command-line arguments @@ -107,7 +146,7 @@ sbatch hpx/run.sh |----------|---------|-------------| | `--loop` / `--loop=` | 1 | Number of timed repetitions per configuration | | `--size_start` / `--size_stop` | 32 / 128 | Problem size range (doubled each step) | -| `--tiles_start` / `--tiles_stop` | 16 / 32 | Tile count range (doubled each step) | +| `--tiles_start` / `--tiles_stop` | 16 / 32 | Tile count range (doubled each step). Accepted but ignored by the `reference/` binary, which has no tiling axis. | ## Output @@ -116,24 +155,21 @@ Results are appended to a text file in the working directory: ``` runtimes_openmp_cholesky_.txt runtimes_hpx_cholesky_.txt +runtimes_reference_cholesky_.txt ``` The suffix encodes which dimension is swept: `tile_` if tiles vary, `size_` if size varies, followed by the loop count. The file uses `;`-separated columns: -``` -threads;problem_size;tile_size;n_tiles;for_collapse;for_naive;task_naive;task_depend -128;65536;1024;64;3.14;3.21;2.98;2.87 -``` - -The same lines are also printed to stdout. +The `reference/` binary reports a `lapacke` column (suppressed by `ENABLE_LAPACKE=OFF`) plus a `plasma` column when built with `ENABLE_PLASMA=ON`, with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key: ## Repository structure ``` . +├── .clang-format # repo-wide style; governs all three subtrees +├── CMakeLists.txt # top-level coordinator (formatting only; LANGUAGES NONE) ├── openmp/ │ ├── CMakeLists.txt -│ ├── CMakePresets.json │ ├── compile.sh # build script (gcc or llvm) │ ├── run.sh # SLURM job script │ ├── main.cpp @@ -150,9 +186,26 @@ The same lines are also printed to stdout. │ ├── tile_generation.cpp │ ├── validate.cpp │ └── adapter_cblas_fp64.cpp -└── hpx/ +├── hpx/ +│ ├── CMakeLists.txt +│ ├── compile.sh # build script (gcc only) +│ ├── run.sh # SLURM job script +│ ├── main.cpp +│ └── core/ +│ ├── include/ +│ │ ├── cholesky_factor.hpp +│ │ ├── functions.hpp +│ │ ├── tile_generation.hpp +│ │ ├── validate.hpp +│ │ └── adapter_cblas_fp64.hpp +│ └── src/ +│ ├── cholesky_factor.cpp +│ ├── functions.cpp +│ ├── tile_generation.cpp +│ ├── validate.cpp +│ └── adapter_cblas_fp64.cpp +└── reference/ ├── CMakeLists.txt - ├── CMakePresets.json ├── compile.sh # build script (gcc only) ├── run.sh # SLURM job script ├── main.cpp @@ -160,20 +213,36 @@ The same lines are also printed to stdout. ├── include/ │ ├── cholesky_factor.hpp │ ├── functions.hpp - │ ├── tile_generation.hpp + │ ├── matrix_generation.hpp + │ ├── adapter_plasma_fp64.hpp # only used when ENABLE_PLASMA=ON │ ├── validate.hpp │ └── adapter_cblas_fp64.hpp └── src/ ├── cholesky_factor.cpp ├── functions.cpp - ├── tile_generation.cpp + ├── matrix_generation.cpp + ├── adapter_plasma_fp64.cpp # only built when ENABLE_PLASMA=ON ├── validate.cpp └── adapter_cblas_fp64.cpp ``` +When `ENABLE_LAPACKE=OFF`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still compiled and linked (they share cblas/lapacke symbols with PLASMA's BLAS dependency); only the runtime dispatch of the `lapacke` mode is skipped. + +## Formatting + +A repository-wide [`.clang-format`](.clang-format) governs all subtrees. The top-level [`CMakeLists.txt`](CMakeLists.txt) wires up `clang-format` and `cmake-format` targets via [Format.cmake](https://github.com/TheLartians/Format.cmake); configure once from the repo root and use the targets: + +```bash +cmake -B build-fmt +cmake --build build-fmt --target check-clang-format # CI-style check +cmake --build build-fmt --target fix-clang-format # apply formatting +``` + +Each subproject (`openmp/`, `hpx/`, `reference/`) is its own standalone CMake project with its own dependencies, so the top-level `CMakeLists.txt` only handles formatting. The actual builds still happen from inside each subdirectory via its `compile.sh`. + ## Contributing -We would be happy to expand Cholesky-Bench to additional asynchronous many-task (AMT) runtimes. If you have an implementation you would like to add, feel free to open a pull request. +We would be happy to expand Cholesky-Bench to additional asynchronous many-task (AMT) runtimes. If you would like to add an implementation, feel free to open a pull request. ## How to cite diff --git a/hpx/CMakeLists.txt b/hpx/CMakeLists.txt index b2b7fdc..1cc87f5 100644 --- a/hpx/CMakeLists.txt +++ b/hpx/CMakeLists.txt @@ -15,9 +15,6 @@ option( DISABLE_COMPUTATION "Replace all BLAS/LAPACK calls and tile generation with no-ops; keeps the dataflow graph intact so HPX scheduling overhead can be measured in isolation" OFF) -option(ENABLE_FORMAT_TARGETS "Enable clang-format / cmake-format targets" - ${PROJECT_IS_TOP_LEVEL}) - if(ENABLE_VALIDATION AND DISABLE_COMPUTATION) message( FATAL_ERROR @@ -25,19 +22,6 @@ if(ENABLE_VALIDATION AND DISABLE_COMPUTATION) "residual validation needs a real factorization to check against.") endif() -if(ENABLE_FORMAT_TARGETS) - find_package(format QUIET) - if(NOT format_FOUND) - include(FetchContent) - FetchContent_Declare( - format - GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git - GIT_TAG v1.8.1 - QUIET) - FetchContent_MakeAvailable(format) - endif() -endif() - if(NOT CMAKE_SKIP_INSTALL_RULES) # Our installs follow the standard GNU directory layout. This include needs to # come first since we need the CMAKE_INSTALL_* in the CMakeLists.txt of each diff --git a/hpx/CMakePresets.json b/hpx/CMakePresets.json deleted file mode 100644 index f3839f8..0000000 --- a/hpx/CMakePresets.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "version": 6, - "cmakeMinimumRequired": { - "major": 3, - "minor": 22, - "patch": 0 - }, - "configurePresets": [ - { - "name": "clang-tidy", - "hidden": true, - "cacheVariables": { - "CMAKE_CXX_CLANG_TIDY": "clang-tidy;--header-filter=^${sourceDir}/" - } - } - ] -} diff --git a/hpx/core/include/adapter_cblas_fp64.hpp b/hpx/core/include/adapter_cblas_fp64.hpp index 5440833..91ce5c9 100644 --- a/hpx/core/include/adapter_cblas_fp64.hpp +++ b/hpx/core/include/adapter_cblas_fp64.hpp @@ -126,7 +126,7 @@ void gemm(const vector &A, * @param dep_future dependency future to wait on before executing * @param A matrix to be factorized (mutated in-place) * @param N matrix dimension - * @return void future signalling completion + * @return void future signaling completion */ void_future potrf_f(void_future dep_future, vector &A, const int N); @@ -140,7 +140,7 @@ void_future potrf_f(void_future dep_future, vector &A, const int N); * @param M second dimension * @param transpose_L transpose flag for L * @param side_L side flag for L - * @return void future signalling completion + * @return void future signaling completion */ void_future trsm_f(void_future dep_L, void_future dep_A, @@ -158,7 +158,7 @@ void_future trsm_f(void_future dep_L, * @param A base matrix (mutated in-place) * @param B symmetric update matrix * @param N matrix dimension - * @return void future signalling completion + * @return void future signaling completion */ void_future syrk_f(void_future dep_A, void_future dep_B, vector &A, const vector &B, const int N); @@ -175,7 +175,7 @@ void_future syrk_f(void_future dep_A, void_future dep_B, vector &A, const vector * @param K third matrix dimension * @param transpose_A transpose flag for A * @param transpose_B transpose flag for B - * @return void future signalling completion + * @return void future signaling completion */ void_future gemm_f(void_future dep_A, diff --git a/openmp/.clang-format b/openmp/.clang-format deleted file mode 100644 index e8d875c..0000000 --- a/openmp/.clang-format +++ /dev/null @@ -1,174 +0,0 @@ ---- -Language: Cpp -AccessModifierOffset: -2 -AlignAfterOpenBracket: Align -AlignArrayOfStructures: None -AlignConsecutiveAssignments: None -AlignConsecutiveBitFields: None -AlignConsecutiveDeclarations: None -AlignConsecutiveMacros: None -AlignConsecutiveShortCaseStatements: - Enabled: true - AcrossEmptyLines: false - AcrossComments: false - AlignCaseColons: false -AlignEscapedNewlines: Right -AlignOperands: Align -AlignTrailingComments: - Kind: Always -AllowAllArgumentsOnNextLine: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowBreakBeforeNoexceptSpecifier: OnlyWithParen -AllowShortBlocksOnASingleLine: Empty -AllowShortCaseLabelsOnASingleLine: true -AllowShortEnumsOnASingleLine: true -AllowShortFunctionsOnASingleLine: All -AllowShortIfStatementsOnASingleLine: Never -AllowShortLambdasOnASingleLine: All -AllowShortLoopsOnASingleLine: true -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakBeforeMultilineStrings: false -AlwaysBreakAfterReturnType: None -AlwaysBreakTemplateDeclarations: Yes -BinPackArguments: false -BinPackParameters: false -BitFieldColonSpacing: Both -BraceWrapping: - AfterCaseLabel: false - AfterClass: true - AfterControlStatement: Always - AfterEnum: false - AfterFunction: true - AfterNamespace: true - AfterObjCDeclaration: true - AfterStruct: true - AfterUnion: true - AfterExternBlock: false - BeforeCatch: true - BeforeElse: true - BeforeLambdaBody: true - BeforeWhile: false - IndentBraces: false - SplitEmptyFunction: false - SplitEmptyRecord: false - SplitEmptyNamespace: false -BreakAfterAttributes: Never -BreakAfterJavaFieldAnnotations: false -BreakBeforeBinaryOperators: NonAssignment -BreakBeforeBraces: Custom -BreakBeforeConceptDeclarations: Always -BreakBeforeInlineASMColon: OnlyMultiline -BreakBeforeTernaryOperators: true -BreakConstructorInitializers: AfterColon -BreakInheritanceList: AfterComma -BreakStringLiterals: true -ColumnLimit: 120 -CommentPragmas: '^ IWYU pragma:' -CompactNamespaces: false -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: false -DerivePointerAlignment: false -DisableFormat: false -EmptyLineAfterAccessModifier: Never -EmptyLineBeforeAccessModifier: LogicalBlock -FixNamespaceComments: true -ForEachMacros: [ 'foreach', 'Q_FOREACH', 'BOOST_FOREACH' ] -IfMacros: [ ] -IncludeBlocks: Regroup -IncludeCategories: - - Regex: '^"gprat/' - Priority: 1 - - Regex: '^"(tests|bindings)/' - Priority: 2 - - Regex: '^"(fmt|catch2|pybind)' - Priority: 3 - - Regex: '^.*' - Priority: 4 -IncludeIsMainRegex: '(Test)?$' -IncludeIsMainSourceRegex: '(\.cu|\.hip)' -IndentAccessModifiers: false -IndentCaseBlocks: true -IndentCaseLabels: true -IndentExternBlock: NoIndent -IndentGotoLabels: false -IndentPPDirectives: None -IndentRequiresClause: false -IndentWidth: 4 -IndentWrappedFunctionNames: false -InsertBraces: true -InsertNewlineAtEOF: true -InsertTrailingCommas: None -IntegerLiteralSeparator: - Binary: 8 - Decimal: 3 - DecimalMinDigits: 5 - Hex: -1 -KeepEmptyLinesAtEOF: false -KeepEmptyLinesAtTheStartOfBlocks: false -LambdaBodyIndentation: Signature -LineEnding: DeriveLF -MacroBlockBegin: '' -MacroBlockEnd: '' -Macros: [ ] -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -NamespaceMacros: [ ] -PPIndentWidth: -1 -PackConstructorInitializers: Never -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 19 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakOpenParenthesis: 0 -PenaltyBreakString: 1000 -PenaltyBreakTemplateDeclaration: 10 -PenaltyExcessCharacter: 1000000 -PenaltyIndentedWhitespace: 1 -PenaltyReturnTypeOnItsOwnLine: 60 -PointerAlignment: Right -QualifierAlignment: Custom -QualifierOrder: [ 'inline', 'constexpr', 'static', 'friend', 'restrict', 'const', 'volatile', 'type' ] -ReferenceAlignment: Pointer -ReflowComments: true -RemoveBracesLLVM: false -RemoveParentheses: Leave -RemoveSemicolon: true -RequiresClausePosition: OwnLine -RequiresExpressionIndentation: OuterScope -SeparateDefinitionBlocks: Always -ShortNamespaceLines: 1 -SortIncludes: CaseInsensitive -SortUsingDeclarations: LexicographicNumeric -SpaceAfterCStyleCast: true -SpaceAfterLogicalNot: false -SpaceAfterTemplateKeyword: true -SpaceAroundPointerQualifiers: Default -SpaceBeforeAssignmentOperators: true -SpaceBeforeCaseColon: false -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeJsonColon: false -SpaceBeforeParens: ControlStatements -SpaceBeforeRangeBasedForLoopColon: true -SpaceBeforeSquareBrackets: false -SpaceInEmptyBlock: true -SpacesBeforeTrailingComments: 2 -SpacesInAngles: false -SpacesInContainerLiterals: true -SpacesInLineCommentPrefix: - Minimum: 1 - Maximum: 1 -SpacesInParens: Never -SpacesInSquareBrackets: false -Standard: c++17 -StatementAttributeLikeMacros: [ ] -StatementMacros: [ 'Q_UNUSED', 'QT_REQUIRE_VERSION' ] -TabWidth: 4 -TypeNames: [ ] -TypenameMacros: [ ] -UseTab: Never -WhitespaceSensitiveMacros: [ 'STRINGIZE', 'PP_STRINGIZE', 'BOOST_PP_STRINGIZE' ] -... - diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt index f506c0e..038bb9e 100644 --- a/openmp/CMakeLists.txt +++ b/openmp/CMakeLists.txt @@ -19,9 +19,6 @@ option( ENABLE_DYNAMIC_SCHEDULE "Use schedule(dynamic, 1) on the trailing-update worksharing loops in for_collapse. OFF by default so GCC builds compile out of the box. Turn ON for LLVM builds where the dynamic schedule is supported and gives better load balancing." OFF) -option(ENABLE_FORMAT_TARGETS "Enable clang-format / cmake-format targets" - ${PROJECT_IS_TOP_LEVEL}) - if(ENABLE_VALIDATION AND DISABLE_COMPUTATION) message( FATAL_ERROR @@ -29,19 +26,6 @@ if(ENABLE_VALIDATION AND DISABLE_COMPUTATION) "residual validation needs a real factorization to check against.") endif() -if(ENABLE_FORMAT_TARGETS) - find_package(format QUIET) - if(NOT format_FOUND) - include(FetchContent) - FetchContent_Declare( - format - GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git - GIT_TAG v1.8.1 - QUIET) - FetchContent_MakeAvailable(format) - endif() -endif() - if(NOT CMAKE_SKIP_INSTALL_RULES) # Our installs follow the standard GNU directory layout. This include needs to # come first since we need the CMAKE_INSTALL_* in the CMakeLists.txt of each diff --git a/openmp/CMakePresets.json b/openmp/CMakePresets.json deleted file mode 100644 index f3839f8..0000000 --- a/openmp/CMakePresets.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "version": 6, - "cmakeMinimumRequired": { - "major": 3, - "minor": 22, - "patch": 0 - }, - "configurePresets": [ - { - "name": "clang-tidy", - "hidden": true, - "cacheVariables": { - "CMAKE_CXX_CLANG_TIDY": "clang-tidy;--header-filter=^${sourceDir}/" - } - } - ] -} diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt new file mode 100644 index 0000000..69996cd --- /dev/null +++ b/reference/CMakeLists.txt @@ -0,0 +1,94 @@ +cmake_minimum_required(VERSION 3.23) +project(cholesky_reference) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# What to build? +option(BUILD_CORE "Build the core library" ON) +option(ENABLE_MKL "Enable Intel oneMKL support (threaded)" OFF) +option( + ENABLE_PLASMA + "Build the PLASMA tiled-Cholesky reference variant in addition to the LAPACKE_dpotrf one" + OFF) +option( + ENABLE_LAPACKE + "Run the LAPACKE_dpotrf reference mode at runtime (on by default). Linking is unchanged either way (PLASMA and validation still need cblas/lapacke)." + ON) +option( + ENABLE_VALIDATION + "Compute ||A - L*L^T||_F / ||A||_F after each factorization (off by default)" + OFF) + +if(NOT CMAKE_SKIP_INSTALL_RULES) + include(GNUInstallDirs) +endif() + +if(BUILD_CORE) + if(ENABLE_MKL) + set(MKL_INTERFACE_FULL "intel_lp64") + set(MKL_THREADING "intel_thread") + find_package(MKL CONFIG REQUIRED) + + if(MKL_FOUND) + message(STATUS "Intel oneMKL Library found (threaded: ${MKL_THREADING})") + else() + message(FATAL_ERROR "No BLAS Library found") + endif() + else() + find_library(OpenBLAS_LIB NAMES openblas REQUIRED) + + if(OpenBLAS_LIB) + message(STATUS "OpenBLAS Library found at ${OpenBLAS_LIB}") + find_path( + OpenBLAS_INCLUDE_DIR + NAMES cblas.h + PATH_SUFFIXES openblas) + if(NOT OpenBLAS_INCLUDE_DIR) + message(FATAL_ERROR "OpenBLAS include directory not found") + endif() + + message(STATUS "OpenBLAS include dir: ${OpenBLAS_INCLUDE_DIR}") + else() + message(FATAL_ERROR "No BLAS Library found") + endif() + endif() + + find_package(OpenMP REQUIRED) + + if(ENABLE_PLASMA) + find_path(PLASMA_INCLUDE_DIR plasma.h) + if(NOT PLASMA_INCLUDE_DIR) + message(FATAL_ERROR "ENABLE_PLASMA=ON but plasma.h was not found") + endif() + find_library(PLASMA_LIB NAMES plasma REQUIRED) + find_library(PLASMA_CORE_BLAS_LIB NAMES coreblas plasma_core_blas) + message(STATUS "PLASMA include dir: ${PLASMA_INCLUDE_DIR}") + message(STATUS "PLASMA library: ${PLASMA_LIB}") + if(PLASMA_CORE_BLAS_LIB) + message(STATUS "PLASMA coreblas library: ${PLASMA_CORE_BLAS_LIB}") + endif() + endif() + + add_subdirectory(core) + + # Add the executable + add_executable(cholesky_reference main.cpp) + + # Link the libraries + target_link_libraries(cholesky_reference PUBLIC Cholesky::core + OpenMP::OpenMP_CXX) + + if(ENABLE_VALIDATION) + target_compile_definitions(cholesky_reference PRIVATE ENABLE_VALIDATION) + endif() + + if(ENABLE_PLASMA) + target_compile_definitions(cholesky_reference PRIVATE ENABLE_PLASMA) + target_include_directories(cholesky_reference PRIVATE ${PLASMA_INCLUDE_DIR}) + endif() + + if(ENABLE_LAPACKE) + target_compile_definitions(cholesky_reference PRIVATE ENABLE_LAPACKE) + endif() +endif() diff --git a/reference/compile.sh b/reference/compile.sh new file mode 100755 index 0000000..b5e66c0 --- /dev/null +++ b/reference/compile.sh @@ -0,0 +1,118 @@ +#!/bin/bash +# Usage: compile.sh +# +# Builds the parallel-BLAS reference benchmark: a single tile parallel +# LAPACKE_dpotrf call on the full matrix, used as a baseline against the +# tiled fork-join and tasking implementations. +# +# CMake project options can be overridden via environment variables +# (defaults match the project's CMakeLists.txt defaults): +# ENABLE_MKL ON|OFF (default OFF) - link threaded Intel oneMKL +# instead of threaded OpenBLAS +# ENABLE_PLASMA ON|OFF (default OFF) - also build the PLASMA +# plasma_dpotrf variant (extra +# 'plasma' column in the output) +# ENABLE_LAPACKE ON|OFF (default ON) - run the LAPACKE_dpotrf +# reference mode at runtime +# ENABLE_VALIDATION ON|OFF (default OFF) - residual check after each +# factorization +# +# Examples: +# ./compile.sh +# ENABLE_MKL=ON ./compile.sh +# ENABLE_PLASMA=ON ./compile.sh +# ENABLE_LAPACKE=OFF ENABLE_PLASMA=ON ./compile.sh +# ENABLE_VALIDATION=ON ./compile.sh +################################################################################ +set -e # Exit immediately if a command exits with a non-zero status. + +################################################################################ +# CMake project options (env-var overridable; defaults match CMakeLists.txt) +################################################################################ +: "${ENABLE_MKL:=OFF}" +: "${ENABLE_PLASMA:=OFF}" +: "${ENABLE_LAPACKE:=ON}" +: "${ENABLE_VALIDATION:=OFF}" + +for var in ENABLE_MKL ENABLE_PLASMA ENABLE_LAPACKE ENABLE_VALIDATION; do + case "${!var}" in + ON | OFF) ;; + *) + echo "Error: $var must be ON or OFF (got '${!var}')." >&2 + exit 1 + ;; + esac +done + +################################################################################ +# Toolchain selection +################################################################################ +select_toolchain() { + module load gcc/14.2.0 + export CC=gcc + export CXX=g++ +} + +################################################################################ +# Configurations +# +# The reference benchmark uses *threaded* BLAS as they operate on a single tile +# and do not parallelize at the tile level. +################################################################################ +if command -v spack &>/dev/null; then + echo "Spack command found. Loading libraries." + # Get current hostname + HOSTNAME=$(hostname -s) + + if [[ "$HOSTNAME" == "ipvs-epyc1" || "$HOSTNAME" == "ipvs-epyc2" ]]; then + # Compiler + select_toolchain + if [[ "$ENABLE_MKL" == "OFF" ]]; then + # OpenBLAS built with OpenMP threading + spack load openblas@0.3.28%gcc@14.2.0 threads=openmp ilp64=true + fi + if [[ "$ENABLE_PLASMA" == "ON" ]]; then + spack load plasma%gcc@14.2.0 ^openblas@0.3.28%gcc@14.2.0 threads=openmp + fi + + elif [[ "$HOSTNAME" == "nasrin0" || "$HOSTNAME" == "nasrin1" ]]; then + # Compiler + select_toolchain + if [[ "$ENABLE_MKL" == "OFF" ]]; then + # OpenBLAS built with OpenMP threading + spack load openblas@0.3.28%gcc@14.2.0 arch=linux-almalinux9-zen3 threads=openmp ilp64=true + fi + if [[ "$ENABLE_PLASMA" == "ON" ]]; then + spack load plasma%gcc@14.2.0 arch=linux-almalinux9-zen3 openblas@0.3.28%gcc@14.2.0 threads=openmp + fi + + else + echo "Hostname is $HOSTNAME — no action taken." + fi +else + echo "Spack command not found. Exiting." +fi + +################################################################################ +# Compile code +################################################################################ +rm -rf build && mkdir build && cd build + +echo "CMake options:" +echo " ENABLE_MKL = $ENABLE_MKL" +echo " ENABLE_PLASMA = $ENABLE_PLASMA" +echo " ENABLE_LAPACKE = $ENABLE_LAPACKE" +echo " ENABLE_VALIDATION = $ENABLE_VALIDATION" + +cmake -DCMAKE_BUILD_TYPE=Release \ + -DENABLE_MKL="$ENABLE_MKL" \ + -DENABLE_PLASMA="$ENABLE_PLASMA" \ + -DENABLE_LAPACKE="$ENABLE_LAPACKE" \ + -DENABLE_VALIDATION="$ENABLE_VALIDATION" \ + .. +make -j VERBOSE=1 +cd .. + +# Launch Example +# OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores \ +# ./build/cholesky_reference --size_start 1024 --size_stop 65536 --loop 1 diff --git a/reference/core/CMakeLists.txt b/reference/core/CMakeLists.txt new file mode 100644 index 0000000..b74c17f --- /dev/null +++ b/reference/core/CMakeLists.txt @@ -0,0 +1,71 @@ +set(SOURCE_FILES src/matrix_generation.cpp src/functions.cpp + src/cholesky_factor.cpp src/adapter_cblas_fp64.cpp) + +if(ENABLE_VALIDATION) + list(APPEND SOURCE_FILES src/validate.cpp) +endif() + +if(ENABLE_PLASMA) + list(APPEND SOURCE_FILES src/adapter_plasma_fp64.cpp) +endif() + +add_library(cholesky_core STATIC ${SOURCE_FILES}) + +set_property(TARGET cholesky_core PROPERTY EXPORT_NAME core) +add_library(Cholesky::core ALIAS cholesky_core) + +# Add them as PRIVATE sources here so they show up in project files Can't use +# PUBLIC etc., see: https://stackoverflow.com/a/62465051 +file(GLOB_RECURSE header_files CONFIGURE_DEPENDS include/*.hpp) +target_sources(cholesky_core PRIVATE ${header_files}) + +# Link OpenMP libraries (used by the parallel matrix generator) +target_link_libraries(cholesky_core PUBLIC OpenMP::OpenMP_CXX) + +# Include directories +target_include_directories( + cholesky_core PUBLIC "$") + +# Link BLAS +if(ENABLE_MKL) + # Link threaded Intel oneMKL + target_link_libraries(cholesky_core PUBLIC MKL::mkl_intel_lp64 MKL::mkl_core + MKL::MKL MKL::mkl_intel_thread) +else() + # Link threaded OpenBLAS + target_link_libraries(cholesky_core PUBLIC ${OpenBLAS_LIB}) + target_include_directories(cholesky_core PUBLIC ${OpenBLAS_INCLUDE_DIR}) +endif() + +if(ENABLE_MKL) + target_compile_definitions(cholesky_core PUBLIC ENABLE_MKL) +endif() + +if(ENABLE_PLASMA) + target_compile_definitions(cholesky_core PUBLIC ENABLE_PLASMA) + target_include_directories(cholesky_core PUBLIC ${PLASMA_INCLUDE_DIR}) + target_link_libraries(cholesky_core PUBLIC ${PLASMA_LIB}) + if(PLASMA_CORE_BLAS_LIB) + target_link_libraries(cholesky_core PUBLIC ${PLASMA_CORE_BLAS_LIB}) + endif() +endif() + +target_compile_features(cholesky_core PUBLIC cxx_std_17) + +set_property(TARGET cholesky_core PROPERTY POSITION_INDEPENDENT_CODE ON) + +if(NOT CMAKE_SKIP_INSTALL_RULES) + install( + DIRECTORY include/ + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" + COMPONENT Development) + + install( + TARGETS cholesky_core + EXPORT CholeskyTargets + RUNTIME COMPONENT Runtime + LIBRARY COMPONENT Runtime NAMELINK_COMPONENT Development + ARCHIVE COMPONENT Development + INCLUDES + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") +endif() diff --git a/reference/core/include/adapter_cblas_fp64.hpp b/reference/core/include/adapter_cblas_fp64.hpp new file mode 100644 index 0000000..11a79e4 --- /dev/null +++ b/reference/core/include/adapter_cblas_fp64.hpp @@ -0,0 +1,21 @@ +#ifndef CPU_ADAPTER_CBLAS_FP64_H +#define CPU_ADAPTER_CBLAS_FP64_H + +#pragma once + +#include + +using vector = std::vector; + +// LAPACK level 3 operations + +/** + * @brief FP64 In-place Cholesky decomposition of A using a threaded + * LAPACKE_dpotrf call. + * + * @param A row-major matrix of size N*N to be factorized in place + * @param N matrix dimension + */ +void lapacke_potrf(vector &A, const int N); + +#endif // end of CPU_ADAPTER_CBLAS_FP64_H diff --git a/reference/core/include/adapter_plasma_fp64.hpp b/reference/core/include/adapter_plasma_fp64.hpp new file mode 100644 index 0000000..3edd661 --- /dev/null +++ b/reference/core/include/adapter_plasma_fp64.hpp @@ -0,0 +1,22 @@ +#ifndef CPU_ADAPTER_PLASMA_FP64_H +#define CPU_ADAPTER_PLASMA_FP64_H + +#pragma once + +#include + +namespace cpu +{ + +/** + * @brief PLASMA tiled Cholesky on a row-major N x N buffer using the + * high-level synchronous API (plasma_dpotrf). + * + * Throws @c std::runtime_error before calling PLASMA when the descriptor + * size computation inside plasma_desc_*_create() would overflow int32. + * + */ +void plasma_potrf(std::vector &A, int N); + +} // end of namespace cpu +#endif // end of CPU_ADAPTER_PLASMA_FP64_H diff --git a/reference/core/include/cholesky_factor.hpp b/reference/core/include/cholesky_factor.hpp new file mode 100644 index 0000000..9bcf784 --- /dev/null +++ b/reference/core/include/cholesky_factor.hpp @@ -0,0 +1,43 @@ +#ifndef CPU_CHOLESKY_FACTOR_H +#define CPU_CHOLESKY_FACTOR_H + +#pragma once + +#include +#include +#include + +namespace cpu +{ + +/** + * @brief Reference Cholesky variants. + * + * - lapacke : threaded LAPACKE_dpotrf2 call + * - plasma : plasma_dpotrf call (PLASMA's high-level + * synchronous Cholesky over the OpenMP runtime). + */ +enum class Variant { lapacke, plasma }; + +inline Variant to_variant(const std::string &s) +{ + if (s == "lapacke") + { + return Variant::lapacke; + } + if (s == "plasma") + { + return Variant::plasma; + } + throw std::invalid_argument("Unknown Variant: " + s); +} + +/** + * @brief Run the requested reference variant on the full row-major N x N + * matrix. Factorization is in place; @p matrix holds the lower + * triangular factor L on return. + */ +void parallel_cholesky(Variant variant, std::vector &matrix, int N); + +} // end of namespace cpu +#endif // end of CPU_CHOLESKY_FACTOR_H diff --git a/reference/core/include/functions.hpp b/reference/core/include/functions.hpp new file mode 100644 index 0000000..f7e74ba --- /dev/null +++ b/reference/core/include/functions.hpp @@ -0,0 +1,26 @@ +#ifndef CPU_FUNCTIONS_H +#define CPU_FUNCTIONS_H + +#pragma once + +#include +#include +#include + +namespace cpu +{ + +/** + * @brief Time a single call to the requested reference variant + * ('reference' or 'plasma') on the @p matrix buffer (row-major, N x N). + * The buffer is factorized in place. + * + * @param matrix row-major matrix; on return contains the lower-triangular factor L + * @param N matrix dimension + * @param variant which reference path to time + * @return wall-clock elapsed time in seconds + */ +double cholesky(std::vector &matrix, std::size_t N, const std::string &variant); + +} // namespace cpu +#endif // end of CPU_FUNCTIONS_H diff --git a/reference/core/include/matrix_generation.hpp b/reference/core/include/matrix_generation.hpp new file mode 100644 index 0000000..967398b --- /dev/null +++ b/reference/core/include/matrix_generation.hpp @@ -0,0 +1,22 @@ +#ifndef MATRIX_GENERATION_H +#define MATRIX_GENERATION_H + +#pragma once + +#include +#include + +/** + * @brief Generate a deterministic, dense, row-major SPD matrix of size N x N. + * + * Entries are uniform on [0, 1) using a per-row seed; the diagonal is shifted + * by +N to guarantee strict diagonal dominance and therefore symmetric + * positive definiteness. The result is stored as a single contiguous + * std::vector of length N*N in row-major order. + * + * @param N matrix dimension + * @return owning row-major buffer of length N*N + */ +std::vector gen_matrix(std::size_t N); + +#endif diff --git a/reference/core/include/validate.hpp b/reference/core/include/validate.hpp new file mode 100644 index 0000000..4c666d0 --- /dev/null +++ b/reference/core/include/validate.hpp @@ -0,0 +1,28 @@ +#ifndef CPU_VALIDATE_H +#define CPU_VALIDATE_H + +#pragma once + +#include +#include + +namespace cpu +{ + +/** + * @brief Compute the relative Cholesky residual ||A - L * L^T||_F / ||A||_F + * for the dense, row-major reference factorization. + * + * The original A is regenerated on the fly with the same deterministic seed + * used by gen_matrix, so no extra storage is needed. + * + * @param N matrix dimension (must match the factorization) + * @param L row-major buffer of length N*N holding the factor returned by + * LAPACKE_dpotrf with uplo='L' (only the lower triangle is read) + * @return relative Frobenius residual + */ +double cholesky_residual(std::size_t N, const std::vector &L); + +} // namespace cpu + +#endif // end of CPU_VALIDATE_H diff --git a/reference/core/src/adapter_cblas_fp64.cpp b/reference/core/src/adapter_cblas_fp64.cpp new file mode 100644 index 0000000..264d442 --- /dev/null +++ b/reference/core/src/adapter_cblas_fp64.cpp @@ -0,0 +1,12 @@ +#include "adapter_cblas_fp64.hpp" + +#ifdef ENABLE_MKL +// MKL CBLAS / LAPACKE +#include "mkl_cblas.h" +#include "mkl_lapacke.h" +#else +#include "cblas.h" +#include "lapacke.h" +#endif + +void lapacke_potrf(vector &A, const int N) { LAPACKE_dpotrf2(LAPACK_ROW_MAJOR, 'L', N, A.data(), N); } diff --git a/reference/core/src/adapter_plasma_fp64.cpp b/reference/core/src/adapter_plasma_fp64.cpp new file mode 100644 index 0000000..06147ad --- /dev/null +++ b/reference/core/src/adapter_plasma_fp64.cpp @@ -0,0 +1,30 @@ +#include "adapter_plasma_fp64.hpp" + +#include +#include +#include + +namespace cpu +{ + +void plasma_potrf(std::vector &A, int N) +{ + constexpr int k_plasma_max_n = 65'280; + if (N > k_plasma_max_n) + { + throw std::runtime_error( + "plasma_dpotrf: skipped to avoid PLASMA descriptor int32 overflow at N=" + std::to_string(N) + + " (max supported with default nb=256: " + std::to_string(k_plasma_max_n) + ")"); + } + + // PLASMA is column-major. Our buffer is row-major and the matrix is + // symmetric, so we can pass it through unchanged and ask PLASMA to write + // its result into the upper triangle of its column-major view + const int info = plasma_dpotrf(PlasmaUpper, N, A.data(), N); + if (info != 0) + { + throw std::runtime_error("plasma_dpotrf failed with info=" + std::to_string(info)); + } +} + +} // end of namespace cpu diff --git a/reference/core/src/cholesky_factor.cpp b/reference/core/src/cholesky_factor.cpp new file mode 100644 index 0000000..3a20132 --- /dev/null +++ b/reference/core/src/cholesky_factor.cpp @@ -0,0 +1,29 @@ +#include "cholesky_factor.hpp" + +#include "adapter_cblas_fp64.hpp" +#ifdef ENABLE_PLASMA +#include "adapter_plasma_fp64.hpp" +#endif + +#include + +namespace cpu +{ + +void parallel_cholesky(Variant variant, std::vector &matrix, int N) +{ + switch (variant) + { + case Variant::lapacke: lapacke_potrf(matrix, N); return; + + case Variant::plasma: +#ifdef ENABLE_PLASMA + plasma_potrf(matrix, N); + return; +#else + throw std::invalid_argument("Variant 'plasma' requested but the binary was built without ENABLE_PLASMA=ON"); +#endif + } +} + +} // end of namespace cpu diff --git a/reference/core/src/functions.cpp b/reference/core/src/functions.cpp new file mode 100644 index 0000000..1f15f26 --- /dev/null +++ b/reference/core/src/functions.cpp @@ -0,0 +1,21 @@ +#include "functions.hpp" + +#include "cholesky_factor.hpp" +#include + +namespace cpu +{ + +double cholesky(std::vector &matrix, std::size_t N, const std::string &variant) +{ + const Variant v = to_variant(variant); + auto start = std::chrono::high_resolution_clock::now(); + /////////////////////////////////////////////////////////////////////////// + // Launch Cholesky decomposition: A = L * L^T + parallel_cholesky(v, matrix, static_cast(N)); + /////////////////////////////////////////////////////////////////////////// + auto stop = std::chrono::high_resolution_clock::now(); + return (stop - start).count() / 1e9; +} + +} // end of namespace cpu diff --git a/reference/core/src/matrix_generation.cpp b/reference/core/src/matrix_generation.cpp new file mode 100644 index 0000000..a67ff5a --- /dev/null +++ b/reference/core/src/matrix_generation.cpp @@ -0,0 +1,28 @@ +#include "matrix_generation.hpp" + +#include +#include + +std::vector gen_matrix(std::size_t N) +{ + std::vector A(N * N); + + // The matrix is built row by row in parallel. Each row uses its own RNG + // seeded by the row index, so the matrix is deterministic and + // reproducible regardless of the number of threads. +#pragma omp parallel for schedule(static) + for (std::size_t i = 0; i < N; ++i) + { + std::mt19937 generator(static_cast(i + 1)); + std::uniform_real_distribution distribute(0.0, 1.0); + for (std::size_t j = 0; j <= i; ++j) + { + const double v = distribute(generator); + A[i * N + j] = v; + A[j * N + i] = v; + } + A[i * N + i] += static_cast(N); + } + + return A; +} diff --git a/reference/core/src/validate.cpp b/reference/core/src/validate.cpp new file mode 100644 index 0000000..8b1f647 --- /dev/null +++ b/reference/core/src/validate.cpp @@ -0,0 +1,68 @@ +#include "validate.hpp" + +#include "matrix_generation.hpp" + +#ifdef ENABLE_MKL +#include "mkl_cblas.h" +#else +#include "cblas.h" +#endif + +#include +#include +#include + +namespace cpu +{ + +double cholesky_residual(std::size_t N, const std::vector &L) +{ + // Build a working copy of L with its strictly upper triangle zeroed out. + std::vector Lwork(L); + for (std::size_t i = 0; i < N; ++i) + { + for (std::size_t j = i + 1; j < N; ++j) + { + Lwork[i * N + j] = 0.0; + } + } + + // Compute LLt = L * L^T (full N x N) with a single dgemm. + std::vector LLt(N * N, 0.0); + cblas_dgemm( + CblasRowMajor, + CblasNoTrans, + CblasTrans, + static_cast(N), + static_cast(N), + static_cast(N), + 1.0, + Lwork.data(), + static_cast(N), + Lwork.data(), + static_cast(N), + 0.0, + LLt.data(), + static_cast(N)); + + // Regenerate the original matrix A deterministically and accumulate Frobenius + // norms of (A - LLt) and A. + const std::vector A = gen_matrix(N); + + double r_norm_sq = 0.0; + double a_norm_sq = 0.0; + for (std::size_t idx = 0; idx < A.size(); ++idx) + { + const double d = A[idx] - LLt[idx]; + r_norm_sq += d * d; + a_norm_sq += A[idx] * A[idx]; + } + + if (a_norm_sq == 0.0) + { + return 0.0; + } + return std::sqrt(r_norm_sq / a_norm_sq); +} + +} // namespace cpu diff --git a/reference/main.cpp b/reference/main.cpp new file mode 100644 index 0000000..3c824c9 --- /dev/null +++ b/reference/main.cpp @@ -0,0 +1,157 @@ +#include "functions.hpp" +#include "matrix_generation.hpp" +#ifdef ENABLE_VALIDATION +#include "validate.hpp" +#endif +#ifdef ENABLE_PLASMA +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char *argv[]) +{ + /////////////////////////////////////////////////////////////////////////// + // cmdline arguments + std::size_t loop = 1; + std::size_t size_start = 32, size_stop = 128; + + for (int i = 1; i < argc; ++i) + { + std::string arg = argv[i]; + if (arg == "--loop" && i + 1 < argc) + { + loop = std::stoul(argv[++i]); + } + else if (arg == "--size_start" && i + 1 < argc) + { + size_start = std::stoul(argv[++i]); + } + else if (arg == "--size_stop" && i + 1 < argc) + { + size_stop = std::stoul(argv[++i]); + } + else if ((arg == "--tiles_start" || arg == "--tiles_stop") && i + 1 < argc) + { + // Accept-and-ignore for CLI parity with the tiled variants. + ++i; + } + } + /////////////////////////////////////////////////////////////////////////// + // configuration + const std::size_t LOOP = loop; + + const std::size_t START_SIZE = size_start; + const std::size_t STOP_SIZE = size_stop; + const std::size_t STEP_SIZE = 2; + + // print and write results + bool HEADER_FLAG = true; + std::string runtime_file_path = "runtimes_reference_cholesky_"; + if (START_SIZE != STOP_SIZE) + { + runtime_file_path += std::string("size_"); + } + runtime_file_path += std::to_string(LOOP) + std::string(".txt"); + std::ofstream runtime_file; + runtime_file.open(runtime_file_path, std::ios_base::app); + +#ifdef ENABLE_PLASMA + if (plasma_init() != 0) + { + throw std::runtime_error("plasma_init() failed"); + } +#endif + + for (std::size_t size = START_SIZE; size <= STOP_SIZE; size = size * STEP_SIZE) + { + for (std::size_t l = 0; l < LOOP; l++) + { + std::string header = "threads;problem_size;tile_size;n_tiles"; + std::string values = std::to_string(omp_get_max_threads()); + values += std::string(";") + std::to_string(size); + values += std::string(";") + std::to_string(size); + values += std::string(";") + std::to_string(1); + /////////////////////////////////////////////////////////////////// + // Reference modes: + std::vector modes = {}; +#ifdef ENABLE_LAPACKE + modes.push_back("lapacke"); +#endif +#ifdef ENABLE_PLASMA + modes.push_back("plasma"); +#endif + + for (const auto &mode : modes) + { + header += ";" + mode; + std::size_t mode_size = size; + + // PLASMA's triangular descriptor allocation + // overflows int32 for N>65280 with the default nb=256. For + // input sizes in (65280, 65536] we silently clamp PLASMA's + // working size down to 65280; std::size_t mode_size = size; + if (mode == "plasma" && mode_size > 65'280 && mode_size <= 65'536) + { + mode_size = 65'280; + } + + std::vector matrix = gen_matrix(mode_size); + // NaN guard + double cholesky_cpu = std::numeric_limits::quiet_NaN(); + try + { + cholesky_cpu = cpu::cholesky(matrix, mode_size, mode); + } + catch (const std::exception &e) + { + std::cerr << "Error: variant '" << mode << "' failed at size=" << mode_size << ": " << e.what() + << ". Recording NaN and continuing." << std::endl; + values += ";nan"; + continue; + } + + values += ";" + std::to_string(cholesky_cpu); + +#ifdef ENABLE_VALIDATION + // Validate by computing relative residual ||A - L L^T||_F / ||A||_F + constexpr double residual_tol = 1e-10; + const double residual = cpu::cholesky_residual(mode_size, matrix); + std::cout << "[validate] mode=" << mode << " size=" << mode_size << " residual=" << residual + << std::endl; + if (!(residual <= residual_tol)) // catches NaN too + { + std::cerr << "Validation warning: variant '" << mode << "' residual " << residual + << " exceeds tolerance " << residual_tol << " (size=" << mode_size << ")" << std::endl; + } +#endif + } + /////////////////////////////////////////////////////////////////// + // print/write header only once + if (HEADER_FLAG) + { + HEADER_FLAG = false; + std::cout << header << std::endl; + runtime_file << header << std::endl; + } + // print/write runtimes + std::cout << values << std::endl; + runtime_file << values << std::endl; + } + } + + runtime_file.close(); + +#ifdef ENABLE_PLASMA + plasma_finalize(); +#endif + + return 0; +} diff --git a/reference/run.sh b/reference/run.sh new file mode 100755 index 0000000..0b5c772 --- /dev/null +++ b/reference/run.sh @@ -0,0 +1,41 @@ +#!/bin/bash +#SBATCH --job-name=cholesky_reference +#SBATCH --output=logs/cholesky_reference_%j.out +#SBATCH --error=logs/cholesky_reference_%j.err +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=128 +#SBATCH --time=144:00:00 +#SBATCH --exclusive +# +# Usage: run.sh +# +# Submit example: +# sbatch run.sh + +set -e # Exit immediately if a command exits with a non-zero status. + +################################################################################ +# Toolchain runtime selection +################################################################################ +module load gcc/14.2.0 + +# Resolve directory where the script is located +SCRIPT_DIR="$(pwd)" + +# OpenMP settings +export OMP_NUM_THREADS=128 +export OMP_PROC_BIND=close +export OMP_PLACES=cores + +# Make sure threaded MKL uses the OpenMP runtime if ENABLE_MKL=ON was used at +# build time. +export MKL_NUM_THREADS=${MKL_NUM_THREADS:-$OMP_NUM_THREADS} + +echo "Running with gcc runtime" + +# Run executable +srun --cpu-bind=cores "$SCRIPT_DIR/build/cholesky_reference" \ + --loop 20 \ + --size_start 1024 \ + --size_stop 65536