From 4df38516281858214f7aa4752d172bf4b75a7d4d Mon Sep 17 00:00:00 2001
From: constracktor <74077030+constracktor@users.noreply.github.com>
Date: Wed, 29 Apr 2026 10:19:43 +0200
Subject: [PATCH 01/13] Initial reference draft

---
 reference/.clang-format                       | 174 ++++++++++++++++++
 reference/CMakeLists.txt                      |  91 +++++++++
 reference/CMakePresets.json                   |  17 ++
 reference/compile.sh                          |  99 ++++++++++
 reference/core/CMakeLists.txt                 |  62 +++++++
 reference/core/include/adapter_cblas_fp64.hpp |  23 +++
 reference/core/include/cholesky_factor.hpp    |  20 ++
 reference/core/include/functions.hpp          |  23 +++
 reference/core/include/matrix_generation.hpp  |  26 +++
 reference/core/include/validate.hpp           |  28 +++
 reference/core/src/adapter_cblas_fp64.cpp     |  19 ++
 reference/core/src/cholesky_factor.cpp        |  15 ++
 reference/core/src/functions.cpp              |  20 ++
 reference/core/src/matrix_generation.cpp      |  32 ++++
 reference/core/src/validate.cpp               |  72 ++++++++
 reference/main.cpp                            | 119 ++++++++++++
 reference/run.sh                              |  47 +++++
 17 files changed, 887 insertions(+)
 create mode 100644 reference/.clang-format
 create mode 100644 reference/CMakeLists.txt
 create mode 100644 reference/CMakePresets.json
 create mode 100755 reference/compile.sh
 create mode 100644 reference/core/CMakeLists.txt
 create mode 100644 reference/core/include/adapter_cblas_fp64.hpp
 create mode 100644 reference/core/include/cholesky_factor.hpp
 create mode 100644 reference/core/include/functions.hpp
 create mode 100644 reference/core/include/matrix_generation.hpp
 create mode 100644 reference/core/include/validate.hpp
 create mode 100644 reference/core/src/adapter_cblas_fp64.cpp
 create mode 100644 reference/core/src/cholesky_factor.cpp
 create mode 100644 reference/core/src/functions.cpp
 create mode 100644 reference/core/src/matrix_generation.cpp
 create mode 100644 reference/core/src/validate.cpp
 create mode 100644 reference/main.cpp
 create mode 100755 reference/run.sh

diff --git a/reference/.clang-format b/reference/.clang-format
new file mode 100644
index 0000000..e8d875c
--- /dev/null
+++ b/reference/.clang-format
@@ -0,0 +1,174 @@
+---
+Language: Cpp
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: None
+AlignConsecutiveAssignments: None
+AlignConsecutiveBitFields: None
+AlignConsecutiveDeclarations: None
+AlignConsecutiveMacros: None
+AlignConsecutiveShortCaseStatements:
+  Enabled: true
+  AcrossEmptyLines: false
+  AcrossComments: false
+  AlignCaseColons: false
+AlignEscapedNewlines: Right
+AlignOperands: Align
+AlignTrailingComments:
+  Kind: Always
+AllowAllArgumentsOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
+AllowShortBlocksOnASingleLine: Empty
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortEnumsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: All
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BitFieldColonSpacing: Both
+BraceWrapping:
+  AfterCaseLabel: false
+  AfterClass: true
+  AfterControlStatement: Always
+  AfterEnum: false
+  AfterFunction: true
+  AfterNamespace: true
+  AfterObjCDeclaration: true
+  AfterStruct: true
+  AfterUnion: true
+  AfterExternBlock: false
+  BeforeCatch: true
+  BeforeElse: true
+  BeforeLambdaBody: true
+  BeforeWhile: false
+  IndentBraces: false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakAfterAttributes: Never
+BreakAfterJavaFieldAnnotations: false
+BreakBeforeBinaryOperators: NonAssignment
+BreakBeforeBraces: Custom
+BreakBeforeConceptDeclarations: Always
+BreakBeforeInlineASMColon: OnlyMultiline
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: AfterColon
+BreakInheritanceList: AfterComma
+BreakStringLiterals: true
+ColumnLimit: 120
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat: false
+EmptyLineAfterAccessModifier: Never
+EmptyLineBeforeAccessModifier: LogicalBlock
+FixNamespaceComments: true
+ForEachMacros: [ 'foreach', 'Q_FOREACH', 'BOOST_FOREACH' ]
+IfMacros: [ ]
+IncludeBlocks: Regroup
+IncludeCategories:
+  - Regex: '^"gprat/'
+    Priority: 1
+  - Regex: '^"(tests|bindings)/'
+    Priority: 2
+  - Regex: '^"(fmt|catch2|pybind)'
+    Priority: 3
+  - Regex: '^.*'
+    Priority: 4
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: '(\.cu|\.hip)'
+IndentAccessModifiers: false
+IndentCaseBlocks: true
+IndentCaseLabels: true
+IndentExternBlock: NoIndent
+IndentGotoLabels: false
+IndentPPDirectives: None
+IndentRequiresClause: false
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+InsertBraces: true
+InsertNewlineAtEOF: true
+InsertTrailingCommas: None
+IntegerLiteralSeparator:
+  Binary: 8
+  Decimal: 3
+  DecimalMinDigits: 5
+  Hex: -1
+KeepEmptyLinesAtEOF: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+LambdaBodyIndentation: Signature
+LineEnding: DeriveLF
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+Macros: [ ]
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+NamespaceMacros: [ ]
+PPIndentWidth: -1
+PackConstructorInitializers: Never
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakOpenParenthesis: 0
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyIndentedWhitespace: 1
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Right
+QualifierAlignment: Custom
+QualifierOrder: [ 'inline', 'constexpr', 'static', 'friend', 'restrict', 'const', 'volatile', 'type' ]
+ReferenceAlignment: Pointer
+ReflowComments: true
+RemoveBracesLLVM: false
+RemoveParentheses: Leave
+RemoveSemicolon: true
+RequiresClausePosition: OwnLine
+RequiresExpressionIndentation: OuterScope
+SeparateDefinitionBlocks: Always
+ShortNamespaceLines: 1
+SortIncludes: CaseInsensitive
+SortUsingDeclarations: LexicographicNumeric
+SpaceAfterCStyleCast: true
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceAroundPointerQualifiers: Default
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCaseColon: false
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeJsonColon: false
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
+SpaceInEmptyBlock: true
+SpacesBeforeTrailingComments: 2
+SpacesInAngles: false
+SpacesInContainerLiterals: true
+SpacesInLineCommentPrefix:
+  Minimum: 1
+  Maximum: 1
+SpacesInParens: Never
+SpacesInSquareBrackets: false
+Standard: c++17
+StatementAttributeLikeMacros: [ ]
+StatementMacros: [ 'Q_UNUSED', 'QT_REQUIRE_VERSION' ]
+TabWidth: 4
+TypeNames: [ ]
+TypenameMacros: [ ]
+UseTab: Never
+WhitespaceSensitiveMacros: [ 'STRINGIZE', 'PP_STRINGIZE', 'BOOST_PP_STRINGIZE' ]
+...
+
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
new file mode 100644
index 0000000..e63612f
--- /dev/null
+++ b/reference/CMakeLists.txt
@@ -0,0 +1,91 @@
+cmake_minimum_required(VERSION 3.23)
+project(cholesky_reference)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# What to build?
+option(BUILD_CORE "Build the core library" ON)
+option(ENABLE_MKL "Enable Intel oneMKL support (threaded)" OFF)
+option(
+  ENABLE_VALIDATION
+  "Compute ||A - L*L^T||_F / ||A||_F after each factorization (off by default)"
+  OFF)
+option(ENABLE_FORMAT_TARGETS "Enable clang-format / cmake-format targets"
+       ${PROJECT_IS_TOP_LEVEL})
+
+if(ENABLE_FORMAT_TARGETS)
+  find_package(format QUIET)
+  if(NOT format_FOUND)
+    include(FetchContent)
+    FetchContent_Declare(
+      format
+      GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git
+      GIT_TAG v1.8.1
+      QUIET)
+    FetchContent_MakeAvailable(format)
+  endif()
+endif()
+
+if(NOT CMAKE_SKIP_INSTALL_RULES)
+  # Our installs follow the standard GNU directory layout. This include needs to
+  # come first since we need the CMAKE_INSTALL_* in the CMakeLists.txt of each
+  # target.
+  include(GNUInstallDirs)
+endif()
+
+if(BUILD_CORE)
+  if(ENABLE_MKL)
+    # Threaded Intel oneMKL: ask MKL to use its OpenMP runtime ('intel_thread').
+    # This is the only difference from the OpenMP/HPX builds, which pin
+    # MKL_THREADING=sequential because they parallelise at the tile level.
+    # Here the parallelism lives inside dpotrf itself, so we want the
+    # vendor-threaded backend.
+    set(MKL_INTERFACE_FULL "intel_lp64")
+    set(MKL_THREADING "intel_thread")
+    find_package(MKL CONFIG REQUIRED)
+
+    if(MKL_FOUND)
+      message(STATUS "Intel oneMKL Library found (threaded: ${MKL_THREADING})")
+    else()
+      message(FATAL_ERROR "No BLAS Library found")
+    endif()
+  else()
+    # Threaded OpenBLAS. The library name is the same as the sequential one,
+    # but the Spack environment loaded by compile.sh selects an OpenBLAS built
+    # with threads=openmp.
+    find_library(OpenBLAS_LIB NAMES openblas REQUIRED)
+
+    if(OpenBLAS_LIB)
+      message(STATUS "OpenBLAS Library found at ${OpenBLAS_LIB}")
+      find_path(
+        OpenBLAS_INCLUDE_DIR
+        NAMES cblas.h
+        PATH_SUFFIXES openblas)
+      if(NOT OpenBLAS_INCLUDE_DIR)
+        message(FATAL_ERROR "OpenBLAS include directory not found")
+      endif()
+
+      message(STATUS "OpenBLAS include dir: ${OpenBLAS_INCLUDE_DIR}")
+    else()
+      message(FATAL_ERROR "No BLAS Library found")
+    endif()
+  endif()
+
+  # OpenMP is required for the matrix-generation parallel loop and to pick up
+  # the OpenMP runtime that threaded OpenBLAS / threaded MKL share.
+  find_package(OpenMP REQUIRED)
+
+  add_subdirectory(core)
+
+  # Add the executable
+  add_executable(cholesky_reference main.cpp)
+
+  # Link the libraries
+  target_link_libraries(cholesky_reference PUBLIC Cholesky::core
+                                                  OpenMP::OpenMP_CXX)
+
+  if(ENABLE_VALIDATION)
+    target_compile_definitions(cholesky_reference PRIVATE ENABLE_VALIDATION)
+  endif()
+endif()
diff --git a/reference/CMakePresets.json b/reference/CMakePresets.json
new file mode 100644
index 0000000..f3839f8
--- /dev/null
+++ b/reference/CMakePresets.json
@@ -0,0 +1,17 @@
+{
+  "version": 6,
+  "cmakeMinimumRequired": {
+    "major": 3,
+    "minor": 22,
+    "patch": 0
+  },
+  "configurePresets": [
+    {
+      "name": "clang-tidy",
+      "hidden": true,
+      "cacheVariables": {
+        "CMAKE_CXX_CLANG_TIDY": "clang-tidy;--header-filter=^${sourceDir}/"
+      }
+    }
+  ]
+}
diff --git a/reference/compile.sh b/reference/compile.sh
new file mode 100755
index 0000000..f896d8c
--- /dev/null
+++ b/reference/compile.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+# Usage: compile.sh
+#
+# Builds the parallel-BLAS reference benchmark: a single threaded
+# LAPACKE_dpotrf call on the full matrix, used as a baseline against the
+# tiled OpenMP / HPX implementations. GCC only.
+#
+# CMake project options can be overridden via environment variables
+# (defaults match the project's CMakeLists.txt defaults):
+#   ENABLE_MKL          ON|OFF  (default OFF) - link threaded Intel oneMKL
+#                                               instead of threaded OpenBLAS
+#   ENABLE_VALIDATION   ON|OFF  (default OFF) - residual check after each
+#                                               factorisation
+#
+# Examples:
+#   ./compile.sh
+#   ENABLE_MKL=ON ./compile.sh
+#   ENABLE_VALIDATION=ON ./compile.sh
+################################################################################
+set -e # Exit immediately if a command exits with a non-zero status.
+
+################################################################################
+# CMake project options (env-var overridable; defaults match CMakeLists.txt)
+################################################################################
+: "${ENABLE_MKL:=OFF}"
+: "${ENABLE_VALIDATION:=OFF}"
+
+for var in ENABLE_MKL ENABLE_VALIDATION; do
+  case "${!var}" in
+  ON | OFF) ;;
+  *)
+    echo "Error: $var must be ON or OFF (got '${!var}')." >&2
+    exit 1
+    ;;
+  esac
+done
+
+################################################################################
+# Toolchain selection (gcc only)
+################################################################################
+select_toolchain() {
+  module load gcc/14.2.0
+  export CC=gcc
+  export CXX=g++
+}
+
+################################################################################
+# Configurations
+#
+# The reference benchmark uses *threaded* OpenBLAS / MKL — that is the whole
+# point of this directory. The OpenMP and HPX builds, by contrast, pin the
+# BLAS to its sequential variant because they parallelise at the tile level.
+################################################################################
+if command -v spack &>/dev/null; then
+  echo "Spack command found. Loading libraries (gcc)"
+  # Get current hostname
+  HOSTNAME=$(hostname -s)
+
+  if [[ "$HOSTNAME" == "ipvs-epyc1" ]]; then
+    # Compiler
+    select_toolchain
+    if [[ "$ENABLE_MKL" == "OFF" ]]; then
+      # OpenBLAS built with OpenMP threading
+      spack load openblas@0.3.28%gcc@14.2.0 threads=openmp
+    fi
+
+  elif [[ "$HOSTNAME" == "nasrin0" || "$HOSTNAME" == "nasrin1" ]]; then
+    # Compiler
+    select_toolchain
+    if [[ "$ENABLE_MKL" == "OFF" ]]; then
+      # OpenBLAS built with OpenMP threading
+      spack load openblas@0.3.28%gcc@14.2.0 arch=linux-almalinux9-zen3 threads=openmp
+    fi
+
+  else
+    echo "Hostname is $HOSTNAME — no action taken."
+  fi
+else
+  echo "Spack command not found. Exiting."
+fi
+
+################################################################################
+# Compile code
+################################################################################
+rm -rf build && mkdir build && cd build
+
+echo "CMake options:"
+echo "  ENABLE_MKL          = $ENABLE_MKL"
+echo "  ENABLE_VALIDATION   = $ENABLE_VALIDATION"
+
+cmake -DCMAKE_BUILD_TYPE=Release \
+  -DENABLE_MKL="$ENABLE_MKL" \
+  -DENABLE_VALIDATION="$ENABLE_VALIDATION" \
+  ..
+make -j VERBOSE=1
+cd ..
+
+# Launch Example
+# OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores ./build/cholesky_reference --size_start 65536 --size_stop 65536 --loop 20
diff --git a/reference/core/CMakeLists.txt b/reference/core/CMakeLists.txt
new file mode 100644
index 0000000..0453e4f
--- /dev/null
+++ b/reference/core/CMakeLists.txt
@@ -0,0 +1,62 @@
+set(SOURCE_FILES src/matrix_generation.cpp src/functions.cpp
+                 src/cholesky_factor.cpp src/adapter_cblas_fp64.cpp)
+
+if(ENABLE_VALIDATION)
+  list(APPEND SOURCE_FILES src/validate.cpp)
+endif()
+
+add_library(cholesky_core STATIC ${SOURCE_FILES})
+
+set_property(TARGET cholesky_core PROPERTY EXPORT_NAME core)
+add_library(Cholesky::core ALIAS cholesky_core)
+
+# Add them as PRIVATE sources here so they show up in project files Can't use
+# PUBLIC etc., see: https://stackoverflow.com/a/62465051
+file(GLOB_RECURSE header_files CONFIGURE_DEPENDS include/*.hpp)
+target_sources(cholesky_core PRIVATE ${header_files})
+
+# Link OpenMP libraries (used by the parallel matrix generator)
+target_link_libraries(cholesky_core PUBLIC OpenMP::OpenMP_CXX)
+
+# Include directories
+target_include_directories(
+  cholesky_core PUBLIC "$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/core/include>")
+
+# Link BLAS
+if(ENABLE_MKL)
+  # Link threaded Intel oneMKL
+  target_link_libraries(
+    cholesky_core PUBLIC MKL::mkl_intel_lp64 MKL::mkl_core MKL::MKL
+                         MKL::mkl_intel_thread)
+else()
+  # Link threaded OpenBLAS (the library name is the same; threading is
+  # determined by the OpenBLAS build that compile.sh's Spack env selects).
+  target_link_libraries(cholesky_core PUBLIC ${OpenBLAS_LIB})
+  target_include_directories(cholesky_core PUBLIC ${OpenBLAS_INCLUDE_DIR})
+endif()
+
+if(ENABLE_MKL)
+  target_compile_definitions(cholesky_core PUBLIC ENABLE_MKL)
+endif()
+
+target_compile_features(cholesky_core PUBLIC cxx_std_17)
+
+set_property(TARGET cholesky_core PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+if(NOT CMAKE_SKIP_INSTALL_RULES)
+  # We need to manually install those into CMAKE_INSTALL_INCLUDEDIR. Below
+  # install(TARGETS ...) only setups the paths for the exported targets.
+  install(
+    DIRECTORY include/
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+    COMPONENT Development)
+
+  install(
+    TARGETS cholesky_core
+    EXPORT CholeskyTargets
+    RUNTIME COMPONENT Runtime
+    LIBRARY COMPONENT Runtime NAMELINK_COMPONENT Development
+    ARCHIVE COMPONENT Development
+    INCLUDES
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+endif()
diff --git a/reference/core/include/adapter_cblas_fp64.hpp b/reference/core/include/adapter_cblas_fp64.hpp
new file mode 100644
index 0000000..139945c
--- /dev/null
+++ b/reference/core/include/adapter_cblas_fp64.hpp
@@ -0,0 +1,23 @@
+#ifndef CPU_ADAPTER_CBLAS_FP64_H
+#define CPU_ADAPTER_CBLAS_FP64_H
+
+#pragma once
+
+#include <vector>
+
+using vector = std::vector<double>;
+
+// LAPACK level 3 operations
+
+/**
+ * @brief FP64 In-place Cholesky decomposition of A using a single, threaded
+ *        LAPACKE_dpotrf call (no tiling). This is the parallel-BLAS reference
+ *        implementation that the OpenMP and HPX tiled variants are compared
+ *        against.
+ *
+ * @param A row-major matrix of size N*N to be factorised in place
+ * @param N matrix dimension
+ */
+void potrf(vector &A, const int N);
+
+#endif  // end of CPU_ADAPTER_CBLAS_FP64_H
diff --git a/reference/core/include/cholesky_factor.hpp b/reference/core/include/cholesky_factor.hpp
new file mode 100644
index 0000000..f54237d
--- /dev/null
+++ b/reference/core/include/cholesky_factor.hpp
@@ -0,0 +1,20 @@
+#ifndef CPU_CHOLESKY_FACTOR_H
+#define CPU_CHOLESKY_FACTOR_H
+
+#pragma once
+
+#include <vector>
+
+namespace cpu
+{
+
+/**
+ * @brief Run a single, threaded LAPACKE_dpotrf on the full N x N row-major
+ *        matrix @p A. This is the reference (non-tiled) parallel BLAS
+ *        Cholesky factorisation that the OpenMP / HPX tiled variants are
+ *        benchmarked against.
+ */
+void parallel_blas_cholesky(std::vector<double> &A, int N);
+
+}  // end of namespace cpu
+#endif  // end of CPU_CHOLESKY_FACTOR_H
diff --git a/reference/core/include/functions.hpp b/reference/core/include/functions.hpp
new file mode 100644
index 0000000..63614d8
--- /dev/null
+++ b/reference/core/include/functions.hpp
@@ -0,0 +1,23 @@
+#ifndef CPU_FUNCTIONS_H
+#define CPU_FUNCTIONS_H
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+namespace cpu
+{
+
+/**
+ * @brief Time a single threaded LAPACKE_dpotrf call on the @p A buffer
+ *        (row-major, N x N). The buffer is factorised in place.
+ *
+ * @param A row-major matrix; on return contains the lower-triangular factor L
+ * @param N matrix dimension
+ * @return wall-clock elapsed time in seconds
+ */
+double cholesky(std::vector<double> &A, std::size_t N);
+
+}  // namespace cpu
+#endif  // end of CPU_FUNCTIONS_H
diff --git a/reference/core/include/matrix_generation.hpp b/reference/core/include/matrix_generation.hpp
new file mode 100644
index 0000000..22a3206
--- /dev/null
+++ b/reference/core/include/matrix_generation.hpp
@@ -0,0 +1,26 @@
+#ifndef MATRIX_GENERATION_H
+#define MATRIX_GENERATION_H
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+/**
+ * @brief Generate a deterministic, dense, row-major SPD matrix of size N x N.
+ *
+ * Entries are uniform on [0, 1) using a per-row seed; the diagonal is shifted
+ * by +N to guarantee strict diagonal dominance and therefore symmetric
+ * positive definiteness. The result is stored as a single contiguous
+ * std::vector<double> of length N*N in row-major order, ready to be passed to
+ * LAPACKE_dpotrf.
+ *
+ * Generation is parallelised with OpenMP across rows so it does not dominate
+ * the timed factorisation phase.
+ *
+ * @param N matrix dimension
+ * @return owning row-major buffer of length N*N
+ */
+std::vector<double> gen_matrix(std::size_t N);
+
+#endif
diff --git a/reference/core/include/validate.hpp b/reference/core/include/validate.hpp
new file mode 100644
index 0000000..6cf829c
--- /dev/null
+++ b/reference/core/include/validate.hpp
@@ -0,0 +1,28 @@
+#ifndef CPU_VALIDATE_H
+#define CPU_VALIDATE_H
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+namespace cpu
+{
+
+/**
+ * @brief Compute the relative Cholesky residual ||A - L * L^T||_F / ||A||_F
+ *        for the dense, row-major reference factorisation.
+ *
+ * The original A is regenerated on the fly with the same deterministic seed
+ * used by gen_matrix, so no extra storage is needed.
+ *
+ * @param N matrix dimension (must match the factorisation)
+ * @param L row-major buffer of length N*N holding the factor returned by
+ *          LAPACKE_dpotrf with uplo='L' (only the lower triangle is read)
+ * @return relative Frobenius residual
+ */
+double cholesky_residual(std::size_t N, const std::vector<double> &L);
+
+}  // namespace cpu
+
+#endif  // end of CPU_VALIDATE_H
diff --git a/reference/core/src/adapter_cblas_fp64.cpp b/reference/core/src/adapter_cblas_fp64.cpp
new file mode 100644
index 0000000..566290f
--- /dev/null
+++ b/reference/core/src/adapter_cblas_fp64.cpp
@@ -0,0 +1,19 @@
+#include "adapter_cblas_fp64.hpp"
+
+#ifdef ENABLE_MKL
+// MKL CBLAS / LAPACKE
+#include "mkl_cblas.h"
+#include "mkl_lapacke.h"
+#else
+#include "cblas.h"
+#include "lapacke.h"
+#endif
+
+void potrf(vector &A, const int N)
+{
+    // Single threaded LAPACKE call on the full matrix. dpotrf2 is the
+    // recursive variant, which is what the OpenMP / HPX variants use on
+    // their diagonal tiles, so picking it here keeps the underlying kernel
+    // identical and isolates the parallelism source as the only difference.
+    LAPACKE_dpotrf2(LAPACK_ROW_MAJOR, 'L', N, A.data(), N);
+}
diff --git a/reference/core/src/cholesky_factor.cpp b/reference/core/src/cholesky_factor.cpp
new file mode 100644
index 0000000..4fd93fb
--- /dev/null
+++ b/reference/core/src/cholesky_factor.cpp
@@ -0,0 +1,15 @@
+#include "cholesky_factor.hpp"
+
+#include "adapter_cblas_fp64.hpp"
+
+namespace cpu
+{
+
+void parallel_blas_cholesky(std::vector<double> &A, int N)
+{
+    // The whole factorisation is one threaded LAPACKE call; the BLAS library
+    // takes care of dispatching work across the available threads.
+    potrf(A, N);
+}
+
+}  // end of namespace cpu
diff --git a/reference/core/src/functions.cpp b/reference/core/src/functions.cpp
new file mode 100644
index 0000000..abe4cb0
--- /dev/null
+++ b/reference/core/src/functions.cpp
@@ -0,0 +1,20 @@
+#include "functions.hpp"
+
+#include "cholesky_factor.hpp"
+#include <chrono>
+
+namespace cpu
+{
+
+double cholesky(std::vector<double> &A, std::size_t N)
+{
+    auto start = std::chrono::high_resolution_clock::now();
+    ///////////////////////////////////////////////////////////////////////////
+    // Launch Cholesky decomposition: A = L * L^T (single threaded LAPACKE call)
+    parallel_blas_cholesky(A, static_cast<int>(N));
+    ///////////////////////////////////////////////////////////////////////////
+    auto stop = std::chrono::high_resolution_clock::now();
+    return (stop - start).count() / 1e9;
+}
+
+}  // end of namespace cpu
diff --git a/reference/core/src/matrix_generation.cpp b/reference/core/src/matrix_generation.cpp
new file mode 100644
index 0000000..b0db740
--- /dev/null
+++ b/reference/core/src/matrix_generation.cpp
@@ -0,0 +1,32 @@
+#include "matrix_generation.hpp"
+
+#include <random>
+#include <vector>
+
+std::vector<double> gen_matrix(std::size_t N)
+{
+    // Row-major dense buffer
+    std::vector<double> A(N * N);
+
+    // The matrix is built row by row in parallel. Each row uses its own RNG
+    // seeded by the row index, so the matrix is deterministic and
+    // reproducible regardless of the number of threads. Off-diagonal entries
+    // are mirrored to keep A symmetric; the diagonal is shifted by +N to
+    // guarantee strict diagonal dominance (and therefore SPD), mirroring the
+    // +N*n_tiles shift used by the tiled variants when n_tiles == 1.
+#pragma omp parallel for schedule(static)
+    for (std::size_t i = 0; i < N; ++i)
+    {
+        std::mt19937 generator(static_cast<std::mt19937::result_type>(i + 1));
+        std::uniform_real_distribution<double> distribute(0.0, 1.0);
+        for (std::size_t j = 0; j <= i; ++j)
+        {
+            const double v = distribute(generator);
+            A[i * N + j] = v;
+            A[j * N + i] = v;
+        }
+        A[i * N + i] += static_cast<double>(N);
+    }
+
+    return A;
+}
diff --git a/reference/core/src/validate.cpp b/reference/core/src/validate.cpp
new file mode 100644
index 0000000..5a43cc8
--- /dev/null
+++ b/reference/core/src/validate.cpp
@@ -0,0 +1,72 @@
+#include "validate.hpp"
+
+#include "matrix_generation.hpp"
+
+#ifdef ENABLE_MKL
+#include "mkl_cblas.h"
+#else
+#include "cblas.h"
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <vector>
+
+namespace cpu
+{
+
+double cholesky_residual(std::size_t N, const std::vector<double> &L)
+{
+    // Build a working copy of L with its strictly upper triangle zeroed out.
+    // dpotrf with uplo='L' leaves the upper triangle untouched (it still
+    // contains the original A values), so we must mask it before forming
+    // L * L^T with a plain dgemm.
+    std::vector<double> Lwork(L);
+    for (std::size_t i = 0; i < N; ++i)
+    {
+        for (std::size_t j = i + 1; j < N; ++j)
+        {
+            Lwork[i * N + j] = 0.0;
+        }
+    }
+
+    // Compute LLt = L * L^T (full N x N) with a single dgemm.
+    std::vector<double> LLt(N * N, 0.0);
+    cblas_dgemm(
+        CblasRowMajor,
+        CblasNoTrans,
+        CblasTrans,
+        static_cast<int>(N),
+        static_cast<int>(N),
+        static_cast<int>(N),
+        1.0,
+        Lwork.data(),
+        static_cast<int>(N),
+        Lwork.data(),
+        static_cast<int>(N),
+        0.0,
+        LLt.data(),
+        static_cast<int>(N));
+
+    // Regenerate the original A deterministically and accumulate Frobenius
+    // norms of (A - LLt) and A.
+    const std::vector<double> A = gen_matrix(N);
+
+    double r_norm_sq = 0.0;
+    double a_norm_sq = 0.0;
+    for (std::size_t idx = 0; idx < A.size(); ++idx)
+    {
+        const double d = A[idx] - LLt[idx];
+        r_norm_sq += d * d;
+        a_norm_sq += A[idx] * A[idx];
+    }
+
+    if (a_norm_sq == 0.0)
+    {
+        return 0.0;
+    }
+    return std::sqrt(r_norm_sq / a_norm_sq);
+}
+
+}  // namespace cpu
diff --git a/reference/main.cpp b/reference/main.cpp
new file mode 100644
index 0000000..0d58b8f
--- /dev/null
+++ b/reference/main.cpp
@@ -0,0 +1,119 @@
+#include "functions.hpp"
+#include "matrix_generation.hpp"
+#ifdef ENABLE_VALIDATION
+#include "validate.hpp"
+#endif
+#include <cstddef>
+#include <fstream>
+#include <iostream>
+#include <omp.h>
+#include <string>
+#include <vector>
+
+int main(int argc, char *argv[])
+{
+    ///////////////////////////////////////////////////////////////////////////
+    // cmdline arguments
+    //
+    // The reference benchmark calls a single threaded LAPACKE_dpotrf on the
+    // full matrix, so there is no tiling axis. We still accept --tiles_start
+    // / --tiles_stop for CLI compatibility with the openmp/ and hpx/ binaries
+    // (they are silently ignored), which keeps any shared driver script
+    // unchanged.
+    std::size_t loop = 1;
+    std::size_t size_start = 32, size_stop = 128;
+
+    for (int i = 1; i < argc; ++i)
+    {
+        std::string arg = argv[i];
+        if (arg == "--loop" && i + 1 < argc)
+        {
+            loop = std::stoul(argv[++i]);
+        }
+        else if (arg == "--size_start" && i + 1 < argc)
+        {
+            size_start = std::stoul(argv[++i]);
+        }
+        else if (arg == "--size_stop" && i + 1 < argc)
+        {
+            size_stop = std::stoul(argv[++i]);
+        }
+        else if ((arg == "--tiles_start" || arg == "--tiles_stop") && i + 1 < argc)
+        {
+            // Accept-and-ignore for CLI parity with the tiled variants.
+            ++i;
+        }
+    }
+    ///////////////////////////////////////////////////////////////////////////
+    // configuration
+    const std::size_t LOOP = loop;
+
+    const std::size_t START_SIZE = size_start;
+    const std::size_t STOP_SIZE = size_stop;
+    const std::size_t STEP_SIZE = 2;
+
+    // print and write results
+    bool HEADER_FLAG = true;
+    std::string runtime_file_path = "runtimes_reference_cholesky_";
+    if (START_SIZE != STOP_SIZE)
+    {
+        runtime_file_path += std::string("size_");
+    }
+    runtime_file_path += std::to_string(LOOP) + std::string(".txt");
+    std::ofstream runtime_file;
+    runtime_file.open(runtime_file_path, std::ios_base::app);
+
+    for (std::size_t size = START_SIZE; size <= STOP_SIZE; size = size * STEP_SIZE)
+    {
+        for (std::size_t l = 0; l < LOOP; l++)
+        {
+            // header for output file -- columns mirror the openmp/hpx output so
+            // results from all three benchmarks can be merged on (problem_size).
+            // The reference has no tiling, so tile_size == problem_size and
+            // n_tiles == 1.
+            std::string header = "threads;problem_size;tile_size;n_tiles";
+            std::string values = std::to_string(omp_get_max_threads());
+            values += std::string(";") + std::to_string(size);
+            values += std::string(";") + std::to_string(size);
+            values += std::string(";") + std::to_string(1);
+            ///////////////////////////////////////////////////////////////////
+            // Single mode: parallel-BLAS reference dpotrf on the full matrix.
+            std::vector<std::string> modes = { "reference" };
+
+            for (const auto &mode : modes)
+            {
+                auto A = gen_matrix(size);
+                auto cholesky_cpu = cpu::cholesky(A, size);
+
+                header += ";" + mode;
+                values += ";" + std::to_string(cholesky_cpu);
+
+#ifdef ENABLE_VALIDATION
+                // Validate by computing relative residual ||A - L L^T||_F / ||A||_F
+                constexpr double residual_tol = 1e-10;
+                const double residual = cpu::cholesky_residual(size, A);
+                std::cout << "[validate] mode=" << mode << " size=" << size << " residual=" << residual << std::endl;
+                if (!(residual <= residual_tol))  // catches NaN too
+                {
+                    std::cerr << "Validation warning: variant '" << mode << "' residual " << residual
+                              << " exceeds tolerance " << residual_tol << " (size=" << size << ")" << std::endl;
+                }
+#endif
+            }
+            ///////////////////////////////////////////////////////////////////
+            // print/write header only once
+            if (HEADER_FLAG)
+            {
+                HEADER_FLAG = false;
+                std::cout << header << std::endl;
+                runtime_file << header << std::endl;
+            }
+            // print/write runtimes
+            std::cout << values << std::endl;
+            runtime_file << values << std::endl;
+        }
+    }
+
+    runtime_file.close();
+    return 0;
+}
diff --git a/reference/run.sh b/reference/run.sh
new file mode 100755
index 0000000..0600513
--- /dev/null
+++ b/reference/run.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+#SBATCH --job-name=cholesky_reference
+#SBATCH --output=logs/cholesky_reference_%j.out
+#SBATCH --error=logs/cholesky_reference_%j.err
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=128
+#SBATCH --time=144:00:00
+#SBATCH --exclusive
+#
+# Usage: run.sh
+#
+# Submit example:
+#   sbatch run.sh
+#
+# Runs the parallel-BLAS reference benchmark — a single threaded
+# LAPACKE_dpotrf call on the full matrix — as a baseline for the OpenMP and
+# HPX tiled implementations. GCC only.
+
+set -e # Exit immediately if a command exits with a non-zero status.
+
+################################################################################
+# Toolchain runtime selection (gcc only)
+################################################################################
+module load gcc/14.2.0
+
+# Resolve directory where the script is located
+SCRIPT_DIR="$(pwd)"
+
+# OpenMP settings — the threaded BLAS picks these up to spread dpotrf across
+# all the cores. Both OpenBLAS (threads=openmp) and threaded MKL respect the
+# standard OMP_* environment.
+export OMP_NUM_THREADS=128
+export OMP_PROC_BIND=close
+export OMP_PLACES=cores
+
+# Make sure threaded MKL uses the OpenMP runtime if ENABLE_MKL=ON was used at
+# build time. Harmless when linking OpenBLAS.
+export MKL_NUM_THREADS=${MKL_NUM_THREADS:-$OMP_NUM_THREADS}
+
+echo "Running with gcc runtime"
+
+# Run executable
+srun --cpu-bind=cores "$SCRIPT_DIR/build/cholesky_reference" \
+  --loop 20 \
+  --size_start 65536 \
+  --size_stop 65536

From bd93795b50a01d710d815dbf6aeef943d62d9f6b Mon Sep 17 00:00:00 2001
From: constracktor <74077030+constracktor@users.noreply.github.com>
Date: Wed, 29 Apr 2026 12:59:49 +0200
Subject: [PATCH 02/13] Adjust OpenBLAS

---
 reference/compile.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/reference/compile.sh b/reference/compile.sh
index f896d8c..d0d2727 100755
--- a/reference/compile.sh
+++ b/reference/compile.sh
@@ -56,12 +56,12 @@ if command -v spack &>/dev/null; then
   # Get current hostname
   HOSTNAME=$(hostname -s)
 
-  if [[ "$HOSTNAME" == "ipvs-epyc1" ]]; then
+  if [[ "$HOSTNAME" == "ipvs-epyc1" || "$HOSTNAME" == "ipvs-epyc2" ]]; then
     # Compiler
     select_toolchain
     if [[ "$ENABLE_MKL" == "OFF" ]]; then
       # OpenBLAS built with OpenMP threading
-      spack load openblas@0.3.28%gcc@14.2.0 threads=openmp
+      spack load openblas@0.3.28%gcc@14.2.0 threads=openmp ilp64=true
     fi
 
   elif [[ "$HOSTNAME" == "nasrin0" || "$HOSTNAME" == "nasrin1" ]]; then

From 8de7ed7e7dae6cec90e721d9516bd5c922eec85a Mon Sep 17 00:00:00 2001
From: constracktor <74077030+constracktor@users.noreply.github.com>
Date: Wed, 29 Apr 2026 13:01:33 +0200
Subject: [PATCH 03/13] Adjust README

---
 README.md | 101 +++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 78 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 7f06a60..0287f4b 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Cholesky-Bench
 
-Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-join to asynchronous tasks across several parallelism models, currently comparing OpenMP and HPX implementations side by side.
+Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-join to asynchronous tasks across several parallelism models, currently comparing OpenMP and HPX implementations side by side. A non-tiled parallel-BLAS reference is also included as a baseline.
 
 ## Variants
 
@@ -24,27 +24,40 @@ Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-j
 | `loop_two` | Collapsed fork-join with dynamic schedule for trailing-update |
 | `async_void` |  Fully asynchronous tasking with dataflow using `hpx::shared_future<void>` |
 
+### Reference (`reference/`)
+
+| Mode | Description |
+|------|-------------|
+| `reference` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). |
+| `plasma` | Single `plasma_dpotrf` call on the full matrix. PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. |
+
+This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `reference` mode isolates the contribution of vendor-provided dense-LA parallelism, and the `plasma` mode adds a true tiled-parallel competitor that uses the same OpenMP runtime as the in-house variants.
+
 ## Dependencies
 
-Both implementations share the same sequential BLAS backend and are built with CMake (≥ 3.23) and C++20.
+All three implementations are built with CMake (≥ 3.23) and C++20. The OpenMP and HPX directories link against a *sequential* BLAS (parallelism is at the tile level); the `reference/` directory links against a *threaded* BLAS instead.
 
-| Dependency | OpenMP | HPX |
-|---|---|---|
-| OpenBLAS 0.3.28 | ✓ (default) | ✓ (default) |
-| Intel oneMKL | optional (`ENABLE_MKL=ON`) | optional (`ENABLE_MKL=ON`) |
-| HPX 1.11.0 + jemalloc | — | ✓ |
-| GCC 14.2.0 | ✓ | ✓ |
-| LLVM/Clang 22.1.2 | optional | — |
+| Dependency | OpenMP | HPX | Reference |
+|---|---|---|---|
+| OpenBLAS 0.3.28 (sequential) | ✓ (default) | ✓ (default) | — |
+| OpenBLAS 0.3.28 (`threads=openmp`) | — | — | ✓ (default) |
+| Intel oneMKL (sequential) | optional (`ENABLE_MKL=ON`) | optional (`ENABLE_MKL=ON`) | — |
+| Intel oneMKL (`intel_thread`) | — | — | optional (`ENABLE_MKL=ON`) |
+| PLASMA | — | — | optional (`ENABLE_PLASMA=ON`) |
+| HPX 1.11.0 + jemalloc | — | ✓ | — |
+| GCC 14.2.0 | ✓ | ✓ | ✓ |
+| LLVM/Clang 22.1.2 | optional | — | — |
 
 Dependencies are managed via [Spack](https://spack.io/). The compile scripts auto-detect the host system and load the correct Spack environment.
 
 ## Build
 
-From within the `openmp/` or `hpx/` directory, run:
+From within the `openmp/`, `hpx/`, or `reference/` directory, run:
 
 ```bash
-./compile.sh [gcc|llvm]   # OpenMP: gcc (default) or llvm
-./compile.sh              # HPX: always gcc
+./compile.sh [gcc|llvm]   # OpenMP:    gcc (default) or llvm
+./compile.sh              # HPX:       always gcc
+./compile.sh              # Reference: always gcc
 ```
 
 The script clears and recreates the `build/` directory, then runs CMake in Release mode followed by a parallel make.
@@ -55,10 +68,11 @@ These can be set as environment variables before calling `compile.sh`:
 
 | Option | Default | Description |
 |--------|---------|-------------|
-| `ENABLE_VALIDATION` | `OFF` | After each factorization, compute the relative residual ‖A − LL^T‖_F / ‖A‖_F and warn if it exceeds 1e-10. Mutually exclusive with `DISABLE_COMPUTATION`. |
-| `DISABLE_COMPUTATION` | `OFF` | Replace all BLAS/tile-generation calls with no-ops. The task graph and loops remain intact, so scheduling overhead can be measured in isolation. |
-| `ENABLE_DYNAMIC_SCHEDULE` | `OFF` | *(OpenMP only)* Use `schedule(dynamic,1)` on the trailing-update worksharing loops in `for_collapse`. Requires the LLVM toolchain; rejected at compile time with GCC. |
-| `ENABLE_MKL` | `OFF` | Link against Intel oneMKL instead of OpenBLAS. |
+| `ENABLE_VALIDATION` | `OFF` | After each factorization, compute the relative residual ‖A − LL^T‖_F / ‖A‖_F and warn if it exceeds 1e-10. In `openmp/` and `hpx/`, mutually exclusive with `DISABLE_COMPUTATION`. |
+| `DISABLE_COMPUTATION` | `OFF` | *(`openmp/` and `hpx/` only)* Replace all BLAS/tile-generation calls with no-ops. The task graph and loops remain intact, so scheduling overhead can be measured in isolation. |
+| `ENABLE_DYNAMIC_SCHEDULE` | `OFF` | *(`openmp/` only)* Use `schedule(dynamic,1)` on the trailing-update worksharing loops in `for_collapse`. Requires the LLVM toolchain; rejected at compile time with GCC. |
+| `ENABLE_MKL` | `OFF` | Link against Intel oneMKL instead of OpenBLAS. In `openmp/` and `hpx/` this is the *sequential* MKL; in `reference/` it is the *threaded* MKL. |
+| `ENABLE_PLASMA` | `OFF` | *(`reference/` only)* Also build the PLASMA tiled-Cholesky variant. Adds a `plasma` column alongside `reference` in the runtime output. |
 
 **Examples:**
 
@@ -71,6 +85,12 @@ ENABLE_DYNAMIC_SCHEDULE=ON ./compile.sh llvm
 
 # HPX: measure pure scheduling overhead
 DISABLE_COMPUTATION=ON ./compile.sh
+
+# Reference: threaded MKL baseline
+ENABLE_MKL=ON ./compile.sh
+
+# Reference: also build the PLASMA tiled-Cholesky variant
+ENABLE_PLASMA=ON ./compile.sh
 ```
 
 ## Run
@@ -89,16 +109,22 @@ OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores \
   --hpx:threads=128 \
   --loop=1 --size_start=1024 --size_stop=65536 \
   --tiles_start=64 --tiles_stop=64
+
+# Reference (parallel BLAS, no tiling)
+OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores \
+  ./build/cholesky_reference \
+  --loop 1 --size_start 1024 --size_stop 65536
 ```
 
 ### Via SLURM
 
-Both directories contain a `run.sh` that is a ready-to-submit SLURM batch script (128 CPUs, exclusive node, 144-hour wall time):
+All three directories contain a `run.sh` that is a ready-to-submit SLURM batch script (128 CPUs, exclusive node, 144-hour wall time):
 
 ```bash
-sbatch openmp/run.sh          # gcc runtime (default)
-sbatch openmp/run.sh llvm     # llvm runtime
+sbatch openmp/run.sh             # gcc runtime (default)
+sbatch openmp/run.sh llvm        # llvm runtime
 sbatch hpx/run.sh
+sbatch reference/run.sh
 ```
 
 ### Command-line arguments
@@ -107,7 +133,7 @@ sbatch hpx/run.sh
 |----------|---------|-------------|
 | `--loop` / `--loop=` | 1 | Number of timed repetitions per configuration |
 | `--size_start` / `--size_stop` | 32 / 128 | Problem size range (doubled each step) |
-| `--tiles_start` / `--tiles_stop` | 16 / 32 | Tile count range (doubled each step) |
+| `--tiles_start` / `--tiles_stop` | 16 / 32 | Tile count range (doubled each step). Accepted but ignored by the `reference/` binary, which has no tiling axis. |
 
 ## Output
 
@@ -116,6 +142,7 @@ Results are appended to a text file in the working directory:
 ```
 runtimes_openmp_cholesky_<suffix>.txt
 runtimes_hpx_cholesky_<suffix>.txt
+runtimes_reference_cholesky_<suffix>.txt
 ```
 
 The suffix encodes which dimension is swept: `tile_` if tiles vary, `size_` if size varies, followed by the loop count. The file uses `;`-separated columns:
@@ -125,6 +152,13 @@ threads;problem_size;tile_size;n_tiles;for_collapse;for_naive;task_naive;task_de
 128;65536;1024;64;3.14;3.21;2.98;2.87
 ```
 
+The `reference/` binary reports a `reference` column (and a `plasma` column when built with `ENABLE_PLASMA=ON`), with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key:
+
+```
+threads;problem_size;tile_size;n_tiles;reference;plasma
+128;65536;65536;1;2.71;2.45
+```
+
 The same lines are also printed to stdout.
 
 ## Repository structure
@@ -150,7 +184,26 @@ The same lines are also printed to stdout.
 │           ├── tile_generation.cpp
 │           ├── validate.cpp
 │           └── adapter_cblas_fp64.cpp
-└── hpx/
+├── hpx/
+│   ├── CMakeLists.txt
+│   ├── CMakePresets.json
+│   ├── compile.sh          # build script (gcc only)
+│   ├── run.sh              # SLURM job script
+│   ├── main.cpp
+│   └── core/
+│       ├── include/
+│       │   ├── cholesky_factor.hpp
+│       │   ├── functions.hpp
+│       │   ├── tile_generation.hpp
+│       │   ├── validate.hpp
+│       │   └── adapter_cblas_fp64.hpp
+│       └── src/
+│           ├── cholesky_factor.cpp
+│           ├── functions.cpp
+│           ├── tile_generation.cpp
+│           ├── validate.cpp
+│           └── adapter_cblas_fp64.cpp
+└── reference/
     ├── CMakeLists.txt
     ├── CMakePresets.json
     ├── compile.sh          # build script (gcc only)
@@ -160,13 +213,15 @@ The same lines are also printed to stdout.
         ├── include/
         │   ├── cholesky_factor.hpp
         │   ├── functions.hpp
-        │   ├── tile_generation.hpp
+        │   ├── matrix_generation.hpp
+        │   ├── plasma_factor.hpp     # only used when ENABLE_PLASMA=ON
         │   ├── validate.hpp
         │   └── adapter_cblas_fp64.hpp
         └── src/
             ├── cholesky_factor.cpp
             ├── functions.cpp
-            ├── tile_generation.cpp
+            ├── matrix_generation.cpp
+            ├── plasma_factor.cpp     # only built when ENABLE_PLASMA=ON
             ├── validate.cpp
             └── adapter_cblas_fp64.cpp
 ```

From d1edae32b93dadbfe319c0035179e633f003f1a2 Mon Sep 17 00:00:00 2001
From: constracktor <74077030+constracktor@users.noreply.github.com>
Date: Wed, 29 Apr 2026 13:50:42 +0200
Subject: [PATCH 04/13] Add PLASMA draft

---
 reference/CMakeLists.txt                   | 26 +++++++++++++++++
 reference/compile.sh                       | 15 +++++++++-
 reference/core/CMakeLists.txt              | 13 +++++++++
 reference/core/include/cholesky_factor.hpp | 34 ++++++++++++++++++----
 reference/core/include/functions.hpp       | 13 +++++----
 reference/core/include/plasma_factor.hpp   | 26 +++++++++++++++++
 reference/core/src/cholesky_factor.cpp     | 27 ++++++++++++++---
 reference/core/src/functions.cpp           |  7 +++--
 reference/core/src/plasma_factor.cpp       | 25 ++++++++++++++++
 reference/main.cpp                         | 27 +++++++++++++++--
 10 files changed, 193 insertions(+), 20 deletions(-)
 create mode 100644 reference/core/include/plasma_factor.hpp
 create mode 100644 reference/core/src/plasma_factor.cpp

diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index e63612f..084b4fb 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -7,6 +7,10 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 # What to build?
 option(BUILD_CORE "Build the core library" ON)
 option(ENABLE_MKL "Enable Intel oneMKL support (threaded)" OFF)
+option(
+  ENABLE_PLASMA
+  "Build the PLASMA tiled-Cholesky reference variant in addition to the LAPACKE_dpotrf one"
+  OFF)
 option(
   ENABLE_VALIDATION
   "Compute ||A - L*L^T||_F / ||A||_F after each factorization (off by default)"
@@ -76,6 +80,23 @@ if(BUILD_CORE)
   # the OpenMP runtime that threaded OpenBLAS / threaded MKL share.
   find_package(OpenMP REQUIRED)
 
+  if(ENABLE_PLASMA)
+    # PLASMA exposes its own tiled parallel Cholesky (plasma_dpotrf). Spack
+    # installs it as a single shared library plus a coreblas helper; we look
+    # for both and link whichever is present.
+    find_path(PLASMA_INCLUDE_DIR plasma.h)
+    if(NOT PLASMA_INCLUDE_DIR)
+      message(FATAL_ERROR "ENABLE_PLASMA=ON but plasma.h was not found")
+    endif()
+    find_library(PLASMA_LIB NAMES plasma REQUIRED)
+    find_library(PLASMA_CORE_BLAS_LIB NAMES coreblas plasma_core_blas)
+    message(STATUS "PLASMA include dir: ${PLASMA_INCLUDE_DIR}")
+    message(STATUS "PLASMA library: ${PLASMA_LIB}")
+    if(PLASMA_CORE_BLAS_LIB)
+      message(STATUS "PLASMA coreblas library: ${PLASMA_CORE_BLAS_LIB}")
+    endif()
+  endif()
+
   add_subdirectory(core)
 
   # Add the executable
@@ -88,4 +109,9 @@ if(BUILD_CORE)
   if(ENABLE_VALIDATION)
     target_compile_definitions(cholesky_reference PRIVATE ENABLE_VALIDATION)
   endif()
+
+  if(ENABLE_PLASMA)
+    target_compile_definitions(cholesky_reference PRIVATE ENABLE_PLASMA)
+    target_include_directories(cholesky_reference PRIVATE ${PLASMA_INCLUDE_DIR})
+  endif()
 endif()
diff --git a/reference/compile.sh b/reference/compile.sh
index d0d2727..0fa48b7 100755
--- a/reference/compile.sh
+++ b/reference/compile.sh
@@ -9,12 +9,16 @@
 # (defaults match the project's CMakeLists.txt defaults):
 #   ENABLE_MKL          ON|OFF  (default OFF) - link threaded Intel oneMKL
 #                                               instead of threaded OpenBLAS
+#   ENABLE_PLASMA       ON|OFF  (default OFF) - also build the PLASMA tiled
+#                                               Cholesky variant (extra
+#                                               'plasma' column in the output)
 #   ENABLE_VALIDATION   ON|OFF  (default OFF) - residual check after each
 #                                               factorisation
 #
 # Examples:
 #   ./compile.sh
 #   ENABLE_MKL=ON ./compile.sh
+#   ENABLE_PLASMA=ON ./compile.sh
 #   ENABLE_VALIDATION=ON ./compile.sh
 ################################################################################
 set -e # Exit immediately if a command exits with a non-zero status.
@@ -23,9 +27,10 @@ set -e # Exit immediately if a command exits with a non-zero status.
 # CMake project options (env-var overridable; defaults match CMakeLists.txt)
 ################################################################################
 : "${ENABLE_MKL:=OFF}"
+: "${ENABLE_PLASMA:=OFF}"
 : "${ENABLE_VALIDATION:=OFF}"
 
-for var in ENABLE_MKL ENABLE_VALIDATION; do
+for var in ENABLE_MKL ENABLE_PLASMA ENABLE_VALIDATION; do
   case "${!var}" in
   ON | OFF) ;;
   *)
@@ -63,6 +68,9 @@ if command -v spack &>/dev/null; then
       # OpenBLAS built with OpenMP threading
       spack load openblas@0.3.28%gcc@14.2.0 threads=openmp ilp64=true
     fi
+    if [[ "$ENABLE_PLASMA" == "ON" ]]; then
+      spack load plasma%gcc@14.2.0
+    fi
 
   elif [[ "$HOSTNAME" == "nasrin0" || "$HOSTNAME" == "nasrin1" ]]; then
     # Compiler
@@ -71,6 +79,9 @@ if command -v spack &>/dev/null; then
       # OpenBLAS built with OpenMP threading
       spack load openblas@0.3.28%gcc@14.2.0 arch=linux-almalinux9-zen3 threads=openmp
     fi
+    if [[ "$ENABLE_PLASMA" == "ON" ]]; then
+      spack load plasma%gcc@14.2.0 arch=linux-almalinux9-zen3
+    fi
 
   else
     echo "Hostname is $HOSTNAME — no action taken."
@@ -86,10 +97,12 @@ rm -rf build && mkdir build && cd build
 
 echo "CMake options:"
 echo "  ENABLE_MKL          = $ENABLE_MKL"
+echo "  ENABLE_PLASMA       = $ENABLE_PLASMA"
 echo "  ENABLE_VALIDATION   = $ENABLE_VALIDATION"
 
 cmake -DCMAKE_BUILD_TYPE=Release \
   -DENABLE_MKL="$ENABLE_MKL" \
+  -DENABLE_PLASMA="$ENABLE_PLASMA" \
   -DENABLE_VALIDATION="$ENABLE_VALIDATION" \
   ..
 make -j VERBOSE=1
diff --git a/reference/core/CMakeLists.txt b/reference/core/CMakeLists.txt
index 0453e4f..f7b6535 100644
--- a/reference/core/CMakeLists.txt
+++ b/reference/core/CMakeLists.txt
@@ -5,6 +5,10 @@ if(ENABLE_VALIDATION)
   list(APPEND SOURCE_FILES src/validate.cpp)
 endif()
 
+if(ENABLE_PLASMA)
+  list(APPEND SOURCE_FILES src/plasma_factor.cpp)
+endif()
+
 add_library(cholesky_core STATIC ${SOURCE_FILES})
 
 set_property(TARGET cholesky_core PROPERTY EXPORT_NAME core)
@@ -39,6 +43,15 @@ if(ENABLE_MKL)
   target_compile_definitions(cholesky_core PUBLIC ENABLE_MKL)
 endif()
 
+if(ENABLE_PLASMA)
+  target_compile_definitions(cholesky_core PUBLIC ENABLE_PLASMA)
+  target_include_directories(cholesky_core PUBLIC ${PLASMA_INCLUDE_DIR})
+  target_link_libraries(cholesky_core PUBLIC ${PLASMA_LIB})
+  if(PLASMA_CORE_BLAS_LIB)
+    target_link_libraries(cholesky_core PUBLIC ${PLASMA_CORE_BLAS_LIB})
+  endif()
+endif()
+
 target_compile_features(cholesky_core PUBLIC cxx_std_17)
 
 set_property(TARGET cholesky_core PROPERTY POSITION_INDEPENDENT_CODE ON)
diff --git a/reference/core/include/cholesky_factor.hpp b/reference/core/include/cholesky_factor.hpp
index f54237d..8ec3c4a 100644
--- a/reference/core/include/cholesky_factor.hpp
+++ b/reference/core/include/cholesky_factor.hpp
@@ -3,18 +3,42 @@
 
 #pragma once
 
+#include <stdexcept>
+#include <string>
 #include <vector>
 
 namespace cpu
 {
 
 /**
- * @brief Run a single, threaded LAPACKE_dpotrf on the full N x N row-major
- *        matrix @p A. This is the reference (non-tiled) parallel BLAS
- *        Cholesky factorisation that the OpenMP / HPX tiled variants are
- *        benchmarked against.
+ * @brief Reference Cholesky variants.
+ *
+ *   - reference : single threaded LAPACKE_dpotrf2 call (no tiling; parallelism
+ *                 lives entirely inside the threaded BLAS).
+ *   - plasma    : single plasma_dpotrf call (PLASMA's own tiled parallel
+ *                 Cholesky over the OpenMP runtime).
  */
-void parallel_blas_cholesky(std::vector<double> &A, int N);
+enum class Variant { reference, plasma };
+
+inline Variant to_variant(const std::string &s)
+{
+    if (s == "reference")
+    {
+        return Variant::reference;
+    }
+    if (s == "plasma")
+    {
+        return Variant::plasma;
+    }
+    throw std::invalid_argument("Unknown Variant: " + s);
+}
+
+/**
+ * @brief Run the requested reference variant on the full row-major N x N
+ *        matrix @p A. Factorisation is in place; @p A holds the lower
+ *        triangular factor L on return.
+ */
+void parallel_blas_cholesky(Variant variant, std::vector<double> &A, int N);
 
 }  // end of namespace cpu
 #endif  // end of CPU_CHOLESKY_FACTOR_H
diff --git a/reference/core/include/functions.hpp b/reference/core/include/functions.hpp
index 63614d8..0740b4d 100644
--- a/reference/core/include/functions.hpp
+++ b/reference/core/include/functions.hpp
@@ -4,20 +4,23 @@
 #pragma once
 
 #include <cstddef>
+#include <string>
 #include <vector>
 
 namespace cpu
 {
 
 /**
- * @brief Time a single threaded LAPACKE_dpotrf call on the @p A buffer
- *        (row-major, N x N). The buffer is factorised in place.
+ * @brief Time a single call to the requested reference variant
+ *        ('reference' or 'plasma') on the @p A buffer (row-major, N x N).
+ *        The buffer is factorised in place.
  *
- * @param A row-major matrix; on return contains the lower-triangular factor L
- * @param N matrix dimension
+ * @param A       row-major matrix; on return contains the lower-triangular factor L
+ * @param N       matrix dimension
+ * @param variant which reference path to time
  * @return wall-clock elapsed time in seconds
  */
-double cholesky(std::vector<double> &A, std::size_t N);
+double cholesky(std::vector<double> &A, std::size_t N, const std::string &variant);
 
 }  // namespace cpu
 #endif  // end of CPU_FUNCTIONS_H
diff --git a/reference/core/include/plasma_factor.hpp b/reference/core/include/plasma_factor.hpp
new file mode 100644
index 0000000..cfd85d7
--- /dev/null
+++ b/reference/core/include/plasma_factor.hpp
@@ -0,0 +1,26 @@
+#ifndef CPU_PLASMA_FACTOR_H
+#define CPU_PLASMA_FACTOR_H
+
+#pragma once
+
+#include <vector>
+
+namespace cpu
+{
+
+/**
+ * @brief PLASMA tiled Cholesky on a row-major N x N buffer.
+ *
+ * PLASMA's high-level API is column-major, so we ask for @c PlasmaUpper:
+ * the upper triangle in PLASMA's column-major view aliases the lower
+ * triangle in our row-major view, which is the layout the validation
+ * routine expects (and which matches the LAPACKE_dpotrf2 reference).
+ *
+ * Caller is responsible for having invoked plasma_init() at startup; that
+ * cost is intentionally amortised over all timed calls and stays out of the
+ * timed region.
+ */
+void plasma_cholesky(std::vector<double> &A, int N);
+
+}  // end of namespace cpu
+#endif  // end of CPU_PLASMA_FACTOR_H
diff --git a/reference/core/src/cholesky_factor.cpp b/reference/core/src/cholesky_factor.cpp
index 4fd93fb..677feba 100644
--- a/reference/core/src/cholesky_factor.cpp
+++ b/reference/core/src/cholesky_factor.cpp
@@ -1,15 +1,34 @@
 #include "cholesky_factor.hpp"
 
 #include "adapter_cblas_fp64.hpp"
+#ifdef ENABLE_PLASMA
+#include "plasma_factor.hpp"
+#endif
+
+#include <stdexcept>
 
 namespace cpu
 {
 
-void parallel_blas_cholesky(std::vector<double> &A, int N)
+void parallel_blas_cholesky(Variant variant, std::vector<double> &A, int N)
 {
-    // The whole factorisation is one threaded LAPACKE call; the BLAS library
-    // takes care of dispatching work across the available threads.
-    potrf(A, N);
+    switch (variant)
+    {
+        case Variant::reference:
+            // Single threaded LAPACKE call on the full matrix; the BLAS
+            // library dispatches work across the available threads.
+            potrf(A, N);
+            return;
+
+        case Variant::plasma:
+#ifdef ENABLE_PLASMA
+            plasma_cholesky(A, N);
+            return;
+#else
+            throw std::invalid_argument(
+                "Variant 'plasma' requested but the binary was built without ENABLE_PLASMA=ON");
+#endif
+    }
 }
 
 }  // end of namespace cpu
diff --git a/reference/core/src/functions.cpp b/reference/core/src/functions.cpp
index abe4cb0..e2986ea 100644
--- a/reference/core/src/functions.cpp
+++ b/reference/core/src/functions.cpp
@@ -6,12 +6,13 @@
 namespace cpu
 {
 
-double cholesky(std::vector<double> &A, std::size_t N)
+double cholesky(std::vector<double> &A, std::size_t N, const std::string &variant)
 {
+    const Variant v = to_variant(variant);
     auto start = std::chrono::high_resolution_clock::now();
     ///////////////////////////////////////////////////////////////////////////
-    // Launch Cholesky decomposition: A = L * L^T (single threaded LAPACKE call)
-    parallel_blas_cholesky(A, static_cast<int>(N));
+    // Launch Cholesky decomposition: A = L * L^T (single dispatched call)
+    parallel_blas_cholesky(v, A, static_cast<int>(N));
     ///////////////////////////////////////////////////////////////////////////
     auto stop = std::chrono::high_resolution_clock::now();
     return (stop - start).count() / 1e9;
diff --git a/reference/core/src/plasma_factor.cpp b/reference/core/src/plasma_factor.cpp
new file mode 100644
index 0000000..2618c38
--- /dev/null
+++ b/reference/core/src/plasma_factor.cpp
@@ -0,0 +1,25 @@
+#include "plasma_factor.hpp"
+
+#include <plasma.h>
+
+#include <stdexcept>
+#include <string>
+
+namespace cpu
+{
+
+void plasma_cholesky(std::vector<double> &A, int N)
+{
+    // PLASMA is column-major. Our buffer is row-major and the matrix is
+    // symmetric, so we can pass it through unchanged and ask PLASMA to write
+    // its result into the upper triangle of its column-major view -- that
+    // upper triangle aliases the lower triangle of our row-major view, which
+    // is the layout the validator (and the LAPACKE reference path) expects.
+    const int info = plasma_dpotrf(PlasmaUpper, N, A.data(), N);
+    if (info != 0)
+    {
+        throw std::runtime_error("plasma_dpotrf failed with info=" + std::to_string(info));
+    }
+}
+
+}  // end of namespace cpu
diff --git a/reference/main.cpp b/reference/main.cpp
index 0d58b8f..96d52e4 100644
--- a/reference/main.cpp
+++ b/reference/main.cpp
@@ -3,10 +3,14 @@
 #ifdef ENABLE_VALIDATION
 #include "validate.hpp"
 #endif
+#ifdef ENABLE_PLASMA
+#include <plasma.h>
+#endif
 #include <cstddef>
 #include <fstream>
 #include <iostream>
 #include <omp.h>
+#include <stdexcept>
 #include <string>
 #include <vector>
 
@@ -63,6 +67,15 @@ int main(int argc, char *argv[])
     std::ofstream runtime_file;
     runtime_file.open(runtime_file_path, std::ios_base::app);
 
+#ifdef ENABLE_PLASMA
+    // PLASMA spins up its own context and worker pool; do this once so the
+    // cost is not folded into any timed factorisation.
+    if (plasma_init() != 0)
+    {
+        throw std::runtime_error("plasma_init() failed");
+    }
+#endif
+
     for (std::size_t size = START_SIZE; size <= STOP_SIZE; size = size * STEP_SIZE)
     {
         for (std::size_t l = 0; l < LOOP; l++)
@@ -77,13 +90,18 @@ int main(int argc, char *argv[])
             values += std::string(";") + std::to_string(size);
             values += std::string(";") + std::to_string(1);
             ///////////////////////////////////////////////////////////////////
-            // Single mode: parallel-BLAS reference dpotrf on the full matrix.
+            // Reference modes:
+            //   reference -> single threaded LAPACKE_dpotrf2 on the full matrix
+            //   plasma    -> single plasma_dpotrf (added when ENABLE_PLASMA=ON)
             std::vector<std::string> modes = { "reference" };
+#ifdef ENABLE_PLASMA
+            modes.push_back("plasma");
+#endif
 
             for (const auto &mode : modes)
             {
                 auto A = gen_matrix(size);
-                auto cholesky_cpu = cpu::cholesky(A, size);
+                auto cholesky_cpu = cpu::cholesky(A, size, mode);
 
                 header += ";" + mode;
                 values += ";" + std::to_string(cholesky_cpu);
@@ -115,5 +133,10 @@ int main(int argc, char *argv[])
     }
 
     runtime_file.close();
+
+#ifdef ENABLE_PLASMA
+    plasma_finalize();
+#endif
+
     return 0;
 }

From eb3e93b2847c44a24707804c9cf4b63a8b8fb424 Mon Sep 17 00:00:00 2001
From: constracktor <74077030+constracktor@users.noreply.github.com>
Date: Wed, 29 Apr 2026 23:48:33 +0200
Subject: [PATCH 05/13] Add plasma guard and tile variant

---
 README.md                                  | 22 +++--
 reference/core/include/cholesky_factor.hpp | 16 ++--
 reference/core/include/plasma_factor.hpp   | 28 ++++++-
 reference/core/src/cholesky_factor.cpp     |  9 ++
 reference/core/src/plasma_factor.cpp       | 98 ++++++++++++++++++++++
 reference/main.cpp                         | 53 ++++++++++--
 6 files changed, 209 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 0287f4b..2b0ebff 100644
--- a/README.md
+++ b/README.md
@@ -29,9 +29,21 @@ Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-j
 | Mode | Description |
 |------|-------------|
 | `reference` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). |
-| `plasma` | Single `plasma_dpotrf` call on the full matrix. PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. |
+| `plasma` | Single `plasma_dpotrf` call on the full matrix (PLASMA's high-level synchronous API). PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. |
+| `plasma_tile` | `plasma_omp_dpotrf` over a manually-built `plasma_desc_t` (PLASMA's asynchronous tile interface). Allocates the tile-layout backing store in user code (so PLASMA's `_create` routines never run) and wraps it via `plasma_desc_general_init`, which avoids the int32 overflow that bounds the `plasma` mode. Built only when `ENABLE_PLASMA=ON`. |
 
-This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `reference` mode isolates the contribution of vendor-provided dense-LA parallelism, and the `plasma` mode adds a true tiled-parallel competitor that uses the same OpenMP runtime as the in-house variants.
+This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `reference` mode isolates the contribution of vendor-provided dense-LA parallelism, and the two `plasma*` modes add true tiled-parallel competitors that use the same OpenMP runtime as the in-house variants.
+
+#### PLASMA descriptor int32 overflow
+
+PLASMA 24.8.7's `plasma_desc_*_create()` routines compute their tile-storage size as `int * int` before casting to `size_t`, which silently overflows once the padded tile-area exceeds `INT32_MAX`:
+
+| Path | Behaviour past the boundary (default `nb=256`) |
+|------|------------------------------------------------|
+| `plasma` (high-level, triangular descriptor) | Skipped for `N > 65280`. The benchmark detects the overflow condition before invoking PLASMA and records `nan` for that cell instead of triggering PLASMA's multi-line `malloc() failed` diagnostic. |
+| `plasma_tile` (tile API, user-allocated buffer) | Continues to run. The tile path allocates its own tile-layout backing store with `size_t` arithmetic and wraps it via `plasma_desc_general_init`, so no `_create`/malloc happens inside PLASMA at all. The int32 ceiling does not apply. |
+
+Patching `(size_t)` casts into `control/descriptor.c` in the spack PLASMA package removes the ceiling for the high-level path too and the guard becomes a no-op.
 
 ## Dependencies
 
@@ -152,11 +164,11 @@ threads;problem_size;tile_size;n_tiles;for_collapse;for_naive;task_naive;task_de
 128;65536;1024;64;3.14;3.21;2.98;2.87
 ```
 
-The `reference/` binary reports a `reference` column (and a `plasma` column when built with `ENABLE_PLASMA=ON`), with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key:
+The `reference/` binary reports a `reference` column (and `plasma` + `plasma_tile` columns when built with `ENABLE_PLASMA=ON`), with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key:
 
 ```
-threads;problem_size;tile_size;n_tiles;reference;plasma
-128;65536;65536;1;2.71;2.45
+threads;problem_size;tile_size;n_tiles;reference;plasma;plasma_tile
+128;65280;65280;1;2.71;68.12;71.30
 ```
 
 The same lines are also printed to stdout.
diff --git a/reference/core/include/cholesky_factor.hpp b/reference/core/include/cholesky_factor.hpp
index 8ec3c4a..f7f1b2f 100644
--- a/reference/core/include/cholesky_factor.hpp
+++ b/reference/core/include/cholesky_factor.hpp
@@ -13,12 +13,14 @@ namespace cpu
 /**
  * @brief Reference Cholesky variants.
  *
- *   - reference : single threaded LAPACKE_dpotrf2 call (no tiling; parallelism
- *                 lives entirely inside the threaded BLAS).
- *   - plasma    : single plasma_dpotrf call (PLASMA's own tiled parallel
- *                 Cholesky over the OpenMP runtime).
+ *   - reference   : single threaded LAPACKE_dpotrf2 call (no tiling;
+ *                   parallelism lives entirely inside the threaded BLAS).
+ *   - plasma      : single plasma_dpotrf call (PLASMA's high-level
+ *                   synchronous Cholesky over the OpenMP runtime).
+ *   - plasma_tile : plasma_omp_dpotrf called over a manually-built tile
+ *                   descriptor (PLASMA's asynchronous tile interface).
  */
-enum class Variant { reference, plasma };
+enum class Variant { reference, plasma, plasma_tile };
 
 inline Variant to_variant(const std::string &s)
 {
@@ -30,6 +32,10 @@ inline Variant to_variant(const std::string &s)
     {
         return Variant::plasma;
     }
+    if (s == "plasma_tile")
+    {
+        return Variant::plasma_tile;
+    }
     throw std::invalid_argument("Unknown Variant: " + s);
 }
 
diff --git a/reference/core/include/plasma_factor.hpp b/reference/core/include/plasma_factor.hpp
index cfd85d7..a07edbe 100644
--- a/reference/core/include/plasma_factor.hpp
+++ b/reference/core/include/plasma_factor.hpp
@@ -9,7 +9,8 @@ namespace cpu
 {
 
 /**
- * @brief PLASMA tiled Cholesky on a row-major N x N buffer.
+ * @brief PLASMA tiled Cholesky on a row-major N x N buffer using the
+ *        high-level synchronous API (plasma_dpotrf).
  *
  * PLASMA's high-level API is column-major, so we ask for @c PlasmaUpper:
  * the upper triangle in PLASMA's column-major view aliases the lower
@@ -19,8 +20,33 @@ namespace cpu
  * Caller is responsible for having invoked plasma_init() at startup; that
  * cost is intentionally amortised over all timed calls and stays out of the
  * timed region.
+ *
+ * Throws @c std::runtime_error before calling PLASMA if the descriptor
+ * size computation inside plasma_desc_triangular_create() would overflow
+ * int32 (PLASMA 24.8.7 still does this multiplication in @c int). This
+ * keeps PLASMA's own multi-line error spam off stderr when the surrounding
+ * sweep walks past N=65280.
  */
 void plasma_cholesky(std::vector<double> &A, int N);
 
+/**
+ * @brief PLASMA tiled Cholesky on a row-major N x N buffer using the
+ *        asynchronous tile interface (plasma_omp_dpotrf).
+ *
+ * Allocates the tile-layout backing store ourselves with size_t
+ * arithmetic, then wraps it in a @c plasma_desc_t via
+ * plasma_desc_general_init -- which performs no malloc and therefore
+ * sidesteps PLASMA 24.8.7's int32 overflow inside the create routines.
+ * This means the tile path is expected to keep working past N>65280
+ * where the high-level @c plasma_cholesky aborts.
+ *
+ * After the descriptor is set up, PLASMA's tile-API routines translate
+ * our row-major buffer into tile layout (plasma_omp_dge2desc), run the
+ * tiled factorisation (plasma_omp_dpotrf with PlasmaUpper), and
+ * translate back (plasma_omp_ddesc2ge). The output layout matches the
+ * high-level path: row-major lower triangle holds L.
+ */
+void plasma_tile_cholesky(std::vector<double> &A, int N);
+
 }  // end of namespace cpu
 #endif  // end of CPU_PLASMA_FACTOR_H
diff --git a/reference/core/src/cholesky_factor.cpp b/reference/core/src/cholesky_factor.cpp
index 677feba..81f054e 100644
--- a/reference/core/src/cholesky_factor.cpp
+++ b/reference/core/src/cholesky_factor.cpp
@@ -28,6 +28,15 @@ void parallel_blas_cholesky(Variant variant, std::vector<double> &A, int N)
             throw std::invalid_argument(
                 "Variant 'plasma' requested but the binary was built without ENABLE_PLASMA=ON");
 #endif
+
+        case Variant::plasma_tile:
+#ifdef ENABLE_PLASMA
+            plasma_tile_cholesky(A, N);
+            return;
+#else
+            throw std::invalid_argument(
+                "Variant 'plasma_tile' requested but the binary was built without ENABLE_PLASMA=ON");
+#endif
     }
 }
 
diff --git a/reference/core/src/plasma_factor.cpp b/reference/core/src/plasma_factor.cpp
index 2618c38..d7fae86 100644
--- a/reference/core/src/plasma_factor.cpp
+++ b/reference/core/src/plasma_factor.cpp
@@ -2,14 +2,56 @@
 
 #include <plasma.h>
 
+#include <climits>
+#include <cstddef>
 #include <stdexcept>
 #include <string>
+#include <vector>
 
 namespace cpu
 {
+namespace
+{
+
+// PLASMA's default tile size for fp64 (typical 24.x default). We hardcode
+// this rather than calling plasma_get(PlasmaNb, ...) so the overflow guard
+// below stays portable across PLASMA versions. If you tune via
+// plasma_set(PlasmaNb, ...) at startup, keep this matching.
+constexpr int kPlasmaDefaultNb = 256;
+
+// Pre-flight: would PLASMA's int32 multiplication for descriptor sizing
+// overflow? PLASMA 24.8.7's plasma_desc_*_create routines compute the
+// total tile-layout backing-store size as int*int and then cast to size_t,
+// so the malloc gets a sign-extended-negative argument and fails for any
+// padded total >= INT32_MAX. We replicate the math here and throw before
+// invoking PLASMA, which avoids the multi-line PLASMA ERROR diagnostic on
+// stderr and keeps the surrounding sweep clean.
+//
+// Only used for the high-level path. The tile path bypasses _create entirely
+// by allocating its tile buffer in user code, so it does not need this.
+void guard_descriptor_overflow(int N, int nb, bool triangular, const char *which)
+{
+    const long long mt = (N + nb - 1) / nb;
+    const long long padded =
+        triangular ? (mt * (mt + 1) / 2) * static_cast<long long>(nb) * nb
+                   : mt * mt * static_cast<long long>(nb) * nb;
+    if (padded > static_cast<long long>(INT_MAX))
+    {
+        throw std::runtime_error(
+            std::string(which) + ": skipped to avoid PLASMA descriptor int32 overflow at N=" + std::to_string(N)
+            + " (nb=" + std::to_string(nb) + ", mt=" + std::to_string(mt)
+            + ", padded elements=" + std::to_string(padded) + " > INT32_MAX)");
+    }
+}
+
+}  // anonymous namespace
 
 void plasma_cholesky(std::vector<double> &A, int N)
 {
+    // High-level plasma_dpotrf allocates a triangular tile descriptor
+    // internally; overflow check uses the triangular size formula.
+    guard_descriptor_overflow(N, kPlasmaDefaultNb, /*triangular=*/true, "plasma_dpotrf");
+
     // PLASMA is column-major. Our buffer is row-major and the matrix is
     // symmetric, so we can pass it through unchanged and ask PLASMA to write
     // its result into the upper triangle of its column-major view -- that
@@ -22,4 +64,60 @@ void plasma_cholesky(std::vector<double> &A, int N)
     }
 }
 
+void plasma_tile_cholesky(std::vector<double> &A, int N)
+{
+    // The tile path sidesteps PLASMA 24.8.7's int32 overflow in
+    // plasma_desc_general_create() by *allocating the tile-layout backing
+    // store ourselves* and handing PLASMA a descriptor that merely wraps
+    // it. plasma_desc_general_init does no malloc, so the buggy
+    // multiplication is never reached. Our std::vector handles size_t
+    // arithmetic correctly and frees the buffer on scope exit.
+    //
+    // PLASMA may still hit additional int math on its internal tile-offset
+    // computations during execution; if so, plasma_omp_dpotrf will mark
+    // the sequence with a non-zero status, we'll throw, and main.cpp's
+    // try/catch will record nan for this cell. But the malloc-overflow
+    // failure that hits at N>~46080 with the create path is gone.
+    const int nb = kPlasmaDefaultNb;
+    const long long mt_ll = (N + nb - 1) / nb;
+    const int mt = static_cast<int>(mt_ll);
+    const int lm = mt * nb;  // padded leading dimension; fits int32 even for huge N
+    const std::size_t tile_buf_elements = static_cast<std::size_t>(lm) * static_cast<std::size_t>(lm);
+    std::vector<double> tile_buf(tile_buf_elements);
+
+    plasma_desc_t descA;
+    int retval = plasma_desc_general_init(PlasmaRealDouble, tile_buf.data(), nb, nb, lm, lm, 0, 0, N, N, &descA);
+    if (retval != PlasmaSuccess)
+    {
+        throw std::runtime_error("plasma_desc_general_init failed with retval=" + std::to_string(retval));
+    }
+
+    // PLASMA 24.8.7's tile interface uses stack-allocated sequence/request
+    // structs (no plasma_sequence_create/destroy, no PlasmaRequestInitializer
+    // macro). Zero-init lands status=0=PlasmaSuccess, which is the expected
+    // pre-call state for both structs.
+    plasma_sequence_t sequence{};
+    plasma_request_t request{};
+
+    // Translate row-major buffer -> tile descriptor, factor in place on the
+    // descriptor, translate back. Same PlasmaUpper convention as the
+    // high-level path, so the resulting layout (row-major lower triangle = L)
+    // matches what the validator expects.
+#pragma omp parallel
+#pragma omp master
+    {
+        plasma_omp_dge2desc(A.data(), N, descA, &sequence, &request);
+        plasma_omp_dpotrf(PlasmaUpper, descA, &sequence, &request);
+        plasma_omp_ddesc2ge(descA, A.data(), N, &sequence, &request);
+    }
+
+    // No plasma_desc_destroy: the descriptor never owned the buffer (we did),
+    // and tile_buf goes out of scope here. No sequence destroy: stack-alloc.
+
+    if (sequence.status != PlasmaSuccess)
+    {
+        throw std::runtime_error("plasma tile sequence failed with status=" + std::to_string(sequence.status));
+    }
+}
+
 }  // end of namespace cpu
diff --git a/reference/main.cpp b/reference/main.cpp
index 96d52e4..f6b8c17 100644
--- a/reference/main.cpp
+++ b/reference/main.cpp
@@ -6,9 +6,12 @@
 #ifdef ENABLE_PLASMA
 #include <plasma.h>
 #endif
+#include <cmath>
 #include <cstddef>
+#include <exception>
 #include <fstream>
 #include <iostream>
+#include <limits>
 #include <omp.h>
 #include <stdexcept>
 #include <string>
@@ -91,19 +94,57 @@ int main(int argc, char *argv[])
             values += std::string(";") + std::to_string(1);
             ///////////////////////////////////////////////////////////////////
             // Reference modes:
-            //   reference -> single threaded LAPACKE_dpotrf2 on the full matrix
-            //   plasma    -> single plasma_dpotrf (added when ENABLE_PLASMA=ON)
-            std::vector<std::string> modes = { "reference" };
+            //   reference   -> single threaded LAPACKE_dpotrf2 on the full
+            //                  matrix (currently disabled; uncomment the
+            //                  initializer below to re-enable)
+            //   plasma      -> single plasma_dpotrf (high-level synchronous
+            //                  PLASMA API; added when ENABLE_PLASMA=ON)
+            //   plasma_tile -> plasma_omp_dpotrf over a manually-built
+            //                  plasma_desc_t (PLASMA's asynchronous tile
+            //                  interface; added when ENABLE_PLASMA=ON)
+            std::vector<std::string> modes = {
+                // "reference",
+            };
 #ifdef ENABLE_PLASMA
             modes.push_back("plasma");
+            modes.push_back("plasma_tile");
 #endif
 
             for (const auto &mode : modes)
             {
-                auto A = gen_matrix(size);
-                auto cholesky_cpu = cpu::cholesky(A, size, mode);
-
                 header += ";" + mode;
+
+                // We let one mode fail (e.g. PLASMA running out of memory at
+                // very large N -- its high-level wrapper allocates an extra
+                // tiled triangular copy on top of the input buffer) without
+                // killing the whole sweep. The failed cell is recorded as NaN
+                // and we continue with the next mode and size.
+                std::vector<double> A;
+                try
+                {
+                    A = gen_matrix(size);
+                }
+                catch (const std::exception &e)
+                {
+                    std::cerr << "Error: gen_matrix(size=" << size << ") threw '" << e.what()
+                              << "'. Recording NaN for variant '" << mode << "'." << std::endl;
+                    values += ";nan";
+                    continue;
+                }
+
+                double cholesky_cpu = std::numeric_limits<double>::quiet_NaN();
+                try
+                {
+                    cholesky_cpu = cpu::cholesky(A, size, mode);
+                }
+                catch (const std::exception &e)
+                {
+                    std::cerr << "Error: variant '" << mode << "' failed at size=" << size << ": " << e.what()
+                              << ". Recording NaN and continuing." << std::endl;
+                    values += ";nan";
+                    continue;
+                }
+
                 values += ";" + std::to_string(cholesky_cpu);
 
 #ifdef ENABLE_VALIDATION

From e545c35b8a3b499cf5734bd82353fda0c77d1150 Mon Sep 17 00:00:00 2001
From: constracktor <74077030+constracktor@users.noreply.github.com>
Date: Thu, 30 Apr 2026 10:00:37 +0200
Subject: [PATCH 06/13] Improved plasma tiled

---
 README.md                                | 12 +++---
 reference/core/include/plasma_factor.hpp | 26 +++++++-----
 reference/core/src/plasma_factor.cpp     | 51 ++++++++++++++----------
 3 files changed, 53 insertions(+), 36 deletions(-)

diff --git a/README.md b/README.md
index 2b0ebff..aeb7bdb 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-j
 |------|-------------|
 | `reference` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). |
 | `plasma` | Single `plasma_dpotrf` call on the full matrix (PLASMA's high-level synchronous API). PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. |
-| `plasma_tile` | `plasma_omp_dpotrf` over a manually-built `plasma_desc_t` (PLASMA's asynchronous tile interface). Allocates the tile-layout backing store in user code (so PLASMA's `_create` routines never run) and wraps it via `plasma_desc_general_init`, which avoids the int32 overflow that bounds the `plasma` mode. Built only when `ENABLE_PLASMA=ON`. |
+| `plasma_tile` | `plasma_omp_dpotrf` over a manually-built `plasma_desc_t` (PLASMA's asynchronous tile interface). Allocates an *uninitialised* general (full N×N) tile-layout backing store in user code and wraps it via `plasma_desc_general_init` — PLASMA's `_create` routines never run. Leaving the buffer uninitialised lets `plasma_omp_dge2desc` first-touch each tile from its consuming core, so pages land on the right NUMA node and a chunk of the runtime gap with `plasma` closes. Built only when `ENABLE_PLASMA=ON`. |
 
 This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `reference` mode isolates the contribution of vendor-provided dense-LA parallelism, and the two `plasma*` modes add true tiled-parallel competitors that use the same OpenMP runtime as the in-house variants.
 
@@ -38,12 +38,12 @@ This directory is the natural baseline for the OpenMP and HPX tiled implementati
 
 PLASMA 24.8.7's `plasma_desc_*_create()` routines compute their tile-storage size as `int * int` before casting to `size_t`, which silently overflows once the padded tile-area exceeds `INT32_MAX`:
 
-| Path | Behaviour past the boundary (default `nb=256`) |
-|------|------------------------------------------------|
-| `plasma` (high-level, triangular descriptor) | Skipped for `N > 65280`. The benchmark detects the overflow condition before invoking PLASMA and records `nan` for that cell instead of triggering PLASMA's multi-line `malloc() failed` diagnostic. |
-| `plasma_tile` (tile API, user-allocated buffer) | Continues to run. The tile path allocates its own tile-layout backing store with `size_t` arithmetic and wraps it via `plasma_desc_general_init`, so no `_create`/malloc happens inside PLASMA at all. The int32 ceiling does not apply. |
+| Path | Boundary (default `nb=256`) | Behaviour past the boundary |
+|------|------------------------------|------------------------------|
+| `plasma` (high-level, triangular descriptor) | `N > 65280` | Skipped before invoking PLASMA. Records `nan` instead of triggering PLASMA's multi-line `malloc() failed` diagnostic. |
+| `plasma_tile` (tile API, user-allocated general buffer) | `N > 46080` | Skipped before invoking PLASMA. The user-allocated buffer avoids `_create`'s malloc-overflow, but PLASMA does additional int32 tile-offset arithmetic *during execution* of `plasma_omp_dpotrf`, which segfaults past this boundary. The guard makes the failure clean. |
 
-Patching `(size_t)` casts into `control/descriptor.c` in the spack PLASMA package removes the ceiling for the high-level path too and the guard becomes a no-op.
+Patching `(size_t)` casts into `control/descriptor.c` and the tile-offset code in the spack PLASMA package removes both ceilings, and the guards become no-ops.
 
 ## Dependencies
 
diff --git a/reference/core/include/plasma_factor.hpp b/reference/core/include/plasma_factor.hpp
index a07edbe..f7639ea 100644
--- a/reference/core/include/plasma_factor.hpp
+++ b/reference/core/include/plasma_factor.hpp
@@ -33,18 +33,26 @@ void plasma_cholesky(std::vector<double> &A, int N);
  * @brief PLASMA tiled Cholesky on a row-major N x N buffer using the
  *        asynchronous tile interface (plasma_omp_dpotrf).
  *
- * Allocates the tile-layout backing store ourselves with size_t
- * arithmetic, then wraps it in a @c plasma_desc_t via
+ * Allocates an *uninitialised* general (full N x N) tile-layout backing
+ * store ourselves and wraps it in a @c plasma_desc_t via
  * plasma_desc_general_init -- which performs no malloc and therefore
  * sidesteps PLASMA 24.8.7's int32 overflow inside the create routines.
- * This means the tile path is expected to keep working past N>65280
- * where the high-level @c plasma_cholesky aborts.
  *
- * After the descriptor is set up, PLASMA's tile-API routines translate
- * our row-major buffer into tile layout (plasma_omp_dge2desc), run the
- * tiled factorisation (plasma_omp_dpotrf with PlasmaUpper), and
- * translate back (plasma_omp_ddesc2ge). The output layout matches the
- * high-level path: row-major lower triangle holds L.
+ * Leaving the buffer uninitialised lets plasma_omp_dge2desc first-touch
+ * each tile from its consuming core, so pages land on the right NUMA
+ * node instead of all on the main thread's. That is the optimisation
+ * that closes part of the runtime gap with @c plasma_cholesky; the
+ * remainder of the gap is the wider working-set of the general
+ * descriptor (full N*N tile area vs the high-level path's triangular
+ * mt*(mt+1)/2 area), which would only be recovered by switching to
+ * @c plasma_desc_triangular_init -- attempted but found incompatible
+ * with the dge2desc/ddesc2ge translation routines in PLASMA 24.8.7.
+ *
+ * Note: PLASMA does int32 tile-offset arithmetic during execution as
+ * well, so the tile path is also bounded by an int32 overflow guard
+ * (general formula). Past the bound this function throws and
+ * @c main.cpp's catch handler records @c nan rather than letting PLASMA
+ * segfault.
  */
 void plasma_tile_cholesky(std::vector<double> &A, int N);
 
diff --git a/reference/core/src/plasma_factor.cpp b/reference/core/src/plasma_factor.cpp
index d7fae86..897a05d 100644
--- a/reference/core/src/plasma_factor.cpp
+++ b/reference/core/src/plasma_factor.cpp
@@ -4,6 +4,7 @@
 
 #include <climits>
 #include <cstddef>
+#include <memory>
 #include <stdexcept>
 #include <string>
 #include <vector>
@@ -27,8 +28,11 @@ constexpr int kPlasmaDefaultNb = 256;
 // invoking PLASMA, which avoids the multi-line PLASMA ERROR diagnostic on
 // stderr and keeps the surrounding sweep clean.
 //
-// Only used for the high-level path. The tile path bypasses _create entirely
-// by allocating its tile buffer in user code, so it does not need this.
+// Used for both paths. The high-level path needs it because of the malloc
+// inside _create; the tile path needs it because PLASMA also does int32
+// tile-offset arithmetic *during execution* (segfaults at N>~46080 with the
+// general descriptor and default nb), even though we allocate the buffer
+// ourselves and bypass _create entirely.
 void guard_descriptor_overflow(int N, int nb, bool triangular, const char *which)
 {
     const long long mt = (N + nb - 1) / nb;
@@ -66,36 +70,44 @@ void plasma_cholesky(std::vector<double> &A, int N)
 
 void plasma_tile_cholesky(std::vector<double> &A, int N)
 {
-    // The tile path sidesteps PLASMA 24.8.7's int32 overflow in
-    // plasma_desc_general_create() by *allocating the tile-layout backing
-    // store ourselves* and handing PLASMA a descriptor that merely wraps
-    // it. plasma_desc_general_init does no malloc, so the buggy
-    // multiplication is never reached. Our std::vector handles size_t
-    // arithmetic correctly and frees the buffer on scope exit.
+    // Pre-flight: PLASMA does int32 tile-offset arithmetic during execution
+    // (not just inside _create), so the general descriptor still hits an
+    // overflow ceiling at N>~46080 with the default nb. Without this guard
+    // plasma_omp_dpotrf segfaults rather than failing cleanly.
+    guard_descriptor_overflow(N, kPlasmaDefaultNb, /*triangular=*/false, "plasma_omp_dpotrf");
+
+    // The tile path bypasses PLASMA's _create allocator (which has the
+    // int32-multiply malloc bug) by allocating the tile-layout backing
+    // store ourselves and wrapping it with plasma_desc_general_init. _init
+    // performs no malloc, so the buggy multiplication is never reached.
     //
-    // PLASMA may still hit additional int math on its internal tile-offset
-    // computations during execution; if so, plasma_omp_dpotrf will mark
-    // the sequence with a non-zero status, we'll throw, and main.cpp's
-    // try/catch will record nan for this cell. But the malloc-overflow
-    // failure that hits at N>~46080 with the create path is gone.
+    // The buffer is *uninitialised* (new double[N], not value-initialised
+    // with std::vector). Two reasons: (1) skips a multi-GB zero-init pass
+    // run on the main thread, and (2) lets plasma_omp_dge2desc first-touch
+    // each tile from its consuming core, so pages land on the right NUMA
+    // node instead of all on the main thread's node. That's what shaves
+    // time off the general-descriptor tile path here.
+
     const int nb = kPlasmaDefaultNb;
     const long long mt_ll = (N + nb - 1) / nb;
     const int mt = static_cast<int>(mt_ll);
     const int lm = mt * nb;  // padded leading dimension; fits int32 even for huge N
+
     const std::size_t tile_buf_elements = static_cast<std::size_t>(lm) * static_cast<std::size_t>(lm);
-    std::vector<double> tile_buf(tile_buf_elements);
+
+    std::unique_ptr<double[]> tile_buf(new double[tile_buf_elements]);
 
     plasma_desc_t descA;
-    int retval = plasma_desc_general_init(PlasmaRealDouble, tile_buf.data(), nb, nb, lm, lm, 0, 0, N, N, &descA);
+    int retval =
+        plasma_desc_general_init(PlasmaRealDouble, tile_buf.get(), nb, nb, lm, lm, 0, 0, N, N, &descA);
     if (retval != PlasmaSuccess)
     {
         throw std::runtime_error("plasma_desc_general_init failed with retval=" + std::to_string(retval));
     }
 
     // PLASMA 24.8.7's tile interface uses stack-allocated sequence/request
-    // structs (no plasma_sequence_create/destroy, no PlasmaRequestInitializer
-    // macro). Zero-init lands status=0=PlasmaSuccess, which is the expected
-    // pre-call state for both structs.
+    // structs. Zero-init lands status=0=PlasmaSuccess, the expected
+    // pre-call state.
     plasma_sequence_t sequence{};
     plasma_request_t request{};
 
@@ -111,9 +123,6 @@ void plasma_tile_cholesky(std::vector<double> &A, int N)
         plasma_omp_ddesc2ge(descA, A.data(), N, &sequence, &request);
     }
 
-    // No plasma_desc_destroy: the descriptor never owned the buffer (we did),
-    // and tile_buf goes out of scope here. No sequence destroy: stack-alloc.
-
     if (sequence.status != PlasmaSuccess)
     {
         throw std::runtime_error("plasma tile sequence failed with status=" + std::to_string(sequence.status));

From 2a9d90ba87019e6587f48da30cc61cc0db671f40 Mon Sep 17 00:00:00 2001
From: constracktor <74077030+constracktor@users.noreply.github.com>
Date: Thu, 30 Apr 2026 10:48:05 +0200
Subject: [PATCH 07/13] Remove tiled plasma

---
 README.md                                  |  31 +++---
 reference/CMakeLists.txt                   |   8 ++
 reference/compile.sh                       |  15 ++-
 reference/core/include/cholesky_factor.hpp |  16 +--
 reference/core/include/plasma_factor.hpp   |  38 ++-----
 reference/core/src/cholesky_factor.cpp     |   9 --
 reference/core/src/plasma_factor.cpp       | 120 +++------------------
 reference/main.cpp                         |  34 +++---
 8 files changed, 85 insertions(+), 186 deletions(-)

diff --git a/README.md b/README.md
index aeb7bdb..3307869 100644
--- a/README.md
+++ b/README.md
@@ -28,22 +28,21 @@ Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-j
 
 | Mode | Description |
 |------|-------------|
-| `reference` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). |
+| `reference` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). Enabled by default; disable with `DISABLE_BLAS_REFERENCE=ON`. |
 | `plasma` | Single `plasma_dpotrf` call on the full matrix (PLASMA's high-level synchronous API). PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. |
-| `plasma_tile` | `plasma_omp_dpotrf` over a manually-built `plasma_desc_t` (PLASMA's asynchronous tile interface). Allocates an *uninitialised* general (full N×N) tile-layout backing store in user code and wraps it via `plasma_desc_general_init` — PLASMA's `_create` routines never run. Leaving the buffer uninitialised lets `plasma_omp_dge2desc` first-touch each tile from its consuming core, so pages land on the right NUMA node and a chunk of the runtime gap with `plasma` closes. Built only when `ENABLE_PLASMA=ON`. |
 
-This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `reference` mode isolates the contribution of vendor-provided dense-LA parallelism, and the two `plasma*` modes add true tiled-parallel competitors that use the same OpenMP runtime as the in-house variants.
+This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `reference` mode isolates the contribution of vendor-provided dense-LA parallelism, and the `plasma` mode adds a tiled-parallel competitor that uses the same OpenMP runtime as the in-house variants.
 
 #### PLASMA descriptor int32 overflow
 
-PLASMA 24.8.7's `plasma_desc_*_create()` routines compute their tile-storage size as `int * int` before casting to `size_t`, which silently overflows once the padded tile-area exceeds `INT32_MAX`:
+PLASMA 24.8.7's `plasma_desc_*_create()` routines compute their tile-storage size as `int * int` before casting to `size_t`, which silently overflows once the padded triangular tile-area exceeds `INT32_MAX`. With the default `nb=256`, the boundary is at `N=65280` (`mt=255`).
 
-| Path | Boundary (default `nb=256`) | Behaviour past the boundary |
-|------|------------------------------|------------------------------|
-| `plasma` (high-level, triangular descriptor) | `N > 65280` | Skipped before invoking PLASMA. Records `nan` instead of triggering PLASMA's multi-line `malloc() failed` diagnostic. |
-| `plasma_tile` (tile API, user-allocated general buffer) | `N > 46080` | Skipped before invoking PLASMA. The user-allocated buffer avoids `_create`'s malloc-overflow, but PLASMA does additional int32 tile-offset arithmetic *during execution* of `plasma_omp_dpotrf`, which segfaults past this boundary. The guard makes the failure clean. |
+The benchmark handles this transparently:
 
-Patching `(size_t)` casts into `control/descriptor.c` and the tile-offset code in the spack PLASMA package removes both ceilings, and the guards become no-ops.
+- For sweep sizes `N` in `(65280, 65536]` the working size is **clamped to 65280** for the whole row (both `reference` and `plasma` run at 65280, and the `problem_size` column reports 65280). This keeps the largest practical PLASMA point on the curve without touching the underlying PLASMA build.
+- For `N > 65536` `plasma` records `nan`. `reference` (LAPACKE) is unaffected by the int32 ceiling and continues normally.
+
+Patching `(size_t)` casts into `control/descriptor.c` in the spack PLASMA package removes the ceiling and the clamp + guard become no-ops.
 
 ## Dependencies
 
@@ -84,7 +83,8 @@ These can be set as environment variables before calling `compile.sh`:
 | `DISABLE_COMPUTATION` | `OFF` | *(`openmp/` and `hpx/` only)* Replace all BLAS/tile-generation calls with no-ops. The task graph and loops remain intact, so scheduling overhead can be measured in isolation. |
 | `ENABLE_DYNAMIC_SCHEDULE` | `OFF` | *(`openmp/` only)* Use `schedule(dynamic,1)` on the trailing-update worksharing loops in `for_collapse`. Requires the LLVM toolchain; rejected at compile time with GCC. |
 | `ENABLE_MKL` | `OFF` | Link against Intel oneMKL instead of OpenBLAS. In `openmp/` and `hpx/` this is the *sequential* MKL; in `reference/` it is the *threaded* MKL. |
-| `ENABLE_PLASMA` | `OFF` | *(`reference/` only)* Also build the PLASMA tiled-Cholesky variant. Adds a `plasma` column alongside `reference` in the runtime output. |
+| `ENABLE_PLASMA` | `OFF` | *(`reference/` only)* Also build the PLASMA `plasma_dpotrf` variant. Adds a `plasma` column alongside `reference` in the runtime output. |
+| `DISABLE_BLAS_REFERENCE` | `OFF` | *(`reference/` only)* Skip the LAPACKE_dpotrf reference mode at runtime, so only `plasma` runs (when `ENABLE_PLASMA=ON`). Linking is unchanged — PLASMA and validation still need cblas/lapacke symbols. |
 
 **Examples:**
 
@@ -103,6 +103,9 @@ ENABLE_MKL=ON ./compile.sh
 
 # Reference: also build the PLASMA tiled-Cholesky variant
 ENABLE_PLASMA=ON ./compile.sh
+
+# Reference: PLASMA only, skip the LAPACKE_dpotrf column at runtime
+DISABLE_BLAS_REFERENCE=ON ENABLE_PLASMA=ON ./compile.sh
 ```
 
 ## Run
@@ -164,11 +167,11 @@ threads;problem_size;tile_size;n_tiles;for_collapse;for_naive;task_naive;task_de
 128;65536;1024;64;3.14;3.21;2.98;2.87
 ```
 
-The `reference/` binary reports a `reference` column (and `plasma` + `plasma_tile` columns when built with `ENABLE_PLASMA=ON`), with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key:
+The `reference/` binary reports a `reference` column (suppressed by `DISABLE_BLAS_REFERENCE=ON`) plus a `plasma` column when built with `ENABLE_PLASMA=ON`, with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key:
 
 ```
-threads;problem_size;tile_size;n_tiles;reference;plasma;plasma_tile
-128;65280;65280;1;2.71;68.12;71.30
+threads;problem_size;tile_size;n_tiles;reference;plasma
+128;65280;65280;1;2.71;68.12
 ```
 
 The same lines are also printed to stdout.
@@ -238,6 +241,8 @@ The same lines are also printed to stdout.
             └── adapter_cblas_fp64.cpp
 ```
 
+When `DISABLE_BLAS_REFERENCE=ON`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still compiled and linked (they share cblas/lapacke symbols with PLASMA's BLAS dependency); only the runtime dispatch of the `reference` mode is skipped.
+
 ## Contributing
 
 We would be happy to expand Cholesky-Bench to additional asynchronous many-task (AMT) runtimes. If you have an implementation you would like to add, feel free to open a pull request.
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index 084b4fb..111fae1 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -11,6 +11,10 @@ option(
   ENABLE_PLASMA
   "Build the PLASMA tiled-Cholesky reference variant in addition to the LAPACKE_dpotrf one"
   OFF)
+option(
+  DISABLE_BLAS_REFERENCE
+  "Skip the LAPACKE_dpotrf reference mode at runtime. Linking is unchanged (PLASMA and validation still need cblas/lapacke)."
+  OFF)
 option(
   ENABLE_VALIDATION
   "Compute ||A - L*L^T||_F / ||A||_F after each factorization (off by default)"
@@ -114,4 +118,8 @@ if(BUILD_CORE)
     target_compile_definitions(cholesky_reference PRIVATE ENABLE_PLASMA)
     target_include_directories(cholesky_reference PRIVATE ${PLASMA_INCLUDE_DIR})
   endif()
+
+  if(DISABLE_BLAS_REFERENCE)
+    target_compile_definitions(cholesky_reference PRIVATE DISABLE_BLAS_REFERENCE)
+  endif()
 endif()
diff --git a/reference/compile.sh b/reference/compile.sh
index 0fa48b7..3bffd73 100755
--- a/reference/compile.sh
+++ b/reference/compile.sh
@@ -12,6 +12,9 @@
 #   ENABLE_PLASMA       ON|OFF  (default OFF) - also build the PLASMA tiled
 #                                               Cholesky variant (extra
 #                                               'plasma' column in the output)
+#   DISABLE_BLAS_REFERENCE    ON|OFF  (default OFF) - skip the LAPACKE_dpotrf
+#                                               reference mode at runtime
+#                                               (linking unchanged)
 #   ENABLE_VALIDATION   ON|OFF  (default OFF) - residual check after each
 #                                               factorisation
 #
@@ -19,6 +22,7 @@
 #   ./compile.sh
 #   ENABLE_MKL=ON ./compile.sh
 #   ENABLE_PLASMA=ON ./compile.sh
+#   DISABLE_BLAS_REFERENCE=ON ENABLE_PLASMA=ON ./compile.sh
 #   ENABLE_VALIDATION=ON ./compile.sh
 ################################################################################
 set -e # Exit immediately if a command exits with a non-zero status.
@@ -28,9 +32,10 @@ set -e # Exit immediately if a command exits with a non-zero status.
 ################################################################################
 : "${ENABLE_MKL:=OFF}"
 : "${ENABLE_PLASMA:=OFF}"
+: "${DISABLE_BLAS_REFERENCE:=OFF}"
 : "${ENABLE_VALIDATION:=OFF}"
 
-for var in ENABLE_MKL ENABLE_PLASMA ENABLE_VALIDATION; do
+for var in ENABLE_MKL ENABLE_PLASMA DISABLE_BLAS_REFERENCE ENABLE_VALIDATION; do
   case "${!var}" in
   ON | OFF) ;;
   *)
@@ -96,13 +101,15 @@ fi
 rm -rf build && mkdir build && cd build
 
 echo "CMake options:"
-echo "  ENABLE_MKL          = $ENABLE_MKL"
-echo "  ENABLE_PLASMA       = $ENABLE_PLASMA"
-echo "  ENABLE_VALIDATION   = $ENABLE_VALIDATION"
+echo "  ENABLE_MKL             = $ENABLE_MKL"
+echo "  ENABLE_PLASMA          = $ENABLE_PLASMA"
+echo "  DISABLE_BLAS_REFERENCE = $DISABLE_BLAS_REFERENCE"
+echo "  ENABLE_VALIDATION      = $ENABLE_VALIDATION"
 
 cmake -DCMAKE_BUILD_TYPE=Release \
   -DENABLE_MKL="$ENABLE_MKL" \
   -DENABLE_PLASMA="$ENABLE_PLASMA" \
+  -DDISABLE_BLAS_REFERENCE="$DISABLE_BLAS_REFERENCE" \
   -DENABLE_VALIDATION="$ENABLE_VALIDATION" \
   ..
 make -j VERBOSE=1
diff --git a/reference/core/include/cholesky_factor.hpp b/reference/core/include/cholesky_factor.hpp
index f7f1b2f..5828475 100644
--- a/reference/core/include/cholesky_factor.hpp
+++ b/reference/core/include/cholesky_factor.hpp
@@ -13,14 +13,12 @@ namespace cpu
 /**
  * @brief Reference Cholesky variants.
  *
- *   - reference   : single threaded LAPACKE_dpotrf2 call (no tiling;
- *                   parallelism lives entirely inside the threaded BLAS).
- *   - plasma      : single plasma_dpotrf call (PLASMA's high-level
- *                   synchronous Cholesky over the OpenMP runtime).
- *   - plasma_tile : plasma_omp_dpotrf called over a manually-built tile
- *                   descriptor (PLASMA's asynchronous tile interface).
+ *   - reference : single threaded LAPACKE_dpotrf2 call (no tiling;
+ *                 parallelism lives entirely inside the threaded BLAS).
+ *   - plasma    : single plasma_dpotrf call (PLASMA's high-level
+ *                 synchronous Cholesky over the OpenMP runtime).
  */
-enum class Variant { reference, plasma, plasma_tile };
+enum class Variant { reference, plasma };
 
 inline Variant to_variant(const std::string &s)
 {
@@ -32,10 +30,6 @@ inline Variant to_variant(const std::string &s)
     {
         return Variant::plasma;
     }
-    if (s == "plasma_tile")
-    {
-        return Variant::plasma_tile;
-    }
     throw std::invalid_argument("Unknown Variant: " + s);
 }
 
diff --git a/reference/core/include/plasma_factor.hpp b/reference/core/include/plasma_factor.hpp
index f7639ea..d15868e 100644
--- a/reference/core/include/plasma_factor.hpp
+++ b/reference/core/include/plasma_factor.hpp
@@ -21,40 +21,14 @@ namespace cpu
  * cost is intentionally amortised over all timed calls and stays out of the
  * timed region.
  *
- * Throws @c std::runtime_error before calling PLASMA if the descriptor
- * size computation inside plasma_desc_triangular_create() would overflow
- * int32 (PLASMA 24.8.7 still does this multiplication in @c int). This
- * keeps PLASMA's own multi-line error spam off stderr when the surrounding
- * sweep walks past N=65280.
+ * Throws @c std::runtime_error before calling PLASMA when the descriptor
+ * size computation inside plasma_desc_*_create() would overflow int32
+ * (PLASMA 24.8.7 still does this multiplication in @c int). With the
+ * default @c nb=256 the boundary is at @c N=65280; main.cpp transparently
+ * clamps any iteration size in @c (65280, 65536] down to 65280, so this
+ * guard fires only for @c N>65536 (which then becomes a @c nan cell).
  */
 void plasma_cholesky(std::vector<double> &A, int N);
 
-/**
- * @brief PLASMA tiled Cholesky on a row-major N x N buffer using the
- *        asynchronous tile interface (plasma_omp_dpotrf).
- *
- * Allocates an *uninitialised* general (full N x N) tile-layout backing
- * store ourselves and wraps it in a @c plasma_desc_t via
- * plasma_desc_general_init -- which performs no malloc and therefore
- * sidesteps PLASMA 24.8.7's int32 overflow inside the create routines.
- *
- * Leaving the buffer uninitialised lets plasma_omp_dge2desc first-touch
- * each tile from its consuming core, so pages land on the right NUMA
- * node instead of all on the main thread's. That is the optimisation
- * that closes part of the runtime gap with @c plasma_cholesky; the
- * remainder of the gap is the wider working-set of the general
- * descriptor (full N*N tile area vs the high-level path's triangular
- * mt*(mt+1)/2 area), which would only be recovered by switching to
- * @c plasma_desc_triangular_init -- attempted but found incompatible
- * with the dge2desc/ddesc2ge translation routines in PLASMA 24.8.7.
- *
- * Note: PLASMA does int32 tile-offset arithmetic during execution as
- * well, so the tile path is also bounded by an int32 overflow guard
- * (general formula). Past the bound this function throws and
- * @c main.cpp's catch handler records @c nan rather than letting PLASMA
- * segfault.
- */
-void plasma_tile_cholesky(std::vector<double> &A, int N);
-
 }  // end of namespace cpu
 #endif  // end of CPU_PLASMA_FACTOR_H
diff --git a/reference/core/src/cholesky_factor.cpp b/reference/core/src/cholesky_factor.cpp
index 81f054e..677feba 100644
--- a/reference/core/src/cholesky_factor.cpp
+++ b/reference/core/src/cholesky_factor.cpp
@@ -28,15 +28,6 @@ void parallel_blas_cholesky(Variant variant, std::vector<double> &A, int N)
             throw std::invalid_argument(
                 "Variant 'plasma' requested but the binary was built without ENABLE_PLASMA=ON");
 #endif
-
-        case Variant::plasma_tile:
-#ifdef ENABLE_PLASMA
-            plasma_tile_cholesky(A, N);
-            return;
-#else
-            throw std::invalid_argument(
-                "Variant 'plasma_tile' requested but the binary was built without ENABLE_PLASMA=ON");
-#endif
     }
 }
 
diff --git a/reference/core/src/plasma_factor.cpp b/reference/core/src/plasma_factor.cpp
index 897a05d..323e5ed 100644
--- a/reference/core/src/plasma_factor.cpp
+++ b/reference/core/src/plasma_factor.cpp
@@ -2,59 +2,32 @@
 
 #include <plasma.h>
 
-#include <climits>
-#include <cstddef>
-#include <memory>
 #include <stdexcept>
 #include <string>
-#include <vector>
 
 namespace cpu
 {
-namespace
-{
-
-// PLASMA's default tile size for fp64 (typical 24.x default). We hardcode
-// this rather than calling plasma_get(PlasmaNb, ...) so the overflow guard
-// below stays portable across PLASMA versions. If you tune via
-// plasma_set(PlasmaNb, ...) at startup, keep this matching.
-constexpr int kPlasmaDefaultNb = 256;
 
-// Pre-flight: would PLASMA's int32 multiplication for descriptor sizing
-// overflow? PLASMA 24.8.7's plasma_desc_*_create routines compute the
-// total tile-layout backing-store size as int*int and then cast to size_t,
-// so the malloc gets a sign-extended-negative argument and fails for any
-// padded total >= INT32_MAX. We replicate the math here and throw before
-// invoking PLASMA, which avoids the multi-line PLASMA ERROR diagnostic on
-// stderr and keeps the surrounding sweep clean.
-//
-// Used for both paths. The high-level path needs it because of the malloc
-// inside _create; the tile path needs it because PLASMA also does int32
-// tile-offset arithmetic *during execution* (segfaults at N>~46080 with the
-// general descriptor and default nb), even though we allocate the buffer
-// ourselves and bypass _create entirely.
-void guard_descriptor_overflow(int N, int nb, bool triangular, const char *which)
+void plasma_cholesky(std::vector<double> &A, int N)
 {
-    const long long mt = (N + nb - 1) / nb;
-    const long long padded =
-        triangular ? (mt * (mt + 1) / 2) * static_cast<long long>(nb) * nb
-                   : mt * mt * static_cast<long long>(nb) * nb;
-    if (padded > static_cast<long long>(INT_MAX))
+    // PLASMA 24.8.7's plasma_desc_*_create routines compute their tile-storage
+    // size as int*int and then cast to size_t, so the malloc gets a
+    // sign-extended-negative argument and fails for any padded total
+    // >= INT32_MAX. With the default nb=256 the triangular padded element
+    // count first crosses INT32_MAX at N=65281 (mt=256), so any N>65280 hits
+    // the bug. Guard before invoking PLASMA so the multi-line PLASMA ERROR
+    // diagnostic does not reach stderr.
+    //
+    // main.cpp transparently clamps iteration sizes in (65280, 65536] down to
+    // 65280, so in practice this guard only fires for N>65536 -- which then
+    // becomes a nan cell via main.cpp's per-mode catch handler.
+    constexpr int kPlasmaMaxN = 65280;
+    if (N > kPlasmaMaxN)
     {
         throw std::runtime_error(
-            std::string(which) + ": skipped to avoid PLASMA descriptor int32 overflow at N=" + std::to_string(N)
-            + " (nb=" + std::to_string(nb) + ", mt=" + std::to_string(mt)
-            + ", padded elements=" + std::to_string(padded) + " > INT32_MAX)");
+            "plasma_dpotrf: skipped to avoid PLASMA descriptor int32 overflow at N=" + std::to_string(N)
+            + " (max supported with default nb=256: " + std::to_string(kPlasmaMaxN) + ")");
     }
-}
-
-}  // anonymous namespace
-
-void plasma_cholesky(std::vector<double> &A, int N)
-{
-    // High-level plasma_dpotrf allocates a triangular tile descriptor
-    // internally; overflow check uses the triangular size formula.
-    guard_descriptor_overflow(N, kPlasmaDefaultNb, /*triangular=*/true, "plasma_dpotrf");
 
     // PLASMA is column-major. Our buffer is row-major and the matrix is
     // symmetric, so we can pass it through unchanged and ask PLASMA to write
@@ -68,65 +41,4 @@ void plasma_cholesky(std::vector<double> &A, int N)
     }
 }
 
-void plasma_tile_cholesky(std::vector<double> &A, int N)
-{
-    // Pre-flight: PLASMA does int32 tile-offset arithmetic during execution
-    // (not just inside _create), so the general descriptor still hits an
-    // overflow ceiling at N>~46080 with the default nb. Without this guard
-    // plasma_omp_dpotrf segfaults rather than failing cleanly.
-    guard_descriptor_overflow(N, kPlasmaDefaultNb, /*triangular=*/false, "plasma_omp_dpotrf");
-
-    // The tile path bypasses PLASMA's _create allocator (which has the
-    // int32-multiply malloc bug) by allocating the tile-layout backing
-    // store ourselves and wrapping it with plasma_desc_general_init. _init
-    // performs no malloc, so the buggy multiplication is never reached.
-    //
-    // The buffer is *uninitialised* (new double[N], not value-initialised
-    // with std::vector). Two reasons: (1) skips a multi-GB zero-init pass
-    // run on the main thread, and (2) lets plasma_omp_dge2desc first-touch
-    // each tile from its consuming core, so pages land on the right NUMA
-    // node instead of all on the main thread's node. That's what shaves
-    // time off the general-descriptor tile path here.
-
-    const int nb = kPlasmaDefaultNb;
-    const long long mt_ll = (N + nb - 1) / nb;
-    const int mt = static_cast<int>(mt_ll);
-    const int lm = mt * nb;  // padded leading dimension; fits int32 even for huge N
-
-    const std::size_t tile_buf_elements = static_cast<std::size_t>(lm) * static_cast<std::size_t>(lm);
-
-    std::unique_ptr<double[]> tile_buf(new double[tile_buf_elements]);
-
-    plasma_desc_t descA;
-    int retval =
-        plasma_desc_general_init(PlasmaRealDouble, tile_buf.get(), nb, nb, lm, lm, 0, 0, N, N, &descA);
-    if (retval != PlasmaSuccess)
-    {
-        throw std::runtime_error("plasma_desc_general_init failed with retval=" + std::to_string(retval));
-    }
-
-    // PLASMA 24.8.7's tile interface uses stack-allocated sequence/request
-    // structs. Zero-init lands status=0=PlasmaSuccess, the expected
-    // pre-call state.
-    plasma_sequence_t sequence{};
-    plasma_request_t request{};
-
-    // Translate row-major buffer -> tile descriptor, factor in place on the
-    // descriptor, translate back. Same PlasmaUpper convention as the
-    // high-level path, so the resulting layout (row-major lower triangle = L)
-    // matches what the validator expects.
-#pragma omp parallel
-#pragma omp master
-    {
-        plasma_omp_dge2desc(A.data(), N, descA, &sequence, &request);
-        plasma_omp_dpotrf(PlasmaUpper, descA, &sequence, &request);
-        plasma_omp_ddesc2ge(descA, A.data(), N, &sequence, &request);
-    }
-
-    if (sequence.status != PlasmaSuccess)
-    {
-        throw std::runtime_error("plasma tile sequence failed with status=" + std::to_string(sequence.status));
-    }
-}
-
 }  // end of namespace cpu
diff --git a/reference/main.cpp b/reference/main.cpp
index f6b8c17..ef17ccc 100644
--- a/reference/main.cpp
+++ b/reference/main.cpp
@@ -79,8 +79,19 @@ int main(int argc, char *argv[])
     }
 #endif
 
-    for (std::size_t size = START_SIZE; size <= STOP_SIZE; size = size * STEP_SIZE)
+    for (std::size_t input_size = START_SIZE; input_size <= STOP_SIZE; input_size = input_size * STEP_SIZE)
     {
+        // PLASMA 24.8.7's triangular descriptor allocation overflows int32 for
+        // N>65280 with the default nb=256. For sweep sizes in (65280, 65536]
+        // we transparently clamp the working size down to 65280 so the row
+        // still produces a real plasma timing instead of a nan. Sizes beyond
+        // 65536 fall through and the per-mode catch handler records nan.
+        std::size_t size = input_size;
+        if (size > 65280 && size <= 65536)
+        {
+            size = 65280;
+        }
+
         for (std::size_t l = 0; l < LOOP; l++)
         {
             // header for output file -- columns mirror the openmp/hpx output so
@@ -94,20 +105,17 @@ int main(int argc, char *argv[])
             values += std::string(";") + std::to_string(1);
             ///////////////////////////////////////////////////////////////////
             // Reference modes:
-            //   reference   -> single threaded LAPACKE_dpotrf2 on the full
-            //                  matrix (currently disabled; uncomment the
-            //                  initializer below to re-enable)
-            //   plasma      -> single plasma_dpotrf (high-level synchronous
-            //                  PLASMA API; added when ENABLE_PLASMA=ON)
-            //   plasma_tile -> plasma_omp_dpotrf over a manually-built
-            //                  plasma_desc_t (PLASMA's asynchronous tile
-            //                  interface; added when ENABLE_PLASMA=ON)
-            std::vector<std::string> modes = {
-                // "reference",
-            };
+            //   reference -> single threaded LAPACKE_dpotrf2 on the full
+            //                matrix. Enabled by default; disable at build
+            //                time with DISABLE_BLAS_REFERENCE=ON.
+            //   plasma    -> single plasma_dpotrf (high-level synchronous
+            //                PLASMA API). Built only when ENABLE_PLASMA=ON.
+            std::vector<std::string> modes = {};
+#ifndef DISABLE_BLAS_REFERENCE
+            modes.push_back("reference");
+#endif
 #ifdef ENABLE_PLASMA
             modes.push_back("plasma");
-            modes.push_back("plasma_tile");
 #endif
 
             for (const auto &mode : modes)

From 417878c8c53805254b1d051f30dcf1044d019eec Mon Sep 17 00:00:00 2001
From: constracktor <74077030+constracktor@users.noreply.github.com>
Date: Thu, 30 Apr 2026 20:52:41 +0200
Subject: [PATCH 08/13] Cleanup

---
 README.md                                     |  8 +--
 hpx/core/include/adapter_cblas_fp64.hpp       |  8 +--
 reference/CMakeLists.txt                      | 16 ------
 reference/compile.sh                          | 41 ++++++++-------
 reference/core/CMakeLists.txt                 |  7 +--
 reference/core/include/adapter_cblas_fp64.hpp | 10 ++--
 .../core/include/adapter_plasma_fp64.hpp      | 22 ++++++++
 reference/core/include/cholesky_factor.hpp    |  9 ++--
 reference/core/include/functions.hpp          |  8 +--
 reference/core/include/matrix_generation.hpp  |  6 +--
 reference/core/include/plasma_factor.hpp      | 34 -------------
 reference/core/include/validate.hpp           |  4 +-
 reference/core/src/adapter_cblas_fp64.cpp     |  9 +---
 reference/core/src/adapter_plasma_fp64.cpp    | 30 +++++++++++
 reference/core/src/cholesky_factor.cpp        | 15 ++----
 reference/core/src/functions.cpp              |  6 +--
 reference/core/src/matrix_generation.cpp      |  6 +--
 reference/core/src/plasma_factor.cpp          | 44 ----------------
 reference/core/src/validate.cpp               |  6 +--
 reference/main.cpp                            | 51 +++----------------
 reference/run.sh                              | 14 ++---
 21 files changed, 120 insertions(+), 234 deletions(-)
 create mode 100644 reference/core/include/adapter_plasma_fp64.hpp
 delete mode 100644 reference/core/include/plasma_factor.hpp
 create mode 100644 reference/core/src/adapter_plasma_fp64.cpp
 delete mode 100644 reference/core/src/plasma_factor.cpp

diff --git a/README.md b/README.md
index 3307869..28fdb26 100644
--- a/README.md
+++ b/README.md
@@ -139,7 +139,7 @@ All three directories contain a `run.sh` that is a ready-to-submit SLURM batch s
 sbatch openmp/run.sh             # gcc runtime (default)
 sbatch openmp/run.sh llvm        # llvm runtime
 sbatch hpx/run.sh
-sbatch reference/run.sh
+sbatch reference/run.sh          # gcc runtime; defaults to N=65280 (see PLASMA boundary note)
 ```
 
 ### Command-line arguments
@@ -171,7 +171,7 @@ The `reference/` binary reports a `reference` column (suppressed by `DISABLE_BLA
 
 ```
 threads;problem_size;tile_size;n_tiles;reference;plasma
-128;65280;65280;1;2.71;68.12
+128;65280;65280;1;5.21;68.12
 ```
 
 The same lines are also printed to stdout.
@@ -229,14 +229,14 @@ The same lines are also printed to stdout.
         │   ├── cholesky_factor.hpp
         │   ├── functions.hpp
         │   ├── matrix_generation.hpp
-        │   ├── plasma_factor.hpp     # only used when ENABLE_PLASMA=ON
+        │   ├── adapter_plasma_fp64.hpp  # only used when ENABLE_PLASMA=ON
         │   ├── validate.hpp
         │   └── adapter_cblas_fp64.hpp
         └── src/
             ├── cholesky_factor.cpp
             ├── functions.cpp
             ├── matrix_generation.cpp
-            ├── plasma_factor.cpp     # only built when ENABLE_PLASMA=ON
+            ├── adapter_plasma_fp64.cpp  # only built when ENABLE_PLASMA=ON
             ├── validate.cpp
             └── adapter_cblas_fp64.cpp
 ```
diff --git a/hpx/core/include/adapter_cblas_fp64.hpp b/hpx/core/include/adapter_cblas_fp64.hpp
index 5440833..91ce5c9 100644
--- a/hpx/core/include/adapter_cblas_fp64.hpp
+++ b/hpx/core/include/adapter_cblas_fp64.hpp
@@ -126,7 +126,7 @@ void gemm(const vector &A,
  * @param dep_future dependency future to wait on before executing
  * @param A matrix to be factorized (mutated in-place)
  * @param N matrix dimension
- * @return void future signalling completion
+ * @return void future signaling completion
  */
 void_future potrf_f(void_future dep_future, vector &A, const int N);
 
@@ -140,7 +140,7 @@ void_future potrf_f(void_future dep_future, vector &A, const int N);
  * @param M second dimension
  * @param transpose_L transpose flag for L
  * @param side_L side flag for L
- * @return void future signalling completion
+ * @return void future signaling completion
  */
 void_future trsm_f(void_future dep_L,
                    void_future dep_A,
@@ -158,7 +158,7 @@ void_future trsm_f(void_future dep_L,
  * @param A base matrix (mutated in-place)
  * @param B symmetric update matrix
  * @param N matrix dimension
- * @return void future signalling completion
+ * @return void future signaling completion
  */
 void_future syrk_f(void_future dep_A, void_future dep_B, vector &A, const vector &B, const int N);
 
@@ -175,7 +175,7 @@ void_future syrk_f(void_future dep_A, void_future dep_B, vector &A, const vector
  * @param K third matrix dimension
  * @param transpose_A transpose flag for A
  * @param transpose_B transpose flag for B
- * @return void future signalling completion
+ * @return void future signaling completion
  */
 void_future
 gemm_f(void_future dep_A,
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index 111fae1..a9b5b90 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -36,19 +36,11 @@ if(ENABLE_FORMAT_TARGETS)
 endif()
 
 if(NOT CMAKE_SKIP_INSTALL_RULES)
-  # Our installs follow the standard GNU directory layout. This include needs to
-  # come first since we need the CMAKE_INSTALL_* in the CMakeLists.txt of each
-  # target.
   include(GNUInstallDirs)
 endif()
 
 if(BUILD_CORE)
   if(ENABLE_MKL)
-    # Threaded Intel oneMKL: ask MKL to use its OpenMP runtime ('intel_thread').
-    # This is the only difference from the OpenMP/HPX builds, which pin
-    # MKL_THREADING=sequential because they parallelise at the tile level.
-    # Here the parallelism lives inside dpotrf itself, so we want the
-    # vendor-threaded backend.
     set(MKL_INTERFACE_FULL "intel_lp64")
     set(MKL_THREADING "intel_thread")
     find_package(MKL CONFIG REQUIRED)
@@ -59,9 +51,6 @@ if(BUILD_CORE)
       message(FATAL_ERROR "No BLAS Library found")
     endif()
   else()
-    # Threaded OpenBLAS. The library name is the same as the sequential one,
-    # but the Spack environment loaded by compile.sh selects an OpenBLAS built
-    # with threads=openmp.
     find_library(OpenBLAS_LIB NAMES openblas REQUIRED)
 
     if(OpenBLAS_LIB)
@@ -80,14 +69,9 @@ if(BUILD_CORE)
     endif()
   endif()
 
-  # OpenMP is required for the matrix-generation parallel loop and to pick up
-  # the OpenMP runtime that threaded OpenBLAS / threaded MKL share.
   find_package(OpenMP REQUIRED)
 
   if(ENABLE_PLASMA)
-    # PLASMA exposes its own tiled parallel Cholesky (plasma_dpotrf). Spack
-    # installs it as a single shared library plus a coreblas helper; we look
-    # for both and link whichever is present.
     find_path(PLASMA_INCLUDE_DIR plasma.h)
     if(NOT PLASMA_INCLUDE_DIR)
       message(FATAL_ERROR "ENABLE_PLASMA=ON but plasma.h was not found")
diff --git a/reference/compile.sh b/reference/compile.sh
index 3bffd73..2f20af3 100755
--- a/reference/compile.sh
+++ b/reference/compile.sh
@@ -1,22 +1,21 @@
 #!/bin/bash
 # Usage: compile.sh
 #
-# Builds the parallel-BLAS reference benchmark: a single threaded
+# Builds the parallel-BLAS reference benchmark: a single tile parallel
 # LAPACKE_dpotrf call on the full matrix, used as a baseline against the
-# tiled OpenMP / HPX implementations. GCC only.
+# tiled fork-join and tasking implementations.
 #
 # CMake project options can be overridden via environment variables
 # (defaults match the project's CMakeLists.txt defaults):
-#   ENABLE_MKL          ON|OFF  (default OFF) - link threaded Intel oneMKL
-#                                               instead of threaded OpenBLAS
-#   ENABLE_PLASMA       ON|OFF  (default OFF) - also build the PLASMA tiled
-#                                               Cholesky variant (extra
-#                                               'plasma' column in the output)
-#   DISABLE_BLAS_REFERENCE    ON|OFF  (default OFF) - skip the LAPACKE_dpotrf
-#                                               reference mode at runtime
-#                                               (linking unchanged)
-#   ENABLE_VALIDATION   ON|OFF  (default OFF) - residual check after each
-#                                               factorisation
+#   ENABLE_MKL             ON|OFF  (default OFF) - link threaded Intel oneMKL
+#                                                  instead of threaded OpenBLAS
+#   ENABLE_PLASMA          ON|OFF  (default OFF) - also build the PLASMA
+#                                                  plasma_dpotrf variant (extra
+#                                                  'plasma' column in the output)
+#   DISABLE_BLAS_REFERENCE ON|OFF  (default OFF) - skip the LAPACKE_dpotrf
+#                                                  reference at runtime
+#   ENABLE_VALIDATION      ON|OFF  (default OFF) - residual check after each
+#                                                  factorization
 #
 # Examples:
 #   ./compile.sh
@@ -46,7 +45,7 @@ for var in ENABLE_MKL ENABLE_PLASMA DISABLE_BLAS_REFERENCE ENABLE_VALIDATION; do
 done
 
 ################################################################################
-# Toolchain selection (gcc only)
+# Toolchain selection
 ################################################################################
 select_toolchain() {
   module load gcc/14.2.0
@@ -57,12 +56,11 @@ select_toolchain() {
 ################################################################################
 # Configurations
 #
-# The reference benchmark uses *threaded* OpenBLAS / MKL — that is the whole
-# point of this directory. The OpenMP and HPX builds, by contrast, pin the
-# BLAS to its sequential variant because they parallelise at the tile level.
+# The reference benchmark uses *threaded* BLAS as they operate on a single tile
+# and do not parallelize at the tile level.
 ################################################################################
 if command -v spack &>/dev/null; then
-  echo "Spack command found. Loading libraries (gcc)"
+  echo "Spack command found. Loading libraries."
   # Get current hostname
   HOSTNAME=$(hostname -s)
 
@@ -74,7 +72,7 @@ if command -v spack &>/dev/null; then
       spack load openblas@0.3.28%gcc@14.2.0 threads=openmp ilp64=true
     fi
     if [[ "$ENABLE_PLASMA" == "ON" ]]; then
-      spack load plasma%gcc@14.2.0
+      spack load plasma%gcc@14.2.0 ^openblas@0.3.28%gcc@14.2.0 threads=openmp
     fi
 
   elif [[ "$HOSTNAME" == "nasrin0" || "$HOSTNAME" == "nasrin1" ]]; then
@@ -82,10 +80,10 @@ if command -v spack &>/dev/null; then
     select_toolchain
     if [[ "$ENABLE_MKL" == "OFF" ]]; then
       # OpenBLAS built with OpenMP threading
-      spack load openblas@0.3.28%gcc@14.2.0 arch=linux-almalinux9-zen3 threads=openmp
+      spack load openblas@0.3.28%gcc@14.2.0 arch=linux-almalinux9-zen3 threads=openmp ilp64=true
     fi
     if [[ "$ENABLE_PLASMA" == "ON" ]]; then
-      spack load plasma%gcc@14.2.0 arch=linux-almalinux9-zen3
+      spack load plasma%gcc@14.2.0 arch=linux-almalinux9-zen3 openblas@0.3.28%gcc@14.2.0 threads=openmp
     fi
 
   else
@@ -116,4 +114,5 @@ make -j VERBOSE=1
 cd ..
 
 # Launch Example
-# OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores ./build/cholesky_reference --size_start 65536 --size_stop 65536 --loop 20
+# OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores \
+# ./build/cholesky_reference --size_start 1024 --size_stop 65536 --loop 1
diff --git a/reference/core/CMakeLists.txt b/reference/core/CMakeLists.txt
index f7b6535..eea5cbf 100644
--- a/reference/core/CMakeLists.txt
+++ b/reference/core/CMakeLists.txt
@@ -6,7 +6,7 @@ if(ENABLE_VALIDATION)
 endif()
 
 if(ENABLE_PLASMA)
-  list(APPEND SOURCE_FILES src/plasma_factor.cpp)
+  list(APPEND SOURCE_FILES src/adapter_plasma_fp64.cpp)
 endif()
 
 add_library(cholesky_core STATIC ${SOURCE_FILES})
@@ -33,8 +33,7 @@ if(ENABLE_MKL)
     cholesky_core PUBLIC MKL::mkl_intel_lp64 MKL::mkl_core MKL::MKL
                          MKL::mkl_intel_thread)
 else()
-  # Link threaded OpenBLAS (the library name is the same; threading is
-  # determined by the OpenBLAS build that compile.sh's Spack env selects).
+  # Link threaded OpenBLAS
   target_link_libraries(cholesky_core PUBLIC ${OpenBLAS_LIB})
   target_include_directories(cholesky_core PUBLIC ${OpenBLAS_INCLUDE_DIR})
 endif()
@@ -57,8 +56,6 @@ target_compile_features(cholesky_core PUBLIC cxx_std_17)
 set_property(TARGET cholesky_core PROPERTY POSITION_INDEPENDENT_CODE ON)
 
 if(NOT CMAKE_SKIP_INSTALL_RULES)
-  # We need to manually install those into CMAKE_INSTALL_INCLUDEDIR. Below
-  # install(TARGETS ...) only setups the paths for the exported targets.
   install(
     DIRECTORY include/
     DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
diff --git a/reference/core/include/adapter_cblas_fp64.hpp b/reference/core/include/adapter_cblas_fp64.hpp
index 139945c..11a79e4 100644
--- a/reference/core/include/adapter_cblas_fp64.hpp
+++ b/reference/core/include/adapter_cblas_fp64.hpp
@@ -10,14 +10,12 @@ using vector = std::vector<double>;
 // LAPACK level 3 operations
 
 /**
- * @brief FP64 In-place Cholesky decomposition of A using a single, threaded
- *        LAPACKE_dpotrf call (no tiling). This is the parallel-BLAS reference
- *        implementation that the OpenMP and HPX tiled variants are compared
- *        against.
+ * @brief FP64 In-place Cholesky decomposition of A using a threaded
+ *        LAPACKE_dpotrf call.
  *
- * @param A row-major matrix of size N*N to be factorised in place
+ * @param A row-major matrix of size N*N to be factorized in place
  * @param N matrix dimension
  */
-void potrf(vector &A, const int N);
+void lapacke_potrf(vector &A, const int N);
 
 #endif  // end of CPU_ADAPTER_CBLAS_FP64_H
diff --git a/reference/core/include/adapter_plasma_fp64.hpp b/reference/core/include/adapter_plasma_fp64.hpp
new file mode 100644
index 0000000..3edd661
--- /dev/null
+++ b/reference/core/include/adapter_plasma_fp64.hpp
@@ -0,0 +1,22 @@
+#ifndef CPU_ADAPTER_PLASMA_FP64_H
+#define CPU_ADAPTER_PLASMA_FP64_H
+
+#pragma once
+
+#include <vector>
+
+namespace cpu
+{
+
+/**
+ * @brief PLASMA tiled Cholesky on a row-major N x N buffer using the
+ *        high-level synchronous API (plasma_dpotrf).
+ *
+ * Throws @c std::runtime_error before calling PLASMA when the descriptor
+ * size computation inside plasma_desc_*_create() would overflow int32.
+ *
+ */
+void plasma_potrf(std::vector<double> &A, int N);
+
+}  // end of namespace cpu
+#endif  // end of CPU_ADAPTER_PLASMA_FP64_H
diff --git a/reference/core/include/cholesky_factor.hpp b/reference/core/include/cholesky_factor.hpp
index 5828475..e1e48e2 100644
--- a/reference/core/include/cholesky_factor.hpp
+++ b/reference/core/include/cholesky_factor.hpp
@@ -13,9 +13,8 @@ namespace cpu
 /**
  * @brief Reference Cholesky variants.
  *
- *   - reference : single threaded LAPACKE_dpotrf2 call (no tiling;
- *                 parallelism lives entirely inside the threaded BLAS).
- *   - plasma    : single plasma_dpotrf call (PLASMA's high-level
+ *   - reference : threaded LAPACKE_dpotrf2 call
+ *   - plasma    : plasma_dpotrf call (PLASMA's high-level
  *                 synchronous Cholesky over the OpenMP runtime).
  */
 enum class Variant { reference, plasma };
@@ -35,10 +34,10 @@ inline Variant to_variant(const std::string &s)
 
 /**
  * @brief Run the requested reference variant on the full row-major N x N
- *        matrix @p A. Factorisation is in place; @p A holds the lower
+ *        matrix. Factorization is in place; @p matrix holds the lower
  *        triangular factor L on return.
  */
-void parallel_blas_cholesky(Variant variant, std::vector<double> &A, int N);
+void parallel_cholesky(Variant variant, std::vector<double> &matrix, int N);
 
 }  // end of namespace cpu
 #endif  // end of CPU_CHOLESKY_FACTOR_H
diff --git a/reference/core/include/functions.hpp b/reference/core/include/functions.hpp
index 0740b4d..f7e74ba 100644
--- a/reference/core/include/functions.hpp
+++ b/reference/core/include/functions.hpp
@@ -12,15 +12,15 @@ namespace cpu
 
 /**
  * @brief Time a single call to the requested reference variant
- *        ('reference' or 'plasma') on the @p A buffer (row-major, N x N).
- *        The buffer is factorised in place.
+ *        ('reference' or 'plasma') on the @p matrix buffer (row-major, N x N).
+ *        The buffer is factorized in place.
  *
- * @param A       row-major matrix; on return contains the lower-triangular factor L
+ * @param matrix  row-major matrix; on return contains the lower-triangular factor L
  * @param N       matrix dimension
  * @param variant which reference path to time
  * @return wall-clock elapsed time in seconds
  */
-double cholesky(std::vector<double> &A, std::size_t N, const std::string &variant);
+double cholesky(std::vector<double> &matrix, std::size_t N, const std::string &variant);
 
 }  // namespace cpu
 #endif  // end of CPU_FUNCTIONS_H
diff --git a/reference/core/include/matrix_generation.hpp b/reference/core/include/matrix_generation.hpp
index 22a3206..967398b 100644
--- a/reference/core/include/matrix_generation.hpp
+++ b/reference/core/include/matrix_generation.hpp
@@ -12,11 +12,7 @@
  * Entries are uniform on [0, 1) using a per-row seed; the diagonal is shifted
  * by +N to guarantee strict diagonal dominance and therefore symmetric
  * positive definiteness. The result is stored as a single contiguous
- * std::vector<double> of length N*N in row-major order, ready to be passed to
- * LAPACKE_dpotrf.
- *
- * Generation is parallelised with OpenMP across rows so it does not dominate
- * the timed factorisation phase.
+ * std::vector<double> of length N*N in row-major order.
  *
  * @param N matrix dimension
  * @return owning row-major buffer of length N*N
diff --git a/reference/core/include/plasma_factor.hpp b/reference/core/include/plasma_factor.hpp
deleted file mode 100644
index d15868e..0000000
--- a/reference/core/include/plasma_factor.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef CPU_PLASMA_FACTOR_H
-#define CPU_PLASMA_FACTOR_H
-
-#pragma once
-
-#include <vector>
-
-namespace cpu
-{
-
-/**
- * @brief PLASMA tiled Cholesky on a row-major N x N buffer using the
- *        high-level synchronous API (plasma_dpotrf).
- *
- * PLASMA's high-level API is column-major, so we ask for @c PlasmaUpper:
- * the upper triangle in PLASMA's column-major view aliases the lower
- * triangle in our row-major view, which is the layout the validation
- * routine expects (and which matches the LAPACKE_dpotrf2 reference).
- *
- * Caller is responsible for having invoked plasma_init() at startup; that
- * cost is intentionally amortised over all timed calls and stays out of the
- * timed region.
- *
- * Throws @c std::runtime_error before calling PLASMA when the descriptor
- * size computation inside plasma_desc_*_create() would overflow int32
- * (PLASMA 24.8.7 still does this multiplication in @c int). With the
- * default @c nb=256 the boundary is at @c N=65280; main.cpp transparently
- * clamps any iteration size in @c (65280, 65536] down to 65280, so this
- * guard fires only for @c N>65536 (which then becomes a @c nan cell).
- */
-void plasma_cholesky(std::vector<double> &A, int N);
-
-}  // end of namespace cpu
-#endif  // end of CPU_PLASMA_FACTOR_H
diff --git a/reference/core/include/validate.hpp b/reference/core/include/validate.hpp
index 6cf829c..4c666d0 100644
--- a/reference/core/include/validate.hpp
+++ b/reference/core/include/validate.hpp
@@ -11,12 +11,12 @@ namespace cpu
 
 /**
  * @brief Compute the relative Cholesky residual ||A - L * L^T||_F / ||A||_F
- *        for the dense, row-major reference factorisation.
+ *        for the dense, row-major reference factorization.
  *
  * The original A is regenerated on the fly with the same deterministic seed
  * used by gen_matrix, so no extra storage is needed.
  *
- * @param N matrix dimension (must match the factorisation)
+ * @param N matrix dimension (must match the factorization)
  * @param L row-major buffer of length N*N holding the factor returned by
  *          LAPACKE_dpotrf with uplo='L' (only the lower triangle is read)
  * @return relative Frobenius residual
diff --git a/reference/core/src/adapter_cblas_fp64.cpp b/reference/core/src/adapter_cblas_fp64.cpp
index 566290f..264d442 100644
--- a/reference/core/src/adapter_cblas_fp64.cpp
+++ b/reference/core/src/adapter_cblas_fp64.cpp
@@ -9,11 +9,4 @@
 #include "lapacke.h"
 #endif
 
-void potrf(vector &A, const int N)
-{
-    // Single threaded LAPACKE call on the full matrix. dpotrf2 is the
-    // recursive variant, which is what the OpenMP / HPX variants use on
-    // their diagonal tiles, so picking it here keeps the underlying kernel
-    // identical and isolates the parallelism source as the only difference.
-    LAPACKE_dpotrf2(LAPACK_ROW_MAJOR, 'L', N, A.data(), N);
-}
+void lapacke_potrf(vector &A, const int N) { LAPACKE_dpotrf2(LAPACK_ROW_MAJOR, 'L', N, A.data(), N); }
diff --git a/reference/core/src/adapter_plasma_fp64.cpp b/reference/core/src/adapter_plasma_fp64.cpp
new file mode 100644
index 0000000..06147ad
--- /dev/null
+++ b/reference/core/src/adapter_plasma_fp64.cpp
@@ -0,0 +1,30 @@
+#include "adapter_plasma_fp64.hpp"
+
+#include <plasma.h>
+#include <stdexcept>
+#include <string>
+
+namespace cpu
+{
+
+void plasma_potrf(std::vector<double> &A, int N)
+{
+    constexpr int k_plasma_max_n = 65'280;
+    if (N > k_plasma_max_n)
+    {
+        throw std::runtime_error(
+            "plasma_dpotrf: skipped to avoid PLASMA descriptor int32 overflow at N=" + std::to_string(N)
+            + " (max supported with default nb=256: " + std::to_string(k_plasma_max_n) + ")");
+    }
+
+    // PLASMA is column-major. Our buffer is row-major and the matrix is
+    // symmetric, so we can pass it through unchanged and ask PLASMA to write
+    // its result into the upper triangle of its column-major view
+    const int info = plasma_dpotrf(PlasmaUpper, N, A.data(), N);
+    if (info != 0)
+    {
+        throw std::runtime_error("plasma_dpotrf failed with info=" + std::to_string(info));
+    }
+}
+
+}  // end of namespace cpu
diff --git a/reference/core/src/cholesky_factor.cpp b/reference/core/src/cholesky_factor.cpp
index 677feba..25bbe20 100644
--- a/reference/core/src/cholesky_factor.cpp
+++ b/reference/core/src/cholesky_factor.cpp
@@ -2,7 +2,7 @@
 
 #include "adapter_cblas_fp64.hpp"
 #ifdef ENABLE_PLASMA
-#include "plasma_factor.hpp"
+#include "adapter_plasma_fp64.hpp"
 #endif
 
 #include <stdexcept>
@@ -10,23 +10,18 @@
 namespace cpu
 {
 
-void parallel_blas_cholesky(Variant variant, std::vector<double> &A, int N)
+void parallel_cholesky(Variant variant, std::vector<double> &matrix, int N)
 {
     switch (variant)
     {
-        case Variant::reference:
-            // Single threaded LAPACKE call on the full matrix; the BLAS
-            // library dispatches work across the available threads.
-            potrf(A, N);
-            return;
+        case Variant::reference: lapacke_potrf(matrix, N); return;
 
         case Variant::plasma:
 #ifdef ENABLE_PLASMA
-            plasma_cholesky(A, N);
+            plasma_potrf(matrix, N);
             return;
 #else
-            throw std::invalid_argument(
-                "Variant 'plasma' requested but the binary was built without ENABLE_PLASMA=ON");
+            throw std::invalid_argument("Variant 'plasma' requested but the binary was built without ENABLE_PLASMA=ON");
 #endif
     }
 }
diff --git a/reference/core/src/functions.cpp b/reference/core/src/functions.cpp
index e2986ea..1f15f26 100644
--- a/reference/core/src/functions.cpp
+++ b/reference/core/src/functions.cpp
@@ -6,13 +6,13 @@
 namespace cpu
 {
 
-double cholesky(std::vector<double> &A, std::size_t N, const std::string &variant)
+double cholesky(std::vector<double> &matrix, std::size_t N, const std::string &variant)
 {
     const Variant v = to_variant(variant);
     auto start = std::chrono::high_resolution_clock::now();
     ///////////////////////////////////////////////////////////////////////////
-    // Launch Cholesky decomposition: A = L * L^T (single dispatched call)
-    parallel_blas_cholesky(v, A, static_cast<int>(N));
+    // Launch Cholesky decomposition: A = L * L^T
+    parallel_cholesky(v, matrix, static_cast<int>(N));
     ///////////////////////////////////////////////////////////////////////////
     auto stop = std::chrono::high_resolution_clock::now();
     return (stop - start).count() / 1e9;
diff --git a/reference/core/src/matrix_generation.cpp b/reference/core/src/matrix_generation.cpp
index b0db740..a67ff5a 100644
--- a/reference/core/src/matrix_generation.cpp
+++ b/reference/core/src/matrix_generation.cpp
@@ -5,15 +5,11 @@
 
 std::vector<double> gen_matrix(std::size_t N)
 {
-    // Row-major dense buffer
     std::vector<double> A(N * N);
 
     // The matrix is built row by row in parallel. Each row uses its own RNG
     // seeded by the row index, so the matrix is deterministic and
-    // reproducible regardless of the number of threads. Off-diagonal entries
-    // are mirrored to keep A symmetric; the diagonal is shifted by +N to
-    // guarantee strict diagonal dominance (and therefore SPD), mirroring the
-    // +N*n_tiles shift used by the tiled variants when n_tiles == 1.
+    // reproducible regardless of the number of threads.
 #pragma omp parallel for schedule(static)
     for (std::size_t i = 0; i < N; ++i)
     {
diff --git a/reference/core/src/plasma_factor.cpp b/reference/core/src/plasma_factor.cpp
deleted file mode 100644
index 323e5ed..0000000
--- a/reference/core/src/plasma_factor.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-#include "plasma_factor.hpp"
-
-#include <plasma.h>
-
-#include <stdexcept>
-#include <string>
-
-namespace cpu
-{
-
-void plasma_cholesky(std::vector<double> &A, int N)
-{
-    // PLASMA 24.8.7's plasma_desc_*_create routines compute their tile-storage
-    // size as int*int and then cast to size_t, so the malloc gets a
-    // sign-extended-negative argument and fails for any padded total
-    // >= INT32_MAX. With the default nb=256 the triangular padded element
-    // count first crosses INT32_MAX at N=65281 (mt=256), so any N>65280 hits
-    // the bug. Guard before invoking PLASMA so the multi-line PLASMA ERROR
-    // diagnostic does not reach stderr.
-    //
-    // main.cpp transparently clamps iteration sizes in (65280, 65536] down to
-    // 65280, so in practice this guard only fires for N>65536 -- which then
-    // becomes a nan cell via main.cpp's per-mode catch handler.
-    constexpr int kPlasmaMaxN = 65280;
-    if (N > kPlasmaMaxN)
-    {
-        throw std::runtime_error(
-            "plasma_dpotrf: skipped to avoid PLASMA descriptor int32 overflow at N=" + std::to_string(N)
-            + " (max supported with default nb=256: " + std::to_string(kPlasmaMaxN) + ")");
-    }
-
-    // PLASMA is column-major. Our buffer is row-major and the matrix is
-    // symmetric, so we can pass it through unchanged and ask PLASMA to write
-    // its result into the upper triangle of its column-major view -- that
-    // upper triangle aliases the lower triangle of our row-major view, which
-    // is the layout the validator (and the LAPACKE reference path) expects.
-    const int info = plasma_dpotrf(PlasmaUpper, N, A.data(), N);
-    if (info != 0)
-    {
-        throw std::runtime_error("plasma_dpotrf failed with info=" + std::to_string(info));
-    }
-}
-
-}  // end of namespace cpu
diff --git a/reference/core/src/validate.cpp b/reference/core/src/validate.cpp
index 5a43cc8..8b1f647 100644
--- a/reference/core/src/validate.cpp
+++ b/reference/core/src/validate.cpp
@@ -8,7 +8,6 @@
 #include "cblas.h"
 #endif
 
-#include <algorithm>
 #include <cmath>
 #include <cstddef>
 #include <vector>
@@ -19,9 +18,6 @@ namespace cpu
 double cholesky_residual(std::size_t N, const std::vector<double> &L)
 {
     // Build a working copy of L with its strictly upper triangle zeroed out.
-    // dpotrf with uplo='L' leaves the upper triangle untouched (it still
-    // contains the original A values), so we must mask it before forming
-    // L * L^T with a plain dgemm.
     std::vector<double> Lwork(L);
     for (std::size_t i = 0; i < N; ++i)
     {
@@ -49,7 +45,7 @@ double cholesky_residual(std::size_t N, const std::vector<double> &L)
         LLt.data(),
         static_cast<int>(N));
 
-    // Regenerate the original A deterministically and accumulate Frobenius
+    // Regenerate the original matrix A deterministically and accumulate Frobenius
     // norms of (A - LLt) and A.
     const std::vector<double> A = gen_matrix(N);
 
diff --git a/reference/main.cpp b/reference/main.cpp
index ef17ccc..b4c8585 100644
--- a/reference/main.cpp
+++ b/reference/main.cpp
@@ -6,7 +6,6 @@
 #ifdef ENABLE_PLASMA
 #include <plasma.h>
 #endif
-#include <cmath>
 #include <cstddef>
 #include <exception>
 #include <fstream>
@@ -21,12 +20,6 @@ int main(int argc, char *argv[])
 {
     ///////////////////////////////////////////////////////////////////////////
     // cmdline arguments
-    //
-    // The reference benchmark calls a single threaded LAPACKE_dpotrf on the
-    // full matrix, so there is no tiling axis. We still accept --tiles_start
-    // / --tiles_stop for CLI compatibility with the openmp/ and hpx/ binaries
-    // (they are silently ignored), which keeps any shared driver script
-    // unchanged.
     std::size_t loop = 1;
     std::size_t size_start = 32, size_stop = 128;
 
@@ -71,8 +64,6 @@ int main(int argc, char *argv[])
     runtime_file.open(runtime_file_path, std::ios_base::app);
 
 #ifdef ENABLE_PLASMA
-    // PLASMA spins up its own context and worker pool; do this once so the
-    // cost is not folded into any timed factorisation.
     if (plasma_init() != 0)
     {
         throw std::runtime_error("plasma_init() failed");
@@ -81,23 +72,18 @@ int main(int argc, char *argv[])
 
     for (std::size_t input_size = START_SIZE; input_size <= STOP_SIZE; input_size = input_size * STEP_SIZE)
     {
-        // PLASMA 24.8.7's triangular descriptor allocation overflows int32 for
+        // PLASMA's triangular descriptor allocation overflows int32 for
         // N>65280 with the default nb=256. For sweep sizes in (65280, 65536]
-        // we transparently clamp the working size down to 65280 so the row
-        // still produces a real plasma timing instead of a nan. Sizes beyond
+        // we transparently clamp the working size down to 65280. Sizes beyond
         // 65536 fall through and the per-mode catch handler records nan.
         std::size_t size = input_size;
-        if (size > 65280 && size <= 65536)
+        if (size > 65'280 && size <= 65'536)
         {
-            size = 65280;
+            size = 65'280;
         }
 
         for (std::size_t l = 0; l < LOOP; l++)
         {
-            // header for output file -- columns mirror the openmp/hpx output so
-            // results from all three benchmarks can be merged on (problem_size).
-            // The reference has no tiling, so tile_size == problem_size and
-            // n_tiles == 1.
             std::string header = "threads;problem_size;tile_size;n_tiles";
             std::string values = std::to_string(omp_get_max_threads());
             values += std::string(";") + std::to_string(size);
@@ -105,11 +91,6 @@ int main(int argc, char *argv[])
             values += std::string(";") + std::to_string(1);
             ///////////////////////////////////////////////////////////////////
             // Reference modes:
-            //   reference -> single threaded LAPACKE_dpotrf2 on the full
-            //                matrix. Enabled by default; disable at build
-            //                time with DISABLE_BLAS_REFERENCE=ON.
-            //   plasma    -> single plasma_dpotrf (high-level synchronous
-            //                PLASMA API). Built only when ENABLE_PLASMA=ON.
             std::vector<std::string> modes = {};
 #ifndef DISABLE_BLAS_REFERENCE
             modes.push_back("reference");
@@ -122,28 +103,12 @@ int main(int argc, char *argv[])
             {
                 header += ";" + mode;
 
-                // We let one mode fail (e.g. PLASMA running out of memory at
-                // very large N -- its high-level wrapper allocates an extra
-                // tiled triangular copy on top of the input buffer) without
-                // killing the whole sweep. The failed cell is recorded as NaN
-                // and we continue with the next mode and size.
-                std::vector<double> A;
-                try
-                {
-                    A = gen_matrix(size);
-                }
-                catch (const std::exception &e)
-                {
-                    std::cerr << "Error: gen_matrix(size=" << size << ") threw '" << e.what()
-                              << "'. Recording NaN for variant '" << mode << "'." << std::endl;
-                    values += ";nan";
-                    continue;
-                }
-
+                std::vector<double> matrix = gen_matrix(size);
+                // NaN guard
                 double cholesky_cpu = std::numeric_limits<double>::quiet_NaN();
                 try
                 {
-                    cholesky_cpu = cpu::cholesky(A, size, mode);
+                    cholesky_cpu = cpu::cholesky(matrix, size, mode);
                 }
                 catch (const std::exception &e)
                 {
@@ -158,7 +123,7 @@ int main(int argc, char *argv[])
 #ifdef ENABLE_VALIDATION
                 // Validate by computing relative residual ||A - L L^T||_F / ||A||_F
                 constexpr double residual_tol = 1e-10;
-                const double residual = cpu::cholesky_residual(size, A);
+                const double residual = cpu::cholesky_residual(size, matrix);
                 std::cout << "[validate] mode=" << mode << " size=" << size << " residual=" << residual << std::endl;
                 if (!(residual <= residual_tol))  // catches NaN too
                 {
diff --git a/reference/run.sh b/reference/run.sh
index 0600513..0b5c772 100755
--- a/reference/run.sh
+++ b/reference/run.sh
@@ -12,30 +12,24 @@
 #
 # Submit example:
 #   sbatch run.sh
-#
-# Runs the parallel-BLAS reference benchmark — a single threaded
-# LAPACKE_dpotrf call on the full matrix — as a baseline for the OpenMP and
-# HPX tiled implementations. GCC only.
 
 set -e # Exit immediately if a command exits with a non-zero status.
 
 ################################################################################
-# Toolchain runtime selection (gcc only)
+# Toolchain runtime selection
 ################################################################################
 module load gcc/14.2.0
 
 # Resolve directory where the script is located
 SCRIPT_DIR="$(pwd)"
 
-# OpenMP settings — the threaded BLAS picks these up to spread dpotrf across
-# all the cores. Both OpenBLAS (threads=openmp) and threaded MKL respect the
-# standard OMP_* environment.
+# OpenMP settings
 export OMP_NUM_THREADS=128
 export OMP_PROC_BIND=close
 export OMP_PLACES=cores
 
 # Make sure threaded MKL uses the OpenMP runtime if ENABLE_MKL=ON was used at
-# build time. Harmless when linking OpenBLAS.
+# build time.
 export MKL_NUM_THREADS=${MKL_NUM_THREADS:-$OMP_NUM_THREADS}
 
 echo "Running with gcc runtime"
@@ -43,5 +37,5 @@ echo "Running with gcc runtime"
 # Run executable
 srun --cpu-bind=cores "$SCRIPT_DIR/build/cholesky_reference" \
   --loop 20 \
-  --size_start 65536 \
+  --size_start 1024 \
   --size_stop 65536

From f2742da81087189a47ae31865ac1f6c19fcd4b8d Mon Sep 17 00:00:00 2001
From: constracktor <74077030+constracktor@users.noreply.github.com>
Date: Fri, 1 May 2026 00:01:24 +0200
Subject: [PATCH 09/13] Rename to lapacke

---
 README.md                                  | 18 +++++++++---------
 reference/core/include/cholesky_factor.hpp | 12 ++++++------
 reference/core/src/cholesky_factor.cpp     |  2 +-
 reference/main.cpp                         |  2 +-
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 28fdb26..d2f9494 100644
--- a/README.md
+++ b/README.md
@@ -28,10 +28,10 @@ Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-j
 
 | Mode | Description |
 |------|-------------|
-| `reference` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). Enabled by default; disable with `DISABLE_BLAS_REFERENCE=ON`. |
+| `lapacke` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). Enabled by default; disable with `DISABLE_BLAS_REFERENCE=ON`. |
 | `plasma` | Single `plasma_dpotrf` call on the full matrix (PLASMA's high-level synchronous API). PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. |
 
-This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `reference` mode isolates the contribution of vendor-provided dense-LA parallelism, and the `plasma` mode adds a tiled-parallel competitor that uses the same OpenMP runtime as the in-house variants.
+This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `lapacke` mode isolates the contribution of vendor-provided dense-LA parallelism, and the `plasma` mode adds a tiled-parallel competitor that uses the same OpenMP runtime as the in-house variants.
 
 #### PLASMA descriptor int32 overflow
 
@@ -39,8 +39,8 @@ PLASMA 24.8.7's `plasma_desc_*_create()` routines compute their tile-storage siz
 
 The benchmark handles this transparently:
 
-- For sweep sizes `N` in `(65280, 65536]` the working size is **clamped to 65280** for the whole row (both `reference` and `plasma` run at 65280, and the `problem_size` column reports 65280). This keeps the largest practical PLASMA point on the curve without touching the underlying PLASMA build.
-- For `N > 65536` `plasma` records `nan`. `reference` (LAPACKE) is unaffected by the int32 ceiling and continues normally.
+- For sweep sizes `N` in `(65280, 65536]` the working size is **clamped to 65280** for the whole row (both `lapacke` and `plasma` run at 65280, and the `problem_size` column reports 65280). This keeps the largest practical PLASMA point on the curve without touching the underlying PLASMA build.
+- For `N > 65536` `plasma` records `nan`. `lapacke` is unaffected by the int32 ceiling and continues normally.
 
 Patching `(size_t)` casts into `control/descriptor.c` in the spack PLASMA package removes the ceiling and the clamp + guard become no-ops.
 
@@ -83,8 +83,8 @@ These can be set as environment variables before calling `compile.sh`:
 | `DISABLE_COMPUTATION` | `OFF` | *(`openmp/` and `hpx/` only)* Replace all BLAS/tile-generation calls with no-ops. The task graph and loops remain intact, so scheduling overhead can be measured in isolation. |
 | `ENABLE_DYNAMIC_SCHEDULE` | `OFF` | *(`openmp/` only)* Use `schedule(dynamic,1)` on the trailing-update worksharing loops in `for_collapse`. Requires the LLVM toolchain; rejected at compile time with GCC. |
 | `ENABLE_MKL` | `OFF` | Link against Intel oneMKL instead of OpenBLAS. In `openmp/` and `hpx/` this is the *sequential* MKL; in `reference/` it is the *threaded* MKL. |
-| `ENABLE_PLASMA` | `OFF` | *(`reference/` only)* Also build the PLASMA `plasma_dpotrf` variant. Adds a `plasma` column alongside `reference` in the runtime output. |
-| `DISABLE_BLAS_REFERENCE` | `OFF` | *(`reference/` only)* Skip the LAPACKE_dpotrf reference mode at runtime, so only `plasma` runs (when `ENABLE_PLASMA=ON`). Linking is unchanged — PLASMA and validation still need cblas/lapacke symbols. |
+| `ENABLE_PLASMA` | `OFF` | *(`reference/` only)* Also build the PLASMA `plasma_dpotrf` variant. Adds a `plasma` column alongside `lapacke` in the runtime output. |
+| `DISABLE_BLAS_REFERENCE` | `OFF` | *(`reference/` only)* Skip the `lapacke` mode at runtime, so only `plasma` runs (when `ENABLE_PLASMA=ON`). Linking is unchanged — PLASMA and validation still need cblas/lapacke symbols. |
 
 **Examples:**
 
@@ -167,10 +167,10 @@ threads;problem_size;tile_size;n_tiles;for_collapse;for_naive;task_naive;task_de
 128;65536;1024;64;3.14;3.21;2.98;2.87
 ```
 
-The `reference/` binary reports a `reference` column (suppressed by `DISABLE_BLAS_REFERENCE=ON`) plus a `plasma` column when built with `ENABLE_PLASMA=ON`, with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key:
+The `reference/` binary reports a `lapacke` column (suppressed by `DISABLE_BLAS_REFERENCE=ON`) plus a `plasma` column when built with `ENABLE_PLASMA=ON`, with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key:
 
 ```
-threads;problem_size;tile_size;n_tiles;reference;plasma
+threads;problem_size;tile_size;n_tiles;lapacke;plasma
 128;65280;65280;1;5.21;68.12
 ```
 
@@ -241,7 +241,7 @@ The same lines are also printed to stdout.
             └── adapter_cblas_fp64.cpp
 ```
 
-When `DISABLE_BLAS_REFERENCE=ON`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still compiled and linked (they share cblas/lapacke symbols with PLASMA's BLAS dependency); only the runtime dispatch of the `reference` mode is skipped.
+When `DISABLE_BLAS_REFERENCE=ON`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still compiled and linked (they share cblas/lapacke symbols with PLASMA's BLAS dependency); only the runtime dispatch of the `lapacke` mode is skipped.
 
 ## Contributing
 
diff --git a/reference/core/include/cholesky_factor.hpp b/reference/core/include/cholesky_factor.hpp
index e1e48e2..9bcf784 100644
--- a/reference/core/include/cholesky_factor.hpp
+++ b/reference/core/include/cholesky_factor.hpp
@@ -13,17 +13,17 @@ namespace cpu
 /**
  * @brief Reference Cholesky variants.
  *
- *   - reference : threaded LAPACKE_dpotrf2 call
- *   - plasma    : plasma_dpotrf call (PLASMA's high-level
- *                 synchronous Cholesky over the OpenMP runtime).
+ *   - lapacke : threaded LAPACKE_dpotrf2 call
+ *   - plasma  : plasma_dpotrf call (PLASMA's high-level
+ *               synchronous Cholesky over the OpenMP runtime).
  */
-enum class Variant { reference, plasma };
+enum class Variant { lapacke, plasma };
 
 inline Variant to_variant(const std::string &s)
 {
-    if (s == "reference")
+    if (s == "lapacke")
     {
-        return Variant::reference;
+        return Variant::lapacke;
     }
     if (s == "plasma")
     {
diff --git a/reference/core/src/cholesky_factor.cpp b/reference/core/src/cholesky_factor.cpp
index 25bbe20..3a20132 100644
--- a/reference/core/src/cholesky_factor.cpp
+++ b/reference/core/src/cholesky_factor.cpp
@@ -14,7 +14,7 @@ void parallel_cholesky(Variant variant, std::vector<double> &matrix, int N)
 {
     switch (variant)
     {
-        case Variant::reference: lapacke_potrf(matrix, N); return;
+        case Variant::lapacke: lapacke_potrf(matrix, N); return;
 
         case Variant::plasma:
 #ifdef ENABLE_PLASMA
diff --git a/reference/main.cpp b/reference/main.cpp
index b4c8585..2aceb09 100644
--- a/reference/main.cpp
+++ b/reference/main.cpp
@@ -93,7 +93,7 @@ int main(int argc, char *argv[])
             // Reference modes:
             std::vector<std::string> modes = {};
 #ifndef DISABLE_BLAS_REFERENCE
-            modes.push_back("reference");
+            modes.push_back("lapacke");
 #endif
 #ifdef ENABLE_PLASMA
             modes.push_back("plasma");

From 493d11e6c4bba03a2940438457789a068a0c435b Mon Sep 17 00:00:00 2001
From: constracktor <74077030+constracktor@users.noreply.github.com>
Date: Fri, 1 May 2026 10:35:31 +0200
Subject: [PATCH 10/13] Cropped mode for Plasma only

---
 README.md          |  8 +++-----
 reference/main.cpp | 35 ++++++++++++++++++-----------------
 2 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index d2f9494..1e21cfa 100644
--- a/README.md
+++ b/README.md
@@ -39,11 +39,9 @@ PLASMA 24.8.7's `plasma_desc_*_create()` routines compute their tile-storage siz
 
 The benchmark handles this transparently:
 
-- For sweep sizes `N` in `(65280, 65536]` the working size is **clamped to 65280** for the whole row (both `lapacke` and `plasma` run at 65280, and the `problem_size` column reports 65280). This keeps the largest practical PLASMA point on the curve without touching the underlying PLASMA build.
+- For sweep sizes `N` in `(65280, 65536]`, **only `plasma` is silently clamped down to 65280** for that iteration; `lapacke` runs at the full `N`. The `problem_size` column reports the original `N`, so `plasma`'s timing in this range corresponds to the 65280 compute even though the row is labelled with the input size.
 - For `N > 65536` `plasma` records `nan`. `lapacke` is unaffected by the int32 ceiling and continues normally.
 
-Patching `(size_t)` casts into `control/descriptor.c` in the spack PLASMA package removes the ceiling and the clamp + guard become no-ops.
-
 ## Dependencies
 
 All three implementations are built with CMake (≥ 3.23) and C++20. The OpenMP and HPX directories link against a *sequential* BLAS (parallelism is at the tile level); the `reference/` directory links against a *threaded* BLAS instead.
@@ -54,12 +52,12 @@ All three implementations are built with CMake (≥ 3.23) and C++20. The OpenMP
 | OpenBLAS 0.3.28 (`threads=openmp`) | — | — | ✓ (default) |
 | Intel oneMKL (sequential) | optional (`ENABLE_MKL=ON`) | optional (`ENABLE_MKL=ON`) | — |
 | Intel oneMKL (`intel_thread`) | — | — | optional (`ENABLE_MKL=ON`) |
-| PLASMA | — | — | optional (`ENABLE_PLASMA=ON`) |
+| PLASMA 24.8.7 | — | — | optional (`ENABLE_PLASMA=ON`) |
 | HPX 1.11.0 + jemalloc | — | ✓ | — |
 | GCC 14.2.0 | ✓ | ✓ | ✓ |
 | LLVM/Clang 22.1.2 | optional | — | — |
 
-Dependencies are managed via [Spack](https://spack.io/). The compile scripts auto-detect the host system and load the correct Spack environment.
+Dependencies are managed via [Spack](https://spack.io/).
 
 ## Build
 
diff --git a/reference/main.cpp b/reference/main.cpp
index 2aceb09..38b768a 100644
--- a/reference/main.cpp
+++ b/reference/main.cpp
@@ -70,18 +70,8 @@ int main(int argc, char *argv[])
     }
 #endif
 
-    for (std::size_t input_size = START_SIZE; input_size <= STOP_SIZE; input_size = input_size * STEP_SIZE)
+    for (std::size_t size = START_SIZE; size <= STOP_SIZE; size = size * STEP_SIZE)
     {
-        // PLASMA's triangular descriptor allocation overflows int32 for
-        // N>65280 with the default nb=256. For sweep sizes in (65280, 65536]
-        // we transparently clamp the working size down to 65280. Sizes beyond
-        // 65536 fall through and the per-mode catch handler records nan.
-        std::size_t size = input_size;
-        if (size > 65'280 && size <= 65'536)
-        {
-            size = 65'280;
-        }
-
         for (std::size_t l = 0; l < LOOP; l++)
         {
             std::string header = "threads;problem_size;tile_size;n_tiles";
@@ -102,17 +92,27 @@ int main(int argc, char *argv[])
             for (const auto &mode : modes)
             {
                 header += ";" + mode;
+                std::size_t mode_size = size;
+
+                // PLASMA's triangular descriptor allocation
+                // overflows int32 for N>65280 with the default nb=256. For
+                // input sizes in (65280, 65536] we silently clamp PLASMA's
+                // working size down to 65280;                std::size_t mode_size = size;
+                if (mode == "plasma" && mode_size > 65'280 && mode_size <= 65'536)
+                {
+                    mode_size = 65'280;
+                }
 
-                std::vector<double> matrix = gen_matrix(size);
+                std::vector<double> matrix = gen_matrix(mode_size);
                 // NaN guard
                 double cholesky_cpu = std::numeric_limits<double>::quiet_NaN();
                 try
                 {
-                    cholesky_cpu = cpu::cholesky(matrix, size, mode);
+                    cholesky_cpu = cpu::cholesky(matrix, mode_size, mode);
                 }
                 catch (const std::exception &e)
                 {
-                    std::cerr << "Error: variant '" << mode << "' failed at size=" << size << ": " << e.what()
+                    std::cerr << "Error: variant '" << mode << "' failed at size=" << mode_size << ": " << e.what()
                               << ". Recording NaN and continuing." << std::endl;
                     values += ";nan";
                     continue;
@@ -123,12 +123,13 @@ int main(int argc, char *argv[])
 #ifdef ENABLE_VALIDATION
                 // Validate by computing relative residual ||A - L L^T||_F / ||A||_F
                 constexpr double residual_tol = 1e-10;
-                const double residual = cpu::cholesky_residual(size, matrix);
-                std::cout << "[validate] mode=" << mode << " size=" << size << " residual=" << residual << std::endl;
+                const double residual = cpu::cholesky_residual(mode_size, matrix);
+                std::cout << "[validate] mode=" << mode << " size=" << mode_size << " residual=" << residual
+                          << std::endl;
                 if (!(residual <= residual_tol))  // catches NaN too
                 {
                     std::cerr << "Validation warning: variant '" << mode << "' residual " << residual
-                              << " exceeds tolerance " << residual_tol << " (size=" << size << ")" << std::endl;
+                              << " exceeds tolerance " << residual_tol << " (size=" << mode_size << ")" << std::endl;
                 }
 #endif
             }

From 879774dcc367b304f5504d6dfe16031e99f38920 Mon Sep 17 00:00:00 2001
From: constracktor <74077030+constracktor@users.noreply.github.com>
Date: Fri, 1 May 2026 10:43:09 +0200
Subject: [PATCH 11/13] Add enable lapacke

---
 README.md                | 10 +++++-----
 reference/CMakeLists.txt | 10 +++++-----
 reference/compile.sh     | 20 ++++++++++----------
 reference/main.cpp       |  2 +-
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 1e21cfa..c3eecfc 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-j
 
 | Mode | Description |
 |------|-------------|
-| `lapacke` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). Enabled by default; disable with `DISABLE_BLAS_REFERENCE=ON`. |
+| `lapacke` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). Enabled by default; disable with `ENABLE_LAPACKE=OFF`. |
 | `plasma` | Single `plasma_dpotrf` call on the full matrix (PLASMA's high-level synchronous API). PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. |
 
 This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `lapacke` mode isolates the contribution of vendor-provided dense-LA parallelism, and the `plasma` mode adds a tiled-parallel competitor that uses the same OpenMP runtime as the in-house variants.
@@ -82,7 +82,7 @@ These can be set as environment variables before calling `compile.sh`:
 | `ENABLE_DYNAMIC_SCHEDULE` | `OFF` | *(`openmp/` only)* Use `schedule(dynamic,1)` on the trailing-update worksharing loops in `for_collapse`. Requires the LLVM toolchain; rejected at compile time with GCC. |
 | `ENABLE_MKL` | `OFF` | Link against Intel oneMKL instead of OpenBLAS. In `openmp/` and `hpx/` this is the *sequential* MKL; in `reference/` it is the *threaded* MKL. |
 | `ENABLE_PLASMA` | `OFF` | *(`reference/` only)* Also build the PLASMA `plasma_dpotrf` variant. Adds a `plasma` column alongside `lapacke` in the runtime output. |
-| `DISABLE_BLAS_REFERENCE` | `OFF` | *(`reference/` only)* Skip the `lapacke` mode at runtime, so only `plasma` runs (when `ENABLE_PLASMA=ON`). Linking is unchanged — PLASMA and validation still need cblas/lapacke symbols. |
+| `ENABLE_LAPACKE` | `ON` | *(`reference/` only)* Run the `lapacke` mode at runtime. Set `OFF` to skip it (e.g. when only `plasma` is wanted). Linking is unchanged either way — PLASMA and validation still need cblas/lapacke symbols. |
 
 **Examples:**
 
@@ -103,7 +103,7 @@ ENABLE_MKL=ON ./compile.sh
 ENABLE_PLASMA=ON ./compile.sh
 
 # Reference: PLASMA only, skip the LAPACKE_dpotrf column at runtime
-DISABLE_BLAS_REFERENCE=ON ENABLE_PLASMA=ON ./compile.sh
+ENABLE_LAPACKE=OFF ENABLE_PLASMA=ON ./compile.sh
 ```
 
 ## Run
@@ -165,7 +165,7 @@ threads;problem_size;tile_size;n_tiles;for_collapse;for_naive;task_naive;task_de
 128;65536;1024;64;3.14;3.21;2.98;2.87
 ```
 
-The `reference/` binary reports a `lapacke` column (suppressed by `DISABLE_BLAS_REFERENCE=ON`) plus a `plasma` column when built with `ENABLE_PLASMA=ON`, with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key:
+The `reference/` binary reports a `lapacke` column (suppressed by `ENABLE_LAPACKE=OFF`) plus a `plasma` column when built with `ENABLE_PLASMA=ON`, with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key:
 
 ```
 threads;problem_size;tile_size;n_tiles;lapacke;plasma
@@ -239,7 +239,7 @@ The same lines are also printed to stdout.
             └── adapter_cblas_fp64.cpp
 ```
 
-When `DISABLE_BLAS_REFERENCE=ON`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still compiled and linked (they share cblas/lapacke symbols with PLASMA's BLAS dependency); only the runtime dispatch of the `lapacke` mode is skipped.
+When `ENABLE_LAPACKE=OFF`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still compiled and linked (they share cblas/lapacke symbols with PLASMA's BLAS dependency); only the runtime dispatch of the `lapacke` mode is skipped.
 
 ## Contributing
 
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index a9b5b90..e8045b8 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -12,9 +12,9 @@ option(
   "Build the PLASMA tiled-Cholesky reference variant in addition to the LAPACKE_dpotrf one"
   OFF)
 option(
-  DISABLE_BLAS_REFERENCE
-  "Skip the LAPACKE_dpotrf reference mode at runtime. Linking is unchanged (PLASMA and validation still need cblas/lapacke)."
-  OFF)
+  ENABLE_LAPACKE
+  "Run the LAPACKE_dpotrf reference mode at runtime (on by default). Linking is unchanged either way (PLASMA and validation still need cblas/lapacke)."
+  ON)
 option(
   ENABLE_VALIDATION
   "Compute ||A - L*L^T||_F / ||A||_F after each factorization (off by default)"
@@ -103,7 +103,7 @@ if(BUILD_CORE)
     target_include_directories(cholesky_reference PRIVATE ${PLASMA_INCLUDE_DIR})
   endif()
 
-  if(DISABLE_BLAS_REFERENCE)
-    target_compile_definitions(cholesky_reference PRIVATE DISABLE_BLAS_REFERENCE)
+  if(ENABLE_LAPACKE)
+    target_compile_definitions(cholesky_reference PRIVATE ENABLE_LAPACKE)
   endif()
 endif()
diff --git a/reference/compile.sh b/reference/compile.sh
index 2f20af3..b5e66c0 100755
--- a/reference/compile.sh
+++ b/reference/compile.sh
@@ -12,8 +12,8 @@
 #   ENABLE_PLASMA          ON|OFF  (default OFF) - also build the PLASMA
 #                                                  plasma_dpotrf variant (extra
 #                                                  'plasma' column in the output)
-#   DISABLE_BLAS_REFERENCE ON|OFF  (default OFF) - skip the LAPACKE_dpotrf
-#                                                  reference at runtime
+#   ENABLE_LAPACKE         ON|OFF  (default ON)  - run the LAPACKE_dpotrf
+#                                                  reference mode at runtime
 #   ENABLE_VALIDATION      ON|OFF  (default OFF) - residual check after each
 #                                                  factorization
 #
@@ -21,7 +21,7 @@
 #   ./compile.sh
 #   ENABLE_MKL=ON ./compile.sh
 #   ENABLE_PLASMA=ON ./compile.sh
-#   DISABLE_BLAS_REFERENCE=ON ENABLE_PLASMA=ON ./compile.sh
+#   ENABLE_LAPACKE=OFF ENABLE_PLASMA=ON ./compile.sh
 #   ENABLE_VALIDATION=ON ./compile.sh
 ################################################################################
 set -e # Exit immediately if a command exits with a non-zero status.
@@ -31,10 +31,10 @@ set -e # Exit immediately if a command exits with a non-zero status.
 ################################################################################
 : "${ENABLE_MKL:=OFF}"
 : "${ENABLE_PLASMA:=OFF}"
-: "${DISABLE_BLAS_REFERENCE:=OFF}"
+: "${ENABLE_LAPACKE:=ON}"
 : "${ENABLE_VALIDATION:=OFF}"
 
-for var in ENABLE_MKL ENABLE_PLASMA DISABLE_BLAS_REFERENCE ENABLE_VALIDATION; do
+for var in ENABLE_MKL ENABLE_PLASMA ENABLE_LAPACKE ENABLE_VALIDATION; do
   case "${!var}" in
   ON | OFF) ;;
   *)
@@ -99,15 +99,15 @@ fi
 rm -rf build && mkdir build && cd build
 
 echo "CMake options:"
-echo "  ENABLE_MKL             = $ENABLE_MKL"
-echo "  ENABLE_PLASMA          = $ENABLE_PLASMA"
-echo "  DISABLE_BLAS_REFERENCE = $DISABLE_BLAS_REFERENCE"
-echo "  ENABLE_VALIDATION      = $ENABLE_VALIDATION"
+echo "  ENABLE_MKL        = $ENABLE_MKL"
+echo "  ENABLE_PLASMA     = $ENABLE_PLASMA"
+echo "  ENABLE_LAPACKE    = $ENABLE_LAPACKE"
+echo "  ENABLE_VALIDATION = $ENABLE_VALIDATION"
 
 cmake -DCMAKE_BUILD_TYPE=Release \
   -DENABLE_MKL="$ENABLE_MKL" \
   -DENABLE_PLASMA="$ENABLE_PLASMA" \
-  -DDISABLE_BLAS_REFERENCE="$DISABLE_BLAS_REFERENCE" \
+  -DENABLE_LAPACKE="$ENABLE_LAPACKE" \
   -DENABLE_VALIDATION="$ENABLE_VALIDATION" \
   ..
 make -j VERBOSE=1
diff --git a/reference/main.cpp b/reference/main.cpp
index 38b768a..3c824c9 100644
--- a/reference/main.cpp
+++ b/reference/main.cpp
@@ -82,7 +82,7 @@ int main(int argc, char *argv[])
             ///////////////////////////////////////////////////////////////////
             // Reference modes:
             std::vector<std::string> modes = {};
-#ifndef DISABLE_BLAS_REFERENCE
+#ifdef ENABLE_LAPACKE
             modes.push_back("lapacke");
 #endif
 #ifdef ENABLE_PLASMA

From 9e44539f7830b96820b972eead77b3c92d630a32 Mon Sep 17 00:00:00 2001
From: constracktor <74077030+constracktor@users.noreply.github.com>
Date: Fri, 1 May 2026 11:12:32 +0200
Subject: [PATCH 12/13] Unify formatting

---
 hpx/.clang-format => .clang-format |   0
 .github/workflows/lint.yml         |  24 ++--
 CMakeLists.txt                     |  14 +++
 README.md                          |  17 ++-
 hpx/CMakeLists.txt                 |  17 +--
 hpx/CMakePresets.json              |  17 ---
 openmp/.clang-format               | 174 -----------------------------
 openmp/CMakeLists.txt              |  17 +--
 openmp/CMakePresets.json           |  17 ---
 reference/.clang-format            | 174 -----------------------------
 reference/CMakeLists.txt           |  17 +--
 reference/CMakePresets.json        |  17 ---
 reference/core/CMakeLists.txt      |   5 +-
 13 files changed, 50 insertions(+), 460 deletions(-)
 rename hpx/.clang-format => .clang-format (100%)
 create mode 100644 CMakeLists.txt
 delete mode 100644 hpx/CMakePresets.json
 delete mode 100644 openmp/.clang-format
 delete mode 100644 openmp/CMakePresets.json
 delete mode 100644 reference/.clang-format
 delete mode 100644 reference/CMakePresets.json

diff --git a/hpx/.clang-format b/.clang-format
similarity index 100%
rename from hpx/.clang-format
rename to .clang-format
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index cd6047c..54ed8f7 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -19,14 +19,18 @@ jobs:
     - name: Install cmakelang
       run: pip3 install cmakelang
 
-    - name: Configure dummy project
-      run: cd hpx && cmake -B build-fmt -DBUILD_CORE=OFF -DCLANG_FORMAT_PROGRAM=$(which clang-format-18)
+    - name: Configure top-level format project
+      # The repo-root CMakeLists.txt is a format-only coordinator that pulls
+      # in TheLartians/Format.cmake and exposes the *-clang-format and
+      # *-cmake-format targets. The actual builds live in openmp/, hpx/,
+      # and reference/ and are not configured here.
+      run: cmake -B build-fmt -DCLANG_FORMAT_PROGRAM=$(which clang-format-18)
 
     - name: Check code formatting
       id: clangformat
       run: |
         set +e
-        cd hpx && cmake --build build-fmt --target check-clang-format
+        cmake --build build-fmt --target check-clang-format
         status=$?
         if [ $status -ne 0 ]; then
           echo "Formatting errors found!"
@@ -37,14 +41,14 @@ jobs:
         fi
 
     - name: Check CMake formatting
-      # Let's run the CMake formatting checks even if our code is mis-formatted.
+      # Run CMake formatting checks even if the C++ check failed.
       if: success() || steps.clangformat.conclusion == 'failure'
-      # Note that diff generation for cmake-format is somewhat broken in the upstream project.
-      # Diffs always end up with incorrect paths so manual fixes would be necessary, which we sidestep
-      # by re-formatting in-place and then using `git diff`.
+      # Diff generation for cmake-format is somewhat broken upstream (paths
+      # come out wrong), so we sidestep it by fixing in place and using
+      # `git diff` to produce the patch.
       run: |
         set +e
-        cd hpx && cmake --build build-fmt --target check-cmake-format
+        cmake --build build-fmt --target check-cmake-format
         status=$?
         if [ $status -ne 0 ]; then
           echo "Formatting errors found!"
@@ -61,5 +65,5 @@ jobs:
       with:
         name: Formatting fix .patch files
         path: |
-          hpx/clang-format.patch
-          hpx/cmake-format.patch
+          clang-format.patch
+          cmake-format.patch
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..4184e03
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.14)
+# Top-level coordinator for *source formatting only*. Each subdirectory owns its
+# own standalone CMake project (with its own dependencies and its own
+# compile.sh); this file exists so the clang-format / cmake-format integration
+# can be configured once for the whole repository.
+project(cholesky_bench LANGUAGES NONE)
+
+include(FetchContent)
+FetchContent_Declare(
+  format
+  GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git
+  GIT_TAG v1.8.1
+  QUIET)
+FetchContent_MakeAvailable(format)
diff --git a/README.md b/README.md
index c3eecfc..1096d29 100644
--- a/README.md
+++ b/README.md
@@ -178,9 +178,10 @@ The same lines are also printed to stdout.
 
 ```
 .
+├── .clang-format           # repo-wide style; governs all three subtrees
+├── CMakeLists.txt          # top-level coordinator (formatting only; LANGUAGES NONE)
 ├── openmp/
 │   ├── CMakeLists.txt
-│   ├── CMakePresets.json
 │   ├── compile.sh          # build script (gcc or llvm)
 │   ├── run.sh              # SLURM job script
 │   ├── main.cpp
@@ -199,7 +200,6 @@ The same lines are also printed to stdout.
 │           └── adapter_cblas_fp64.cpp
 ├── hpx/
 │   ├── CMakeLists.txt
-│   ├── CMakePresets.json
 │   ├── compile.sh          # build script (gcc only)
 │   ├── run.sh              # SLURM job script
 │   ├── main.cpp
@@ -218,7 +218,6 @@ The same lines are also printed to stdout.
 │           └── adapter_cblas_fp64.cpp
 └── reference/
     ├── CMakeLists.txt
-    ├── CMakePresets.json
     ├── compile.sh          # build script (gcc only)
     ├── run.sh              # SLURM job script
     ├── main.cpp
@@ -241,6 +240,18 @@ The same lines are also printed to stdout.
 
 When `ENABLE_LAPACKE=OFF`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still compiled and linked (they share cblas/lapacke symbols with PLASMA's BLAS dependency); only the runtime dispatch of the `lapacke` mode is skipped.
 
+## Formatting
+
+A repository-wide [`.clang-format`](.clang-format) governs all three subtrees. The top-level [`CMakeLists.txt`](CMakeLists.txt) wires up `clang-format` and `cmake-format` targets via [Format.cmake](https://github.com/TheLartians/Format.cmake); configure once from the repo root and use the targets:
+
+```bash
+cmake -B build-fmt
+cmake --build build-fmt --target check-clang-format   # CI-style check
+cmake --build build-fmt --target fix-clang-format     # apply formatting
+```
+
+Each subproject (`openmp/`, `hpx/`, `reference/`) is its own standalone CMake project with its own dependencies, so the top-level `CMakeLists.txt` only handles formatting — actual builds still happen from inside each subdirectory via its `compile.sh`.
+
 ## Contributing
 
 We would be happy to expand Cholesky-Bench to additional asynchronous many-task (AMT) runtimes. If you have an implementation you would like to add, feel free to open a pull request.
diff --git a/hpx/CMakeLists.txt b/hpx/CMakeLists.txt
index b2b7fdc..356c529 100644
--- a/hpx/CMakeLists.txt
+++ b/hpx/CMakeLists.txt
@@ -15,9 +15,6 @@ option(
   DISABLE_COMPUTATION
   "Replace all BLAS/LAPACK calls and tile generation with no-ops; keeps the dataflow graph intact so HPX scheduling overhead can be measured in isolation"
   OFF)
-option(ENABLE_FORMAT_TARGETS "Enable clang-format / cmake-format targets"
-       ${PROJECT_IS_TOP_LEVEL})
-
 if(ENABLE_VALIDATION AND DISABLE_COMPUTATION)
   message(
     FATAL_ERROR
@@ -25,18 +22,8 @@ if(ENABLE_VALIDATION AND DISABLE_COMPUTATION)
       "residual validation needs a real factorization to check against.")
 endif()
 
-if(ENABLE_FORMAT_TARGETS)
-  find_package(format QUIET)
-  if(NOT format_FOUND)
-    include(FetchContent)
-    FetchContent_Declare(
-      format
-      GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git
-      GIT_TAG v1.8.1
-      QUIET)
-    FetchContent_MakeAvailable(format)
-  endif()
-endif()
+# clang-format / cmake-format integration is hoisted to the top-level
+# CMakeLists.txt; configure from the repo root to use it.
 
 if(NOT CMAKE_SKIP_INSTALL_RULES)
   # Our installs follow the standard GNU directory layout. This include needs to
diff --git a/hpx/CMakePresets.json b/hpx/CMakePresets.json
deleted file mode 100644
index f3839f8..0000000
--- a/hpx/CMakePresets.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "version": 6,
-  "cmakeMinimumRequired": {
-    "major": 3,
-    "minor": 22,
-    "patch": 0
-  },
-  "configurePresets": [
-    {
-      "name": "clang-tidy",
-      "hidden": true,
-      "cacheVariables": {
-        "CMAKE_CXX_CLANG_TIDY": "clang-tidy;--header-filter=^${sourceDir}/"
-      }
-    }
-  ]
-}
diff --git a/openmp/.clang-format b/openmp/.clang-format
deleted file mode 100644
index e8d875c..0000000
--- a/openmp/.clang-format
+++ /dev/null
@@ -1,174 +0,0 @@
----
-Language: Cpp
-AccessModifierOffset: -2
-AlignAfterOpenBracket: Align
-AlignArrayOfStructures: None
-AlignConsecutiveAssignments: None
-AlignConsecutiveBitFields: None
-AlignConsecutiveDeclarations: None
-AlignConsecutiveMacros: None
-AlignConsecutiveShortCaseStatements:
-  Enabled: true
-  AcrossEmptyLines: false
-  AcrossComments: false
-  AlignCaseColons: false
-AlignEscapedNewlines: Right
-AlignOperands: Align
-AlignTrailingComments:
-  Kind: Always
-AllowAllArgumentsOnNextLine: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
-AllowShortBlocksOnASingleLine: Empty
-AllowShortCaseLabelsOnASingleLine: true
-AllowShortEnumsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: Never
-AllowShortLambdasOnASingleLine: All
-AllowShortLoopsOnASingleLine: true
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakBeforeMultilineStrings: false
-AlwaysBreakAfterReturnType: None
-AlwaysBreakTemplateDeclarations: Yes
-BinPackArguments: false
-BinPackParameters: false
-BitFieldColonSpacing: Both
-BraceWrapping:
-  AfterCaseLabel: false
-  AfterClass: true
-  AfterControlStatement: Always
-  AfterEnum: false
-  AfterFunction: true
-  AfterNamespace: true
-  AfterObjCDeclaration: true
-  AfterStruct: true
-  AfterUnion: true
-  AfterExternBlock: false
-  BeforeCatch: true
-  BeforeElse: true
-  BeforeLambdaBody: true
-  BeforeWhile: false
-  IndentBraces: false
-  SplitEmptyFunction: false
-  SplitEmptyRecord: false
-  SplitEmptyNamespace: false
-BreakAfterAttributes: Never
-BreakAfterJavaFieldAnnotations: false
-BreakBeforeBinaryOperators: NonAssignment
-BreakBeforeBraces: Custom
-BreakBeforeConceptDeclarations: Always
-BreakBeforeInlineASMColon: OnlyMultiline
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializers: AfterColon
-BreakInheritanceList: AfterComma
-BreakStringLiterals: true
-ColumnLimit: 120
-CommentPragmas: '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: false
-DerivePointerAlignment: false
-DisableFormat: false
-EmptyLineAfterAccessModifier: Never
-EmptyLineBeforeAccessModifier: LogicalBlock
-FixNamespaceComments: true
-ForEachMacros: [ 'foreach', 'Q_FOREACH', 'BOOST_FOREACH' ]
-IfMacros: [ ]
-IncludeBlocks: Regroup
-IncludeCategories:
-  - Regex: '^"gprat/'
-    Priority: 1
-  - Regex: '^"(tests|bindings)/'
-    Priority: 2
-  - Regex: '^"(fmt|catch2|pybind)'
-    Priority: 3
-  - Regex: '^.*'
-    Priority: 4
-IncludeIsMainRegex: '(Test)?$'
-IncludeIsMainSourceRegex: '(\.cu|\.hip)'
-IndentAccessModifiers: false
-IndentCaseBlocks: true
-IndentCaseLabels: true
-IndentExternBlock: NoIndent
-IndentGotoLabels: false
-IndentPPDirectives: None
-IndentRequiresClause: false
-IndentWidth: 4
-IndentWrappedFunctionNames: false
-InsertBraces: true
-InsertNewlineAtEOF: true
-InsertTrailingCommas: None
-IntegerLiteralSeparator:
-  Binary: 8
-  Decimal: 3
-  DecimalMinDigits: 5
-  Hex: -1
-KeepEmptyLinesAtEOF: false
-KeepEmptyLinesAtTheStartOfBlocks: false
-LambdaBodyIndentation: Signature
-LineEnding: DeriveLF
-MacroBlockBegin: ''
-MacroBlockEnd: ''
-Macros: [ ]
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-NamespaceMacros: [ ]
-PPIndentWidth: -1
-PackConstructorInitializers: Never
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 19
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakOpenParenthesis: 0
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyIndentedWhitespace: 1
-PenaltyReturnTypeOnItsOwnLine: 60
-PointerAlignment: Right
-QualifierAlignment: Custom
-QualifierOrder: [ 'inline', 'constexpr', 'static', 'friend', 'restrict', 'const', 'volatile', 'type' ]
-ReferenceAlignment: Pointer
-ReflowComments: true
-RemoveBracesLLVM: false
-RemoveParentheses: Leave
-RemoveSemicolon: true
-RequiresClausePosition: OwnLine
-RequiresExpressionIndentation: OuterScope
-SeparateDefinitionBlocks: Always
-ShortNamespaceLines: 1
-SortIncludes: CaseInsensitive
-SortUsingDeclarations: LexicographicNumeric
-SpaceAfterCStyleCast: true
-SpaceAfterLogicalNot: false
-SpaceAfterTemplateKeyword: true
-SpaceAroundPointerQualifiers: Default
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCaseColon: false
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeJsonColon: false
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceBeforeSquareBrackets: false
-SpaceInEmptyBlock: true
-SpacesBeforeTrailingComments: 2
-SpacesInAngles: false
-SpacesInContainerLiterals: true
-SpacesInLineCommentPrefix:
-  Minimum: 1
-  Maximum: 1
-SpacesInParens: Never
-SpacesInSquareBrackets: false
-Standard: c++17
-StatementAttributeLikeMacros: [ ]
-StatementMacros: [ 'Q_UNUSED', 'QT_REQUIRE_VERSION' ]
-TabWidth: 4
-TypeNames: [ ]
-TypenameMacros: [ ]
-UseTab: Never
-WhitespaceSensitiveMacros: [ 'STRINGIZE', 'PP_STRINGIZE', 'BOOST_PP_STRINGIZE' ]
-...
-
diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt
index f506c0e..aba403a 100644
--- a/openmp/CMakeLists.txt
+++ b/openmp/CMakeLists.txt
@@ -19,9 +19,6 @@ option(
   ENABLE_DYNAMIC_SCHEDULE
   "Use schedule(dynamic, 1) on the trailing-update worksharing loops in for_collapse. OFF by default so GCC builds compile out of the box. Turn ON for LLVM builds where the dynamic schedule is supported and gives better load balancing."
   OFF)
-option(ENABLE_FORMAT_TARGETS "Enable clang-format / cmake-format targets"
-       ${PROJECT_IS_TOP_LEVEL})
-
 if(ENABLE_VALIDATION AND DISABLE_COMPUTATION)
   message(
     FATAL_ERROR
@@ -29,18 +26,8 @@ if(ENABLE_VALIDATION AND DISABLE_COMPUTATION)
       "residual validation needs a real factorization to check against.")
 endif()
 
-if(ENABLE_FORMAT_TARGETS)
-  find_package(format QUIET)
-  if(NOT format_FOUND)
-    include(FetchContent)
-    FetchContent_Declare(
-      format
-      GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git
-      GIT_TAG v1.8.1
-      QUIET)
-    FetchContent_MakeAvailable(format)
-  endif()
-endif()
+# clang-format / cmake-format integration is hoisted to the top-level
+# CMakeLists.txt; configure from the repo root to use it.
 
 if(NOT CMAKE_SKIP_INSTALL_RULES)
   # Our installs follow the standard GNU directory layout. This include needs to
diff --git a/openmp/CMakePresets.json b/openmp/CMakePresets.json
deleted file mode 100644
index f3839f8..0000000
--- a/openmp/CMakePresets.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "version": 6,
-  "cmakeMinimumRequired": {
-    "major": 3,
-    "minor": 22,
-    "patch": 0
-  },
-  "configurePresets": [
-    {
-      "name": "clang-tidy",
-      "hidden": true,
-      "cacheVariables": {
-        "CMAKE_CXX_CLANG_TIDY": "clang-tidy;--header-filter=^${sourceDir}/"
-      }
-    }
-  ]
-}
diff --git a/reference/.clang-format b/reference/.clang-format
deleted file mode 100644
index e8d875c..0000000
--- a/reference/.clang-format
+++ /dev/null
@@ -1,174 +0,0 @@
----
-Language: Cpp
-AccessModifierOffset: -2
-AlignAfterOpenBracket: Align
-AlignArrayOfStructures: None
-AlignConsecutiveAssignments: None
-AlignConsecutiveBitFields: None
-AlignConsecutiveDeclarations: None
-AlignConsecutiveMacros: None
-AlignConsecutiveShortCaseStatements:
-  Enabled: true
-  AcrossEmptyLines: false
-  AcrossComments: false
-  AlignCaseColons: false
-AlignEscapedNewlines: Right
-AlignOperands: Align
-AlignTrailingComments:
-  Kind: Always
-AllowAllArgumentsOnNextLine: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
-AllowShortBlocksOnASingleLine: Empty
-AllowShortCaseLabelsOnASingleLine: true
-AllowShortEnumsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: Never
-AllowShortLambdasOnASingleLine: All
-AllowShortLoopsOnASingleLine: true
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakBeforeMultilineStrings: false
-AlwaysBreakAfterReturnType: None
-AlwaysBreakTemplateDeclarations: Yes
-BinPackArguments: false
-BinPackParameters: false
-BitFieldColonSpacing: Both
-BraceWrapping:
-  AfterCaseLabel: false
-  AfterClass: true
-  AfterControlStatement: Always
-  AfterEnum: false
-  AfterFunction: true
-  AfterNamespace: true
-  AfterObjCDeclaration: true
-  AfterStruct: true
-  AfterUnion: true
-  AfterExternBlock: false
-  BeforeCatch: true
-  BeforeElse: true
-  BeforeLambdaBody: true
-  BeforeWhile: false
-  IndentBraces: false
-  SplitEmptyFunction: false
-  SplitEmptyRecord: false
-  SplitEmptyNamespace: false
-BreakAfterAttributes: Never
-BreakAfterJavaFieldAnnotations: false
-BreakBeforeBinaryOperators: NonAssignment
-BreakBeforeBraces: Custom
-BreakBeforeConceptDeclarations: Always
-BreakBeforeInlineASMColon: OnlyMultiline
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializers: AfterColon
-BreakInheritanceList: AfterComma
-BreakStringLiterals: true
-ColumnLimit: 120
-CommentPragmas: '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: false
-DerivePointerAlignment: false
-DisableFormat: false
-EmptyLineAfterAccessModifier: Never
-EmptyLineBeforeAccessModifier: LogicalBlock
-FixNamespaceComments: true
-ForEachMacros: [ 'foreach', 'Q_FOREACH', 'BOOST_FOREACH' ]
-IfMacros: [ ]
-IncludeBlocks: Regroup
-IncludeCategories:
-  - Regex: '^"gprat/'
-    Priority: 1
-  - Regex: '^"(tests|bindings)/'
-    Priority: 2
-  - Regex: '^"(fmt|catch2|pybind)'
-    Priority: 3
-  - Regex: '^.*'
-    Priority: 4
-IncludeIsMainRegex: '(Test)?$'
-IncludeIsMainSourceRegex: '(\.cu|\.hip)'
-IndentAccessModifiers: false
-IndentCaseBlocks: true
-IndentCaseLabels: true
-IndentExternBlock: NoIndent
-IndentGotoLabels: false
-IndentPPDirectives: None
-IndentRequiresClause: false
-IndentWidth: 4
-IndentWrappedFunctionNames: false
-InsertBraces: true
-InsertNewlineAtEOF: true
-InsertTrailingCommas: None
-IntegerLiteralSeparator:
-  Binary: 8
-  Decimal: 3
-  DecimalMinDigits: 5
-  Hex: -1
-KeepEmptyLinesAtEOF: false
-KeepEmptyLinesAtTheStartOfBlocks: false
-LambdaBodyIndentation: Signature
-LineEnding: DeriveLF
-MacroBlockBegin: ''
-MacroBlockEnd: ''
-Macros: [ ]
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-NamespaceMacros: [ ]
-PPIndentWidth: -1
-PackConstructorInitializers: Never
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 19
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakOpenParenthesis: 0
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyIndentedWhitespace: 1
-PenaltyReturnTypeOnItsOwnLine: 60
-PointerAlignment: Right
-QualifierAlignment: Custom
-QualifierOrder: [ 'inline', 'constexpr', 'static', 'friend', 'restrict', 'const', 'volatile', 'type' ]
-ReferenceAlignment: Pointer
-ReflowComments: true
-RemoveBracesLLVM: false
-RemoveParentheses: Leave
-RemoveSemicolon: true
-RequiresClausePosition: OwnLine
-RequiresExpressionIndentation: OuterScope
-SeparateDefinitionBlocks: Always
-ShortNamespaceLines: 1
-SortIncludes: CaseInsensitive
-SortUsingDeclarations: LexicographicNumeric
-SpaceAfterCStyleCast: true
-SpaceAfterLogicalNot: false
-SpaceAfterTemplateKeyword: true
-SpaceAroundPointerQualifiers: Default
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCaseColon: false
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeJsonColon: false
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceBeforeSquareBrackets: false
-SpaceInEmptyBlock: true
-SpacesBeforeTrailingComments: 2
-SpacesInAngles: false
-SpacesInContainerLiterals: true
-SpacesInLineCommentPrefix:
-  Minimum: 1
-  Maximum: 1
-SpacesInParens: Never
-SpacesInSquareBrackets: false
-Standard: c++17
-StatementAttributeLikeMacros: [ ]
-StatementMacros: [ 'Q_UNUSED', 'QT_REQUIRE_VERSION' ]
-TabWidth: 4
-TypeNames: [ ]
-TypenameMacros: [ ]
-UseTab: Never
-WhitespaceSensitiveMacros: [ 'STRINGIZE', 'PP_STRINGIZE', 'BOOST_PP_STRINGIZE' ]
-...
-
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index e8045b8..3f50b6b 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -19,21 +19,8 @@ option(
   ENABLE_VALIDATION
   "Compute ||A - L*L^T||_F / ||A||_F after each factorization (off by default)"
   OFF)
-option(ENABLE_FORMAT_TARGETS "Enable clang-format / cmake-format targets"
-       ${PROJECT_IS_TOP_LEVEL})
-
-if(ENABLE_FORMAT_TARGETS)
-  find_package(format QUIET)
-  if(NOT format_FOUND)
-    include(FetchContent)
-    FetchContent_Declare(
-      format
-      GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git
-      GIT_TAG v1.8.1
-      QUIET)
-    FetchContent_MakeAvailable(format)
-  endif()
-endif()
+# clang-format / cmake-format integration is hoisted to the top-level
+# CMakeLists.txt; configure from the repo root to use it.
 
 if(NOT CMAKE_SKIP_INSTALL_RULES)
   include(GNUInstallDirs)
diff --git a/reference/CMakePresets.json b/reference/CMakePresets.json
deleted file mode 100644
index f3839f8..0000000
--- a/reference/CMakePresets.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "version": 6,
-  "cmakeMinimumRequired": {
-    "major": 3,
-    "minor": 22,
-    "patch": 0
-  },
-  "configurePresets": [
-    {
-      "name": "clang-tidy",
-      "hidden": true,
-      "cacheVariables": {
-        "CMAKE_CXX_CLANG_TIDY": "clang-tidy;--header-filter=^${sourceDir}/"
-      }
-    }
-  ]
-}
diff --git a/reference/core/CMakeLists.txt b/reference/core/CMakeLists.txt
index eea5cbf..b74c17f 100644
--- a/reference/core/CMakeLists.txt
+++ b/reference/core/CMakeLists.txt
@@ -29,9 +29,8 @@ target_include_directories(
 # Link BLAS
 if(ENABLE_MKL)
   # Link threaded Intel oneMKL
-  target_link_libraries(
-    cholesky_core PUBLIC MKL::mkl_intel_lp64 MKL::mkl_core MKL::MKL
-                         MKL::mkl_intel_thread)
+  target_link_libraries(cholesky_core PUBLIC MKL::mkl_intel_lp64 MKL::mkl_core
+                                             MKL::MKL MKL::mkl_intel_thread)
 else()
   # Link threaded OpenBLAS
   target_link_libraries(cholesky_core PUBLIC ${OpenBLAS_LIB})

From ad29902d6471b63c18fa1acbbd2eb977ba916284 Mon Sep 17 00:00:00 2001
From: constracktor <74077030+constracktor@users.noreply.github.com>
Date: Fri, 1 May 2026 11:27:42 +0200
Subject: [PATCH 13/13] Final cleanup

---
 .github/workflows/lint.yml |  4 ----
 README.md                  | 20 ++++----------------
 hpx/CMakeLists.txt         |  3 ---
 openmp/CMakeLists.txt      |  3 ---
 reference/CMakeLists.txt   |  2 --
 5 files changed, 4 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 54ed8f7..0c82399 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -20,10 +20,6 @@ jobs:
       run: pip3 install cmakelang
 
     - name: Configure top-level format project
-      # The repo-root CMakeLists.txt is a format-only coordinator that pulls
-      # in TheLartians/Format.cmake and exposes the *-clang-format and
-      # *-cmake-format targets. The actual builds live in openmp/, hpx/,
-      # and reference/ and are not configured here.
       run: cmake -B build-fmt -DCLANG_FORMAT_PROGRAM=$(which clang-format-18)
 
     - name: Check code formatting
diff --git a/README.md b/README.md
index 1096d29..c2b694a 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Cholesky-Bench
 
-Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-join to asynchronous tasks across several parallelism models, currently comparing OpenMP and HPX implementations side by side. A non-tiled parallel-BLAS reference is also included as a baseline.
+Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-join to asynchronous tasks across several parallelism models, currently comparing OpenMP and HPX implementations side by side. A non-tiled parallel reference is also included as a baseline.
 
 ## Variants
 
@@ -160,20 +160,8 @@ runtimes_reference_cholesky_<suffix>.txt
 
 The suffix encodes which dimension is swept: `tile_` if tiles vary, `size_` if size varies, followed by the loop count. The file uses `;`-separated columns:
 
-```
-threads;problem_size;tile_size;n_tiles;for_collapse;for_naive;task_naive;task_depend
-128;65536;1024;64;3.14;3.21;2.98;2.87
-```
-
 The `reference/` binary reports a `lapacke` column (suppressed by `ENABLE_LAPACKE=OFF`) plus a `plasma` column when built with `ENABLE_PLASMA=ON`, with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key:
 
-```
-threads;problem_size;tile_size;n_tiles;lapacke;plasma
-128;65280;65280;1;5.21;68.12
-```
-
-The same lines are also printed to stdout.
-
 ## Repository structure
 
 ```
@@ -242,7 +230,7 @@ When `ENABLE_LAPACKE=OFF`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still
 
 ## Formatting
 
-A repository-wide [`.clang-format`](.clang-format) governs all three subtrees. The top-level [`CMakeLists.txt`](CMakeLists.txt) wires up `clang-format` and `cmake-format` targets via [Format.cmake](https://github.com/TheLartians/Format.cmake); configure once from the repo root and use the targets:
+A repository-wide [`.clang-format`](.clang-format) governs all subtrees. The top-level [`CMakeLists.txt`](CMakeLists.txt) wires up `clang-format` and `cmake-format` targets via [Format.cmake](https://github.com/TheLartians/Format.cmake); configure once from the repo root and use the targets:
 
 ```bash
 cmake -B build-fmt
@@ -250,11 +238,11 @@ cmake --build build-fmt --target check-clang-format   # CI-style check
 cmake --build build-fmt --target fix-clang-format     # apply formatting
 ```
 
-Each subproject (`openmp/`, `hpx/`, `reference/`) is its own standalone CMake project with its own dependencies, so the top-level `CMakeLists.txt` only handles formatting — actual builds still happen from inside each subdirectory via its `compile.sh`.
+Each subproject (`openmp/`, `hpx/`, `reference/`) is its own standalone CMake project with its own dependencies, so the top-level `CMakeLists.txt` only handles formatting. The actual builds still happen from inside each subdirectory via its `compile.sh`.
 
 ## Contributing
 
-We would be happy to expand Cholesky-Bench to additional asynchronous many-task (AMT) runtimes. If you have an implementation you would like to add, feel free to open a pull request.
+We would be happy to expand Cholesky-Bench to additional asynchronous many-task (AMT) runtimes. If you would like to add an implementation, feel free to open a pull request.
 
 ## How to cite
 
diff --git a/hpx/CMakeLists.txt b/hpx/CMakeLists.txt
index 356c529..1cc87f5 100644
--- a/hpx/CMakeLists.txt
+++ b/hpx/CMakeLists.txt
@@ -22,9 +22,6 @@ if(ENABLE_VALIDATION AND DISABLE_COMPUTATION)
       "residual validation needs a real factorization to check against.")
 endif()
 
-# clang-format / cmake-format integration is hoisted to the top-level
-# CMakeLists.txt; configure from the repo root to use it.
-
 if(NOT CMAKE_SKIP_INSTALL_RULES)
   # Our installs follow the standard GNU directory layout. This include needs to
   # come first since we need the CMAKE_INSTALL_* in the CMakeLists.txt of each
diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt
index aba403a..038bb9e 100644
--- a/openmp/CMakeLists.txt
+++ b/openmp/CMakeLists.txt
@@ -26,9 +26,6 @@ if(ENABLE_VALIDATION AND DISABLE_COMPUTATION)
       "residual validation needs a real factorization to check against.")
 endif()
 
-# clang-format / cmake-format integration is hoisted to the top-level
-# CMakeLists.txt; configure from the repo root to use it.
-
 if(NOT CMAKE_SKIP_INSTALL_RULES)
   # Our installs follow the standard GNU directory layout. This include needs to
   # come first since we need the CMAKE_INSTALL_* in the CMakeLists.txt of each
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index 3f50b6b..69996cd 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -19,8 +19,6 @@ option(
   ENABLE_VALIDATION
   "Compute ||A - L*L^T||_F / ||A||_F after each factorization (off by default)"
   OFF)
-# clang-format / cmake-format integration is hoisted to the top-level
-# CMakeLists.txt; configure from the repo root to use it.
 
 if(NOT CMAKE_SKIP_INSTALL_RULES)
   include(GNUInstallDirs)