constracktor · constracktor · May 1, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/hpx/.clang-format → .clang-format b/hpx/.clang-format → .clang-format
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -19,14 +19,14 @@ jobs:
     - name: Install cmakelang
       run: pip3 install cmakelang
 
-    - name: Configure dummy project
-      run: cd hpx && cmake -B build-fmt -DBUILD_CORE=OFF -DCLANG_FORMAT_PROGRAM=$(which clang-format-18)
+    - name: Configure top-level format project
+      run: cmake -B build-fmt -DCLANG_FORMAT_PROGRAM=$(which clang-format-18)
 
     - name: Check code formatting
       id: clangformat
       run: |
         set +e
-        cd hpx && cmake --build build-fmt --target check-clang-format
+        cmake --build build-fmt --target check-clang-format
         status=$?
         if [ $status -ne 0 ]; then
           echo "Formatting errors found!"
@@ -37,14 +37,14 @@ jobs:
         fi
 
     - name: Check CMake formatting
-      # Let's run the CMake formatting checks even if our code is mis-formatted.
+      # Run CMake formatting checks even if the C++ check failed.
       if: success() || steps.clangformat.conclusion == 'failure'
-      # Note that diff generation for cmake-format is somewhat broken in the upstream project.
-      # Diffs always end up with incorrect paths so manual fixes would be necessary, which we sidestep
-      # by re-formatting in-place and then using `git diff`.
+      # Diff generation for cmake-format is somewhat broken upstream (paths
+      # come out wrong), so we sidestep it by fixing in place and using
+      # `git diff` to produce the patch.
       run: |
         set +e
-        cd hpx && cmake --build build-fmt --target check-cmake-format
+        cmake --build build-fmt --target check-cmake-format
         status=$?
         if [ $status -ne 0 ]; then
           echo "Formatting errors found!"
@@ -61,5 +61,5 @@ jobs:
       with:
         name: Formatting fix .patch files
         path: |
-          hpx/clang-format.patch
-          hpx/cmake-format.patch
+          clang-format.patch
+          cmake-format.patch
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.14)
+# Top-level coordinator for *source formatting only*. Each subdirectory owns its
+# own standalone CMake project (with its own dependencies and its own
+# compile.sh); this file exists so the clang-format / cmake-format integration
+# can be configured once for the whole repository.
+project(cholesky_bench LANGUAGES NONE)
+
+include(FetchContent)
+FetchContent_Declare(
+  format
+  GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git
+  GIT_TAG v1.8.1
+  QUIET)
+FetchContent_MakeAvailable(format)
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Cholesky-Bench
 
-Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-join to asynchronous tasks across several parallelism models, currently comparing OpenMP and HPX implementations side by side.
+Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-join to asynchronous tasks across several parallelism models, currently comparing OpenMP and HPX implementations side by side. A non-tiled parallel reference is also included as a baseline.
 
 ## Variants
 
@@ -24,27 +24,49 @@ Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-j
 | `loop_two` | Collapsed fork-join with dynamic schedule for trailing-update |
 | `async_void` |  Fully asynchronous tasking with dataflow using `hpx::shared_future<void>` |
 
+### Reference (`reference/`)
+
+| Mode | Description |
+|------|-------------|
+| `lapacke` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). Enabled by default; disable with `ENABLE_LAPACKE=OFF`. |
+| `plasma` | Single `plasma_dpotrf` call on the full matrix (PLASMA's high-level synchronous API). PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. |
+
+This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `lapacke` mode isolates the contribution of vendor-provided dense-LA parallelism, and the `plasma` mode adds a tiled-parallel competitor that uses the same OpenMP runtime as the in-house variants.
+
+#### PLASMA descriptor int32 overflow
+
+PLASMA 24.8.7's `plasma_desc_*_create()` routines compute their tile-storage size as `int * int` before casting to `size_t`, which silently overflows once the padded triangular tile-area exceeds `INT32_MAX`. With the default `nb=256`, the boundary is at `N=65280` (`mt=255`).
+
+The benchmark handles this transparently:
+
+- For sweep sizes `N` in `(65280, 65536]`, **only `plasma` is silently clamped down to 65280** for that iteration; `lapacke` runs at the full `N`. The `problem_size` column reports the original `N`, so `plasma`'s timing in this range corresponds to the 65280 compute even though the row is labelled with the input size.
+- For `N > 65536` `plasma` records `nan`. `lapacke` is unaffected by the int32 ceiling and continues normally.
+
 ## Dependencies
 
-Both implementations share the same sequential BLAS backend and are built with CMake (≥ 3.23) and C++20.
+All three implementations are built with CMake (≥ 3.23) and C++20. The OpenMP and HPX directories link against a *sequential* BLAS (parallelism is at the tile level); the `reference/` directory links against a *threaded* BLAS instead.
 
-| Dependency | OpenMP | HPX |
-|---|---|---|
-| OpenBLAS 0.3.28 | ✓ (default) | ✓ (default) |
-| Intel oneMKL | optional (`ENABLE_MKL=ON`) | optional (`ENABLE_MKL=ON`) |
-| HPX 1.11.0 + jemalloc | — | ✓ |
-| GCC 14.2.0 | ✓ | ✓ |
-| LLVM/Clang 22.1.2 | optional | — |
+| Dependency | OpenMP | HPX | Reference |
+|---|---|---|---|
+| OpenBLAS 0.3.28 (sequential) | ✓ (default) | ✓ (default) | — |
+| OpenBLAS 0.3.28 (`threads=openmp`) | — | — | ✓ (default) |
+| Intel oneMKL (sequential) | optional (`ENABLE_MKL=ON`) | optional (`ENABLE_MKL=ON`) | — |
+| Intel oneMKL (`intel_thread`) | — | — | optional (`ENABLE_MKL=ON`) |
+| PLASMA 24.8.7 | — | — | optional (`ENABLE_PLASMA=ON`) |
+| HPX 1.11.0 + jemalloc | — | ✓ | — |
+| GCC 14.2.0 | ✓ | ✓ | ✓ |
+| LLVM/Clang 22.1.2 | optional | — | — |
 
-Dependencies are managed via [Spack](https://spack.io/). The compile scripts auto-detect the host system and load the correct Spack environment.
+Dependencies are managed via [Spack](https://spack.io/).
 
 ## Build
 
-From within the `openmp/` or `hpx/` directory, run:
+From within the `openmp/`, `hpx/`, or `reference/` directory, run:
 
 ```bash
-./compile.sh [gcc|llvm]   # OpenMP: gcc (default) or llvm
-./compile.sh              # HPX: always gcc
+./compile.sh [gcc|llvm]   # OpenMP:    gcc (default) or llvm
+./compile.sh              # HPX:       always gcc
+./compile.sh              # Reference: always gcc
 ```
 
 The script clears and recreates the `build/` directory, then runs CMake in Release mode followed by a parallel make.
@@ -55,10 +77,12 @@ These can be set as environment variables before calling `compile.sh`:
 
 | Option | Default | Description |
 |--------|---------|-------------|
-| `ENABLE_VALIDATION` | `OFF` | After each factorization, compute the relative residual ‖A − LL^T‖_F / ‖A‖_F and warn if it exceeds 1e-10. Mutually exclusive with `DISABLE_COMPUTATION`. |
-| `DISABLE_COMPUTATION` | `OFF` | Replace all BLAS/tile-generation calls with no-ops. The task graph and loops remain intact, so scheduling overhead can be measured in isolation. |
-| `ENABLE_DYNAMIC_SCHEDULE` | `OFF` | *(OpenMP only)* Use `schedule(dynamic,1)` on the trailing-update worksharing loops in `for_collapse`. Requires the LLVM toolchain; rejected at compile time with GCC. |
-| `ENABLE_MKL` | `OFF` | Link against Intel oneMKL instead of OpenBLAS. |
+| `ENABLE_VALIDATION` | `OFF` | After each factorization, compute the relative residual ‖A − LL^T‖_F / ‖A‖_F and warn if it exceeds 1e-10. In `openmp/` and `hpx/`, mutually exclusive with `DISABLE_COMPUTATION`. |
+| `DISABLE_COMPUTATION` | `OFF` | *(`openmp/` and `hpx/` only)* Replace all BLAS/tile-generation calls with no-ops. The task graph and loops remain intact, so scheduling overhead can be measured in isolation. |
+| `ENABLE_DYNAMIC_SCHEDULE` | `OFF` | *(`openmp/` only)* Use `schedule(dynamic,1)` on the trailing-update worksharing loops in `for_collapse`. Requires the LLVM toolchain; rejected at compile time with GCC. |
+| `ENABLE_MKL` | `OFF` | Link against Intel oneMKL instead of OpenBLAS. In `openmp/` and `hpx/` this is the *sequential* MKL; in `reference/` it is the *threaded* MKL. |
+| `ENABLE_PLASMA` | `OFF` | *(`reference/` only)* Also build the PLASMA `plasma_dpotrf` variant. Adds a `plasma` column alongside `lapacke` in the runtime output. |
+| `ENABLE_LAPACKE` | `ON` | *(`reference/` only)* Run the `lapacke` mode at runtime. Set `OFF` to skip it (e.g. when only `plasma` is wanted). Linking is unchanged either way — PLASMA and validation still need cblas/lapacke symbols. |
 
 **Examples:**
 
@@ -71,6 +95,15 @@ ENABLE_DYNAMIC_SCHEDULE=ON ./compile.sh llvm
 
 # HPX: measure pure scheduling overhead
 DISABLE_COMPUTATION=ON ./compile.sh
+
+# Reference: threaded MKL baseline
+ENABLE_MKL=ON ./compile.sh
+
+# Reference: also build the PLASMA tiled-Cholesky variant
+ENABLE_PLASMA=ON ./compile.sh
+
+# Reference: PLASMA only, skip the LAPACKE_dpotrf column at runtime
+ENABLE_LAPACKE=OFF ENABLE_PLASMA=ON ./compile.sh
 ```
 
 ## Run
@@ -89,16 +122,22 @@ OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores \
   --hpx:threads=128 \
   --loop=1 --size_start=1024 --size_stop=65536 \
   --tiles_start=64 --tiles_stop=64
+
+# Reference (parallel BLAS, no tiling)
+OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores \
+  ./build/cholesky_reference \
+  --loop 1 --size_start 1024 --size_stop 65536
 ```
 
 ### Via SLURM
 
-Both directories contain a `run.sh` that is a ready-to-submit SLURM batch script (128 CPUs, exclusive node, 144-hour wall time):
+All three directories contain a `run.sh` that is a ready-to-submit SLURM batch script (128 CPUs, exclusive node, 144-hour wall time):
 
 ```bash
-sbatch openmp/run.sh          # gcc runtime (default)
-sbatch openmp/run.sh llvm     # llvm runtime
+sbatch openmp/run.sh             # gcc runtime (default)
+sbatch openmp/run.sh llvm        # llvm runtime
 sbatch hpx/run.sh
+sbatch reference/run.sh          # gcc runtime; defaults to N=65280 (see PLASMA boundary note)
 ```
 
 ### Command-line arguments
@@ -107,7 +146,7 @@ sbatch hpx/run.sh
 |----------|---------|-------------|
 | `--loop` / `--loop=` | 1 | Number of timed repetitions per configuration |
 | `--size_start` / `--size_stop` | 32 / 128 | Problem size range (doubled each step) |
-| `--tiles_start` / `--tiles_stop` | 16 / 32 | Tile count range (doubled each step) |
+| `--tiles_start` / `--tiles_stop` | 16 / 32 | Tile count range (doubled each step). Accepted but ignored by the `reference/` binary, which has no tiling axis. |
 
 ## Output
 
@@ -116,24 +155,21 @@ Results are appended to a text file in the working directory:
 ```
 runtimes_openmp_cholesky_<suffix>.txt
 runtimes_hpx_cholesky_<suffix>.txt
+runtimes_reference_cholesky_<suffix>.txt
 ```
 
 The suffix encodes which dimension is swept: `tile_` if tiles vary, `size_` if size varies, followed by the loop count. The file uses `;`-separated columns:
 
-```
-threads;problem_size;tile_size;n_tiles;for_collapse;for_naive;task_naive;task_depend
-128;65536;1024;64;3.14;3.21;2.98;2.87
-```
-
-The same lines are also printed to stdout.
+The `reference/` binary reports a `lapacke` column (suppressed by `ENABLE_LAPACKE=OFF`) plus a `plasma` column when built with `ENABLE_PLASMA=ON`, with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key:
 
 ## Repository structure
 
 ```
 .
+├── .clang-format           # repo-wide style; governs all three subtrees
+├── CMakeLists.txt          # top-level coordinator (formatting only; LANGUAGES NONE)
 ├── openmp/
 │   ├── CMakeLists.txt
-│   ├── CMakePresets.json
 │   ├── compile.sh          # build script (gcc or llvm)
 │   ├── run.sh              # SLURM job script
 │   ├── main.cpp
@@ -150,30 +186,63 @@ The same lines are also printed to stdout.
 │           ├── tile_generation.cpp
 │           ├── validate.cpp
 │           └── adapter_cblas_fp64.cpp
-└── hpx/
+├── hpx/
+│   ├── CMakeLists.txt
+│   ├── compile.sh          # build script (gcc only)
+│   ├── run.sh              # SLURM job script
+│   ├── main.cpp
+│   └── core/
+│       ├── include/
+│       │   ├── cholesky_factor.hpp
+│       │   ├── functions.hpp
+│       │   ├── tile_generation.hpp
+│       │   ├── validate.hpp
+│       │   └── adapter_cblas_fp64.hpp
+│       └── src/
+│           ├── cholesky_factor.cpp
+│           ├── functions.cpp
+│           ├── tile_generation.cpp
+│           ├── validate.cpp
+│           └── adapter_cblas_fp64.cpp
+└── reference/
     ├── CMakeLists.txt
-    ├── CMakePresets.json
     ├── compile.sh          # build script (gcc only)
     ├── run.sh              # SLURM job script
     ├── main.cpp
     └── core/
         ├── include/
         │   ├── cholesky_factor.hpp
         │   ├── functions.hpp
-        │   ├── tile_generation.hpp
+        │   ├── matrix_generation.hpp
+        │   ├── adapter_plasma_fp64.hpp  # only used when ENABLE_PLASMA=ON
         │   ├── validate.hpp
         │   └── adapter_cblas_fp64.hpp
         └── src/
             ├── cholesky_factor.cpp
             ├── functions.cpp
-            ├── tile_generation.cpp
+            ├── matrix_generation.cpp
+            ├── adapter_plasma_fp64.cpp  # only built when ENABLE_PLASMA=ON
             ├── validate.cpp
             └── adapter_cblas_fp64.cpp
 ```
 
+When `ENABLE_LAPACKE=OFF`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still compiled and linked (they share cblas/lapacke symbols with PLASMA's BLAS dependency); only the runtime dispatch of the `lapacke` mode is skipped.
+
+## Formatting
+
+A repository-wide [`.clang-format`](.clang-format) governs all subtrees. The top-level [`CMakeLists.txt`](CMakeLists.txt) wires up `clang-format` and `cmake-format` targets via [Format.cmake](https://github.com/TheLartians/Format.cmake); configure once from the repo root and use the targets:
+
+```bash
+cmake -B build-fmt
+cmake --build build-fmt --target check-clang-format   # CI-style check
+cmake --build build-fmt --target fix-clang-format     # apply formatting
+```
+
+Each subproject (`openmp/`, `hpx/`, `reference/`) is its own standalone CMake project with its own dependencies, so the top-level `CMakeLists.txt` only handles formatting. The actual builds still happen from inside each subdirectory via its `compile.sh`.
+
 ## Contributing
 
-We would be happy to expand Cholesky-Bench to additional asynchronous many-task (AMT) runtimes. If you have an implementation you would like to add, feel free to open a pull request.
+We would be happy to expand Cholesky-Bench to additional asynchronous many-task (AMT) runtimes. If you would like to add an implementation, feel free to open a pull request.
 
 ## How to cite
 

diff --git a/hpx/CMakeLists.txt b/hpx/CMakeLists.txt
@@ -15,29 +15,13 @@ option(
   DISABLE_COMPUTATION
   "Replace all BLAS/LAPACK calls and tile generation with no-ops; keeps the dataflow graph intact so HPX scheduling overhead can be measured in isolation"
   OFF)
-option(ENABLE_FORMAT_TARGETS "Enable clang-format / cmake-format targets"
-       ${PROJECT_IS_TOP_LEVEL})
-
 if(ENABLE_VALIDATION AND DISABLE_COMPUTATION)
   message(
     FATAL_ERROR
       "ENABLE_VALIDATION and DISABLE_COMPUTATION are mutually exclusive: "
       "residual validation needs a real factorization to check against.")
 endif()
 
-if(ENABLE_FORMAT_TARGETS)
-  find_package(format QUIET)
-  if(NOT format_FOUND)
-    include(FetchContent)
-    FetchContent_Declare(
-      format
-      GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git
-      GIT_TAG v1.8.1
-      QUIET)
-    FetchContent_MakeAvailable(format)
-  endif()
-endif()
-
 if(NOT CMAKE_SKIP_INSTALL_RULES)
   # Our installs follow the standard GNU directory layout. This include needs to
   # come first since we need the CMAKE_INSTALL_* in the CMakeLists.txt of each

diff --git a/hpx/CMakePresets.json b/hpx/CMakePresets.json
diff --git a/hpx/core/include/adapter_cblas_fp64.hpp b/hpx/core/include/adapter_cblas_fp64.hpp
@@ -126,7 +126,7 @@ void gemm(const vector &A,
  * @param dep_future dependency future to wait on before executing
  * @param A matrix to be factorized (mutated in-place)
  * @param N matrix dimension
- * @return void future signalling completion
+ * @return void future signaling completion
  */
 void_future potrf_f(void_future dep_future, vector &A, const int N);
 
@@ -140,7 +140,7 @@ void_future potrf_f(void_future dep_future, vector &A, const int N);
  * @param M second dimension
  * @param transpose_L transpose flag for L
  * @param side_L side flag for L
- * @return void future signalling completion
+ * @return void future signaling completion
  */
 void_future trsm_f(void_future dep_L,
                    void_future dep_A,
@@ -158,7 +158,7 @@ void_future trsm_f(void_future dep_L,
  * @param A base matrix (mutated in-place)
  * @param B symmetric update matrix
  * @param N matrix dimension
- * @return void future signalling completion
+ * @return void future signaling completion
  */
 void_future syrk_f(void_future dep_A, void_future dep_B, vector &A, const vector &B, const int N);
 
@@ -175,7 +175,7 @@ void_future syrk_f(void_future dep_A, void_future dep_B, vector &A, const vector
  * @param K third matrix dimension
  * @param transpose_A transpose flag for A
  * @param transpose_B transpose flag for B
- * @return void future signalling completion
+ * @return void future signaling completion
  */
 void_future
 gemm_f(void_future dep_A,