diff --git a/hpx/.clang-format b/.clang-format
similarity index 100%
rename from hpx/.clang-format
rename to .clang-format
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index cd6047c..0c82399 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -19,14 +19,14 @@ jobs:
     - name: Install cmakelang
       run: pip3 install cmakelang
 
-    - name: Configure dummy project
-      run: cd hpx && cmake -B build-fmt -DBUILD_CORE=OFF -DCLANG_FORMAT_PROGRAM=$(which clang-format-18)
+    - name: Configure top-level format project
+      run: cmake -B build-fmt -DCLANG_FORMAT_PROGRAM=$(which clang-format-18)
 
     - name: Check code formatting
       id: clangformat
       run: |
         set +e
-        cd hpx && cmake --build build-fmt --target check-clang-format
+        cmake --build build-fmt --target check-clang-format
         status=$?
         if [ $status -ne 0 ]; then
           echo "Formatting errors found!"
@@ -37,14 +37,14 @@ jobs:
         fi
 
     - name: Check CMake formatting
-      # Let's run the CMake formatting checks even if our code is mis-formatted.
+      # Run CMake formatting checks even if the C++ check failed.
       if: success() || steps.clangformat.conclusion == 'failure'
-      # Note that diff generation for cmake-format is somewhat broken in the upstream project.
-      # Diffs always end up with incorrect paths so manual fixes would be necessary, which we sidestep
-      # by re-formatting in-place and then using `git diff`.
+      # Diff generation for cmake-format is somewhat broken upstream (paths
+      # come out wrong), so we sidestep it by fixing in place and using
+      # `git diff` to produce the patch.
       run: |
         set +e
-        cd hpx && cmake --build build-fmt --target check-cmake-format
+        cmake --build build-fmt --target check-cmake-format
         status=$?
         if [ $status -ne 0 ]; then
           echo "Formatting errors found!"
@@ -61,5 +61,5 @@ jobs:
       with:
         name: Formatting fix .patch files
         path: |
-          hpx/clang-format.patch
-          hpx/cmake-format.patch
+          clang-format.patch
+          cmake-format.patch
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..4184e03
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.14)
+# Top-level coordinator for *source formatting only*. Each subdirectory owns its
+# own standalone CMake project (with its own dependencies and its own
+# compile.sh); this file exists so the clang-format / cmake-format integration
+# can be configured once for the whole repository.
+project(cholesky_bench LANGUAGES NONE)
+
+include(FetchContent)
+FetchContent_Declare(
+  format
+  GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git
+  GIT_TAG v1.8.1
+  QUIET)
+FetchContent_MakeAvailable(format)
diff --git a/README.md b/README.md
index 7f06a60..c2b694a 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Cholesky-Bench
 
-Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-join to asynchronous tasks across several parallelism models, currently comparing OpenMP and HPX implementations side by side.
+Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-join to asynchronous tasks across several parallelism models, currently comparing OpenMP and HPX implementations side by side. A non-tiled parallel reference is also included as a baseline.
 
 ## Variants
 
@@ -24,27 +24,49 @@ Cholesky-Bench benchmarks right-looking tiled Cholesky factorization from fork-j
 | `loop_two` | Collapsed fork-join with dynamic schedule for trailing-update |
 | `async_void` |  Fully asynchronous tasking with dataflow using `hpx::shared_future<void>` |
 
+### Reference (`reference/`)
+
+| Mode | Description |
+|------|-------------|
+| `lapacke` | Single threaded `LAPACKE_dpotrf` call on the full matrix; no tiling. Parallelism is delegated entirely to a threaded BLAS (OpenBLAS built with `threads=openmp`, or threaded Intel oneMKL via `ENABLE_MKL=ON`). Enabled by default; disable with `ENABLE_LAPACKE=OFF`. |
+| `plasma` | Single `plasma_dpotrf` call on the full matrix (PLASMA's high-level synchronous API). PLASMA does its own tiled, OpenMP-task-based parallel Cholesky internally; tile size is left at PLASMA's built-in default. Built only when `ENABLE_PLASMA=ON`. |
+
+This directory is the natural baseline for the OpenMP and HPX tiled implementations: the `lapacke` mode isolates the contribution of vendor-provided dense-LA parallelism, and the `plasma` mode adds a tiled-parallel competitor that uses the same OpenMP runtime as the in-house variants.
+
+#### PLASMA descriptor int32 overflow
+
+PLASMA 24.8.7's `plasma_desc_*_create()` routines compute their tile-storage size as `int * int` before casting to `size_t`, which silently overflows once the padded triangular tile-area exceeds `INT32_MAX`. With the default `nb=256`, the boundary is at `N=65280` (`mt=255`).
+
+The benchmark handles this transparently:
+
+- For sweep sizes `N` in `(65280, 65536]`, **only `plasma` is silently clamped down to 65280** for that iteration; `lapacke` runs at the full `N`. The `problem_size` column reports the original `N`, so `plasma`'s timing in this range corresponds to the 65280 compute even though the row is labelled with the input size.
+- For `N > 65536` `plasma` records `nan`. `lapacke` is unaffected by the int32 ceiling and continues normally.
+
 ## Dependencies
 
-Both implementations share the same sequential BLAS backend and are built with CMake (≥ 3.23) and C++20.
+All three implementations are built with CMake (≥ 3.23) and C++20. The OpenMP and HPX directories link against a *sequential* BLAS (parallelism is at the tile level); the `reference/` directory links against a *threaded* BLAS instead.
 
-| Dependency | OpenMP | HPX |
-|---|---|---|
-| OpenBLAS 0.3.28 | ✓ (default) | ✓ (default) |
-| Intel oneMKL | optional (`ENABLE_MKL=ON`) | optional (`ENABLE_MKL=ON`) |
-| HPX 1.11.0 + jemalloc | — | ✓ |
-| GCC 14.2.0 | ✓ | ✓ |
-| LLVM/Clang 22.1.2 | optional | — |
+| Dependency | OpenMP | HPX | Reference |
+|---|---|---|---|
+| OpenBLAS 0.3.28 (sequential) | ✓ (default) | ✓ (default) | — |
+| OpenBLAS 0.3.28 (`threads=openmp`) | — | — | ✓ (default) |
+| Intel oneMKL (sequential) | optional (`ENABLE_MKL=ON`) | optional (`ENABLE_MKL=ON`) | — |
+| Intel oneMKL (`intel_thread`) | — | — | optional (`ENABLE_MKL=ON`) |
+| PLASMA 24.8.7 | — | — | optional (`ENABLE_PLASMA=ON`) |
+| HPX 1.11.0 + jemalloc | — | ✓ | — |
+| GCC 14.2.0 | ✓ | ✓ | ✓ |
+| LLVM/Clang 22.1.2 | optional | — | — |
 
-Dependencies are managed via [Spack](https://spack.io/). The compile scripts auto-detect the host system and load the correct Spack environment.
+Dependencies are managed via [Spack](https://spack.io/).
 
 ## Build
 
-From within the `openmp/` or `hpx/` directory, run:
+From within the `openmp/`, `hpx/`, or `reference/` directory, run:
 
 ```bash
-./compile.sh [gcc|llvm]   # OpenMP: gcc (default) or llvm
-./compile.sh              # HPX: always gcc
+./compile.sh [gcc|llvm]   # OpenMP:    gcc (default) or llvm
+./compile.sh              # HPX:       always gcc
+./compile.sh              # Reference: always gcc
 ```
 
 The script clears and recreates the `build/` directory, then runs CMake in Release mode followed by a parallel make.
@@ -55,10 +77,12 @@ These can be set as environment variables before calling `compile.sh`:
 
 | Option | Default | Description |
 |--------|---------|-------------|
-| `ENABLE_VALIDATION` | `OFF` | After each factorization, compute the relative residual ‖A − LL^T‖_F / ‖A‖_F and warn if it exceeds 1e-10. Mutually exclusive with `DISABLE_COMPUTATION`. |
-| `DISABLE_COMPUTATION` | `OFF` | Replace all BLAS/tile-generation calls with no-ops. The task graph and loops remain intact, so scheduling overhead can be measured in isolation. |
-| `ENABLE_DYNAMIC_SCHEDULE` | `OFF` | *(OpenMP only)* Use `schedule(dynamic,1)` on the trailing-update worksharing loops in `for_collapse`. Requires the LLVM toolchain; rejected at compile time with GCC. |
-| `ENABLE_MKL` | `OFF` | Link against Intel oneMKL instead of OpenBLAS. |
+| `ENABLE_VALIDATION` | `OFF` | After each factorization, compute the relative residual ‖A − LL^T‖_F / ‖A‖_F and warn if it exceeds 1e-10. In `openmp/` and `hpx/`, mutually exclusive with `DISABLE_COMPUTATION`. |
+| `DISABLE_COMPUTATION` | `OFF` | *(`openmp/` and `hpx/` only)* Replace all BLAS/tile-generation calls with no-ops. The task graph and loops remain intact, so scheduling overhead can be measured in isolation. |
+| `ENABLE_DYNAMIC_SCHEDULE` | `OFF` | *(`openmp/` only)* Use `schedule(dynamic,1)` on the trailing-update worksharing loops in `for_collapse`. Requires the LLVM toolchain; rejected at compile time with GCC. |
+| `ENABLE_MKL` | `OFF` | Link against Intel oneMKL instead of OpenBLAS. In `openmp/` and `hpx/` this is the *sequential* MKL; in `reference/` it is the *threaded* MKL. |
+| `ENABLE_PLASMA` | `OFF` | *(`reference/` only)* Also build the PLASMA `plasma_dpotrf` variant. Adds a `plasma` column alongside `lapacke` in the runtime output. |
+| `ENABLE_LAPACKE` | `ON` | *(`reference/` only)* Run the `lapacke` mode at runtime. Set `OFF` to skip it (e.g. when only `plasma` is wanted). Linking is unchanged either way — PLASMA and validation still need cblas/lapacke symbols. |
 
 **Examples:**
 
@@ -71,6 +95,15 @@ ENABLE_DYNAMIC_SCHEDULE=ON ./compile.sh llvm
 
 # HPX: measure pure scheduling overhead
 DISABLE_COMPUTATION=ON ./compile.sh
+
+# Reference: threaded MKL baseline
+ENABLE_MKL=ON ./compile.sh
+
+# Reference: also build the PLASMA tiled-Cholesky variant
+ENABLE_PLASMA=ON ./compile.sh
+
+# Reference: PLASMA only, skip the LAPACKE_dpotrf column at runtime
+ENABLE_LAPACKE=OFF ENABLE_PLASMA=ON ./compile.sh
 ```
 
 ## Run
@@ -89,16 +122,22 @@ OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores \
   --hpx:threads=128 \
   --loop=1 --size_start=1024 --size_stop=65536 \
   --tiles_start=64 --tiles_stop=64
+
+# Reference (parallel BLAS, no tiling)
+OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores \
+  ./build/cholesky_reference \
+  --loop 1 --size_start 1024 --size_stop 65536
 ```
 
 ### Via SLURM
 
-Both directories contain a `run.sh` that is a ready-to-submit SLURM batch script (128 CPUs, exclusive node, 144-hour wall time):
+All three directories contain a `run.sh` that is a ready-to-submit SLURM batch script (128 CPUs, exclusive node, 144-hour wall time):
 
 ```bash
-sbatch openmp/run.sh          # gcc runtime (default)
-sbatch openmp/run.sh llvm     # llvm runtime
+sbatch openmp/run.sh             # gcc runtime (default)
+sbatch openmp/run.sh llvm        # llvm runtime
 sbatch hpx/run.sh
+sbatch reference/run.sh          # gcc runtime; defaults to N=65280 (see PLASMA boundary note)
 ```
 
 ### Command-line arguments
@@ -107,7 +146,7 @@ sbatch hpx/run.sh
 |----------|---------|-------------|
 | `--loop` / `--loop=` | 1 | Number of timed repetitions per configuration |
 | `--size_start` / `--size_stop` | 32 / 128 | Problem size range (doubled each step) |
-| `--tiles_start` / `--tiles_stop` | 16 / 32 | Tile count range (doubled each step) |
+| `--tiles_start` / `--tiles_stop` | 16 / 32 | Tile count range (doubled each step). Accepted but ignored by the `reference/` binary, which has no tiling axis. |
 
 ## Output
 
@@ -116,24 +155,21 @@ Results are appended to a text file in the working directory:
 ```
 runtimes_openmp_cholesky_<suffix>.txt
 runtimes_hpx_cholesky_<suffix>.txt
+runtimes_reference_cholesky_<suffix>.txt
 ```
 
 The suffix encodes which dimension is swept: `tile_` if tiles vary, `size_` if size varies, followed by the loop count. The file uses `;`-separated columns:
 
-```
-threads;problem_size;tile_size;n_tiles;for_collapse;for_naive;task_naive;task_depend
-128;65536;1024;64;3.14;3.21;2.98;2.87
-```
-
-The same lines are also printed to stdout.
+The `reference/` binary reports a `lapacke` column (suppressed by `ENABLE_LAPACKE=OFF`) plus a `plasma` column when built with `ENABLE_PLASMA=ON`, with `tile_size = problem_size` and `n_tiles = 1`, so its runtime files merge cleanly with the tiled benchmarks on the `problem_size` key:
 
 ## Repository structure
 
 ```
 .
+├── .clang-format           # repo-wide style; governs all three subtrees
+├── CMakeLists.txt          # top-level coordinator (formatting only; LANGUAGES NONE)
 ├── openmp/
 │   ├── CMakeLists.txt
-│   ├── CMakePresets.json
 │   ├── compile.sh          # build script (gcc or llvm)
 │   ├── run.sh              # SLURM job script
 │   ├── main.cpp
@@ -150,9 +186,26 @@ The same lines are also printed to stdout.
 │           ├── tile_generation.cpp
 │           ├── validate.cpp
 │           └── adapter_cblas_fp64.cpp
-└── hpx/
+├── hpx/
+│   ├── CMakeLists.txt
+│   ├── compile.sh          # build script (gcc only)
+│   ├── run.sh              # SLURM job script
+│   ├── main.cpp
+│   └── core/
+│       ├── include/
+│       │   ├── cholesky_factor.hpp
+│       │   ├── functions.hpp
+│       │   ├── tile_generation.hpp
+│       │   ├── validate.hpp
+│       │   └── adapter_cblas_fp64.hpp
+│       └── src/
+│           ├── cholesky_factor.cpp
+│           ├── functions.cpp
+│           ├── tile_generation.cpp
+│           ├── validate.cpp
+│           └── adapter_cblas_fp64.cpp
+└── reference/
     ├── CMakeLists.txt
-    ├── CMakePresets.json
     ├── compile.sh          # build script (gcc only)
     ├── run.sh              # SLURM job script
     ├── main.cpp
@@ -160,20 +213,36 @@ The same lines are also printed to stdout.
         ├── include/
         │   ├── cholesky_factor.hpp
         │   ├── functions.hpp
-        │   ├── tile_generation.hpp
+        │   ├── matrix_generation.hpp
+        │   ├── adapter_plasma_fp64.hpp  # only used when ENABLE_PLASMA=ON
         │   ├── validate.hpp
         │   └── adapter_cblas_fp64.hpp
         └── src/
             ├── cholesky_factor.cpp
             ├── functions.cpp
-            ├── tile_generation.cpp
+            ├── matrix_generation.cpp
+            ├── adapter_plasma_fp64.cpp  # only built when ENABLE_PLASMA=ON
             ├── validate.cpp
             └── adapter_cblas_fp64.cpp
 ```
 
+When `ENABLE_LAPACKE=OFF`, `adapter_cblas_fp64.cpp` and `validate.cpp` are still compiled and linked (they share cblas/lapacke symbols with PLASMA's BLAS dependency); only the runtime dispatch of the `lapacke` mode is skipped.
+
+## Formatting
+
+A repository-wide [`.clang-format`](.clang-format) governs all subtrees. The top-level [`CMakeLists.txt`](CMakeLists.txt) wires up `clang-format` and `cmake-format` targets via [Format.cmake](https://github.com/TheLartians/Format.cmake); configure once from the repo root and use the targets:
+
+```bash
+cmake -B build-fmt
+cmake --build build-fmt --target check-clang-format   # CI-style check
+cmake --build build-fmt --target fix-clang-format     # apply formatting
+```
+
+Each subproject (`openmp/`, `hpx/`, `reference/`) is its own standalone CMake project with its own dependencies, so the top-level `CMakeLists.txt` only handles formatting. The actual builds still happen from inside each subdirectory via its `compile.sh`.
+
 ## Contributing
 
-We would be happy to expand Cholesky-Bench to additional asynchronous many-task (AMT) runtimes. If you have an implementation you would like to add, feel free to open a pull request.
+We would be happy to expand Cholesky-Bench to additional asynchronous many-task (AMT) runtimes. If you would like to add an implementation, feel free to open a pull request.
 
 ## How to cite
 
diff --git a/hpx/CMakeLists.txt b/hpx/CMakeLists.txt
index b2b7fdc..1cc87f5 100644
--- a/hpx/CMakeLists.txt
+++ b/hpx/CMakeLists.txt
@@ -15,9 +15,6 @@ option(
   DISABLE_COMPUTATION
   "Replace all BLAS/LAPACK calls and tile generation with no-ops; keeps the dataflow graph intact so HPX scheduling overhead can be measured in isolation"
   OFF)
-option(ENABLE_FORMAT_TARGETS "Enable clang-format / cmake-format targets"
-       ${PROJECT_IS_TOP_LEVEL})
-
 if(ENABLE_VALIDATION AND DISABLE_COMPUTATION)
   message(
     FATAL_ERROR
@@ -25,19 +22,6 @@ if(ENABLE_VALIDATION AND DISABLE_COMPUTATION)
       "residual validation needs a real factorization to check against.")
 endif()
 
-if(ENABLE_FORMAT_TARGETS)
-  find_package(format QUIET)
-  if(NOT format_FOUND)
-    include(FetchContent)
-    FetchContent_Declare(
-      format
-      GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git
-      GIT_TAG v1.8.1
-      QUIET)
-    FetchContent_MakeAvailable(format)
-  endif()
-endif()
-
 if(NOT CMAKE_SKIP_INSTALL_RULES)
   # Our installs follow the standard GNU directory layout. This include needs to
   # come first since we need the CMAKE_INSTALL_* in the CMakeLists.txt of each
diff --git a/hpx/CMakePresets.json b/hpx/CMakePresets.json
deleted file mode 100644
index f3839f8..0000000
--- a/hpx/CMakePresets.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "version": 6,
-  "cmakeMinimumRequired": {
-    "major": 3,
-    "minor": 22,
-    "patch": 0
-  },
-  "configurePresets": [
-    {
-      "name": "clang-tidy",
-      "hidden": true,
-      "cacheVariables": {
-        "CMAKE_CXX_CLANG_TIDY": "clang-tidy;--header-filter=^${sourceDir}/"
-      }
-    }
-  ]
-}
diff --git a/hpx/core/include/adapter_cblas_fp64.hpp b/hpx/core/include/adapter_cblas_fp64.hpp
index 5440833..91ce5c9 100644
--- a/hpx/core/include/adapter_cblas_fp64.hpp
+++ b/hpx/core/include/adapter_cblas_fp64.hpp
@@ -126,7 +126,7 @@ void gemm(const vector &A,
  * @param dep_future dependency future to wait on before executing
  * @param A matrix to be factorized (mutated in-place)
  * @param N matrix dimension
- * @return void future signalling completion
+ * @return void future signaling completion
  */
 void_future potrf_f(void_future dep_future, vector &A, const int N);
 
@@ -140,7 +140,7 @@ void_future potrf_f(void_future dep_future, vector &A, const int N);
  * @param M second dimension
  * @param transpose_L transpose flag for L
  * @param side_L side flag for L
- * @return void future signalling completion
+ * @return void future signaling completion
  */
 void_future trsm_f(void_future dep_L,
                    void_future dep_A,
@@ -158,7 +158,7 @@ void_future trsm_f(void_future dep_L,
  * @param A base matrix (mutated in-place)
  * @param B symmetric update matrix
  * @param N matrix dimension
- * @return void future signalling completion
+ * @return void future signaling completion
  */
 void_future syrk_f(void_future dep_A, void_future dep_B, vector &A, const vector &B, const int N);
 
@@ -175,7 +175,7 @@ void_future syrk_f(void_future dep_A, void_future dep_B, vector &A, const vector
  * @param K third matrix dimension
  * @param transpose_A transpose flag for A
  * @param transpose_B transpose flag for B
- * @return void future signalling completion
+ * @return void future signaling completion
  */
 void_future
 gemm_f(void_future dep_A,
diff --git a/openmp/.clang-format b/openmp/.clang-format
deleted file mode 100644
index e8d875c..0000000
--- a/openmp/.clang-format
+++ /dev/null
@@ -1,174 +0,0 @@
----
-Language: Cpp
-AccessModifierOffset: -2
-AlignAfterOpenBracket: Align
-AlignArrayOfStructures: None
-AlignConsecutiveAssignments: None
-AlignConsecutiveBitFields: None
-AlignConsecutiveDeclarations: None
-AlignConsecutiveMacros: None
-AlignConsecutiveShortCaseStatements:
-  Enabled: true
-  AcrossEmptyLines: false
-  AcrossComments: false
-  AlignCaseColons: false
-AlignEscapedNewlines: Right
-AlignOperands: Align
-AlignTrailingComments:
-  Kind: Always
-AllowAllArgumentsOnNextLine: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
-AllowShortBlocksOnASingleLine: Empty
-AllowShortCaseLabelsOnASingleLine: true
-AllowShortEnumsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: Never
-AllowShortLambdasOnASingleLine: All
-AllowShortLoopsOnASingleLine: true
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakBeforeMultilineStrings: false
-AlwaysBreakAfterReturnType: None
-AlwaysBreakTemplateDeclarations: Yes
-BinPackArguments: false
-BinPackParameters: false
-BitFieldColonSpacing: Both
-BraceWrapping:
-  AfterCaseLabel: false
-  AfterClass: true
-  AfterControlStatement: Always
-  AfterEnum: false
-  AfterFunction: true
-  AfterNamespace: true
-  AfterObjCDeclaration: true
-  AfterStruct: true
-  AfterUnion: true
-  AfterExternBlock: false
-  BeforeCatch: true
-  BeforeElse: true
-  BeforeLambdaBody: true
-  BeforeWhile: false
-  IndentBraces: false
-  SplitEmptyFunction: false
-  SplitEmptyRecord: false
-  SplitEmptyNamespace: false
-BreakAfterAttributes: Never
-BreakAfterJavaFieldAnnotations: false
-BreakBeforeBinaryOperators: NonAssignment
-BreakBeforeBraces: Custom
-BreakBeforeConceptDeclarations: Always
-BreakBeforeInlineASMColon: OnlyMultiline
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializers: AfterColon
-BreakInheritanceList: AfterComma
-BreakStringLiterals: true
-ColumnLimit: 120
-CommentPragmas: '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: false
-DerivePointerAlignment: false
-DisableFormat: false
-EmptyLineAfterAccessModifier: Never
-EmptyLineBeforeAccessModifier: LogicalBlock
-FixNamespaceComments: true
-ForEachMacros: [ 'foreach', 'Q_FOREACH', 'BOOST_FOREACH' ]
-IfMacros: [ ]
-IncludeBlocks: Regroup
-IncludeCategories:
-  - Regex: '^"gprat/'
-    Priority: 1
-  - Regex: '^"(tests|bindings)/'
-    Priority: 2
-  - Regex: '^"(fmt|catch2|pybind)'
-    Priority: 3
-  - Regex: '^.*'
-    Priority: 4
-IncludeIsMainRegex: '(Test)?$'
-IncludeIsMainSourceRegex: '(\.cu|\.hip)'
-IndentAccessModifiers: false
-IndentCaseBlocks: true
-IndentCaseLabels: true
-IndentExternBlock: NoIndent
-IndentGotoLabels: false
-IndentPPDirectives: None
-IndentRequiresClause: false
-IndentWidth: 4
-IndentWrappedFunctionNames: false
-InsertBraces: true
-InsertNewlineAtEOF: true
-InsertTrailingCommas: None
-IntegerLiteralSeparator:
-  Binary: 8
-  Decimal: 3
-  DecimalMinDigits: 5
-  Hex: -1
-KeepEmptyLinesAtEOF: false
-KeepEmptyLinesAtTheStartOfBlocks: false
-LambdaBodyIndentation: Signature
-LineEnding: DeriveLF
-MacroBlockBegin: ''
-MacroBlockEnd: ''
-Macros: [ ]
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-NamespaceMacros: [ ]
-PPIndentWidth: -1
-PackConstructorInitializers: Never
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 19
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakOpenParenthesis: 0
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyIndentedWhitespace: 1
-PenaltyReturnTypeOnItsOwnLine: 60
-PointerAlignment: Right
-QualifierAlignment: Custom
-QualifierOrder: [ 'inline', 'constexpr', 'static', 'friend', 'restrict', 'const', 'volatile', 'type' ]
-ReferenceAlignment: Pointer
-ReflowComments: true
-RemoveBracesLLVM: false
-RemoveParentheses: Leave
-RemoveSemicolon: true
-RequiresClausePosition: OwnLine
-RequiresExpressionIndentation: OuterScope
-SeparateDefinitionBlocks: Always
-ShortNamespaceLines: 1
-SortIncludes: CaseInsensitive
-SortUsingDeclarations: LexicographicNumeric
-SpaceAfterCStyleCast: true
-SpaceAfterLogicalNot: false
-SpaceAfterTemplateKeyword: true
-SpaceAroundPointerQualifiers: Default
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCaseColon: false
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeJsonColon: false
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceBeforeSquareBrackets: false
-SpaceInEmptyBlock: true
-SpacesBeforeTrailingComments: 2
-SpacesInAngles: false
-SpacesInContainerLiterals: true
-SpacesInLineCommentPrefix:
-  Minimum: 1
-  Maximum: 1
-SpacesInParens: Never
-SpacesInSquareBrackets: false
-Standard: c++17
-StatementAttributeLikeMacros: [ ]
-StatementMacros: [ 'Q_UNUSED', 'QT_REQUIRE_VERSION' ]
-TabWidth: 4
-TypeNames: [ ]
-TypenameMacros: [ ]
-UseTab: Never
-WhitespaceSensitiveMacros: [ 'STRINGIZE', 'PP_STRINGIZE', 'BOOST_PP_STRINGIZE' ]
-...
-
diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt
index f506c0e..038bb9e 100644
--- a/openmp/CMakeLists.txt
+++ b/openmp/CMakeLists.txt
@@ -19,9 +19,6 @@ option(
   ENABLE_DYNAMIC_SCHEDULE
   "Use schedule(dynamic, 1) on the trailing-update worksharing loops in for_collapse. OFF by default so GCC builds compile out of the box. Turn ON for LLVM builds where the dynamic schedule is supported and gives better load balancing."
   OFF)
-option(ENABLE_FORMAT_TARGETS "Enable clang-format / cmake-format targets"
-       ${PROJECT_IS_TOP_LEVEL})
-
 if(ENABLE_VALIDATION AND DISABLE_COMPUTATION)
   message(
     FATAL_ERROR
@@ -29,19 +26,6 @@ if(ENABLE_VALIDATION AND DISABLE_COMPUTATION)
       "residual validation needs a real factorization to check against.")
 endif()
 
-if(ENABLE_FORMAT_TARGETS)
-  find_package(format QUIET)
-  if(NOT format_FOUND)
-    include(FetchContent)
-    FetchContent_Declare(
-      format
-      GIT_REPOSITORY https://github.com/TheLartians/Format.cmake.git
-      GIT_TAG v1.8.1
-      QUIET)
-    FetchContent_MakeAvailable(format)
-  endif()
-endif()
-
 if(NOT CMAKE_SKIP_INSTALL_RULES)
   # Our installs follow the standard GNU directory layout. This include needs to
   # come first since we need the CMAKE_INSTALL_* in the CMakeLists.txt of each
diff --git a/openmp/CMakePresets.json b/openmp/CMakePresets.json
deleted file mode 100644
index f3839f8..0000000
--- a/openmp/CMakePresets.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "version": 6,
-  "cmakeMinimumRequired": {
-    "major": 3,
-    "minor": 22,
-    "patch": 0
-  },
-  "configurePresets": [
-    {
-      "name": "clang-tidy",
-      "hidden": true,
-      "cacheVariables": {
-        "CMAKE_CXX_CLANG_TIDY": "clang-tidy;--header-filter=^${sourceDir}/"
-      }
-    }
-  ]
-}
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
new file mode 100644
index 0000000..69996cd
--- /dev/null
+++ b/reference/CMakeLists.txt
@@ -0,0 +1,94 @@
+cmake_minimum_required(VERSION 3.23)
+project(cholesky_reference)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# What to build?
+option(BUILD_CORE "Build the core library" ON)
+option(ENABLE_MKL "Enable Intel oneMKL support (threaded)" OFF)
+option(
+  ENABLE_PLASMA
+  "Build the PLASMA tiled-Cholesky reference variant in addition to the LAPACKE_dpotrf one"
+  OFF)
+option(
+  ENABLE_LAPACKE
+  "Run the LAPACKE_dpotrf reference mode at runtime (on by default). Linking is unchanged either way (PLASMA and validation still need cblas/lapacke)."
+  ON)
+option(
+  ENABLE_VALIDATION
+  "Compute ||A - L*L^T||_F / ||A||_F after each factorization (off by default)"
+  OFF)
+
+if(NOT CMAKE_SKIP_INSTALL_RULES)
+  include(GNUInstallDirs)
+endif()
+
+if(BUILD_CORE)
+  if(ENABLE_MKL)
+    set(MKL_INTERFACE_FULL "intel_lp64")
+    set(MKL_THREADING "intel_thread")
+    find_package(MKL CONFIG REQUIRED)
+
+    if(MKL_FOUND)
+      message(STATUS "Intel oneMKL Library found (threaded: ${MKL_THREADING})")
+    else()
+      message(FATAL_ERROR "No BLAS Library found")
+    endif()
+  else()
+    find_library(OpenBLAS_LIB NAMES openblas REQUIRED)
+
+    if(OpenBLAS_LIB)
+      message(STATUS "OpenBLAS Library found at ${OpenBLAS_LIB}")
+      find_path(
+        OpenBLAS_INCLUDE_DIR
+        NAMES cblas.h
+        PATH_SUFFIXES openblas)
+      if(NOT OpenBLAS_INCLUDE_DIR)
+        message(FATAL_ERROR "OpenBLAS include directory not found")
+      endif()
+
+      message(STATUS "OpenBLAS include dir: ${OpenBLAS_INCLUDE_DIR}")
+    else()
+      message(FATAL_ERROR "No BLAS Library found")
+    endif()
+  endif()
+
+  find_package(OpenMP REQUIRED)
+
+  if(ENABLE_PLASMA)
+    find_path(PLASMA_INCLUDE_DIR plasma.h)
+    if(NOT PLASMA_INCLUDE_DIR)
+      message(FATAL_ERROR "ENABLE_PLASMA=ON but plasma.h was not found")
+    endif()
+    find_library(PLASMA_LIB NAMES plasma REQUIRED)
+    find_library(PLASMA_CORE_BLAS_LIB NAMES coreblas plasma_core_blas)
+    message(STATUS "PLASMA include dir: ${PLASMA_INCLUDE_DIR}")
+    message(STATUS "PLASMA library: ${PLASMA_LIB}")
+    if(PLASMA_CORE_BLAS_LIB)
+      message(STATUS "PLASMA coreblas library: ${PLASMA_CORE_BLAS_LIB}")
+    endif()
+  endif()
+
+  add_subdirectory(core)
+
+  # Add the executable
+  add_executable(cholesky_reference main.cpp)
+
+  # Link the libraries
+  target_link_libraries(cholesky_reference PUBLIC Cholesky::core
+                                                  OpenMP::OpenMP_CXX)
+
+  if(ENABLE_VALIDATION)
+    target_compile_definitions(cholesky_reference PRIVATE ENABLE_VALIDATION)
+  endif()
+
+  if(ENABLE_PLASMA)
+    target_compile_definitions(cholesky_reference PRIVATE ENABLE_PLASMA)
+    target_include_directories(cholesky_reference PRIVATE ${PLASMA_INCLUDE_DIR})
+  endif()
+
+  if(ENABLE_LAPACKE)
+    target_compile_definitions(cholesky_reference PRIVATE ENABLE_LAPACKE)
+  endif()
+endif()
diff --git a/reference/compile.sh b/reference/compile.sh
new file mode 100755
index 0000000..b5e66c0
--- /dev/null
+++ b/reference/compile.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+# Usage: compile.sh
+#
+# Builds the parallel-BLAS reference benchmark: a single tile parallel
+# LAPACKE_dpotrf call on the full matrix, used as a baseline against the
+# tiled fork-join and tasking implementations.
+#
+# CMake project options can be overridden via environment variables
+# (defaults match the project's CMakeLists.txt defaults):
+#   ENABLE_MKL             ON|OFF  (default OFF) - link threaded Intel oneMKL
+#                                                  instead of threaded OpenBLAS
+#   ENABLE_PLASMA          ON|OFF  (default OFF) - also build the PLASMA
+#                                                  plasma_dpotrf variant (extra
+#                                                  'plasma' column in the output)
+#   ENABLE_LAPACKE         ON|OFF  (default ON)  - run the LAPACKE_dpotrf
+#                                                  reference mode at runtime
+#   ENABLE_VALIDATION      ON|OFF  (default OFF) - residual check after each
+#                                                  factorization
+#
+# Examples:
+#   ./compile.sh
+#   ENABLE_MKL=ON ./compile.sh
+#   ENABLE_PLASMA=ON ./compile.sh
+#   ENABLE_LAPACKE=OFF ENABLE_PLASMA=ON ./compile.sh
+#   ENABLE_VALIDATION=ON ./compile.sh
+################################################################################
+set -e # Exit immediately if a command exits with a non-zero status.
+
+################################################################################
+# CMake project options (env-var overridable; defaults match CMakeLists.txt)
+################################################################################
+: "${ENABLE_MKL:=OFF}"
+: "${ENABLE_PLASMA:=OFF}"
+: "${ENABLE_LAPACKE:=ON}"
+: "${ENABLE_VALIDATION:=OFF}"
+
+for var in ENABLE_MKL ENABLE_PLASMA ENABLE_LAPACKE ENABLE_VALIDATION; do
+  case "${!var}" in
+  ON | OFF) ;;
+  *)
+    echo "Error: $var must be ON or OFF (got '${!var}')." >&2
+    exit 1
+    ;;
+  esac
+done
+
+################################################################################
+# Toolchain selection
+################################################################################
+select_toolchain() {
+  module load gcc/14.2.0
+  export CC=gcc
+  export CXX=g++
+}
+
+################################################################################
+# Configurations
+#
+# The reference benchmark uses *threaded* BLAS as they operate on a single tile
+# and do not parallelize at the tile level.
+################################################################################
+if command -v spack &>/dev/null; then
+  echo "Spack command found. Loading libraries."
+  # Get current hostname
+  HOSTNAME=$(hostname -s)
+
+  if [[ "$HOSTNAME" == "ipvs-epyc1" || "$HOSTNAME" == "ipvs-epyc2" ]]; then
+    # Compiler
+    select_toolchain
+    if [[ "$ENABLE_MKL" == "OFF" ]]; then
+      # OpenBLAS built with OpenMP threading
+      spack load openblas@0.3.28%gcc@14.2.0 threads=openmp ilp64=true
+    fi
+    if [[ "$ENABLE_PLASMA" == "ON" ]]; then
+      spack load plasma%gcc@14.2.0 ^openblas@0.3.28%gcc@14.2.0 threads=openmp
+    fi
+
+  elif [[ "$HOSTNAME" == "nasrin0" || "$HOSTNAME" == "nasrin1" ]]; then
+    # Compiler
+    select_toolchain
+    if [[ "$ENABLE_MKL" == "OFF" ]]; then
+      # OpenBLAS built with OpenMP threading
+      spack load openblas@0.3.28%gcc@14.2.0 arch=linux-almalinux9-zen3 threads=openmp ilp64=true
+    fi
+    if [[ "$ENABLE_PLASMA" == "ON" ]]; then
+      spack load plasma%gcc@14.2.0 arch=linux-almalinux9-zen3 openblas@0.3.28%gcc@14.2.0 threads=openmp
+    fi
+
+  else
+    echo "Hostname is $HOSTNAME — no action taken."
+  fi
+else
+  echo "Spack command not found. Exiting."
+fi
+
+################################################################################
+# Compile code
+################################################################################
+rm -rf build && mkdir build && cd build
+
+echo "CMake options:"
+echo "  ENABLE_MKL        = $ENABLE_MKL"
+echo "  ENABLE_PLASMA     = $ENABLE_PLASMA"
+echo "  ENABLE_LAPACKE    = $ENABLE_LAPACKE"
+echo "  ENABLE_VALIDATION = $ENABLE_VALIDATION"
+
+cmake -DCMAKE_BUILD_TYPE=Release \
+  -DENABLE_MKL="$ENABLE_MKL" \
+  -DENABLE_PLASMA="$ENABLE_PLASMA" \
+  -DENABLE_LAPACKE="$ENABLE_LAPACKE" \
+  -DENABLE_VALIDATION="$ENABLE_VALIDATION" \
+  ..
+make -j VERBOSE=1
+cd ..
+
+# Launch Example
+# OMP_NUM_THREADS=128 OMP_PROC_BIND=close OMP_PLACES=cores \
+# ./build/cholesky_reference --size_start 1024 --size_stop 65536 --loop 1
diff --git a/reference/core/CMakeLists.txt b/reference/core/CMakeLists.txt
new file mode 100644
index 0000000..b74c17f
--- /dev/null
+++ b/reference/core/CMakeLists.txt
@@ -0,0 +1,71 @@
+set(SOURCE_FILES src/matrix_generation.cpp src/functions.cpp
+                 src/cholesky_factor.cpp src/adapter_cblas_fp64.cpp)
+
+if(ENABLE_VALIDATION)
+  list(APPEND SOURCE_FILES src/validate.cpp)
+endif()
+
+if(ENABLE_PLASMA)
+  list(APPEND SOURCE_FILES src/adapter_plasma_fp64.cpp)
+endif()
+
+add_library(cholesky_core STATIC ${SOURCE_FILES})
+
+set_property(TARGET cholesky_core PROPERTY EXPORT_NAME core)
+add_library(Cholesky::core ALIAS cholesky_core)
+
+# Add them as PRIVATE sources here so they show up in project files Can't use
+# PUBLIC etc., see: https://stackoverflow.com/a/62465051
+file(GLOB_RECURSE header_files CONFIGURE_DEPENDS include/*.hpp)
+target_sources(cholesky_core PRIVATE ${header_files})
+
+# Link OpenMP libraries (used by the parallel matrix generator)
+target_link_libraries(cholesky_core PUBLIC OpenMP::OpenMP_CXX)
+
+# Include directories
+target_include_directories(
+  cholesky_core PUBLIC "$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/core/include>")
+
+# Link BLAS
+if(ENABLE_MKL)
+  # Link threaded Intel oneMKL
+  target_link_libraries(cholesky_core PUBLIC MKL::mkl_intel_lp64 MKL::mkl_core
+                                             MKL::MKL MKL::mkl_intel_thread)
+else()
+  # Link threaded OpenBLAS
+  target_link_libraries(cholesky_core PUBLIC ${OpenBLAS_LIB})
+  target_include_directories(cholesky_core PUBLIC ${OpenBLAS_INCLUDE_DIR})
+endif()
+
+if(ENABLE_MKL)
+  target_compile_definitions(cholesky_core PUBLIC ENABLE_MKL)
+endif()
+
+if(ENABLE_PLASMA)
+  target_compile_definitions(cholesky_core PUBLIC ENABLE_PLASMA)
+  target_include_directories(cholesky_core PUBLIC ${PLASMA_INCLUDE_DIR})
+  target_link_libraries(cholesky_core PUBLIC ${PLASMA_LIB})
+  if(PLASMA_CORE_BLAS_LIB)
+    target_link_libraries(cholesky_core PUBLIC ${PLASMA_CORE_BLAS_LIB})
+  endif()
+endif()
+
+target_compile_features(cholesky_core PUBLIC cxx_std_17)
+
+set_property(TARGET cholesky_core PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+if(NOT CMAKE_SKIP_INSTALL_RULES)
+  install(
+    DIRECTORY include/
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+    COMPONENT Development)
+
+  install(
+    TARGETS cholesky_core
+    EXPORT CholeskyTargets
+    RUNTIME COMPONENT Runtime
+    LIBRARY COMPONENT Runtime NAMELINK_COMPONENT Development
+    ARCHIVE COMPONENT Development
+    INCLUDES
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+endif()
diff --git a/reference/core/include/adapter_cblas_fp64.hpp b/reference/core/include/adapter_cblas_fp64.hpp
new file mode 100644
index 0000000..11a79e4
--- /dev/null
+++ b/reference/core/include/adapter_cblas_fp64.hpp
@@ -0,0 +1,21 @@
+#ifndef CPU_ADAPTER_CBLAS_FP64_H
+#define CPU_ADAPTER_CBLAS_FP64_H
+
+#pragma once
+
+#include <vector>
+
+using vector = std::vector<double>;
+
+// LAPACK level 3 operations
+
+/**
+ * @brief FP64 In-place Cholesky decomposition of A using a threaded
+ *        LAPACKE_dpotrf call.
+ *
+ * @param A row-major matrix of size N*N to be factorized in place
+ * @param N matrix dimension
+ */
+void lapacke_potrf(vector &A, const int N);
+
+#endif  // end of CPU_ADAPTER_CBLAS_FP64_H
diff --git a/reference/core/include/adapter_plasma_fp64.hpp b/reference/core/include/adapter_plasma_fp64.hpp
new file mode 100644
index 0000000..3edd661
--- /dev/null
+++ b/reference/core/include/adapter_plasma_fp64.hpp
@@ -0,0 +1,22 @@
+#ifndef CPU_ADAPTER_PLASMA_FP64_H
+#define CPU_ADAPTER_PLASMA_FP64_H
+
+#pragma once
+
+#include <vector>
+
+namespace cpu
+{
+
+/**
+ * @brief PLASMA tiled Cholesky on a row-major N x N buffer using the
+ *        high-level synchronous API (plasma_dpotrf).
+ *
+ * Throws @c std::runtime_error before calling PLASMA when the descriptor
+ * size computation inside plasma_desc_*_create() would overflow int32.
+ *
+ */
+void plasma_potrf(std::vector<double> &A, int N);
+
+}  // end of namespace cpu
+#endif  // end of CPU_ADAPTER_PLASMA_FP64_H
diff --git a/reference/core/include/cholesky_factor.hpp b/reference/core/include/cholesky_factor.hpp
new file mode 100644
index 0000000..9bcf784
--- /dev/null
+++ b/reference/core/include/cholesky_factor.hpp
@@ -0,0 +1,43 @@
+#ifndef CPU_CHOLESKY_FACTOR_H
+#define CPU_CHOLESKY_FACTOR_H
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace cpu
+{
+
+/**
+ * @brief Reference Cholesky variants.
+ *
+ *   - lapacke : threaded LAPACKE_dpotrf2 call
+ *   - plasma  : plasma_dpotrf call (PLASMA's high-level
+ *               synchronous Cholesky over the OpenMP runtime).
+ */
+enum class Variant { lapacke, plasma };
+
+inline Variant to_variant(const std::string &s)
+{
+    if (s == "lapacke")
+    {
+        return Variant::lapacke;
+    }
+    if (s == "plasma")
+    {
+        return Variant::plasma;
+    }
+    throw std::invalid_argument("Unknown Variant: " + s);
+}
+
+/**
+ * @brief Run the requested reference variant on the full row-major N x N
+ *        matrix. Factorization is in place; @p matrix holds the lower
+ *        triangular factor L on return.
+ */
+void parallel_cholesky(Variant variant, std::vector<double> &matrix, int N);
+
+}  // end of namespace cpu
+#endif  // end of CPU_CHOLESKY_FACTOR_H
diff --git a/reference/core/include/functions.hpp b/reference/core/include/functions.hpp
new file mode 100644
index 0000000..f7e74ba
--- /dev/null
+++ b/reference/core/include/functions.hpp
@@ -0,0 +1,26 @@
+#ifndef CPU_FUNCTIONS_H
+#define CPU_FUNCTIONS_H
+
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+namespace cpu
+{
+
+/**
+ * @brief Time a single call to the requested reference variant
+ *        ('reference' or 'plasma') on the @p matrix buffer (row-major, N x N).
+ *        The buffer is factorized in place.
+ *
+ * @param matrix  row-major matrix; on return contains the lower-triangular factor L
+ * @param N       matrix dimension
+ * @param variant which reference path to time
+ * @return wall-clock elapsed time in seconds
+ */
+double cholesky(std::vector<double> &matrix, std::size_t N, const std::string &variant);
+
+}  // namespace cpu
+#endif  // end of CPU_FUNCTIONS_H
diff --git a/reference/core/include/matrix_generation.hpp b/reference/core/include/matrix_generation.hpp
new file mode 100644
index 0000000..967398b
--- /dev/null
+++ b/reference/core/include/matrix_generation.hpp
@@ -0,0 +1,22 @@
+#ifndef MATRIX_GENERATION_H
+#define MATRIX_GENERATION_H
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+/**
+ * @brief Generate a deterministic, dense, row-major SPD matrix of size N x N.
+ *
+ * Entries are uniform on [0, 1) using a per-row seed; the diagonal is shifted
+ * by +N to guarantee strict diagonal dominance and therefore symmetric
+ * positive definiteness. The result is stored as a single contiguous
+ * std::vector<double> of length N*N in row-major order.
+ *
+ * @param N matrix dimension
+ * @return owning row-major buffer of length N*N
+ */
+std::vector<double> gen_matrix(std::size_t N);
+
+#endif
diff --git a/reference/core/include/validate.hpp b/reference/core/include/validate.hpp
new file mode 100644
index 0000000..4c666d0
--- /dev/null
+++ b/reference/core/include/validate.hpp
@@ -0,0 +1,28 @@
+#ifndef CPU_VALIDATE_H
+#define CPU_VALIDATE_H
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+namespace cpu
+{
+
+/**
+ * @brief Compute the relative Cholesky residual ||A - L * L^T||_F / ||A||_F
+ *        for the dense, row-major reference factorization.
+ *
+ * The original A is regenerated on the fly with the same deterministic seed
+ * used by gen_matrix, so no extra storage is needed.
+ *
+ * @param N matrix dimension (must match the factorization)
+ * @param L row-major buffer of length N*N holding the factor returned by
+ *          LAPACKE_dpotrf with uplo='L' (only the lower triangle is read)
+ * @return relative Frobenius residual
+ */
+double cholesky_residual(std::size_t N, const std::vector<double> &L);
+
+}  // namespace cpu
+
+#endif  // end of CPU_VALIDATE_H
diff --git a/reference/core/src/adapter_cblas_fp64.cpp b/reference/core/src/adapter_cblas_fp64.cpp
new file mode 100644
index 0000000..264d442
--- /dev/null
+++ b/reference/core/src/adapter_cblas_fp64.cpp
@@ -0,0 +1,12 @@
+#include "adapter_cblas_fp64.hpp"
+
+#ifdef ENABLE_MKL
+// MKL CBLAS / LAPACKE
+#include "mkl_cblas.h"
+#include "mkl_lapacke.h"
+#else
+#include "cblas.h"
+#include "lapacke.h"
+#endif
+
+void lapacke_potrf(vector &A, const int N) { LAPACKE_dpotrf2(LAPACK_ROW_MAJOR, 'L', N, A.data(), N); }
diff --git a/reference/core/src/adapter_plasma_fp64.cpp b/reference/core/src/adapter_plasma_fp64.cpp
new file mode 100644
index 0000000..06147ad
--- /dev/null
+++ b/reference/core/src/adapter_plasma_fp64.cpp
@@ -0,0 +1,30 @@
+#include "adapter_plasma_fp64.hpp"
+
+#include <plasma.h>
+#include <stdexcept>
+#include <string>
+
+namespace cpu
+{
+
+void plasma_potrf(std::vector<double> &A, int N)
+{
+    constexpr int k_plasma_max_n = 65'280;
+    if (N > k_plasma_max_n)
+    {
+        throw std::runtime_error(
+            "plasma_dpotrf: skipped to avoid PLASMA descriptor int32 overflow at N=" + std::to_string(N)
+            + " (max supported with default nb=256: " + std::to_string(k_plasma_max_n) + ")");
+    }
+
+    // PLASMA is column-major. Our buffer is row-major and the matrix is
+    // symmetric, so we can pass it through unchanged and ask PLASMA to write
+    // its result into the upper triangle of its column-major view
+    const int info = plasma_dpotrf(PlasmaUpper, N, A.data(), N);
+    if (info != 0)
+    {
+        throw std::runtime_error("plasma_dpotrf failed with info=" + std::to_string(info));
+    }
+}
+
+}  // end of namespace cpu
diff --git a/reference/core/src/cholesky_factor.cpp b/reference/core/src/cholesky_factor.cpp
new file mode 100644
index 0000000..3a20132
--- /dev/null
+++ b/reference/core/src/cholesky_factor.cpp
@@ -0,0 +1,29 @@
+#include "cholesky_factor.hpp"
+
+#include "adapter_cblas_fp64.hpp"
+#ifdef ENABLE_PLASMA
+#include "adapter_plasma_fp64.hpp"
+#endif
+
+#include <stdexcept>
+
+namespace cpu
+{
+
+void parallel_cholesky(Variant variant, std::vector<double> &matrix, int N)
+{
+    switch (variant)
+    {
+        case Variant::lapacke: lapacke_potrf(matrix, N); return;
+
+        case Variant::plasma:
+#ifdef ENABLE_PLASMA
+            plasma_potrf(matrix, N);
+            return;
+#else
+            throw std::invalid_argument("Variant 'plasma' requested but the binary was built without ENABLE_PLASMA=ON");
+#endif
+    }
+}
+
+}  // end of namespace cpu
diff --git a/reference/core/src/functions.cpp b/reference/core/src/functions.cpp
new file mode 100644
index 0000000..1f15f26
--- /dev/null
+++ b/reference/core/src/functions.cpp
@@ -0,0 +1,21 @@
+#include "functions.hpp"
+
+#include "cholesky_factor.hpp"
+#include <chrono>
+
+namespace cpu
+{
+
+double cholesky(std::vector<double> &matrix, std::size_t N, const std::string &variant)
+{
+    const Variant v = to_variant(variant);
+    auto start = std::chrono::high_resolution_clock::now();
+    ///////////////////////////////////////////////////////////////////////////
+    // Launch Cholesky decomposition: A = L * L^T
+    parallel_cholesky(v, matrix, static_cast<int>(N));
+    ///////////////////////////////////////////////////////////////////////////
+    auto stop = std::chrono::high_resolution_clock::now();
+    return (stop - start).count() / 1e9;
+}
+
+}  // end of namespace cpu
diff --git a/reference/core/src/matrix_generation.cpp b/reference/core/src/matrix_generation.cpp
new file mode 100644
index 0000000..a67ff5a
--- /dev/null
+++ b/reference/core/src/matrix_generation.cpp
@@ -0,0 +1,28 @@
+#include "matrix_generation.hpp"
+
+#include <random>
+#include <vector>
+
+std::vector<double> gen_matrix(std::size_t N)
+{
+    std::vector<double> A(N * N);
+
+    // The matrix is built row by row in parallel. Each row uses its own RNG
+    // seeded by the row index, so the matrix is deterministic and
+    // reproducible regardless of the number of threads.
+#pragma omp parallel for schedule(static)
+    for (std::size_t i = 0; i < N; ++i)
+    {
+        std::mt19937 generator(static_cast<std::mt19937::result_type>(i + 1));
+        std::uniform_real_distribution<double> distribute(0.0, 1.0);
+        for (std::size_t j = 0; j <= i; ++j)
+        {
+            const double v = distribute(generator);
+            A[i * N + j] = v;
+            A[j * N + i] = v;
+        }
+        A[i * N + i] += static_cast<double>(N);
+    }
+
+    return A;
+}
diff --git a/reference/core/src/validate.cpp b/reference/core/src/validate.cpp
new file mode 100644
index 0000000..8b1f647
--- /dev/null
+++ b/reference/core/src/validate.cpp
@@ -0,0 +1,68 @@
+#include "validate.hpp"
+
+#include "matrix_generation.hpp"
+
+#ifdef ENABLE_MKL
+#include "mkl_cblas.h"
+#else
+#include "cblas.h"
+#endif
+
+#include <cmath>
+#include <cstddef>
+#include <vector>
+
+namespace cpu
+{
+
+double cholesky_residual(std::size_t N, const std::vector<double> &L)
+{
+    // Build a working copy of L with its strictly upper triangle zeroed out.
+    std::vector<double> Lwork(L);
+    for (std::size_t i = 0; i < N; ++i)
+    {
+        for (std::size_t j = i + 1; j < N; ++j)
+        {
+            Lwork[i * N + j] = 0.0;
+        }
+    }
+
+    // Compute LLt = L * L^T (full N x N) with a single dgemm.
+    std::vector<double> LLt(N * N, 0.0);
+    cblas_dgemm(
+        CblasRowMajor,
+        CblasNoTrans,
+        CblasTrans,
+        static_cast<int>(N),
+        static_cast<int>(N),
+        static_cast<int>(N),
+        1.0,
+        Lwork.data(),
+        static_cast<int>(N),
+        Lwork.data(),
+        static_cast<int>(N),
+        0.0,
+        LLt.data(),
+        static_cast<int>(N));
+
+    // Regenerate the original matrix A deterministically and accumulate Frobenius
+    // norms of (A - LLt) and A.
+    const std::vector<double> A = gen_matrix(N);
+
+    double r_norm_sq = 0.0;
+    double a_norm_sq = 0.0;
+    for (std::size_t idx = 0; idx < A.size(); ++idx)
+    {
+        const double d = A[idx] - LLt[idx];
+        r_norm_sq += d * d;
+        a_norm_sq += A[idx] * A[idx];
+    }
+
+    if (a_norm_sq == 0.0)
+    {
+        return 0.0;
+    }
+    return std::sqrt(r_norm_sq / a_norm_sq);
+}
+
+}  // namespace cpu
diff --git a/reference/main.cpp b/reference/main.cpp
new file mode 100644
index 0000000..3c824c9
--- /dev/null
+++ b/reference/main.cpp
@@ -0,0 +1,157 @@
+#include "functions.hpp"
+#include "matrix_generation.hpp"
+#ifdef ENABLE_VALIDATION
+#include "validate.hpp"
+#endif
+#ifdef ENABLE_PLASMA
+#include <plasma.h>
+#endif
+#include <cstddef>
+#include <exception>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <omp.h>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+int main(int argc, char *argv[])
+{
+    ///////////////////////////////////////////////////////////////////////////
+    // cmdline arguments
+    std::size_t loop = 1;
+    std::size_t size_start = 32, size_stop = 128;
+
+    for (int i = 1; i < argc; ++i)
+    {
+        std::string arg = argv[i];
+        if (arg == "--loop" && i + 1 < argc)
+        {
+            loop = std::stoul(argv[++i]);
+        }
+        else if (arg == "--size_start" && i + 1 < argc)
+        {
+            size_start = std::stoul(argv[++i]);
+        }
+        else if (arg == "--size_stop" && i + 1 < argc)
+        {
+            size_stop = std::stoul(argv[++i]);
+        }
+        else if ((arg == "--tiles_start" || arg == "--tiles_stop") && i + 1 < argc)
+        {
+            // Accept-and-ignore for CLI parity with the tiled variants.
+            ++i;
+        }
+    }
+    ///////////////////////////////////////////////////////////////////////////
+    // configuration
+    const std::size_t LOOP = loop;
+
+    const std::size_t START_SIZE = size_start;
+    const std::size_t STOP_SIZE = size_stop;
+    const std::size_t STEP_SIZE = 2;
+
+    // print and write results
+    bool HEADER_FLAG = true;
+    std::string runtime_file_path = "runtimes_reference_cholesky_";
+    if (START_SIZE != STOP_SIZE)
+    {
+        runtime_file_path += std::string("size_");
+    }
+    runtime_file_path += std::to_string(LOOP) + std::string(".txt");
+    std::ofstream runtime_file;
+    runtime_file.open(runtime_file_path, std::ios_base::app);
+
+#ifdef ENABLE_PLASMA
+    if (plasma_init() != 0)
+    {
+        throw std::runtime_error("plasma_init() failed");
+    }
+#endif
+
+    for (std::size_t size = START_SIZE; size <= STOP_SIZE; size = size * STEP_SIZE)
+    {
+        for (std::size_t l = 0; l < LOOP; l++)
+        {
+            std::string header = "threads;problem_size;tile_size;n_tiles";
+            std::string values = std::to_string(omp_get_max_threads());
+            values += std::string(";") + std::to_string(size);
+            values += std::string(";") + std::to_string(size);
+            values += std::string(";") + std::to_string(1);
+            ///////////////////////////////////////////////////////////////////
+            // Reference modes:
+            std::vector<std::string> modes = {};
+#ifdef ENABLE_LAPACKE
+            modes.push_back("lapacke");
+#endif
+#ifdef ENABLE_PLASMA
+            modes.push_back("plasma");
+#endif
+
+            for (const auto &mode : modes)
+            {
+                header += ";" + mode;
+                std::size_t mode_size = size;
+
+                // PLASMA's triangular descriptor allocation
+                // overflows int32 for N>65280 with the default nb=256. For
+                // input sizes in (65280, 65536] we silently clamp PLASMA's
+                // working size down to 65280;                std::size_t mode_size = size;
+                if (mode == "plasma" && mode_size > 65'280 && mode_size <= 65'536)
+                {
+                    mode_size = 65'280;
+                }
+
+                std::vector<double> matrix = gen_matrix(mode_size);
+                // NaN guard
+                double cholesky_cpu = std::numeric_limits<double>::quiet_NaN();
+                try
+                {
+                    cholesky_cpu = cpu::cholesky(matrix, mode_size, mode);
+                }
+                catch (const std::exception &e)
+                {
+                    std::cerr << "Error: variant '" << mode << "' failed at size=" << mode_size << ": " << e.what()
+                              << ". Recording NaN and continuing." << std::endl;
+                    values += ";nan";
+                    continue;
+                }
+
+                values += ";" + std::to_string(cholesky_cpu);
+
+#ifdef ENABLE_VALIDATION
+                // Validate by computing relative residual ||A - L L^T||_F / ||A||_F
+                constexpr double residual_tol = 1e-10;
+                const double residual = cpu::cholesky_residual(mode_size, matrix);
+                std::cout << "[validate] mode=" << mode << " size=" << mode_size << " residual=" << residual
+                          << std::endl;
+                if (!(residual <= residual_tol))  // catches NaN too
+                {
+                    std::cerr << "Validation warning: variant '" << mode << "' residual " << residual
+                              << " exceeds tolerance " << residual_tol << " (size=" << mode_size << ")" << std::endl;
+                }
+#endif
+            }
+            ///////////////////////////////////////////////////////////////////
+            // print/write header only once
+            if (HEADER_FLAG)
+            {
+                HEADER_FLAG = false;
+                std::cout << header << std::endl;
+                runtime_file << header << std::endl;
+            }
+            // print/write runtimes
+            std::cout << values << std::endl;
+            runtime_file << values << std::endl;
+        }
+    }
+
+    runtime_file.close();
+
+#ifdef ENABLE_PLASMA
+    plasma_finalize();
+#endif
+
+    return 0;
+}
diff --git a/reference/run.sh b/reference/run.sh
new file mode 100755
index 0000000..0b5c772
--- /dev/null
+++ b/reference/run.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+#SBATCH --job-name=cholesky_reference
+#SBATCH --output=logs/cholesky_reference_%j.out
+#SBATCH --error=logs/cholesky_reference_%j.err
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=128
+#SBATCH --time=144:00:00
+#SBATCH --exclusive
+#
+# Usage: run.sh
+#
+# Submit example:
+#   sbatch run.sh
+
+set -e # Exit immediately if a command exits with a non-zero status.
+
+################################################################################
+# Toolchain runtime selection
+################################################################################
+module load gcc/14.2.0
+
+# Resolve directory where the script is located
+SCRIPT_DIR="$(pwd)"
+
+# OpenMP settings
+export OMP_NUM_THREADS=128
+export OMP_PROC_BIND=close
+export OMP_PLACES=cores
+
+# Make sure threaded MKL uses the OpenMP runtime if ENABLE_MKL=ON was used at
+# build time.
+export MKL_NUM_THREADS=${MKL_NUM_THREADS:-$OMP_NUM_THREADS}
+
+echo "Running with gcc runtime"
+
+# Run executable
+srun --cpu-bind=cores "$SCRIPT_DIR/build/cholesky_reference" \
+  --loop 20 \
+  --size_start 1024 \
+  --size_stop 65536