QuEST-Kit · TysonRayJones · Jun 1, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -161,6 +161,7 @@ option(
 )
 message(STATUS "Custom communicator support is turned ${QUEST_ENABLE_SUBCOMM}. Set QUEST_ENABLE_SUBCOMM to modify.")
 
+
 # GPU Acceleration
 option(
   QUEST_ENABLE_CUDA
@@ -183,20 +184,20 @@ option(
 )
 message(STATUS "AMD GPU acceleration is turned ${QUEST_ENABLE_HIP}. Set QUEST_ENABLE_HIP to modify.")
 
+
 # GPU Performance Tuning
-## We do not print this value when configuring CMake as it is for advanced users only.
+# (We do not print this value when configuring CMake as it is for advanced users only)
 
-set(QUEST_GPU_NUM_THREADS_PER_BLOCK 128
-  CACHE
-  STRING
-  "The default number of threads per block QuEST will use when offloading to a GPU. Set to 128 by default. Must be a multiple of 32."
+set(quest_tpb_description # (the games we play for multi-line set() strings!)
+  "The default number of threads per block QuEST will use when offloading to a GPU. Set to 128 by default. "
+  "Must be a multiple of 32 (on NVIDIA GPUs) or 64 (on AMD GPUs). Can be overridden at executable launch "
+  "via an environment variable of the same name, or during runtime via a corresponding API setter function."
 )
-mark_as_advanced(QUEST_GPU_NUM_THREADS_PER_BLOCK)
+set(QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK 128 
+  CACHE STRING
+  "${quest_tpb_description}")
+mark_as_advanced(QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK)
 
-math(EXPR quest_tpb_remainder "${QUEST_GPU_NUM_THREADS_PER_BLOCK} % 32")
-if ((NOT (quest_tpb_remainder EQUAL 0)) OR (QUEST_GPU_NUM_THREADS_PER_BLOCK LESS 32))
-    message(FATAL_ERROR "QUEST_GPU_NUM_THREADS_PER_BLOCK must be a multiple of 32. QUEST_GPU_NUM_THREADS_PER_BLOCK=${QUEST_GPU_NUM_THREADS_PER_BLOCK}.")
-endif()
 
 # Deprecated API
 option(
@@ -211,9 +212,15 @@ option(
   "Whether to disable compile-time warnings ordinarily triggered by use of the deprecated API. Turned OFF by default."
   OFF
 )
-message(STATUS "Disabling of deprecated API warnings is turned ${QUEST_DISABLE_DEPRECATION_WARNINGS}. Set QUEST_DISABLE_DEPRECATION_WARNINGS to modify.")
+message(STATUS 
+  "Disabling of deprecated API warnings is turned ${QUEST_DISABLE_DEPRECATION_WARNINGS}. "
+  "Set QUEST_DISABLE_DEPRECATION_WARNINGS to modify."
+)
 
 option(QUEST_INSTALL_BINARIES "Whether to include example and user binaries in the install." OFF)
+if (QUEST_INSTALL_BINARIES)
+  message(STATUS "Including example and user binaries in the install (if built).")
+endif()
 
 
 
@@ -236,10 +243,12 @@ if (QUEST_ENABLE_CUQUANTUM AND NOT QUEST_ENABLE_CUDA)
   message(FATAL_ERROR "Use of cuQuantum requires CUDA.")
 endif()
 
+
 if (QUEST_ENABLE_SUBCOMM AND NOT QUEST_ENABLE_MPI)
   message(FATAL_ERROR "Distribution must be enabled to make use of a user-defined communicator for QuEST.")
 endif()
 
+
 if(WIN32)
 
   # Force MSVC to export all symbols in a shared library, like GCC and clang
@@ -257,6 +266,37 @@ if(WIN32)
 endif()
 
 
+# validate numTPB even when GPU not compiled
+if (QUEST_ENABLE_HIP)
+  set(quest_warp_size 64)
+  set(quest_gpu_model "AMD GPUs (via HIP)")
+else()
+  set(quest_warp_size 32)
+  set(quest_gpu_model "NVIDIA GPUs (via CUDA), or when not targeting GPUs")
+endif()
+math(EXPR quest_tpb_remainder "${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} % ${quest_warp_size}")
+if ((NOT (quest_tpb_remainder EQUAL 0)) OR NOT (QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK GREATER 0))
+  message(FATAL_ERROR
+    "QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK was set to ${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}, "
+    "but it must be a positive multiple of ${quest_warp_size} when compiling for ${quest_gpu_model}."
+  )
+endif()
+
+
+# warn when numTPB will be later overridden by the current environment variable
+if(
+  DEFINED ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} 
+  AND NOT "$ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}" STREQUAL ""
+  AND NOT "$ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}" STREQUAL "${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}"
+)
+  message(WARNING 
+    "The CMake option QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK=${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} "
+    "differs from the current environment variable (of the same name) value of $ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}. "
+    "If not cleared before QuEST is launched, the latter will override the former."
+  )
+endif()
+
+
 # Encourage high-performance Release build
 
 # Taken from Kitware's exmaple of problematic code at
@@ -514,7 +554,6 @@ set(QUEST_COMPILE_MPI ${QUEST_ENABLE_MPI})
 set(QUEST_COMPILE_SUBCOMM ${QUEST_ENABLE_SUBCOMM})
 set(QUEST_COMPILE_CUQUANTUM ${QUEST_ENABLE_CUQUANTUM})
 set(QUEST_INCLUDE_DEPRECATED_FUNCTIONS ${QUEST_ENABLE_DEPRECATED_API})
-set(QUEST_DEFAULT_NUM_THREADS_PER_BLOCK ${QUEST_GPU_NUM_THREADS_PER_BLOCK})
 
 
 # (for the love of God cmake, create a concise syntax for this)
@@ -523,18 +562,19 @@ if (QUEST_ENABLE_CUDA OR QUEST_ENABLE_HIP)
 else()
   set(QUEST_COMPILE_CUDA 0)
 endif()
+set(QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP})
+
+
+# non-binary set vars which will be written to config.h.in (with a differing name) 
+set(QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK ${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK})
 
 
-# these vars are already set, but repeated here for clarity
+# these vars are already set (cmake name matches the macro name), but repeated here for clarity
 set(QUEST_FLOAT_PRECISION ${QUEST_FLOAT_PRECISION})
 set(QUEST_ENABLE_NUMA ${QUEST_ENABLE_NUMA})
 set(QUEST_DISABLE_DEPRECATION_WARNINGS ${QUEST_DISABLE_DEPRECATION_WARNINGS})
 
 
-# these do not appear in src but are saved for record-keeping in config.h.in
-set(QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP})
-
-
 
 # ============================
 # Pass files to library

diff --git a/docs/cmake.md b/docs/cmake.md
@@ -48,7 +48,7 @@ make
 | `QUEST_DISABLE_DEPRECATION_WARNINGS` | (`OFF`), `ON` | Whether to disable the compile-time deprecation warnings when using the deprecated (v3) API. |
 | `USER_SOURCE_NAMES` | (Undefined), String | The source file for a user program which will be compiled alongside QuEST. `USER_OUTPUT_EXE_NAME` *must* also be defined. |
 | `USER_OUTPUT_EXE_NAME` | (Undefined), String | The name of the executable which will be created from the provided `USER_SOURCE_NAMES`. `USER_SOURCE_NAMES` *must* also be defined. |
-| `QUEST_GPU_NUM_THREADS_PER_BLOCK` | (128), Number | The default number of threads per block QuEST will use when offloading to a GPU. *Must* be a multiple of 32. For AMD GPUs this *should* be a multiple of 64. |
+| `QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK` | (128), Number | The default number of threads per block QuEST will use when offloading to a GPU. *Must* be a multiple of 32 (on NVIDIA GPUs) or 64 (on AMD GPUs). This CMake variable sets the default if not later overridden. The number can be overridden at process launch time using an [environment variable](https://quest-kit.github.io/QuEST/group__modes.html#gaf1b71f54d270d3353fe072c66827339b) of the same name, or during runtime using [`setQuESTNumGpuThreadsPerBlock()`](https://quest-kit.github.io/QuEST/group__experimental.html#gae35a55c6d9366ce677e6aaaf4c1ff5ef). |
 
 
 

diff --git a/docs/launch.md b/docs/launch.md
@@ -270,6 +270,7 @@ QuEST execution can be configured prior to runtime using the below [environment
 
 - [`QUEST_PERMIT_NODES_TO_SHARE_GPU`](https://quest-kit.github.io/QuEST/group__modes.html#ga84b134d552464a82d29517e1ce1309a7)
 - [`QUEST_DEFAULT_VALIDATION_EPSILON`](https://quest-kit.github.io/QuEST/group__modes.html#gac4ab30619e411c965377c910680e242c)
+- [`QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK`](https://quest-kit.github.io/QuEST/group__modes.html#gaf1b71f54d270d3353fe072c66827339b)
 
 Note the unit tests in the preceding section accept additional environment variables.
 

diff --git a/quest/include/config.h.in b/quest/include/config.h.in
@@ -83,16 +83,15 @@
 #cmakedefine01 QUEST_COMPILE_SUBCOMM
 #cmakedefine01 QUEST_COMPILE_CUDA
 #cmakedefine01 QUEST_COMPILE_CUQUANTUM
+#cmakedefine01 QUEST_COMPILE_HIP
 
-// default parameters which may have been tuned for performance when building the library
-#cmakedefine QUEST_DEFAULT_NUM_THREADS_PER_BLOCK @QUEST_DEFAULT_NUM_THREADS_PER_BLOCK@
 
 // crucial to QuEST source (informs optional NUMA usage)
 #cmakedefine01 QUEST_ENABLE_NUMA
 
 
-// not consulted by src (included for book-keeping)
-#cmakedefine01 QUEST_COMPILE_HIP
+// default parameters which may have been tuned for performance when building the library
+#cmakedefine QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK @QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK@
 
 
 

diff --git a/quest/include/environment.h b/quest/include/environment.h
@@ -87,14 +87,6 @@ int isQuESTEnvInit();
 QuESTEnv getQuESTEnv();
 
 
-/** @notyetdoced
- * GPU thread per block control
- * This is somehow probably the best pre-existing place for this. It only really applies to GPU, because for
- * OpenMP the user can just export OMP_NUM_THREADS or call omp_set_num_threads.
- */
-int getQuESTNumGpuThreadsPerBlock();
-void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock);
-
 
 // end de-mangler
 #ifdef __cplusplus

diff --git a/quest/include/experimental.h b/quest/include/experimental.h
@@ -44,7 +44,6 @@ void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, in
 
 
 #if QUEST_COMPILE_SUBCOMM
-
 /** @notyetdoced
  * 
  *  Advanced initialiser which allows the user to provide an MPI communicator for QuEST to use.
@@ -61,10 +60,46 @@ void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, in
  * @author Oliver Brown
  */
 void initCustomMpiCommQuESTEnv(MPI_Comm questComm, int useGpuAccel, int useMultithread);
-
 #endif // QUEST_COMPILE_SUBCOMM
 
 
+/** @notyetdoced
+ * 
+ * @author Oliver Brown
+ */
+int getQuESTNumGpuThreadsPerBlock();
+
+
+/** Overrides the number of CUDA threads per block (or @p blockDim) used by QuEST's GPU-accelerated backend.
+ * 
+ * This changes the GPU parallelisation granularity and can affect performance, and is useful
+ * for performance tuning or diagnostics. Before this function is called, QuEST will use the
+ * number as specified by the environment variable @p QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK,
+ * if defined. Otherwise, it will use the value specified by the CMake/compile option of the
+ * same name, which itself presently defaults to @p 128. After this function is called, QuEST
+ * will adopt @p numThreadsPerBlock for the remainder of execution, or until this function is
+ * called again.
+ * 
+ * Practical values of @p numThreadsPerBlock can vary with the simulation size, the user's GPU hardware,
+ * and whether it is NVIDIA or AMD, which have respective warp sizes of @p 32 and @p 64.
+ * 
+ * @note
+ * This function has no effect when QuEST is not deployed with GPU-acceleration enabled.
+ *
+ * @param[in] numThreadsPerBlock the new block size.
+ * @throws @validationerror
+ * - if the @p QuESTEnv is not initialised.
+ * - if @p numThreadsPerBlock is negative.
+ * - if @p numThreadsPerBlock is not a multiple of the GPU warp size.
+ * - if @p numThreadsPerBlock exceeds the maximum @p blockDim imposed by the GPU hardware.
+ * @see
+ * - QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK
+ * @author Oliver Brown
+ * @author Tyson Jones
+ */
+void setQuESTNumGpuThreadsPerBlock(int numThreadsPerBlock);
+
+
 // end de-mangler
 #ifdef __cplusplus
 }

diff --git a/quest/include/modes.h b/quest/include/modes.h
@@ -43,6 +43,10 @@
      *  - forbid sharing: @p 0, @p '0', @p '', @p , (unspecified)
      *  - permit sharing: @p 1, @p '1'
      * 
+     * @constraints
+     * The function initQuESTEnv() will throw a validation error if any of the below are not satisfied.
+     *   - The specified string does not evaluate to an integer @p 0 or @p 1.
+     * 
      * @author Tyson Jones
      */
     const int QUEST_PERMIT_NODES_TO_SHARE_GPU = 0;
@@ -68,7 +72,7 @@
      *    default validation epsilon.
      * 
      * @constraints
-     * The function initQuESTEnv() will throw a validation error if:
+     * The function initQuESTEnv() will throw a validation error if any of the below are not satisfied.
      *   - The specified epsilon must be `0` or positive.
      *   - The specified epsilon must not exceed that maximum or minimum value which can be stored
      *     in a `qreal`, which is specific to its precision.
@@ -78,6 +82,40 @@
     const qreal QUEST_DEFAULT_VALIDATION_EPSILON = 0;
 
 
+    /** @envvardoc
+     * 
+     * Specifies the default number of threads per block (or "block dimension") used by GPU acceleration. 
+     * 
+     * The number of dispatched CUDA threads per block controls the parallelisation granularity of
+     * QuEST's GPU backend, affecting performance.
+     * Specifying `QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK` to a valid, positive integer overrides
+     * QuEST's default otherwise set during compilation via a CMake option of the same name. If 
+     * that CMake option was not set, the default is assumed to be @p 128.
+     * 
+     * The number specified by this environment variable will be used as the block dimension by all of
+     * QuEST's GPU backend functions, unless overridden at runtime via setQuESTNumGpuThreadsPerBlock().
+     * The actual number of threads per block used at any time can be queried via 
+     * getQuESTNumGpuThreadsPerBlock(), or reported by reportQuESTEnv().
+     * 
+     * @envvarvalues
+     *  - use internal default of `128`: @p '', @p , (unspecified)
+     *  - use number `x`: @p x, @p 'x', @p '+x'
+     * 
+     * @constraints
+     * The function initQuESTEnv() will throw a validation error if any of the below are not satisfied.
+     *   - The specified number must be a positive integer.
+     *   - The specified number must not exceed the minimum or maximum value which can be stored in an @p int.
+     *   - The specified number must be divisible by the GPU warp size, which is 32 or 64, depending on
+     *     whether deployed to an NVIDIA or AMD GPU. This restriction is imposed even when QuEST is not
+     *     deployed with GPU-acceleration.
+     *   - The specified number exceeds the maximum imposed by the available GPU hardware.
+     * 
+     * @author Oliver Brown
+     * @author Tyson Jones
+     */
+    const qreal QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = 0;
+
+
 #endif
 
 

diff --git a/quest/include/precision.h b/quest/include/precision.h
@@ -126,13 +126,13 @@
  */
 
 #if QUEST_FLOAT_PRECISION == 1
-    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-5
+    #define QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-5
 
 #elif QUEST_FLOAT_PRECISION == 2
-    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-12
+    #define QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-12
 
 #elif QUEST_FLOAT_PRECISION == 4
-    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-13
+    #define QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-13
 
 #endif
 

diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
@@ -79,7 +79,10 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     validate_envNeverInit(global_envPtr != nullptr, global_hasEnvBeenFinalized, caller);
 
     // load env-vars before validating deployment mode, because some env vars can
-    // affect validation (such as QUEST_PERMIT_NODES_TO_SHARE_GPU)
+    // affect validation (such as QUEST_PERMIT_NODES_TO_SHARE_GPU). note that
+    // some env-vars (like QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK) will be here
+    // validated to have a correct format (like an int), but the validity of its
+    // actual value will be checked later (since it requires deciding GPU-accel).
     envvars_validateAndLoadEnvVars(caller);
     validateconfig_setEpsilonToDefault();
 
@@ -131,6 +134,11 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     /// should we warn here if each machine contains
     /// more GPUs than deployed MPI-processes (some GPUs idle)?
 
+    // validate the initial numTPB env-var (if specified) is valid
+    int initNumThreadsPerBlock = envvars_getDefaultNumGpuThreadsPerBlock();
+    validate_numGpuThreadsPerBlock(initNumThreadsPerBlock, useGpuAccel, caller);
+    gpu_setNumThreadsPerBlock(initNumThreadsPerBlock);
+
     // cuQuantum is always used in GPU-accelerated envs when available
     bool useCuQuantum = useGpuAccel && gpu_isCuQuantumCompiled();
     if (useCuQuantum) {
@@ -157,7 +165,7 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     global_envPtr->isGpuAccelerated    = useGpuAccel;
     global_envPtr->isDistributed       = useDistrib;
     global_envPtr->isMpiUserOwned      = userOwnsMpi;
-    global_envPtr->isMpiGpuAware        = isMpiGpuAware;
+    global_envPtr->isMpiGpuAware       = isMpiGpuAware;
     global_envPtr->isCuQuantumEnabled  = useCuQuantum;
     global_envPtr->isGpuSharingEnabled = permitGpuSharing;
 
@@ -535,20 +543,5 @@ void getQuESTEnvironmentString(char str[200]) {
 }
 
 
-int getQuESTNumGpuThreadsPerBlock() {
-    validate_envIsInit(__func__);
-
-    return gpu_getNumThreadsPerBlock();
-}
-
-void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock) {
-    validate_envIsInit(__func__);
-
-    // just rely on the internal function to throw an error if there's no GPU support compiled
-    // or if newThreadsPerBlock is not a multiple of 32 (NVIDIA) or 64 (AMD)
-    gpu_setNumThreadsPerBlock(newThreadsPerBlock);
-    return;
-}
-
 // end de-mangler
 }