diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3acdbbb1d..b5a438713 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -161,6 +161,7 @@ option(
 )
 message(STATUS "Custom communicator support is turned ${QUEST_ENABLE_SUBCOMM}. Set QUEST_ENABLE_SUBCOMM to modify.")
 
+
 # GPU Acceleration
 option(
   QUEST_ENABLE_CUDA
@@ -183,20 +184,20 @@ option(
 )
 message(STATUS "AMD GPU acceleration is turned ${QUEST_ENABLE_HIP}. Set QUEST_ENABLE_HIP to modify.")
 
+
 # GPU Performance Tuning
-## We do not print this value when configuring CMake as it is for advanced users only.
+# (We do not print this value when configuring CMake as it is for advanced users only)
 
-set(QUEST_GPU_NUM_THREADS_PER_BLOCK 128
-  CACHE
-  STRING
-  "The default number of threads per block QuEST will use when offloading to a GPU. Set to 128 by default. Must be a multiple of 32."
+set(quest_tpb_description # (the games we play for multi-line set() strings!)
+  "The default number of threads per block QuEST will use when offloading to a GPU. Set to 128 by default. "
+  "Must be a multiple of 32 (on NVIDIA GPUs) or 64 (on AMD GPUs). Can be overridden at executable launch "
+  "via an environment variable of the same name, or during runtime via a corresponding API setter function."
 )
-mark_as_advanced(QUEST_GPU_NUM_THREADS_PER_BLOCK)
+set(QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK 128 
+  CACHE STRING
+  "${quest_tpb_description}")
+mark_as_advanced(QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK)
 
-math(EXPR quest_tpb_remainder "${QUEST_GPU_NUM_THREADS_PER_BLOCK} % 32")
-if ((NOT (quest_tpb_remainder EQUAL 0)) OR (QUEST_GPU_NUM_THREADS_PER_BLOCK LESS 32))
-    message(FATAL_ERROR "QUEST_GPU_NUM_THREADS_PER_BLOCK must be a multiple of 32. QUEST_GPU_NUM_THREADS_PER_BLOCK=${QUEST_GPU_NUM_THREADS_PER_BLOCK}.")
-endif()
 
 # Deprecated API
 option(
@@ -211,9 +212,15 @@ option(
   "Whether to disable compile-time warnings ordinarily triggered by use of the deprecated API. Turned OFF by default."
   OFF
 )
-message(STATUS "Disabling of deprecated API warnings is turned ${QUEST_DISABLE_DEPRECATION_WARNINGS}. Set QUEST_DISABLE_DEPRECATION_WARNINGS to modify.")
+message(STATUS 
+  "Disabling of deprecated API warnings is turned ${QUEST_DISABLE_DEPRECATION_WARNINGS}. "
+  "Set QUEST_DISABLE_DEPRECATION_WARNINGS to modify."
+)
 
 option(QUEST_INSTALL_BINARIES "Whether to include example and user binaries in the install." OFF)
+if (QUEST_INSTALL_BINARIES)
+  message(STATUS "Including example and user binaries in the install (if built).")
+endif()
 
 
 
@@ -236,10 +243,12 @@ if (QUEST_ENABLE_CUQUANTUM AND NOT QUEST_ENABLE_CUDA)
   message(FATAL_ERROR "Use of cuQuantum requires CUDA.")
 endif()
 
+
 if (QUEST_ENABLE_SUBCOMM AND NOT QUEST_ENABLE_MPI)
   message(FATAL_ERROR "Distribution must be enabled to make use of a user-defined communicator for QuEST.")
 endif()
 
+
 if(WIN32)
   
   # Force MSVC to export all symbols in a shared library, like GCC and clang
@@ -257,6 +266,37 @@ if(WIN32)
 endif()
 
 
+# validate numTPB even when GPU not compiled
+if (QUEST_ENABLE_HIP)
+  set(quest_warp_size 64)
+  set(quest_gpu_model "AMD GPUs (via HIP)")
+else()
+  set(quest_warp_size 32)
+  set(quest_gpu_model "NVIDIA GPUs (via CUDA), or when not targeting GPUs")
+endif()
+math(EXPR quest_tpb_remainder "${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} % ${quest_warp_size}")
+if ((NOT (quest_tpb_remainder EQUAL 0)) OR NOT (QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK GREATER 0))
+  message(FATAL_ERROR
+    "QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK was set to ${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}, "
+    "but it must be a positive multiple of ${quest_warp_size} when compiling for ${quest_gpu_model}."
+  )
+endif()
+
+
+# warn when numTPB will be later overridden by the current environment variable
+if(
+  DEFINED ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} 
+  AND NOT "$ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}" STREQUAL ""
+  AND NOT "$ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}" STREQUAL "${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}"
+)
+  message(WARNING 
+    "The CMake option QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK=${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} "
+    "differs from the current environment variable (of the same name) value of $ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}. "
+    "If not cleared before QuEST is launched, the latter will override the former."
+  )
+endif()
+
+
 # Encourage high-performance Release build
 
 # Taken from Kitware's exmaple of problematic code at
@@ -514,7 +554,6 @@ set(QUEST_COMPILE_MPI ${QUEST_ENABLE_MPI})
 set(QUEST_COMPILE_SUBCOMM ${QUEST_ENABLE_SUBCOMM})
 set(QUEST_COMPILE_CUQUANTUM ${QUEST_ENABLE_CUQUANTUM})
 set(QUEST_INCLUDE_DEPRECATED_FUNCTIONS ${QUEST_ENABLE_DEPRECATED_API})
-set(QUEST_DEFAULT_NUM_THREADS_PER_BLOCK ${QUEST_GPU_NUM_THREADS_PER_BLOCK})
 
 
 # (for the love of God cmake, create a concise syntax for this)
@@ -523,18 +562,19 @@ if (QUEST_ENABLE_CUDA OR QUEST_ENABLE_HIP)
 else()
   set(QUEST_COMPILE_CUDA 0)
 endif()
+set(QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP})
+
+
+# non-binary set vars which will be written to config.h.in (with a differing name) 
+set(QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK ${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK})
 
 
-# these vars are already set, but repeated here for clarity
+# these vars are already set (cmake name matches the macro name), but repeated here for clarity
 set(QUEST_FLOAT_PRECISION ${QUEST_FLOAT_PRECISION})
 set(QUEST_ENABLE_NUMA ${QUEST_ENABLE_NUMA})
 set(QUEST_DISABLE_DEPRECATION_WARNINGS ${QUEST_DISABLE_DEPRECATION_WARNINGS})
 
 
-# these do not appear in src but are saved for record-keeping in config.h.in
-set(QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP})
-
-
 
 # ============================
 # Pass files to library
diff --git a/docs/cmake.md b/docs/cmake.md
index 34a4c0aeb..fec90d76a 100644
--- a/docs/cmake.md
+++ b/docs/cmake.md
@@ -48,7 +48,7 @@ make
 | `QUEST_DISABLE_DEPRECATION_WARNINGS` | (`OFF`), `ON` | Whether to disable the compile-time deprecation warnings when using the deprecated (v3) API. |
 | `USER_SOURCE_NAMES` | (Undefined), String | The source file for a user program which will be compiled alongside QuEST. `USER_OUTPUT_EXE_NAME` *must* also be defined. |
 | `USER_OUTPUT_EXE_NAME` | (Undefined), String | The name of the executable which will be created from the provided `USER_SOURCE_NAMES`. `USER_SOURCE_NAMES` *must* also be defined. |
-| `QUEST_GPU_NUM_THREADS_PER_BLOCK` | (128), Number | The default number of threads per block QuEST will use when offloading to a GPU. *Must* be a multiple of 32. For AMD GPUs this *should* be a multiple of 64. |
+| `QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK` | (128), Number | The default number of threads per block QuEST will use when offloading to a GPU. *Must* be a multiple of 32 (on NVIDIA GPUs) or 64 (on AMD GPUs). This CMake variable sets the default if not later overridden. The number can be overridden at process launch time using an [environment variable](https://quest-kit.github.io/QuEST/group__modes.html#gaf1b71f54d270d3353fe072c66827339b) of the same name, or during runtime using [`setQuESTNumGpuThreadsPerBlock()`](https://quest-kit.github.io/QuEST/group__experimental.html#gae35a55c6d9366ce677e6aaaf4c1ff5ef). |
 
 
 
diff --git a/docs/launch.md b/docs/launch.md
index 9d5e6ac22..3eb8493ee 100644
--- a/docs/launch.md
+++ b/docs/launch.md
@@ -270,6 +270,7 @@ QuEST execution can be configured prior to runtime using the below [environment
 
 - [`QUEST_PERMIT_NODES_TO_SHARE_GPU`](https://quest-kit.github.io/QuEST/group__modes.html#ga84b134d552464a82d29517e1ce1309a7)
 - [`QUEST_DEFAULT_VALIDATION_EPSILON`](https://quest-kit.github.io/QuEST/group__modes.html#gac4ab30619e411c965377c910680e242c)
+- [`QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK`](https://quest-kit.github.io/QuEST/group__modes.html#gaf1b71f54d270d3353fe072c66827339b)
 
 Note the unit tests in the preceding section accept additional environment variables.
 
diff --git a/quest/include/config.h.in b/quest/include/config.h.in
index c372c793a..1bb8a0470 100644
--- a/quest/include/config.h.in
+++ b/quest/include/config.h.in
@@ -83,16 +83,15 @@
 #cmakedefine01 QUEST_COMPILE_SUBCOMM
 #cmakedefine01 QUEST_COMPILE_CUDA
 #cmakedefine01 QUEST_COMPILE_CUQUANTUM
+#cmakedefine01 QUEST_COMPILE_HIP
 
-// default parameters which may have been tuned for performance when building the library
-#cmakedefine QUEST_DEFAULT_NUM_THREADS_PER_BLOCK @QUEST_DEFAULT_NUM_THREADS_PER_BLOCK@
 
 // crucial to QuEST source (informs optional NUMA usage)
 #cmakedefine01 QUEST_ENABLE_NUMA
 
 
-// not consulted by src (included for book-keeping)
-#cmakedefine01 QUEST_COMPILE_HIP
+// default parameters which may have been tuned for performance when building the library
+#cmakedefine QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK @QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK@
 
 
 
diff --git a/quest/include/environment.h b/quest/include/environment.h
index f8ef8998b..cdefa7d7d 100644
--- a/quest/include/environment.h
+++ b/quest/include/environment.h
@@ -87,14 +87,6 @@ int isQuESTEnvInit();
 QuESTEnv getQuESTEnv();
 
 
-/** @notyetdoced
- * GPU thread per block control
- * This is somehow probably the best pre-existing place for this. It only really applies to GPU, because for
- * OpenMP the user can just export OMP_NUM_THREADS or call omp_set_num_threads.
- */
-int getQuESTNumGpuThreadsPerBlock();
-void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock);
-
 
 // end de-mangler
 #ifdef __cplusplus
diff --git a/quest/include/experimental.h b/quest/include/experimental.h
index 2fabdc34f..8c2cc4e0a 100644
--- a/quest/include/experimental.h
+++ b/quest/include/experimental.h
@@ -44,7 +44,6 @@ void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, in
 
 
 #if QUEST_COMPILE_SUBCOMM
-
 /** @notyetdoced
  * 
  *  Advanced initialiser which allows the user to provide an MPI communicator for QuEST to use.
@@ -61,10 +60,46 @@ void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, in
  * @author Oliver Brown
  */
 void initCustomMpiCommQuESTEnv(MPI_Comm questComm, int useGpuAccel, int useMultithread);
-
 #endif // QUEST_COMPILE_SUBCOMM
 
 
+/** @notyetdoced
+ * 
+ * @author Oliver Brown
+ */
+int getQuESTNumGpuThreadsPerBlock();
+
+
+/** Overrides the number of CUDA threads per block (or @p blockDim) used by QuEST's GPU-accelerated backend.
+ * 
+ * This changes the GPU parallelisation granularity and can affect performance, and is useful
+ * for performance tuning or diagnostics. Before this function is called, QuEST will use the
+ * number as specified by the environment variable @p QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK,
+ * if defined. Otherwise, it will use the value specified by the CMake/compile option of the
+ * same name, which itself presently defaults to @p 128. After this function is called, QuEST
+ * will adopt @p numThreadsPerBlock for the remainder of execution, or until this function is
+ * called again.
+ * 
+ * Practical values of @p numThreadsPerBlock can vary with the simulation size, the user's GPU hardware,
+ * and whether it is NVIDIA or AMD, which have respective warp sizes of @p 32 and @p 64.
+ * 
+ * @note
+ * This function has no effect when QuEST is not deployed with GPU-acceleration enabled.
+ *
+ * @param[in] numThreadsPerBlock the new block size.
+ * @throws @validationerror
+ * - if the @p QuESTEnv is not initialised.
+ * - if @p numThreadsPerBlock is negative.
+ * - if @p numThreadsPerBlock is not a multiple of the GPU warp size.
+ * - if @p numThreadsPerBlock exceeds the maximum @p blockDim imposed by the GPU hardware.
+ * @see
+ * - QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK
+ * @author Oliver Brown
+ * @author Tyson Jones
+ */
+void setQuESTNumGpuThreadsPerBlock(int numThreadsPerBlock);
+
+
 // end de-mangler
 #ifdef __cplusplus
 }
diff --git a/quest/include/modes.h b/quest/include/modes.h
index 285b1cb5d..25ad8bb54 100644
--- a/quest/include/modes.h
+++ b/quest/include/modes.h
@@ -43,6 +43,10 @@
      *  - forbid sharing: @p 0, @p '0', @p '', @p , (unspecified)
      *  - permit sharing: @p 1, @p '1'
      * 
+     * @constraints
+     * The function initQuESTEnv() will throw a validation error if any of the below are not satisfied.
+     *   - The specified string does not evaluate to an integer @p 0 or @p 1.
+     * 
      * @author Tyson Jones
      */
     const int QUEST_PERMIT_NODES_TO_SHARE_GPU = 0;
@@ -68,7 +72,7 @@
      *    default validation epsilon.
      * 
      * @constraints
-     * The function initQuESTEnv() will throw a validation error if:
+     * The function initQuESTEnv() will throw a validation error if any of the below are not satisfied.
      *   - The specified epsilon must be `0` or positive.
      *   - The specified epsilon must not exceed that maximum or minimum value which can be stored
      *     in a `qreal`, which is specific to its precision.
@@ -78,6 +82,40 @@
     const qreal QUEST_DEFAULT_VALIDATION_EPSILON = 0;
 
 
+    /** @envvardoc
+     * 
+     * Specifies the default number of threads per block (or "block dimension") used by GPU acceleration. 
+     * 
+     * The number of dispatched CUDA threads per block controls the parallelisation granularity of
+     * QuEST's GPU backend, affecting performance.
+     * Specifying `QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK` to a valid, positive integer overrides
+     * QuEST's default otherwise set during compilation via a CMake option of the same name. If 
+     * that CMake option was not set, the default is assumed to be @p 128.
+     * 
+     * The number specified by this environment variable will be used as the block dimension by all of
+     * QuEST's GPU backend functions, unless overridden at runtime via setQuESTNumGpuThreadsPerBlock().
+     * The actual number of threads per block used at any time can be queried via 
+     * getQuESTNumGpuThreadsPerBlock(), or reported by reportQuESTEnv().
+     * 
+     * @envvarvalues
+     *  - use internal default of `128`: @p '', @p , (unspecified)
+     *  - use number `x`: @p x, @p 'x', @p '+x'
+     * 
+     * @constraints
+     * The function initQuESTEnv() will throw a validation error if any of the below are not satisfied.
+     *   - The specified number must be a positive integer.
+     *   - The specified number must not exceed the minimum or maximum value which can be stored in an @p int.
+     *   - The specified number must be divisible by the GPU warp size, which is 32 or 64, depending on
+     *     whether deployed to an NVIDIA or AMD GPU. This restriction is imposed even when QuEST is not
+     *     deployed with GPU-acceleration.
+     *   - The specified number exceeds the maximum imposed by the available GPU hardware.
+     * 
+     * @author Oliver Brown
+     * @author Tyson Jones
+     */
+    const qreal QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = 0;
+
+
 #endif
 
 
diff --git a/quest/include/precision.h b/quest/include/precision.h
index 2c89545f7..7b932e678 100644
--- a/quest/include/precision.h
+++ b/quest/include/precision.h
@@ -126,13 +126,13 @@
  */
 
 #if QUEST_FLOAT_PRECISION == 1
-    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-5
+    #define QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-5
 
 #elif QUEST_FLOAT_PRECISION == 2
-    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-12
+    #define QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-12
 
 #elif QUEST_FLOAT_PRECISION == 4
-    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-13
+    #define QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-13
 
 #endif
 
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index dee976c20..c59334b55 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -79,7 +79,10 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     validate_envNeverInit(global_envPtr != nullptr, global_hasEnvBeenFinalized, caller);
 
     // load env-vars before validating deployment mode, because some env vars can
-    // affect validation (such as QUEST_PERMIT_NODES_TO_SHARE_GPU)
+    // affect validation (such as QUEST_PERMIT_NODES_TO_SHARE_GPU). note that
+    // some env-vars (like QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK) will be here
+    // validated to have a correct format (like an int), but the validity of its
+    // actual value will be checked later (since it requires deciding GPU-accel).
     envvars_validateAndLoadEnvVars(caller);
     validateconfig_setEpsilonToDefault();
 
@@ -131,6 +134,11 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     /// should we warn here if each machine contains
     /// more GPUs than deployed MPI-processes (some GPUs idle)?
 
+    // validate the initial numTPB env-var (if specified) is valid
+    int initNumThreadsPerBlock = envvars_getDefaultNumGpuThreadsPerBlock();
+    validate_numGpuThreadsPerBlock(initNumThreadsPerBlock, useGpuAccel, caller);
+    gpu_setNumThreadsPerBlock(initNumThreadsPerBlock);
+
     // cuQuantum is always used in GPU-accelerated envs when available
     bool useCuQuantum = useGpuAccel && gpu_isCuQuantumCompiled();
     if (useCuQuantum) {
@@ -157,7 +165,7 @@ void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuA
     global_envPtr->isGpuAccelerated    = useGpuAccel;
     global_envPtr->isDistributed       = useDistrib;
     global_envPtr->isMpiUserOwned      = userOwnsMpi;
-    global_envPtr->isMpiGpuAware        = isMpiGpuAware;
+    global_envPtr->isMpiGpuAware       = isMpiGpuAware;
     global_envPtr->isCuQuantumEnabled  = useCuQuantum;
     global_envPtr->isGpuSharingEnabled = permitGpuSharing;
 
@@ -535,20 +543,5 @@ void getQuESTEnvironmentString(char str[200]) {
 }
 
 
-int getQuESTNumGpuThreadsPerBlock() {
-    validate_envIsInit(__func__);
-    
-    return gpu_getNumThreadsPerBlock();
-}
-
-void setQuESTNumGpuThreadsPerBlock(const int newThreadsPerBlock) {
-    validate_envIsInit(__func__);
-
-    // just rely on the internal function to throw an error if there's no GPU support compiled
-    // or if newThreadsPerBlock is not a multiple of 32 (NVIDIA) or 64 (AMD)
-    gpu_setNumThreadsPerBlock(newThreadsPerBlock);
-    return;
-}
-
 // end de-mangler
 }
diff --git a/quest/src/api/experimental.cpp b/quest/src/api/experimental.cpp
index 1ad6fdb42..a6f883656 100644
--- a/quest/src/api/experimental.cpp
+++ b/quest/src/api/experimental.cpp
@@ -13,6 +13,7 @@
 
 #include "quest/src/core/validation.hpp"
 #include "quest/src/comm/comm_config.hpp"
+#include "quest/src/gpu/gpu_config.hpp"
 
 #if QUEST_COMPILE_SUBCOMM && ! QUEST_COMPILE_MPI
     #error "Macro QUEST_COMPILE_SUBCOMM was true, but QUEST_COMPILE_MPI was illegally false."
@@ -59,7 +60,7 @@ void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, in
 
 
 #if QUEST_COMPILE_SUBCOMM // hide MPI_Comm
-
+ 
 void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useMultithread) {
 
     // useDistrib and userOwnsMpi are implied by the user of this initialiser
@@ -81,9 +82,26 @@ void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useM
     // perform remaining validation (some is harmlessly repeated) and init QuEST env
     validateAndInitCustomQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread, __func__);
 }
-
 #endif // QUEST_COMPILE_SUBCOMM
 
 
+int getQuESTNumGpuThreadsPerBlock() {
+    validate_envIsInit(__func__);
+    
+    return gpu_getNumThreadsPerBlock();
+}
+
+
+void setQuESTNumGpuThreadsPerBlock(int numTPB) {
+    validate_envIsInit(__func__);
+
+    // validation messages and queries depend upon GPU usage
+    bool gpuIsActive = getQuESTEnv().isGpuAccelerated;
+    validate_numGpuThreadsPerBlock(numTPB, gpuIsActive, __func__);
+
+    gpu_setNumThreadsPerBlock(numTPB);
+}
+
+
 // end de-mangler
 }
diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index 5c59477ca..4b76ca71e 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -209,6 +209,14 @@ bool comm_isMpiInit() {
 }
 
 
+bool comm_isMpiUserOwned() {
+
+    // this isn't presently used by the code base; I'm just naughtily silencing
+    // "unused var" warning when compiling without MPI :^)
+    return global_isMpiUserOwned;
+}
+
+
 
 /*
  * QUEST COMMUNICATION MANAGEMENT
diff --git a/quest/src/comm/comm_config.hpp b/quest/src/comm/comm_config.hpp
index 826ebdf1c..cc009ab9a 100644
--- a/quest/src/comm/comm_config.hpp
+++ b/quest/src/comm/comm_config.hpp
@@ -17,6 +17,7 @@ bool comm_isMpiCompiled();
 bool comm_isMpiSubCommCompiled();
 bool comm_isMpiGpuAware();
 bool comm_isMpiInit();
+bool comm_isMpiUserOwned();
 
 // control of QuEST's (possibly more limited) MPI env
 bool comm_isActive();
diff --git a/quest/src/core/envvars.cpp b/quest/src/core/envvars.cpp
index bd9f87b6f..c1d3e81ed 100644
--- a/quest/src/core/envvars.cpp
+++ b/quest/src/core/envvars.cpp
@@ -6,12 +6,14 @@
  * @author Tyson Jones
  */
 
+#include "quest/include/config.h"
 #include "quest/include/precision.h"
 #include "quest/include/types.h"
 
 #include "quest/src/core/errors.hpp"
 #include "quest/src/core/parser.hpp"
 #include "quest/src/core/validation.hpp"
+#include "quest/src/gpu/gpu_config.hpp"
 
 #include <string>
 #include <cstdlib>
@@ -26,8 +28,9 @@ using std::string;
 
 
 namespace envvar_names {
-    string QUEST_PERMIT_NODES_TO_SHARE_GPU = "QUEST_PERMIT_NODES_TO_SHARE_GPU";
-    string QUEST_DEFAULT_VALIDATION_EPSILON = "QUEST_DEFAULT_VALIDATION_EPSILON";
+    string QUEST_PERMIT_NODES_TO_SHARE_GPU         = "QUEST_PERMIT_NODES_TO_SHARE_GPU";
+    string QUEST_DEFAULT_VALIDATION_EPSILON        = "QUEST_DEFAULT_VALIDATION_EPSILON";
+    string QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = "QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK";
 }
 
 
@@ -45,7 +48,11 @@ namespace envvar_values {
 
     // by default, the initial validation epsilon (before being overriden
     // by users at runtime) should depend on qreal (i.e. FLOAT_PRECISION)
-    qreal QUEST_DEFAULT_VALIDATION_EPSILON = UNSPECIFIED_DEFAULT_VALIDATION_EPSILON;
+    qreal QUEST_DEFAULT_VALIDATION_EPSILON = QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON;
+
+    // by default, the initial number of GPU threads per block is informed by
+    // the below cmake variable (before being overridden by env-var or at runtime)
+    int QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK;
 }
 
 
@@ -123,6 +130,21 @@ void validateAndSetDefaultValidationEpsilon(const char* caller) {
 }
 
 
+void validateAndSetDefaultNumGpuThreadsPerBlock(const char* caller) {
+
+    // permit unspecified, falling back to the hardcoded default
+    string name = envvar_names::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK;
+    if (!isEnvVarSpecified(name))
+        return;
+
+    string value = getSpecifiedEnvVarValue(name);
+    validate_envVarDefaultNumGpuThreadsPerBlockIsAnInt(value, caller);
+
+    // overwrite default env-var value
+    envvar_values::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = parser_parseInteger(value);
+}
+
+
 
 /*
  * PUBLIC
@@ -138,6 +160,7 @@ void envvars_validateAndLoadEnvVars(const char* caller) {
     // load all env-vars
     validateAndSetWhetherGpuSharingIsPermitted(caller);
     validateAndSetDefaultValidationEpsilon(caller);
+    validateAndSetDefaultNumGpuThreadsPerBlock(caller);
 
     // ensure no re-loading
     global_areEnvVarsLoaded = true;
@@ -156,3 +179,10 @@ qreal envvars_getDefaultValidationEpsilon() {
 
     return envvar_values::QUEST_DEFAULT_VALIDATION_EPSILON;
 }
+
+
+int envvars_getDefaultNumGpuThreadsPerBlock() {
+    assertEnvVarsAreLoaded();
+
+    return envvar_values::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK;
+}
diff --git a/quest/src/core/envvars.hpp b/quest/src/core/envvars.hpp
index 555e76f15..4862e8d08 100644
--- a/quest/src/core/envvars.hpp
+++ b/quest/src/core/envvars.hpp
@@ -15,6 +15,7 @@
 namespace envvar_names { 
     extern std::string QUEST_PERMIT_NODES_TO_SHARE_GPU;
     extern std::string QUEST_DEFAULT_VALIDATION_EPSILON;
+    extern std::string QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK;
 }
 
 
@@ -33,5 +34,7 @@ bool envvars_getWhetherGpuSharingIsPermitted();
 
 qreal envvars_getDefaultValidationEpsilon();
 
+int envvars_getDefaultNumGpuThreadsPerBlock();
+
 
 #endif // ENVVARS_HPP
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 372c2244b..0d4f7ea16 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -650,6 +650,11 @@ void error_gpuUnexpectedlyInaccessible() {
     raiseInternalError("A function internally assumed (as a precondition) that QuEST was compiled with GPU-acceleration enabled, and that one was physically accessible, though this was untrue.");
 }
 
+void error_gpuNumThreadsPerBlockNotSet() {
+
+    raiseInternalError("A function queried the GPU numThreadsPerBlock before it had been set (intendedly by QuESTEnv initialisation).");
+}
+
 void error_gpuMemSyncQueriedButEnvNotGpuAccelerated() {
 
     raiseInternalError("A function checked whether persistent GPU memory (such as in a CompMatr) had been synchronised, but the QuEST environment is not GPU accelerated.");  
@@ -665,9 +670,10 @@ void error_gpuDenseMatrixConjugatedAndTransposed() {
     raiseInternalError("The GPU + cuQuantum implementation of anyCtrlAnyTargDenseMatr() assumes that at most one of template arguments ApplyConj and ApplyTransp is true, though this was violated.");
 }
 
-void error_gpuBadNumThreadsPerBlock() {
-
-    raiseInternalError("The number of threads per block must be a multiple of 32 on NVIDIA GPUs or a multiple of 64 on AMD GPUs.");
+void assert_gpuNumThreadsPerBlockIsWarpDivisible(int numThreadsPerBlock) {
+    int warpSize = gpu_isHipCompiled()? gpu_HIP_WARP_SIZE : gpu_CUDA_WARP_SIZE;
+    if (numThreadsPerBlock > 0 && numThreadsPerBlock % warpSize != 0)
+        raiseInternalError("The number of threads per block was not a positive multiple of the platform warp size (32 for NVIDIA, 64 for AMD).");
 }
 
 void assert_quregIsGpuAccelerated(Qureg qureg) {
@@ -889,6 +895,16 @@ void error_attemptedToParseRealFromInvalidString() {
     raiseInternalError("A function attempted to parse a string to a qreal but the string was not validly formatted. This should have been caught by prior user validation.");
 }
 
+void error_attemptedToParseIntegerFromInvalidString() {
+
+    raiseInternalError("A function attempted to parse a string to an int but the string was not validly formatted. This should have been caught by prior user validation.");
+}
+
+void error_attemptedToParseOutOfRangeInteger() {
+
+    raiseInternalError("A function attempted to parse a string to an integer but the numerical value of the string literal exceeded the range of the integer. This should have been caught by prior validation.");
+}
+
 void error_attemptedToParseOutOfRangeReal() {
 
     raiseInternalError("A function attempted to parse a string to a qreal but the numerical value of the string literal exceeded the range of the qreal. This should have been caught by prior user validation.");
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index 56a5aaa40..12465917f 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -249,6 +249,8 @@ void error_gpuCopyButMatrixNotGpuAccelerated();
 
 void error_gpuMemSyncQueriedButEnvNotGpuAccelerated();
 
+void error_gpuNumThreadsPerBlockNotSet();
+
 void error_gpuUnexpectedlyInaccessible();
 
 void error_gpuDeadCopyMatrixFunctionCalled();
@@ -261,6 +263,8 @@ void assert_gpuIsAccessible();
 
 void assert_gpuHasBeenBound(bool isBound);
 
+void assert_gpuNumThreadsPerBlockIsWarpDivisible(int numThreadsPerBlock);
+
 void assert_quregIsGpuAccelerated(Qureg qureg);
 
 void assert_mixQuregTempGpuAllocSucceeded(qcomp* gpuPtr);
@@ -367,6 +371,10 @@ void error_attemptedToParseComplexFromInvalidString();
 
 void error_attemptedToParseRealFromInvalidString();
 
+void error_attemptedToParseIntegerFromInvalidString();
+
+void error_attemptedToParseOutOfRangeInteger();
+
 void error_attemptedToParseOutOfRangeReal();
 
 void error_attemptedToParsePauliStringFromInvalidString();
diff --git a/quest/src/core/parser.cpp b/quest/src/core/parser.cpp
index 140d77745..9d9194a3f 100644
--- a/quest/src/core/parser.cpp
+++ b/quest/src/core/parser.cpp
@@ -82,6 +82,9 @@ namespace patterns {
     // full complex; any format, importantly in order of decreasing specificity. do not consult for captured groups
     string num = group(comp) + "|" + group(imag) + "|" + group(real);
 
+    // full signed integer
+    string signedInt = optSign + "[0-9]+";
+
     // no capturing because 'num' pollutes captured groups, and pauli syntax overlaps real integers
     string pauli = "[" + parser_RECOGNISED_PAULI_CHARS + "]";
     string paulis = group(optSpace + pauli + optSpace) + "+";
@@ -96,6 +99,7 @@ namespace regexes {
     regex imag(patterns::imag);
     regex comp(patterns::comp);
     regex num(patterns::num);
+    regex signedInt(patterns::signedInt);
     regex paulis(patterns::paulis);
     regex weightedPaulis(patterns::weightedPaulis);
 }
@@ -173,6 +177,63 @@ int getNumPaulisInLine(string line) {
 
 
 
+/*
+ * INTEGER PARSING
+ */
+
+
+bool parser_isAnySizedInteger(string str) {
+
+    smatch match;
+    return regex_match(str, match, regexes::signedInt);
+}
+
+
+bool parser_isValidInteger(string str) {
+
+    // reject str if it doesn't match regex
+    if (!parser_isAnySizedInteger(str))
+        return false;
+
+    // remove whitespace which stoi() below cannot handle after the sign
+    removeWhiteSpace(str);
+
+    // check number is in-range of int via duck-typing
+    try {
+        std::stoi(str);
+    } catch (const out_of_range&) {
+        return false;
+
+    // error if our regex permitted an unparsable string
+    } catch (const invalid_argument&) {
+        error_attemptedToParseIntegerFromInvalidString();
+    }
+
+    return true;
+}
+
+
+int parser_parseInteger(string str) {
+
+    if (!parser_isValidInteger(str))
+        error_attemptedToParseIntegerFromInvalidString();
+
+    removeWhiteSpace(str); // stoi can't handle
+
+    try {
+        return std::stoi(str);
+    } catch (const invalid_argument&) {
+        error_attemptedToParseIntegerFromInvalidString();
+    } catch (const out_of_range&) {
+        error_attemptedToParseOutOfRangeInteger();
+    }
+
+    // unreachable
+    return -1;
+}
+
+
+
 /*
  * REAL NUMBER PARSING
  */
diff --git a/quest/src/core/parser.hpp b/quest/src/core/parser.hpp
index 4a9df2d02..3d34588ae 100644
--- a/quest/src/core/parser.hpp
+++ b/quest/src/core/parser.hpp
@@ -20,12 +20,16 @@ using std::string;
  * PARSING NUMBERS
  */
 
+bool parser_isAnySizedInteger(string str);
+bool parser_isValidInteger(string str);
+
 bool parser_isAnySizedReal(string str);
 bool parser_isAnySizedComplex(string str);
 
 bool parser_isValidReal(string str);
 bool parser_isValidComplex(string str);
 
+int parser_parseInteger(string str);
 qreal parser_parseReal(string str);
 qcomp parser_parseComplex(string str);
 
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index c727ad1c5..30119f2dd 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -159,6 +159,31 @@ namespace report {
     string INVALID_REPORTED_PAULI_STR_STYLE_FLAG =
         "Given an unrecognised style flag (${FLAG}). Legal flags are 0 and 1.";
 
+    // substrings re-used below
+    string _invalid_num_tpb_prefix =
+        "An invalid number of GPU threads per block (${NUM_TPB}) was passed, or specified via environment variable " + envvar_names::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK + ", or compiled into the QuEST library through the CMake option of the same name.";
+    string _num_tpb_warp_indivisible_infix =
+        "The specified number does not divide evenly into the warp size of ${CUDA_WARP_SIZE} (NVIDIA GPUs) or ${HIP_WARP_SIZE} (AMD GPUs).";
+    string _num_tpb_warp_negative_infix =
+        "The specified number must be positive.";
+    string _num_tpb_ineffectual_suffix =
+        "Note GPU acceleration is not active so this parameter has no effect anyway.";
+
+    string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE =
+        _invalid_num_tpb_prefix + " " + _num_tpb_warp_negative_infix;
+
+    string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE_BUT_GPU_NOT_ACTIVE_ANYWAY =
+        _invalid_num_tpb_prefix + " " + _num_tpb_warp_negative_infix + " " + _num_tpb_ineffectual_suffix;
+
+    string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE =
+        _invalid_num_tpb_prefix + " " + _num_tpb_warp_indivisible_infix;
+
+    string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE_BUT_GPU_NOT_AVAILABLE_ANYWAY =
+        _invalid_num_tpb_prefix + " " + _num_tpb_warp_indivisible_infix + " " + _num_tpb_ineffectual_suffix;
+
+    string GPU_NUM_THREADS_PER_BLOCK_EXCEEDS_HARDWARE_MAX =
+        _invalid_num_tpb_prefix + " Exceeds the hardware-imposed maximum of ${MAX_TPB}.";
+
 
     /*
      * QUREG CREATION
@@ -1147,6 +1172,13 @@ namespace report {
 
     string DEFAULT_EPSILON_ENV_VAR_IS_NEGATIVE =
         "The optional '" + envvar_names::QUEST_DEFAULT_VALIDATION_EPSILON + "' environment variable was negative. The value must be zero or positive.";
+
+    string DEFAULT_NUM_GPU_THREADS_PER_BLOCK_ENV_VAR_NOT_AN_INT =
+        "The optional '" + envvar_names::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK + "' environment variable was not a recognisable integer.";
+
+    string DEFAULT_NUM_GPU_THREADS_PER_BLOCK_ENV_VAR_EXCEEDS_INT_RANGE =
+        "The optional '" + envvar_names::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK + "' environment variable was larger (in magnitude) than the maximum value which can be stored in an integer.";
+
 }
 
 
@@ -1647,6 +1679,49 @@ void validate_reportedPauliStrStyleFlag(int flag, const char* caller) {
     assertThat(flag==0 || flag==1, report::INVALID_REPORTED_PAULI_STR_STYLE_FLAG, {{"${FLAG}",flag}}, caller);
 }
 
+void validate_numGpuThreadsPerBlock(int numTPB, bool isGpuActive, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    // var 'isGpuActive' indicates that the GPU backend is compiled, a physical
+    // GPU is available, AND that the QuESTEnv has GPU-acceleration enabled, i.e.
+    // isGPuActive = gpu_isGpuCompiled() && gpu_isGpuAvailable() && env.isGpuAccelerated,
+    // though is established before QuESTEnv initialisation has completed.
+
+    // validate numTPB > 0 with an error message that points out TPB may be redundant
+    tokenSubs vars = {{"${NUM_TPB}", numTPB}};
+    auto errorMsg = isGpuActive? 
+        report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE :
+        report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE_BUT_GPU_NOT_ACTIVE_ANYWAY;
+    assertThat(numTPB > 0, errorMsg, vars, caller);
+
+    // prepare to validate TPB is warp-divisible, again pointing out redundancy...
+    vars["${CUDA_WARP_SIZE}"] = gpu_CUDA_WARP_SIZE;
+    vars["${HIP_WARP_SIZE}"] = gpu_HIP_WARP_SIZE;
+    errorMsg = isGpuActive? 
+        report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE :
+        report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE_BUT_GPU_NOT_AVAILABLE_ANYWAY;
+
+    // ... but note that when the GPU backend isn't compiled, we don't know whether the
+    // user has an NVIDIA or AMD GPU, which have distinct warps of 32 (CUDA) and 64 (HIP),
+    // and so choose the smaller divisor (32,CUDA), ergo potentially permitting warp TPB
+    // that are incompatible with HIP. An extremely unimportant subtlety!
+    static_assert(gpu_HIP_WARP_SIZE >= gpu_CUDA_WARP_SIZE);
+    int warpSize = gpu_isHipCompiled()? gpu_HIP_WARP_SIZE : gpu_CUDA_WARP_SIZE;
+    assertThat(numTPB % warpSize == 0, errorMsg, vars, caller);
+
+    // the final check of max numTBP requires querying the hardware device, which obviously
+    // isn't possible if not available (and is pointless if available but we're not using!)
+    if (!isGpuActive)
+        return;
+
+    // otherwise, we verify numTPB doesn't exceed the hardware-declared maximum
+    auto maxNumTPB = gpu_getMaxNumThreadsPerBlock();
+    vars = {{"${NUM_TPB}", numTPB}, {"${MAX_TPB}", maxNumTPB}};
+    assertThat(numTPB <= maxNumTPB, report::GPU_NUM_THREADS_PER_BLOCK_EXCEEDS_HARDWARE_MAX, vars, caller);
+}
+
 
 
 /*
@@ -4991,6 +5066,9 @@ void validate_tempAllocSucceeded(bool succeeded, size_t numBytes, const char* ca
 
 void validate_envVarPermitNodesToShareGpu(string varValue, const char* caller) {
 
+    // this presently does absolutely nothing; environment variables are
+    // loaded during QuESTEnv initialisation, before which there is no
+    // way to disable validation... but we keep for clarity/consistency!
     if (!global_isValidationEnabled)
         return;
 
@@ -5002,6 +5080,9 @@ void validate_envVarPermitNodesToShareGpu(string varValue, const char* caller) {
 
 void validate_envVarDefaultValidationEpsilon(string varValue, const char* caller) {
 
+    // this presently does absolutely nothing; environment variables are
+    // loaded during QuESTEnv initialisation, before which there is no
+    // way to disable validation... but we keep for clarity/consistency!
     if (!global_isValidationEnabled)
         return;
 
@@ -5011,3 +5092,17 @@ void validate_envVarDefaultValidationEpsilon(string varValue, const char* caller
     qreal eps = parser_parseReal(varValue);
     assertThat(eps >= 0, report::DEFAULT_EPSILON_ENV_VAR_IS_NEGATIVE, caller);
 }
+
+void validate_envVarDefaultNumGpuThreadsPerBlockIsAnInt(string varValue, const char* caller) {
+
+    // this presently does absolutely nothing; environment variables are
+    // loaded during QuESTEnv initialisation, before which there is no
+    // way to disable validation... but we keep for clarity/consistency!
+    if (!global_isValidationEnabled)
+        return;
+
+    // we here only validate that the value is a valid signed integer;
+    // validation of its GPU-compatibility is performed by another func
+    assertThat(parser_isAnySizedInteger(varValue), report::DEFAULT_NUM_GPU_THREADS_PER_BLOCK_ENV_VAR_NOT_AN_INT, caller);
+    assertThat(parser_isValidInteger(varValue), report::DEFAULT_NUM_GPU_THREADS_PER_BLOCK_ENV_VAR_EXCEEDS_INT_RANGE, caller);
+}
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
index 787316326..58a0b632f 100644
--- a/quest/src/core/validation.hpp
+++ b/quest/src/core/validation.hpp
@@ -113,6 +113,8 @@ void validate_numPauliChars(const char* paulis, const char* caller);
 
 void validate_reportedPauliStrStyleFlag(int flag, const char* caller);
 
+void validate_numGpuThreadsPerBlock(int numTBP, bool isGpuActive, const char* caller);
+
 
 
 /*
@@ -554,6 +556,8 @@ void validate_envVarPermitNodesToShareGpu(string varValue, const char* caller);
 
 void validate_envVarDefaultValidationEpsilon(string varValue, const char* caller);
 
+void validate_envVarDefaultNumGpuThreadsPerBlockIsAnInt(string varValue, const char* caller);
+
 
 
 #endif // VALIDATION_HPP
\ No newline at end of file
diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index 19c0233bb..fc68969ad 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -335,27 +335,41 @@ qindex gpu_getMaxNumConcurrentThreads() {
  * ENVIRONMENT MANAGEMENT
  */
 
-int global_numThreadsPerBlock = QUEST_DEFAULT_NUM_THREADS_PER_BLOCK;
+
+// the default numTPB is not known until runtime since the initial value
+// (provided either by the CMake var, or the environment variable) must
+// be validated during QuEST initialisation.
+static int global_numThreadsPerBlock = -1;
+
 
 int gpu_getNumThreadsPerBlock() {
-    // permitted even when GPU backend not compiled
+    if (global_numThreadsPerBlock == -1)
+        error_gpuNumThreadsPerBlockNotSet();
+
     return global_numThreadsPerBlock;
 }
 
-void gpu_setNumThreadsPerBlock(const int newNumThreadsPerBlock) {
-    if (gpu_isHipCompiled()) {
-        // number of threads per block should be a multiple of 64
-        if (newNumThreadsPerBlock % 64)
-            error_gpuBadNumThreadsPerBlock();
-    } else {
-        // number of threads per block should be a multiple of 32
-        if (newNumThreadsPerBlock % 32)
-            error_gpuBadNumThreadsPerBlock();
-    }
 
-    // permitted even when GPU backend not compiled
-    global_numThreadsPerBlock = newNumThreadsPerBlock;
-    return;
+void gpu_setNumThreadsPerBlock(int newNumTPB) {
+#if QUEST_COMPILE_CUDA
+    assert_gpuNumThreadsPerBlockIsWarpDivisible(newNumTPB); // CUDA vs HIP specific
+#endif
+
+    global_numThreadsPerBlock = newNumTPB;
+}
+
+
+int gpu_getMaxNumThreadsPerBlock() {
+#if QUEST_COMPILE_CUDA
+
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, getBoundGpuId());
+    return prop.maxThreadsPerBlock; // HIP compatible
+
+#else
+    error_gpuQueriedButGpuNotCompiled();
+    return -1;
+#endif
 }
 
 
diff --git a/quest/src/gpu/gpu_config.hpp b/quest/src/gpu/gpu_config.hpp
index e95c9f4f7..98cb9c8a3 100644
--- a/quest/src/gpu/gpu_config.hpp
+++ b/quest/src/gpu/gpu_config.hpp
@@ -19,6 +19,16 @@
 #include "quest/include/channels.h"
 
 
+
+/*
+ * CONSTANTS
+ */
+
+constexpr int gpu_CUDA_WARP_SIZE = 32;
+constexpr int gpu_HIP_WARP_SIZE = 64;
+
+
+
 /*
  * CUDA ERROR HANDLING
  */
@@ -68,7 +78,9 @@ qindex gpu_getMaxNumConcurrentThreads();
 
 int gpu_getNumThreadsPerBlock();
 
-void gpu_setNumThreadsPerBlock(const int newThreadsPerBlock);
+void gpu_setNumThreadsPerBlock(int newThreadsPerBlock);
+
+int gpu_getMaxNumThreadsPerBlock();
 
 void gpu_bindLocalGPUsToNodes();
 
diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
index 16af56621..b6954f701 100644
--- a/quest/src/gpu/gpu_kernels.cuh
+++ b/quest/src/gpu/gpu_kernels.cuh
@@ -301,7 +301,11 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr(
     // must be strictly through compile-time-known indices, otherwise it will auto-
     // spill to local memory). Hence, this _subA() function is not a subroutine 
     // despite some logic being common to non-compile-time _subB(), and hence
-    // why the loops below are explicitly compile-time unrolled
+    // why the loops below are explicitly compile-time unrolled. Beware that when
+    // numThreadsPerBlock is increased from 128, this kernel will still behave
+    // correctly, but privateCache below will spill over into local memory at a
+    // performance penalty for NumTargs <= 5, with spillage occurring for fewer
+    // NumTargs as numThreadsPerBlock increases.
     REGISTER gpu_qcomp privateCache[1 << NumTargs];
 
     // we know NumTargs <= 5, though NumCtrls is permitted anything (including -1)
diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
index b994f46e0..9b8e819b5 100644
--- a/quest/src/gpu/gpu_subroutines.cpp
+++ b/quest/src/gpu/gpu_subroutines.cpp
@@ -462,9 +462,12 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, Co
     if constexpr (NumTargs != -1) {
 
         // when NumTargs <= 5, each thread has a private array stored in the registers,
-        // enabling rapid IO. Given numThreadsPerBlock = 128, the maximum size of 
-        // this array per-block is 16 * 128 * 2^5 B = 64 KiB which exceeds shared
-        // memory capacity, but does NOT exceed maximum register capacity.
+        // enabling rapid IO. When using the default numThreadsPerBlock = 128, the max
+        // size of this array per-block is 16 * 128 * 2^5 B = 64 KiB which exceeds shared
+        // memory capacity, but does NOT exceed maximum register capacity. When the user
+        // increases numThreadsPerBlock, the thread-private array in the below kernel
+        // will spill from registers into local memory, degrading performance, but
+        // behaving correctly and stably.
 
         /// @todo
         /// We should really check the above claims, otherwise the thread-private arrays could
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index d617ba8df..59341759f 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -7,6 +7,7 @@ target_sources(tests
   debug.cpp
   decoherence.cpp
   environment.cpp
+  experimental.cpp
   initialisations.cpp
   matrices.cpp
   multiplication.cpp
diff --git a/tests/unit/decoherence.cpp b/tests/unit/decoherence.cpp
index f36c491bb..60b4cd640 100644
--- a/tests/unit/decoherence.cpp
+++ b/tests/unit/decoherence.cpp
@@ -38,7 +38,8 @@ using std::vector;
  */
 
 
-#define TEST_CATEGORY "[unit][decoherence]"
+#define TEST_CATEGORY \
+    LABEL_UNIT_TAG "[decoherence]"
 
 
 void TEST_ON_CACHED_QUREGS(auto apiFunc, vector<int> targs, vector<qmatrix> kraus) {
diff --git a/tests/unit/environment.cpp b/tests/unit/environment.cpp
index ee259e220..9ecf8e376 100644
--- a/tests/unit/environment.cpp
+++ b/tests/unit/environment.cpp
@@ -178,40 +178,6 @@ TEST_CASE( "getQuESTEnv", TEST_CATEGORY ) {
 }
 
 
-TEST_CASE( "QuESTNumGpuThreadsPerBlock", TEST_CATEGORY ) {
-
-    SECTION( LABEL_CORRECTNESS ) {
-        // Check that it initially matches the compile time value
-        // stored in config.h
-        REQUIRE(getQuESTNumGpuThreadsPerBlock() == QUEST_DEFAULT_NUM_THREADS_PER_BLOCK);
-
-        // try a set/get iteration
-        const int test_num_tpb = 64;
-        REQUIRE_NOTHROW(setQuESTNumGpuThreadsPerBlock(test_num_tpb));
-        REQUIRE(getQuESTNumGpuThreadsPerBlock() == test_num_tpb);
-
-        // set it back to the original and confirm that also worked
-        REQUIRE_NOTHROW(setQuESTNumGpuThreadsPerBlock(QUEST_DEFAULT_NUM_THREADS_PER_BLOCK));
-        REQUIRE(getQuESTNumGpuThreadsPerBlock() == QUEST_DEFAULT_NUM_THREADS_PER_BLOCK);
-
-    }
-
-    SECTION( LABEL_VALIDATION ) {
-
-        // The way the error-handling currently works, Catch2 can't catch these (ironically)
-        // but leaving them in case we ever update the way errors are done.
-        
-        SECTION( "Less than 32" ) {
-            //REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(31) , ContainsSubstring("number of threads per block") );
-        }
-
-        SECTION("Not a multiple of 32 or 64.") {
-            //REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(94) , ContainsSubstring("number of threads per block") );
-        }
-
-    }
-}
-
 /** @} (end defgroup) */
 
 
diff --git a/tests/unit/experimental.cpp b/tests/unit/experimental.cpp
new file mode 100644
index 000000000..b36f67ad1
--- /dev/null
+++ b/tests/unit/experimental.cpp
@@ -0,0 +1,133 @@
+/** @file
+ * Unit tests of the environment module.
+ *
+ * @author Oliver Brown
+ * @author Tyson Jones
+ * 
+ * @defgroup unitexperi Experimental
+ * @ingroup unittests
+ */
+
+#include "quest.h"
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_string.hpp>
+#include <catch2/generators/catch_generators_range.hpp>
+
+#include "tests/utils/macros.hpp"
+#include "tests/utils/config.hpp"
+
+using Catch::Matchers::ContainsSubstring;
+
+
+
+/*
+ * UTILITIES
+ */
+
+#define TEST_CATEGORY \
+    LABEL_UNIT_TAG "[experimental]"
+
+
+
+/** 
+ * TESTS
+ * 
+ * @ingroup unitexperi
+ * @{
+ */
+
+
+TEST_CASE( "setQuESTNumGpuThreadsPerBlock", TEST_CATEGORY ) {
+
+    // remember the default number for later restoration (hence static)
+    static int initNumTPB = getQuESTNumGpuThreadsPerBlock();
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        // begin at 64 (AMD min, larger than NVIDIA min of 32),
+        // stop at 1024 (should be less than dev-specific max)
+        int inNumTPB = GENERATE( 64, 128, 256, 512, 1024 ); 
+        setQuESTNumGpuThreadsPerBlock(inNumTPB);
+
+        int outNumTPB = getQuESTNumGpuThreadsPerBlock();
+        REQUIRE( inNumTPB == outNumTPB );
+        
+        // BEWARE that we do not here test whether all QuEST
+        // operators succeed with the various numTBP; that must
+        // be ad hoc asssesed via updating the numTBP env-var
+        // before launching the entirety of the tests
+    }
+
+    SECTION( LABEL_VALIDATION ) {
+
+        SECTION( "Negative" ) {
+
+            int badNumTPB = GENERATE( 0, -1, -9999 );
+            REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(badNumTPB), ContainsSubstring( "must be positive" ) );
+        }
+
+        SECTION( "Indivisible by warp size" ) {
+
+            // If HIP status was attached to QuESTEnv, we could do:
+            //     QuESTEnv env = getQuESTEnv();
+            //     int warpSize = (env.isGpuAccelerated && env.isHipCompiled)? 64 : 32;
+            // Since this currently isn't the case, we assume a warp size of 32,
+            // which will mean when this test is run on AMD GPUs, the below tested
+            // badNumTBP won't be as interestingly/rigorously spread
+            int warpSize = 32;
+
+            int badNumTPB = GENERATE_COPY( warpSize - 1, warpSize + 1, warpSize + warpSize/2, 3*warpSize + warpSize/2 );
+
+            REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(badNumTPB), ContainsSubstring( "Number does not divide evenly into the warp size" ) );
+        }
+
+        SECTION( "Exceeds device maximum" ) {
+
+            int badNumTPB = 999999; // exceeds expected 1024 max
+
+            // Cannot be tested (since validation not imposed) when GPU is not actively used
+            if (getQuESTEnv().isGpuAccelerated)
+                REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(badNumTPB), ContainsSubstring( "Exceeds the hardware-imposed maximum" ) );
+
+            SUCCEED( );
+        }
+    }
+
+    // restore numTBP, so as not to interfere with other tests
+    setQuESTNumGpuThreadsPerBlock(initNumTPB);
+}
+
+
+TEST_CASE( "getQuESTNumGpuThreadsPerBlock", TEST_CATEGORY ) {
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        // check initial value matches either the env-var (if set),
+        // or the fixed default in the codebase (hardcoded in test utils)
+        int defaultNum = getDefaultNumGpuThreadsPerBlock(); // test util via env-var
+        int reportedNum = getQuESTNumGpuThreadsPerBlock();  // QuEST API
+
+        REQUIRE( defaultNum == reportedNum );
+
+        // further testing of this function appears in setQuESTNumGpuThreadsPerBlock()
+    }
+
+    SECTION( LABEL_VALIDATION ) {
+
+        // there is none (except untestable env is init!)
+        SUCCEED( );
+    }
+}
+
+
+/** @} (end defgroup) */
+
+
+
+/**
+ * @todo
+ * UNTESTED FUNCTIONS
+ */
+
+// nothing! :^)
diff --git a/tests/utils/config.cpp b/tests/utils/config.cpp
index 30a3844ba..d8eeab605 100644
--- a/tests/utils/config.cpp
+++ b/tests/utils/config.cpp
@@ -40,9 +40,7 @@ int getIntEnvVarValueOrDefault(string name, int defaultValue) {
 
 
 /*
- * PUBLIC
- *
- * which each call std::getenv only once
+ * PUBLIC TEST ENV VARS
  */
 
 int getNumQubitsInUnitTestedQuregs() {
@@ -74,3 +72,20 @@ bool getWhetherToTestAllDeployments() {
     static bool value = getIntEnvVarValueOrDefault("QUEST_TEST_TRY_ALL_DEPLOYMENTS", 1);
     return value;
 }
+
+
+
+/*
+ * PUBLIC QUEST ENV VARS
+ */
+
+int getDefaultNumGpuThreadsPerBlock() {
+
+    // when the env-var is not present, we MUST return the default assumed by the QuEST src code,
+    // which at the time of writing, is a fixed 128 (rather than hardware-specific value)
+    const int compileTimeDefaultTPB = 128;
+
+    // when the env-var is present, we consult that, just like QuEST
+    static int value = getIntEnvVarValueOrDefault("QUEST_NUM_GPU_THREADS_PER_BLOCK", compileTimeDefaultTPB);
+    return value;
+}
diff --git a/tests/utils/config.hpp b/tests/utils/config.hpp
index 10a61f67a..80be56e01 100644
--- a/tests/utils/config.hpp
+++ b/tests/utils/config.hpp
@@ -82,12 +82,16 @@
  * ACCESSING ENV-VARS 
  */
 
+// test env-vars
 int getNumQubitsInUnitTestedQuregs();
 int getMaxNumTestedQubitPermutations();
 int getMaxNumTestedSuperoperatorTargets();
 int getNumTestedMixedDeploymentRepetitions();
 bool getWhetherToTestAllDeployments();
 
+// quest env-vars
+int getDefaultNumGpuThreadsPerBlock();
+
 
 #endif // CONFIG_PP