diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml
index 583749df4..0cec48613 100644
--- a/.github/workflows/audit.yml
+++ b/.github/workflows/audit.yml
@@ -50,9 +50,9 @@ jobs:
         run: >
           cmake -B ${{ env.build_dir }}
           -DCMAKE_CXX_COMPILER=clang++
-          -DENABLE_TESTING=ON
-          -DENABLE_MULTITHREADING=OFF
-          -DFLOAT_PRECISION=${{ matrix.precision }}
+          -DQUEST_BUILD_TESTS=ON
+          -DQUEST_ENABLE_OMP=OFF
+          -DQUEST_FLOAT_PRECISION=${{ matrix.precision }}
           -DCMAKE_CXX_FLAGS="${{ env.sanitiser_flags }}"
           -DCMAKE_EXE_LINKER_FLAGS="${{ env.sanitiser_flags }}"
 
@@ -92,9 +92,9 @@ jobs:
       - name: Configure CMake
         run: >
           cmake -B ${{ env.build_dir }}
-          -DENABLE_TESTING=ON
-          -DENABLE_MULTITHREADING=OFF
-          -DFLOAT_PRECISION=${{ matrix.precision }}
+          -DQUEST_BUILD_TESTS=ON
+          -DQUEST_ENABLE_OMP=OFF
+          -DQUEST_FLOAT_PRECISION=${{ matrix.precision }}
 
       - name: Compile QuEST
         run: cmake --build ${{ env.build_dir }} --parallel
@@ -147,8 +147,8 @@ jobs:
         run: >
           cmake -B .
           -DCMAKE_BUILD_TYPE=Release
-          -DENABLE_TESTING=ON
-          -DENABLE_MULTITHREADING=OFF
+          -DQUEST_BUILD_TESTS=ON
+          -DQUEST_ENABLE_OMP=OFF
           -DCMAKE_CXX_FLAGS="--coverage"
           -DCMAKE_EXE_LINKER_FLAGS="--coverage"
 
diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
index 0950b7dbb..c86de84f1 100644
--- a/.github/workflows/compile.yml
+++ b/.github/workflows/compile.yml
@@ -239,16 +239,16 @@ jobs:
       - name: Configure CMake
         run: >
           cmake -B ${{ env.build_dir }}
-          -DBUILD_EXAMPLES=ON
-          -DENABLE_TESTING=ON
-          -DFLOAT_PRECISION=${{ matrix.precision }}
-          -DENABLE_DEPRECATED_API=${{ matrix.deprecated }}
-          -DDISABLE_DEPRECATION_WARNINGS=${{ matrix.deprecated }}
-          -DENABLE_MULTITHREADING=${{ matrix.omp }}
-          -DENABLE_DISTRIBUTION=${{ matrix.mpi }}
-          -DENABLE_CUDA=${{ matrix.cuda }}
-          -DENABLE_HIP=${{ matrix.hip }}
-          -DENABLE_CUQUANTUM=${{ matrix.cuquantum }}
+          -DQUEST_BUILD_EXAMPLES=ON
+          -DQUEST_BUILD_TESTS=ON
+          -DQUEST_FLOAT_PRECISION=${{ matrix.precision }}
+          -DQUEST_ENABLE_DEPRECATED_API=${{ matrix.deprecated }}
+          -DQUEST_DISABLE_DEPRECATION_WARNINGS=${{ matrix.deprecated }}
+          -DQUEST_ENABLE_OMP=${{ matrix.omp }}
+          -DQUEST_ENABLE_MPI=${{ matrix.mpi }}
+          -DQUEST_ENABLE_CUDA=${{ matrix.cuda }}
+          -DQUEST_ENABLE_HIP=${{ matrix.hip }}
+          -DQUEST_ENABLE_CUQUANTUM=${{ matrix.cuquantum }}
           -DCMAKE_CUDA_ARCHITECTURES=${{ env.cuda_arch }}
           -DCMAKE_HIP_ARCHITECTURES=${{ env.hip_arch }}
           -DCMAKE_CXX_COMPILER=${{ matrix.compiler }}
diff --git a/.github/workflows/test_free.yml b/.github/workflows/test_free.yml
index e0837bfde..2d332e842 100644
--- a/.github/workflows/test_free.yml
+++ b/.github/workflows/test_free.yml
@@ -63,11 +63,11 @@ jobs:
       - name: Configure CMake
         run: >
           cmake -B ${{ env.build_dir }}
-          -DENABLE_TESTING=ON
-          -DENABLE_MULTITHREADING=OFF
-          -DENABLE_DEPRECATED_API=${{ matrix.version == 3 && 'ON' || 'OFF' }}
-          -DDISABLE_DEPRECATION_WARNINGS=${{ matrix.version == 3 && 'ON' || 'OFF' }}
-          -DFLOAT_PRECISION=${{ matrix.precision }}
+          -DQUEST_BUILD_TESTS=ON
+          -DQUEST_ENABLE_OMP=OFF
+          -DQUEST_ENABLE_DEPRECATED_API=${{ matrix.version == 3 && 'ON' || 'OFF' }}
+          -DQUEST_DISABLE_DEPRECATION_WARNINGS=${{ matrix.version == 3 && 'ON' || 'OFF' }}
+          -DQUEST_FLOAT_PRECISION=${{ matrix.precision }}
 
       # force 'Release' build (needed by MSVC to enable optimisations)
       - name: Compile
@@ -80,11 +80,11 @@ jobs:
       # are manually excluding each integration test by name
       - name: Run v4 tests
         if: ${{ matrix.version == 4 }}
-        run: ctest -j2 --output-on-failure --schedule-random -E "density evolution"
+        run: ctest -j2 --output-on-failure --schedule-random -C Release -E "density evolution"
         working-directory: ${{ env.build_dir }}
 
       # run v3 unit tests in random order
       - name: Run v3 tests
         if: ${{ matrix.version == 3 }}
-        run: ctest -j2 --output-on-failure --schedule-random
+        run: ctest -j2 --output-on-failure --schedule-random -C Release
         working-directory: ${{ env.depr_dir }}
diff --git a/.github/workflows/test_paid.yml b/.github/workflows/test_paid.yml
index 070592399..63518c90a 100644
--- a/.github/workflows/test_paid.yml
+++ b/.github/workflows/test_paid.yml
@@ -136,16 +136,16 @@ jobs:
       - name: Configure CMake
         run: >
           cmake -B ${{ env.build_dir }}
-          -DENABLE_TESTING=ON
-          -DFLOAT_PRECISION=${{ matrix.precision }}
-          -DENABLE_DEPRECATED_API=${{ matrix.version == 3 && 'ON' || 'OFF' }}
-          -DENABLE_MULTITHREADING=${{ matrix.omp }}
-          -DENABLE_DISTRIBUTION=${{ matrix.mpi }}
-          -DENABLE_CUDA=${{ matrix.cuda }}
-          -DENABLE_CUQUANTUM=${{ matrix.cuquantum }}
+          -DQUEST_BUILD_TESTS=ON
+          -DQUEST_FLOAT_PRECISION=${{ matrix.precision }}
+          -DQUEST_ENABLE_DEPRECATED_API=${{ matrix.version == 3 && 'ON' || 'OFF' }}
+          -DQUEST_ENABLE_OMP=${{ matrix.omp }}
+          -DQUEST_ENABLE_MPI=${{ matrix.mpi }}
+          -DQUEST_ENABLE_CUDA=${{ matrix.cuda }}
+          -DQUEST_ENABLE_CUQUANTUM=${{ matrix.cuquantum }}
           -DCMAKE_CUDA_ARCHITECTURES=${{ env.cuda_arch }}
-          -DTEST_ALL_DEPLOYMENTS=${{ env.test_all_deploys }}
-          -DTEST_MAX_NUM_QUBIT_PERMUTATIONS=${{ env.num_qubit_perms }}
+          -DQUEST_TEST_TRY_ALL_DEPLOYMENTS=${{ env.test_all_deploys }}
+          -DQUEST_TEST_MAX_NUM_QUBIT_PERMUTATIONS=${{ env.num_qubit_perms }}
 
       - name: Compile
         run: cmake --build ${{ env.build_dir }} --parallel
@@ -153,8 +153,8 @@ jobs:
       # specifying only env-vars with non-default values
       - name: Configure tests with environment variables
         run: | 
-          echo "TEST_MAX_NUM_QUBIT_PERMUTATIONS=${{ env.num_qubit_perms }}" >> $GITHUB_ENV
-          echo "TEST_ALL_DEPLOYMENTS=${{ env.test_all_deploys }}" >> $GITHUB_ENV
+          echo "QUEST_TEST_MAX_NUM_QUBIT_PERMUTATIONS=${{ env.num_qubit_perms }}" >> $GITHUB_ENV
+          echo "QUEST_TEST_TRY_ALL_DEPLOYMENTS=${{ env.test_all_deploys }}" >> $GITHUB_ENV
 
       # cannot use ctests when distributed, grr!
       - name: Run multithreaded + distributed v4 tests (16 nodes, 4 threads eeach)
@@ -264,13 +264,13 @@ jobs:
       - name: Configure CMake
         run: >
           cmake -B ${{ env.build_dir }}
-          -DENABLE_TESTING=ON
-          -DFLOAT_PRECISION=${{ matrix.precision }}
-          -DENABLE_DEPRECATED_API=${{ matrix.version == 3 && 'ON' || 'OFF' }}
-          -DENABLE_MULTITHREADING=${{ matrix.omp }}
-          -DENABLE_DISTRIBUTION=${{ matrix.mpi }}
-          -DENABLE_CUDA=${{ matrix.cuda }}
-          -DENABLE_CUQUANTUM=${{ matrix.cuquantum }}
+          -DQUEST_BUILD_TESTS=ON
+          -DQUEST_FLOAT_PRECISION=${{ matrix.precision }}
+          -DQUEST_ENABLE_DEPRECATED_API=${{ matrix.version == 3 && 'ON' || 'OFF' }}
+          -DQUEST_ENABLE_OMP=${{ matrix.omp }}
+          -DQUEST_ENABLE_MPI=${{ matrix.mpi }}
+          -DQUEST_ENABLE_CUDA=${{ matrix.cuda }}
+          -DQUEST_ENABLE_CUQUANTUM=${{ matrix.cuquantum }}
           -DCMAKE_CUDA_ARCHITECTURES=${{ env.cuda_arch }}
           -DCMAKE_CXX_FLAGS=${{ matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto' || '' }}
 
@@ -280,9 +280,9 @@ jobs:
       # specify only env-vars with non-default values
       - name: Configure tests with environment variables
         run: | 
-          echo "TEST_ALL_DEPLOYMENTS=${{ env.test_all_deploys }}" >> $GITHUB_ENV
-          echo "TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS=${{ env.test_repetitions }}" >> $GITHUB_ENV
-          echo "PERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}" >> $GITHUB_ENV
+          echo "QUEST_TEST_TRY_ALL_DEPLOYMENTS=${{ env.test_all_deploys }}" >> $GITHUB_ENV
+          echo "QUEST_TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS=${{ env.test_repetitions }}" >> $GITHUB_ENV
+          echo "QUEST_PERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}" >> $GITHUB_ENV
 
       # cannot use ctests when distributed, grr!
       - name: Run GPU + distributed v4 mixed tests (4 nodes sharing 1 GPU)
diff --git a/AUTHORS.txt b/AUTHORS.txt
index 907135679..b06846df8 100644
--- a/AUTHORS.txt
+++ b/AUTHORS.txt
@@ -44,6 +44,8 @@ Dr Ian Bush [consultant]
     HPC
 
 External contributors:
+Íñigo Aréjula Aísa
+    patched validation error in the experimental user-owned MPI interface (#722)
 Daniel Expósito Patiño
     patched the applyMultiStateControlledSqrtSwap C++ signature (#737)
 Diogo Pratas Maia
@@ -70,8 +72,8 @@ SchineCompton
     patched GPU Cmake Release build
 Christopher J. Anders
     patched Cmake build when multhithreading defaults off
-    revsied Cmake min version for GPU build
+    revised Cmake min version for GPU build
 Gleb Struchalin
     patched the cmake standalone build
 Milos Prokop
-    implemented serial prototype of initDiagonalOpFromPauliHamil
\ No newline at end of file
+    implemented serial prototype of initDiagonalOpFromPauliHamil
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f91c05f83..b5a438713 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,10 +60,10 @@ endif()
 # Default to "Release"
 # Using recipe from Kitware Blog post
 # https://www.kitware.com/cmake-and-the-default-build-type/
-set(default_build_type "Release")
+set(quest_default_build_type "Release")
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-  message(STATUS "Setting build type to '${default_build_type}' as none was specified.")
-  set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE
+  message(STATUS "Setting build type to '${quest_default_build_type}' as none was specified.")
+  set(CMAKE_BUILD_TYPE "${quest_default_build_type}" CACHE
       STRING "Choose the type of build." FORCE)
   # Set the possible values of build type for cmake-gui
   set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
@@ -79,50 +79,50 @@ if(PROJECT_IS_TOP_LEVEL)
 endif ()
 
 # Library naming
-set(LIB_NAME QuEST 
-  CACHE 
+set(QUEST_OUTPUT_LIB_NAME QuEST
+  CACHE
   STRING
-  "Change library name. LIB_NAME is QuEST by default."
+  "Change library name. QUEST_OUTPUT_LIB_NAME is QuEST by default."
 )
-message(STATUS "Library will be named lib${LIB_NAME}. Set LIB_NAME to modify.")
+message(STATUS "Library will be named lib${QUEST_OUTPUT_LIB_NAME}. Set QUEST_OUTPUT_LIB_NAME to modify.")
 
-option(VERBOSE_LIB_NAME "Modify library name based on compilation configuration. Turned OFF by default." OFF)
-message(STATUS "Verbose library naming is turned ${VERBOSE_LIB_NAME}. Set VERBOSE_LIB_NAME to modify.")
+option(QUEST_APPEND_CONFIG_TO_LIB_NAME "Modify library name based on compilation configuration. Turned OFF by default." OFF)
+message(STATUS "Verbose library naming is turned ${QUEST_APPEND_CONFIG_TO_LIB_NAME}. Set QUEST_APPEND_CONFIG_TO_LIB_NAME to modify.")
 
 
 # Precision
-set(FLOAT_PRECISION 2 
-  CACHE 
-  STRING 
+set(QUEST_FLOAT_PRECISION 2
+  CACHE
+  STRING
   "Whether to use single, double, or quad floating point precision in the state vector. {1,2,4}"
 )
-set_property(CACHE FLOAT_PRECISION PROPERTY STRINGS
+set_property(CACHE QUEST_FLOAT_PRECISION PROPERTY STRINGS
   1
   2
   4
 )
-message(STATUS "Precision set to ${FLOAT_PRECISION}. Set FLOAT_PRECISION to modify.")
+message(STATUS "Precision set to ${QUEST_FLOAT_PRECISION}. Set QUEST_FLOAT_PRECISION to modify.")
 
 
 # Examples
 option(
-  BUILD_EXAMPLES
+  QUEST_BUILD_EXAMPLES
   "Whether the example programs will be built alongside the QuEST library. Turned OFF by default."
   OFF
 )
-message(STATUS "Examples are turned ${BUILD_EXAMPLES}. Set BUILD_EXAMPLES to modify.")
+message(STATUS "Examples are turned ${QUEST_BUILD_EXAMPLES}. Set QUEST_BUILD_EXAMPLES to modify.")
 
 
 # Testing
 option(
-  ENABLE_TESTING
+  QUEST_BUILD_TESTS
   "Whether the test suite will be built alongside the QuEST library. Turned ON by default."
   OFF
 )
-message(STATUS "Testing is turned ${ENABLE_TESTING}. Set ENABLE_TESTING to modify.")
+message(STATUS "Testing is turned ${QUEST_BUILD_TESTS}. Set QUEST_BUILD_TESTS to modify.")
 
 option(
-  DOWNLOAD_CATCH2
+  QUEST_TESTS_DOWNLOAD_CATCH2
   "Whether Catch2 v3 will be downloaded if it is not found. Turned ON by default."
   ON
 )
@@ -130,61 +130,97 @@ option(
 
 # Multithreading
 option(
-  ENABLE_MULTITHREADING 
-  "Whether QuEST will be built with shared-memory parallelism support using OpenMP. Turned ON by default." 
+  QUEST_ENABLE_OMP
+  "Whether QuEST will be built with shared-memory parallelism support using OpenMP. Turned ON by default."
   ON
 )
-message(STATUS "Multithreading is turned ${ENABLE_MULTITHREADING}. Set ENABLE_MULTITHREADING to modify.")
+message(STATUS "Multithreading is turned ${QUEST_ENABLE_OMP}. Set QUEST_ENABLE_OMP to modify.")
+
+
+# NUMA
+option(
+  QUEST_ENABLE_NUMA
+  "Whether QuEST will be built with NUMA awareness, when also using OpenMP. Turned ON by default."
+  ON
+)
+message(STATUS "NUMA awareness is turned ${QUEST_ENABLE_NUMA}. Set QUEST_ENABLE_NUMA to modify.")
 
 
 # Distribution
 option(
-  ENABLE_DISTRIBUTION 
-  "Whether QuEST will be built with distributed parallelism support using MPI. Turned OFF by default." 
+  QUEST_ENABLE_MPI
+  "Whether QuEST will be built with distributed parallelism support using MPI. Turned OFF by default."
   OFF
 )
-message(STATUS "Distribution is turned ${ENABLE_DISTRIBUTION}. Set ENABLE_DISTRIBUTION to modify.")
+message(STATUS "Distribution is turned ${QUEST_ENABLE_MPI}. Set QUEST_ENABLE_MPI to modify.")
+
+option(
+  QUEST_ENABLE_SUBCOMM
+  "Whether QuEST will be built with support for restricting it to a user-defined MPI communicator. Turned OFF by default."
+  OFF
+)
+message(STATUS "Custom communicator support is turned ${QUEST_ENABLE_SUBCOMM}. Set QUEST_ENABLE_SUBCOMM to modify.")
 
 
 # GPU Acceleration
 option(
-  ENABLE_CUDA
+  QUEST_ENABLE_CUDA
   "Whether QuEST will be built with support for NVIDIA GPU acceleration. Turned OFF by default."
   OFF
 )
-message(STATUS "NVIDIA GPU acceleration is turned ${ENABLE_CUDA}. Set ENABLE_CUDA to modify.")
+message(STATUS "NVIDIA GPU acceleration is turned ${QUEST_ENABLE_CUDA}. Set QUEST_ENABLE_CUDA to modify.")
 
 option(
-  ENABLE_CUQUANTUM
+  QUEST_ENABLE_CUQUANTUM
   "Whether QuEST will be built with support for NVIDIA cuQuantum. Turned OFF by default."
   OFF
 )
-message(STATUS "CuQuantum support is turned ${ENABLE_CUQUANTUM}. Set ENABLE_CUQUANTUM to modify.")
+message(STATUS "CuQuantum support is turned ${QUEST_ENABLE_CUQUANTUM}. Set QUEST_ENABLE_CUQUANTUM to modify.")
 
 option(
-  ENABLE_HIP
+  QUEST_ENABLE_HIP
   "Whether QuEST will be built with support for AMD GPU acceleration. Turned OFF by default."
   OFF
 )
-message(STATUS "AMD GPU acceleration is turned ${ENABLE_HIP}. Set ENABLE_HIP to modify.")
+message(STATUS "AMD GPU acceleration is turned ${QUEST_ENABLE_HIP}. Set QUEST_ENABLE_HIP to modify.")
+
+
+# GPU Performance Tuning
+# (We do not print this value when configuring CMake as it is for advanced users only)
+
+set(quest_tpb_description # (the games we play for multi-line set() strings!)
+  "The default number of threads per block QuEST will use when offloading to a GPU. Set to 128 by default. "
+  "Must be a multiple of 32 (on NVIDIA GPUs) or 64 (on AMD GPUs). Can be overridden at executable launch "
+  "via an environment variable of the same name, or during runtime via a corresponding API setter function."
+)
+set(QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK 128 
+  CACHE STRING
+  "${quest_tpb_description}")
+mark_as_advanced(QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK)
 
 
 # Deprecated API
 option(
-  ENABLE_DEPRECATED_API
+  QUEST_ENABLE_DEPRECATED_API
   "Whether QuEST will be built with deprecated API support. Turned OFF by default."
   OFF
 )
-message(STATUS "Deprecated API support is turned ${ENABLE_DEPRECATED_API}. Set ENABLE_DEPRECATED_API to modify.")
+message(STATUS "Deprecated API support is turned ${QUEST_ENABLE_DEPRECATED_API}. Set QUEST_ENABLE_DEPRECATED_API to modify.")
 
 option(
-  DISABLE_DEPRECATION_WARNINGS
+  QUEST_DISABLE_DEPRECATION_WARNINGS
   "Whether to disable compile-time warnings ordinarily triggered by use of the deprecated API. Turned OFF by default."
   OFF
 )
-message(STATUS "Disabling of deprecated API warnings is turned ${DISABLE_DEPRECATION_WARNINGS}. Set DISABLE_DEPRECATION_WARNINGS to modify.")
+message(STATUS 
+  "Disabling of deprecated API warnings is turned ${QUEST_DISABLE_DEPRECATION_WARNINGS}. "
+  "Set QUEST_DISABLE_DEPRECATION_WARNINGS to modify."
+)
 
-option(INSTALL_BINARIES "Whether to include example and user binaries in the install." OFF)
+option(QUEST_INSTALL_BINARIES "Whether to include example and user binaries in the install." OFF)
+if (QUEST_INSTALL_BINARIES)
+  message(STATUS "Including example and user binaries in the install (if built).")
+endif()
 
 
 
@@ -193,38 +229,74 @@ option(INSTALL_BINARIES "Whether to include example and user binaries in the ins
 # ============================
 
 
-if (ENABLE_CUDA AND ENABLE_HIP)
+if (QUEST_ENABLE_CUDA AND QUEST_ENABLE_HIP)
   message(FATAL_ERROR "QuEST cannot support CUDA and HIP simultaneously.")
 endif()
 
 
-if ((ENABLE_CUDA OR ENABLE_HIP) AND FLOAT_PRECISION STREQUAL 4)
+if ((QUEST_ENABLE_CUDA OR QUEST_ENABLE_HIP) AND QUEST_FLOAT_PRECISION STREQUAL 4)
   message(FATAL_ERROR "Quad precision is not supported on GPU. Please disable GPU acceleration or lower precision.")
 endif()
 
 
-if (ENABLE_CUQUANTUM AND NOT ENABLE_CUDA)
+if (QUEST_ENABLE_CUQUANTUM AND NOT QUEST_ENABLE_CUDA)
   message(FATAL_ERROR "Use of cuQuantum requires CUDA.")
 endif()
 
 
+if (QUEST_ENABLE_SUBCOMM AND NOT QUEST_ENABLE_MPI)
+  message(FATAL_ERROR "Distribution must be enabled to make use of a user-defined communicator for QuEST.")
+endif()
+
+
 if(WIN32)
   
   # Force MSVC to export all symbols in a shared library, like GCC and clang
   set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
-  if (ENABLE_TESTING AND BUILD_SHARED_LIBS)
+  if (QUEST_BUILD_TESTS AND BUILD_SHARED_LIBS)
     message(WARNING "Compiling the tests on Windows requires BUILD_SHARED_LIBS=OFF which we now force.")
     set(BUILD_SHARED_LIBS OFF)
   endif()
 
-  if (ENABLE_DEPRECATED_API)
+  if (QUEST_ENABLE_DEPRECATED_API)
     message(FATAL_ERROR "The deprecated API is not compatible with MSVC.")
   endif()
 
 endif()
 
 
+# validate numTPB even when GPU not compiled
+if (QUEST_ENABLE_HIP)
+  set(quest_warp_size 64)
+  set(quest_gpu_model "AMD GPUs (via HIP)")
+else()
+  set(quest_warp_size 32)
+  set(quest_gpu_model "NVIDIA GPUs (via CUDA), or when not targeting GPUs")
+endif()
+math(EXPR quest_tpb_remainder "${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} % ${quest_warp_size}")
+if ((NOT (quest_tpb_remainder EQUAL 0)) OR NOT (QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK GREATER 0))
+  message(FATAL_ERROR
+    "QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK was set to ${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}, "
+    "but it must be a positive multiple of ${quest_warp_size} when compiling for ${quest_gpu_model}."
+  )
+endif()
+
+
+# warn when numTPB will be later overridden by the current environment variable
+if(
+  DEFINED ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} 
+  AND NOT "$ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}" STREQUAL ""
+  AND NOT "$ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}" STREQUAL "${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}"
+)
+  message(WARNING 
+    "The CMake option QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK=${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK} "
+    "differs from the current environment variable (of the same name) value of $ENV{QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK}. "
+    "If not cleared before QuEST is launched, the latter will override the former."
+  )
+endif()
+
+
 # Encourage high-performance Release build
 
 # Taken from Kitware's exmaple of problematic code at
@@ -251,32 +323,32 @@ endif()
 # ============================
 
 
-if (VERBOSE_LIB_NAME)
+if (QUEST_APPEND_CONFIG_TO_LIB_NAME)
 
-  string(CONCAT LIB_NAME ${LIB_NAME} "-fp${FLOAT_PRECISION}")
+  string(CONCAT QUEST_OUTPUT_LIB_NAME ${QUEST_OUTPUT_LIB_NAME} "-fp${QUEST_FLOAT_PRECISION}")
 
-  if (ENABLE_MULTITHREADING)
-    string(CONCAT LIB_NAME ${LIB_NAME} "+mt")
+  if (QUEST_ENABLE_OMP)
+    string(CONCAT QUEST_OUTPUT_LIB_NAME ${QUEST_OUTPUT_LIB_NAME} "+mt")
   endif()
 
-  if (ENABLE_DISTRIBUTION)
-    string(CONCAT LIB_NAME ${LIB_NAME} "+mpi")
+  if (QUEST_ENABLE_MPI)
+    string(CONCAT QUEST_OUTPUT_LIB_NAME ${QUEST_OUTPUT_LIB_NAME} "+mpi")
   endif()
 
-  if (ENABLE_CUDA)
-    string(CONCAT LIB_NAME ${LIB_NAME} "+cuda")
+  if (QUEST_ENABLE_CUDA)
+    string(CONCAT QUEST_OUTPUT_LIB_NAME ${QUEST_OUTPUT_LIB_NAME} "+cuda")
   endif()
 
-  if (ENABLE_HIP)
-    string(CONCAT LIB_NAME ${LIB_NAME} "+hip")
+  if (QUEST_ENABLE_HIP)
+    string(CONCAT QUEST_OUTPUT_LIB_NAME ${QUEST_OUTPUT_LIB_NAME} "+hip")
   endif()
 
-  if (ENABLE_CUQUANTUM)
-    string(CONCAT LIB_NAME ${LIB_NAME} "+cuquantum")
+  if (QUEST_ENABLE_CUQUANTUM)
+    string(CONCAT QUEST_OUTPUT_LIB_NAME ${QUEST_OUTPUT_LIB_NAME} "+cuquantum")
   endif()
 
-  if (ENABLE_DEPRECATED_API)
-    string(CONCAT LIB_NAME ${LIB_NAME} "+depr")
+  if (QUEST_ENABLE_DEPRECATED_API)
+    string(CONCAT QUEST_OUTPUT_LIB_NAME ${QUEST_OUTPUT_LIB_NAME} "+depr")
   endif()
 
 endif()
@@ -313,7 +385,7 @@ set_target_properties(QuEST PROPERTIES
 # while the source code is entirely C++ and requires C++17,
 # and the tests further require C++20 (handled in tests/).
 # Yet, we here specify C++17 for the source, and C11 as only
-# applies to the C interface when users specify USER_SOURCE,
+# applies to the C interface when users specify USER_SOURCE_NAMES,
 # to attemptedly minimise user confusion. Users wishing to
 # link QuEST with C++14 should separate compilation.
 target_compile_features(QuEST
@@ -344,7 +416,7 @@ target_compile_options(QuEST
 
 
 # OpenMP
-if (ENABLE_MULTITHREADING)
+if (QUEST_ENABLE_OMP)
 
   # find OpenMP, but fail gracefully...
   find_package(OpenMP QUIET)
@@ -375,35 +447,38 @@ endif()
 
 
 # NUMA (only relevant when multithreading)
-if (ENABLE_MULTITHREADING)
+if (QUEST_ENABLE_OMP AND QUEST_ENABLE_NUMA)
 
   # Find NUMA - location of NUMA headers
   if (WIN32)
-    set(NUMA_AWARE 0)
+    set(QUEST_ENABLE_NUMA 0)
     message(WARNING "Building on Windows, QuEST will not be aware of numa locality")
   else()
     include(FindPkgConfig)
     pkg_search_module(NUMA numa IMPORTED_TARGET GLOBAL)
     if (${NUMA_FOUND})
-      set(NUMA_AWARE ${NUMA_FOUND})
+      set(QUEST_ENABLE_NUMA ${NUMA_FOUND})
       target_link_libraries(QuEST PRIVATE PkgConfig::NUMA)
       message(STATUS "NUMA awareness is enabled.")
     else()
-      set(NUMA_AWARE 0)
+      set(QUEST_ENABLE_NUMA 0)
       message(WARNING "libnuma not found, QuEST will not be aware of numa locality")
     endif()
   endif()
 
 else()
-  set(NUMA_AWARE 0)
+  set(QUEST_ENABLE_NUMA 0)
 endif()
 
 
 # MPI
-if (ENABLE_DISTRIBUTION)
+if (QUEST_ENABLE_MPI)
   find_package(MPI REQUIRED
+    # Component CXX is the C api usable from C++
+    # NOT the deprecated C++ API
     COMPONENTS CXX
   )
+
   target_link_libraries(QuEST
     PRIVATE
     MPI::MPI_CXX
@@ -412,7 +487,7 @@ endif()
 
 
 # CUDA
-if (ENABLE_CUDA)
+if (QUEST_ENABLE_CUDA)
 
   # make nvcc use user cxx-compiler as default host (before cuda-host is set below)
   if (NOT DEFINED CMAKE_CUDA_HOST_COMPILER)
@@ -437,7 +512,7 @@ endif()
 
 
 # HIP
-if (ENABLE_HIP)
+if (QUEST_ENABLE_HIP)
 
   # if generation fails (hip::amdhip64 not found), users can try setting
   # CMAKE_MODULE_PATH to '/opt/rocm/cmake' or '/opt/rocm/hip/lib/cmake/hip'
@@ -460,7 +535,7 @@ endif()
 
 
 # cuQuantum
-if (ENABLE_CUQUANTUM)
+if (QUEST_ENABLE_CUQUANTUM)
   find_package(CUQUANTUM REQUIRED)
   target_link_libraries(QuEST PRIVATE CUQUANTUM::cuStateVec)
   set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON)
@@ -474,28 +549,30 @@ endif()
 
 
 # set vars which will be written to config.h.in (auto-converted to 0 or 1)
-set(COMPILE_OPENMP ${ENABLE_MULTITHREADING})
-set(COMPILE_MPI ${ENABLE_DISTRIBUTION})
-set(COMPILE_CUQUANTUM ${ENABLE_CUQUANTUM})
-set(INCLUDE_DEPRECATED_FUNCTIONS ${ENABLE_DEPRECATED_API})
+set(QUEST_COMPILE_OMP ${QUEST_ENABLE_OMP})
+set(QUEST_COMPILE_MPI ${QUEST_ENABLE_MPI})
+set(QUEST_COMPILE_SUBCOMM ${QUEST_ENABLE_SUBCOMM})
+set(QUEST_COMPILE_CUQUANTUM ${QUEST_ENABLE_CUQUANTUM})
+set(QUEST_INCLUDE_DEPRECATED_FUNCTIONS ${QUEST_ENABLE_DEPRECATED_API})
 
 
 # (for the love of God cmake, create a concise syntax for this)
-if (ENABLE_CUDA OR ENABLE_HIP)
-  set(COMPILE_CUDA 1)
+if (QUEST_ENABLE_CUDA OR QUEST_ENABLE_HIP)
+  set(QUEST_COMPILE_CUDA 1)
 else()
-  set(COMPILE_CUDA 0)
+  set(QUEST_COMPILE_CUDA 0)
 endif()
+set(QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP})
 
 
-# these vars are already set, but repeated here for clarity
-set(FLOAT_PRECISION ${FLOAT_PRECISION})
-set(NUMA_AWARE ${NUMA_AWARE})
-set(DISABLE_DEPRECATION_WARNINGS ${DISABLE_DEPRECATION_WARNINGS})
+# non-binary set vars which will be written to config.h.in (with a differing name) 
+set(QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK ${QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK})
 
 
-# these do not appear in src but are saved for record-keeping in config.h.in
-set(COMPILE_HIP ${ENABLE_HIP})
+# these vars are already set (cmake name matches the macro name), but repeated here for clarity
+set(QUEST_FLOAT_PRECISION ${QUEST_FLOAT_PRECISION})
+set(QUEST_ENABLE_NUMA ${QUEST_ENABLE_NUMA})
+set(QUEST_DISABLE_DEPRECATION_WARNINGS ${QUEST_DISABLE_DEPRECATION_WARNINGS})
 
 
 
@@ -511,7 +588,7 @@ endif()
 
 
 # Set output name
-set_target_properties(QuEST PROPERTIES OUTPUT_NAME ${LIB_NAME})
+set_target_properties(QuEST PROPERTIES OUTPUT_NAME ${QUEST_OUTPUT_LIB_NAME})
 
 
 # Add source files
@@ -530,7 +607,11 @@ add_executable(min_example
 )
 target_link_libraries(min_example PRIVATE QuEST::QuEST)
 
-if (INSTALL_BINARIES)
+if (QUEST_ENABLE_MPI AND QUEST_ENABLE_SUBCOMM)
+  target_link_libraries(min_example PRIVATE MPI::MPI_CXX)
+endif()
+
+if (QUEST_INSTALL_BINARIES)
   install(TARGETS min_example
     RUNTIME
     DESTINATION ${CMAKE_INSTALL_BINDIR}
@@ -539,7 +620,7 @@ endif ()
 
 
 # all examples optionally built
-if (BUILD_EXAMPLES)
+if (QUEST_BUILD_EXAMPLES)
   add_subdirectory(examples)
 endif()
 
@@ -576,26 +657,26 @@ setup_quest_rpath(min_example)
 
 
 # validate
-if (USER_SOURCE AND NOT OUTPUT_EXE)
-    message(SEND_ERROR "USER_SOURCE specified, but not OUTPUT_EXE.")
+if (USER_SOURCE_NAMES AND NOT USER_OUTPUT_EXE_NAME)
+    message(SEND_ERROR "USER_SOURCE_NAMES specified, but not USER_OUTPUT_EXE_NAME.")
 endif()
-if (OUTPUT_EXE AND NOT USER_SOURCE)
-    message(SEND_ERROR "OUTPUT_EXE specified, but not USER_SOURCE.")
+if (USER_OUTPUT_EXE_NAME AND NOT USER_SOURCE_NAMES)
+    message(SEND_ERROR "USER_OUTPUT_EXE_NAME specified, but not USER_SOURCE_NAMES.")
 endif()
 
 
 # compile user source
-if (USER_SOURCE AND OUTPUT_EXE)
-  message(STATUS "Compiling ${USER_SOURCE} to executable ${OUTPUT_EXE}.")
+if (USER_SOURCE_NAMES AND USER_OUTPUT_EXE_NAME)
+  message(STATUS "Compiling ${USER_SOURCE_NAMES} to executable ${USER_OUTPUT_EXE_NAME}.")
 
-  add_executable(${OUTPUT_EXE} ${USER_SOURCE})
-  target_link_libraries(${OUTPUT_EXE} PUBLIC QuEST)
+  add_executable(${USER_OUTPUT_EXE_NAME} ${USER_SOURCE_NAMES})
+  target_link_libraries(${USER_OUTPUT_EXE_NAME} PUBLIC QuEST)
    
-  if (INSTALL_BINARIES)
-    install(TARGETS ${OUTPUT_EXE} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+  if (QUEST_INSTALL_BINARIES)
+    install(TARGETS ${USER_OUTPUT_EXE_NAME} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
   endif()
   
-  setup_quest_rpath(${OUTPUT_EXE})
+  setup_quest_rpath(${USER_OUTPUT_EXE_NAME})
 endif()
 
 
@@ -605,14 +686,14 @@ endif()
 # ============================
 
 
-if (ENABLE_TESTING)
+if (QUEST_BUILD_TESTS)
 
   # try find Catch2
   set(CatchVersion 3.8.0)
   find_package(Catch2 ${CatchVersion} QUIET)
 
   # else try download Catch2
-  if (NOT TARGET Catch2::Catch2 AND DOWNLOAD_CATCH2)
+  if (NOT TARGET Catch2::Catch2 AND QUEST_TESTS_DOWNLOAD_CATCH2)
     message(STATUS "Catch2 not found, it will be downloaded and built in the build directory.")
     Include(FetchContent)
 
@@ -654,12 +735,12 @@ install(TARGETS QuEST
 
 
 # Write CMake version file for QuEST
-set(QuEST_INSTALL_CONFIGDIR "${CMAKE_INSTALL_LIBDIR}/cmake/QuEST")
+set(quest_install_config_dir "${CMAKE_INSTALL_LIBDIR}/cmake/QuEST")
 
 
 # Write QuESTConfigVersion.cmake
 write_basic_package_version_file(
-  "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}ConfigVersion.cmake"
+  "${CMAKE_CURRENT_BINARY_DIR}/${QUEST_OUTPUT_LIB_NAME}ConfigVersion.cmake"
   VERSION ${PROJECT_VERSION}
   COMPATIBILITY AnyNewerVersion
 )
@@ -668,16 +749,16 @@ write_basic_package_version_file(
 # Configure QuESTConfig.cmake (from template)
 configure_package_config_file(
   "${CMAKE_CURRENT_SOURCE_DIR}/cmake/QuESTConfig.cmake.in"
-  "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}Config.cmake"
-  INSTALL_DESTINATION "${QuEST_INSTALL_CONFIGDIR}"
+  "${CMAKE_CURRENT_BINARY_DIR}/${QUEST_OUTPUT_LIB_NAME}Config.cmake"
+  INSTALL_DESTINATION "${quest_install_config_dir}"
 )
 
 
 # Install them
 install(FILES
-  "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}Config.cmake"
-  "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}ConfigVersion.cmake"
-  DESTINATION "${QuEST_INSTALL_CONFIGDIR}"
+  "${CMAKE_CURRENT_BINARY_DIR}/${QUEST_OUTPUT_LIB_NAME}Config.cmake"
+  "${CMAKE_CURRENT_BINARY_DIR}/${QUEST_OUTPUT_LIB_NAME}ConfigVersion.cmake"
+  DESTINATION "${quest_install_config_dir}"
 )
 
 install(FILES 
@@ -699,9 +780,9 @@ install(
 
 install(
   EXPORT QuESTTargets
-  FILE "${LIB_NAME}Targets.cmake"
+  FILE "${QUEST_OUTPUT_LIB_NAME}Targets.cmake"
   NAMESPACE QuEST::
-  DESTINATION "${QuEST_INSTALL_CONFIGDIR}"
+  DESTINATION "${quest_install_config_dir}"
 )
 
 if(PROJECT_IS_TOP_LEVEL)
diff --git a/cmake/QuESTConfig.cmake.in b/cmake/QuESTConfig.cmake.in
index 5f112d9a4..76f7ff3d6 100644
--- a/cmake/QuESTConfig.cmake.in
+++ b/cmake/QuESTConfig.cmake.in
@@ -1,5 +1,5 @@
 # @author Erich Essmann
-# @author Luc Jaulmes (patched use of LIB_NAME)
+# @author Luc Jaulmes (patched use of QUEST_OUTPUT_LIB_NAME)
 
 @PACKAGE_INIT@
-include("${CMAKE_CURRENT_LIST_DIR}/@LIB_NAME@Targets.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/@QUEST_OUTPUT_LIB_NAME@Targets.cmake")
diff --git a/docs/cmake.md b/docs/cmake.md
index d3c23ee4c..fec90d76a 100644
--- a/docs/cmake.md
+++ b/docs/cmake.md
@@ -11,7 +11,7 @@
 Version 4 of QuEST includes reworked CMake to support library builds, CMake export, and installation. Here we detail useful variables to configure the compilation of QuEST. If using a Unix-like operating system, any of these variables can be set using the `-D` flag when invoking CMake, for example:
 
 ```
-cmake -Bbuild -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/opt/QuEST -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DENABLE_MULTITHREADING=ON -DENABLE_DISTRIBUTION=OFF ./
+cmake -Bbuild -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/opt/QuEST -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DQUEST_ENABLE_OMP=ON -DQUEST_ENABLE_MPI=OFF ./
 ```
 
 Then, as detailed in [`compile.md`](compile.md), one need only move to the build directory and compile by invoking make:
@@ -32,21 +32,23 @@ make
 
 | Variable | (Default) Values | Notes |
 | -------- | ---------------- | ----- |
-| `LIB_NAME` | (`QuEST`), String | The QuEST library will be named `lib${LIB_NAME}.so`. Can be used to differentiate multiple versions of QuEST which have been compiled. |
-| `VERBOSE_LIB_NAME` | (`OFF`), `ON` | When turned on `LIB_NAME` will be modified according to the other configuration options chosen. For example compiling QuEST with multithreading, distribution, and double precision with `VERBOSE_LIB_NAME` turned on creates `libQuEST-fp2+mt+mpi.so`. |
-| `FLOAT_PRECISION` | (`2`), `1`, `4` | Determines which floating-point precision QuEST will use: double, single, or quad. *Note: Quad precision is not supported when also compiling for GPU.* |
-| `BUILD_EXAMPLES` | (`OFF`), `ON` | Determines whether the example programs will be built alongside QuEST. Note that `min_example` is always built. |
-| `INSTALL_BINARIES` | (`OFF`), `ON` | Determines whether compiled binaries such as the examples will be installed as well as the QuEST library. |
-| `ENABLE_MULTITHREADING` | (`ON`), `OFF` | Determines whether QuEST will be built with support for parallelisation with OpenMP. |
-| `ENABLE_DISTRIBUTION` | (`OFF`), `ON` | Determines whether QuEST will be built with support for parallelisation with MPI. |
-| `ENABLE_CUDA` | (`OFF`), `ON` | Determines whether QuEST will be built with support for NVIDIA GPU acceleration. If turned on, `CMAKE_CUDA_ARCHITECTURES` should probably also be set. |
-| `ENABLE_CUQUANTUM` | (`OFF`), `ON` | Determines whether QuEST will make use of the NVIDIA CuQuantum library. Cannot be turned on if `ENABLE_CUDA` is off. |
-| `ENABLE_HIP` | (`OFF`), `ON` | Determines whether QuEST will be built with support for AMD GPU acceleration. If turned on, `CMAKE_HIP_ARCHITECTURES` should probably also be set. |
-| `ENABLE_DEPRECATED_API` | (`OFF`), `ON` | Determines whether QuEST will be built with support for the deprecated (v3) API. ***Note**: this will generate compiler warnings and is not supported by MSVC.* |
-| `DISABLE_DEPRECATION_WARNINGS` | (`OFF`), `ON` | Whether to disable the compile-time deprecation warnings when using the deprecated (v3) API. |
-| `USER_SOURCE` | (Undefined), String | The source file for a user program which will be compiled alongside QuEST. `OUTPUT_EXE` *must* also be defined. |
-| `OUTPUT_EXE` | (Undefined), String | The name of the executable which will be created from the provided `USER_SOURCE`. `USER_SOURCE` *must* also be defined. |
-
+| `QUEST_OUTPUT_LIB_NAME` | (`QuEST`), String | The QuEST library will be named `lib${QUEST_OUTPUT_LIB_NAME}.so`. Can be used to differentiate multiple versions of QuEST which have been compiled. |
+| `QUEST_APPEND_CONFIG_TO_LIB_NAME` | (`OFF`), `ON` | When turned on `QUEST_OUTPUT_LIB_NAME` will be modified according to the other configuration options chosen. For example compiling QuEST with multithreading, distribution, and double precision with `QUEST_APPEND_CONFIG_TO_LIB_NAME` turned on creates `libQuEST-fp2+mt+mpi.so`. |
+| `QUEST_FLOAT_PRECISION` | (`2`), `1`, `4` | Determines which floating-point precision QuEST will use: double, single, or quad. *Note: Quad precision is not supported when also compiling for GPU.* |
+| `QUEST_BUILD_EXAMPLES` | (`OFF`), `ON` | Determines whether the example programs will be built alongside QuEST. Note that `min_example` is always built. |
+| `QUEST_INSTALL_BINARIES` | (`OFF`), `ON` | Determines whether compiled binaries such as the examples will be installed as well as the QuEST library. |
+| `QUEST_ENABLE_OMP` | (`ON`), `OFF` | Determines whether QuEST will be built with support for parallelisation with OpenMP. |
+| `QUEST_ENABLE_NUMA` | (`ON`), `OFF` | Determines whether QuEST will attempt to build with NUMA awareness when OpenMP is also enabled. |
+| `QUEST_ENABLE_MPI` | (`OFF`), `ON` | Determines whether QuEST will be built with support for parallelisation with MPI. |
+| `QUEST_ENABLE_SUBCOMM` | (`OFF`), `ON` | Determines whether QuEST will be built with support for custom MPI communicators. _**Note**: This has the unfortunate side-effect of requiring the MPI header in the public header for QuEST, meaning MPI will become a dependency of any application or library which includes the QuEST header whether it uses MPI or not._ |
+| `QUEST_ENABLE_CUDA` | (`OFF`), `ON` | Determines whether QuEST will be built with support for NVIDIA GPU acceleration. If turned on, `CMAKE_CUDA_ARCHITECTURES` should probably also be set. |
+| `QUEST_ENABLE_CUQUANTUM` | (`OFF`), `ON` | Determines whether QuEST will make use of the NVIDIA CuQuantum library. Cannot be turned on if `QUEST_ENABLE_CUDA` is off. |
+| `QUEST_ENABLE_HIP` | (`OFF`), `ON` | Determines whether QuEST will be built with support for AMD GPU acceleration. If turned on, `CMAKE_HIP_ARCHITECTURES` should probably also be set. |
+| `QUEST_ENABLE_DEPRECATED_API` | (`OFF`), `ON` | Determines whether QuEST will be built with support for the deprecated (v3) API. ***Note**: this will generate compiler warnings and is not supported by MSVC.* |
+| `QUEST_DISABLE_DEPRECATION_WARNINGS` | (`OFF`), `ON` | Whether to disable the compile-time deprecation warnings when using the deprecated (v3) API. |
+| `USER_SOURCE_NAMES` | (Undefined), String | The source file for a user program which will be compiled alongside QuEST. `USER_OUTPUT_EXE_NAME` *must* also be defined. |
+| `USER_OUTPUT_EXE_NAME` | (Undefined), String | The name of the executable which will be created from the provided `USER_SOURCE_NAMES`. `USER_SOURCE_NAMES` *must* also be defined. |
+| `QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK` | (128), Number | The default number of threads per block QuEST will use when offloading to a GPU. *Must* be a multiple of 32 (on NVIDIA GPUs) or 64 (on AMD GPUs). This CMake variable sets the default if not later overridden. The number can be overridden at process launch time using an [environment variable](https://quest-kit.github.io/QuEST/group__modes.html#gaf1b71f54d270d3353fe072c66827339b) of the same name, or during runtime using [`setQuESTNumGpuThreadsPerBlock()`](https://quest-kit.github.io/QuEST/group__experimental.html#gae35a55c6d9366ce677e6aaaf4c1ff5ef). |
 
 
 
@@ -56,11 +58,11 @@ make
 
 | Variable | (Default) Values | Notes |
 | -------- | ---------------- | ----- |
-| `ENABLE_TESTING` | (`OFF`), `ON` | Determines whether to additionally build QuEST's unit and integration tests. If built, tests can be run from the `build` directory with `make test`, or `ctest`, or manually launched with `./tests/tests` which enables distribution (i.e. `mpirun -np 8 ./tests/tests`) |
-| `ENABLE_DEPRECATED_API` | (`OFF`), `ON` | As described above. When enabled alongside testing, the `v3 deprecated` unit tests will additionally be compiled and can be run from within `build` via `cd tests/deprecated; ctest`, or manually launched with `./tests/deprecated/dep_tests` (enabling distribution, as above).
-| `DOWNLOAD_CATCH2` | (`ON`), `OFF` | QuEST's tests require Catch2. By default, if you don't have Catch2 installed (or CMake doesn't find it) it will be downloaded from Github and built for you. If you don't want that to happen, for example because you _do_ have Catch2 installed, set this to `OFF`. |
+| `QUEST_BUILD_TESTS` | (`OFF`), `ON` | Determines whether to additionally build QuEST's unit and integration tests. If built, tests can be run from the `build` directory with `make test`, or `ctest`, or manually launched with `./tests/tests` which enables distribution (i.e. `mpirun -np 8 ./tests/tests`) |
+| `QUEST_ENABLE_DEPRECATED_API` | (`OFF`), `ON` | As described above. When enabled alongside testing, the `v3 deprecated` unit tests will additionally be compiled and can be run from within `build` via `cd tests/deprecated; ctest`, or manually launched with `./tests/deprecated/dep_tests` (enabling distribution, as above).
+| `QUEST_TESTS_DOWNLOAD_CATCH2` | (`ON`), `OFF` | QuEST's tests require Catch2. By default, if you don't have Catch2 installed (or CMake doesn't find it) it will be downloaded from Github and built for you. If you don't want that to happen, for example because you _do_ have Catch2 installed, set this to `OFF`. |
 
-> As of `v4.2`, macros which configure the unit tests such as `TEST_MAX_NUM_QUBIT_PERMUTATIONS` have become environment variables specified before launch. See [`launch.md`](launch.md)
+> As of `v4.2`, macros which configure the unit tests such as `QUEST_TEST_MAX_NUM_QUBIT_PERMUTATIONS` have become environment variables specified before launch. See [`launch.md`](launch.md)
 
 ---------------------------
 
diff --git a/docs/compile.md b/docs/compile.md
index f11677fbf..ba4306a85 100644
--- a/docs/compile.md
+++ b/docs/compile.md
@@ -9,7 +9,7 @@
   Some notes about this guide:
   - we will always use a build directory called 'build'
   - we will use spaces around cmake argnames and values for clarity, e.g.
-    cmake -B build -D ENABLE_CUDA=ON
+    cmake -B build -D QUEST_ENABLE_CUDA=ON
   - we will demonstrate the simplest and visually clear (and likely sub-optimal) 
     use-cases before progressively more visually complicated examples
 -->
@@ -183,10 +183,10 @@ int main() {
     return 0;
 }
 ```
-simply specify variables `USER_SOURCE` and `OUTPUT_EXE` at _configure time_:
+simply specify variables `USER_SOURCE_NAMES` and `USER_OUTPUT_EXE_NAME` at _configure time_:
 ```bash
 # configure
-cmake .. -D USER_SOURCE=myfile.c -D OUTPUT_EXE=myexec
+cmake .. -D USER_SOURCE_NAMES=myfile.c -D USER_OUTPUT_EXE_NAME=myexec
 ```
 where 
 - `myfile.c` is your `C` source file (or `myfile.cpp` if using `C++`).
@@ -194,7 +194,7 @@ where
 
 
 > [!IMPORTANT]
-> `USER_SOURCE` can be any relative or absolute path to a file, but `OUTPUT_EXE` must be strictly a filename and cannot contain subdirectories. See <a href="#compile_location">Location</a> to change the output directory.
+> `USER_SOURCE_NAMES` can be any relative or absolute path to a file, but `USER_OUTPUT_EXE_NAME` must be strictly a filename and cannot contain subdirectories. See <a href="#compile_location">Location</a> to change the output directory.
 
 
 To compile multiple dependent files, such as
@@ -221,10 +221,10 @@ void myfunc() {
     printf("hello quworld!\n");
 }
 ```
-simply separate them by `;` in `USER_SOURCE`, wrapped in quotations:
+simply separate them by `;` in `USER_SOURCE_NAMES`, wrapped in quotations:
 ```bash
 # configure
-cmake .. -D USER_SOURCE="myfile.cpp;otherfile.cpp" -D OUTPUT_EXE=myexec
+cmake .. -D USER_SOURCE_NAMES="myfile.cpp;otherfile.cpp" -D USER_OUTPUT_EXE_NAME=myexec
 ```
 
 
@@ -297,7 +297,7 @@ This applies to _all_ built executables, including your own custom files, the ex
 > [!IMPORTANT]
 > Configuration will fail if any two executables have the same output name since they will not be separated into subdirectories and will collide. We do not gaurantee that all test and example filenames will remain unique in the future, such that use of `CMAKE_RUNTIME_OUTPUT_DIRECTORY` may become invalid except when also specifying
 > ```
-> -D ENABLE_TESTING=OFF -D BUILD_EXAMPLES=OFF
+> -D QUEST_BUILD_TESTS=OFF -D QUEST_BUILD_EXAMPLES=OFF
 > ```
 
 
@@ -311,11 +311,11 @@ This applies to _all_ built executables, including your own custom files, the ex
 
 QuEST's numerical precision can be configured at compile-time, informing what _type_, and ergo how many _bytes_, are used to represent each `qreal` (a floating-point real number) and `qcomp` (a complex amplitude). This affects the memory used by each `Qureg`, but also the user-facing `qreal` and `qcomp` types, as detailed below. Reducing the precision accelerates QuEST at the cost of worsened numerical accuracy. 
 
-Precision is set at configure-time using the `FLOAT_PRECISION` [cmake variable](cmake.md), taking on the values `1`, `2` (default) or `4`.
+Precision is set at configure-time using the `QUEST_FLOAT_PRECISION` [cmake variable](cmake.md), taking on the values `1`, `2` (default) or `4`.
 For example
 ```bash
 # configure
-cmake .. -D FLOAT_PRECISION=1
+cmake .. -D QUEST_FLOAT_PRECISION=1
 ```
 
 The values inform types:
@@ -393,7 +393,7 @@ QuEST itself accepts a variety of its preprocessors (mostly related to testing)
 To compile all of QuEST's [`examples/`](/examples/), use
 ```bash
 # configure
-cmake .. -D BUILD_EXAMPLES=ON
+cmake .. -D QUEST_BUILD_EXAMPLES=ON
 
 # build
 cmake --build .
@@ -433,7 +433,7 @@ To compile QuEST's latest unit and integration tests, use
 
 ```bash
 # configure
-cmake .. -D ENABLE_TESTING=ON
+cmake .. -D QUEST_BUILD_TESTS=ON
 
 # build
 cmake --build .
@@ -451,7 +451,7 @@ This will compile an executable `tests` in subdirectory `build/tests/`, which ca
 QuEST's deprecated v3 API has its own unit tests which can be additionally compiled (_except_ on Windows) via
 ```bash
 # configure
-cmake .. -D ENABLE_TESTING=ON -D ENABLE_DEPRECATED_API=ON
+cmake .. -D QUEST_BUILD_TESTS=ON -D QUEST_ENABLE_DEPRECATED_API=ON
 
 # build
 cmake --build .
@@ -488,7 +488,7 @@ QuEST uses [OpenMP](https://www.openmp.org/) to perform multithreading, so accel
 To compile with multithreading, simply enable it during configuration:
 ```bash
 # configure
-cmake .. -D ENABLE_MULTITHREADING=ON
+cmake .. -D QUEST_ENABLE_OMP=ON
 
 # build
 cmake --build .
@@ -533,13 +533,13 @@ nvcc --version
 To compile your QuEST application with CUDA-acceleration, specify both
 ```bash
 # configure
-cmake .. -D ENABLE_CUDA=ON -D CMAKE_CUDA_ARCHITECTURES=$CC
+cmake .. -D QUEST_ENABLE_CUDA=ON -D CMAKE_CUDA_ARCHITECTURES=$CC
 ```
 where `$CC` is your GPU's [compute capability](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities) (excluding the full-stop) which you can look up [here](https://developer.nvidia.com/cuda-gpus). 
 For example, compiling for the [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) looks like:
 ```bash
 # configure
-cmake .. -D ENABLE_CUDA=ON -D CMAKE_CUDA_ARCHITECTURES=80
+cmake .. -D QUEST_ENABLE_CUDA=ON -D CMAKE_CUDA_ARCHITECTURES=80
 ```
 
 
@@ -567,14 +567,14 @@ The compiled executable can be run like any other, though the GPU behaviour can
 
 > TODO!
 > - ROCm
-> - ENABLE_HIP
+> - QUEST_ENABLE_HIP
 > - CMAKE_HIP_ARCHITECTURES
 
 
 To compile your QuEST application with HIP-acceleration, specify both
 ```bash
 # configure
-cmake .. -D ENABLE_HIP=ON -D CMAKE_HIP_ARCHITECTURES=$TN
+cmake .. -D QUEST_ENABLE_HIP=ON -D CMAKE_HIP_ARCHITECTURES=$TN
 ```
 where `$TN` is your AMD GPU's [LLVM target name](https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html#glossary). You can look this up [here](https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html), or find the names of all of your local GPUs by running the [ROCM agent enumerator](https://rocm.docs.amd.com/projects/rocminfo/en/latest/how-to/use-rocm-agent-enumerator.html) command, i.e.
 ```bash
@@ -583,7 +583,7 @@ rocm_agent_enumerator -name
 For example, compiling for the [AMD Instinct MI210 accelerator](https://www.amd.com/en/products/accelerators/instinct/mi200/mi210.html) looks like:
 ```bash
 # configure
-cmake .. -D ENABLE_HIP=ON -D CMAKE_HIP_ARCHITECTURES=gfx90a
+cmake .. -D QUEST_ENABLE_HIP=ON -D CMAKE_HIP_ARCHITECTURES=gfx90a
 ```
 
 
@@ -626,11 +626,11 @@ After download and installation, and before compiling, you must set the `CUQUANT
 export CUQUANTUM_ROOT=/path/to/cuquantum-folder
 ```
 
-Compilation is then simple; we specify `ENABLE_CUQUANTUM` in addition to the above GPU CMake variables. 
+Compilation is then simple; we specify `QUEST_ENABLE_CUQUANTUM` in addition to the above GPU CMake variables.
 For example
 ```bash
 # configure
-cmake .. -D ENABLE_CUDA=ON -D CMAKE_CUDA_ARCHITECTURES=80 -D ENABLE_CUQUANTUM=ON
+cmake .. -D QUEST_ENABLE_CUDA=ON -D CMAKE_CUDA_ARCHITECTURES=80 -D QUEST_ENABLE_CUQUANTUM=ON
 
 # build
 cmake --build . --parallel
@@ -665,7 +665,7 @@ Compiling QuEST's distributed backend is as simple as
 
 ```bash
 # configure
-cmake .. -D ENABLE_DISTRIBUTION=ON
+cmake .. -D QUEST_ENABLE_MPI=ON
 
 # build
 cmake --build . --parallel
diff --git a/docs/launch.md b/docs/launch.md
index a76ce612b..3eb8493ee 100644
--- a/docs/launch.md
+++ b/docs/launch.md
@@ -223,11 +223,11 @@ The `v4` unit tests make use of the below, optional environment variables to con
 
 | Environment variable  | Default | Description |
 | -------- | ------- | ------- |
-| `TEST_NUM_QUBITS_IN_QUREG` | `6` | The number of qubits in the Qureg(s) undergoing unit testing. In addition to operation upon larger Quregs being exponentially slower, beware that more qubits permit more variations and permutations of input parameters like target qubits, factorially increasing the number of tests per operation. |
-| `TEST_MAX_NUM_QUBIT_PERMUTATIONS`  | `0` | The maximum number of control and target qubit permutations under which to unit test each function. Set to `0` (default) to test all permutations, or to a positive integer (e.g. `50`) to accelerate the unit tests. See more info [here](https://quest-kit.github.io/QuEST/group__testutilsconfig.html#gac5adcc10bd26c56f20344f5ae3d9ba41). |
-| `TEST_MAX_NUM_SUPEROP_TARGETS` | `4` | The maximum number of superoperator targets for which to unit test functions `mixKrausMap()` and `mixSuperOp()`. These are computationally equivalent to simulating unitaries with double the number of targets upon a density matrix. Set to `0` to test all sizes which is likely prohibitively slow, or to a positive integer (e.g. the default of `4`) to accelerate the unit tests. |
-| `NUM_MIXED_DEPLOYMENT_REPETITIONS` | `10` | The number of times (minimum of `1`) to repeat each random mixed-deployment unit test for each deployment combination. |
-| `TEST_ALL_DEPLOYMENTS` | `1` | Whether unit tests will be run using all possible deployment combinations (i.e. OpenMP, CUDA, MPI) in-turn (`=1`), or only once using all available deployments simultaneously (`=0`). |
+| `QUEST_TEST_NUM_QUBITS_IN_QUREG` | `6` | The number of qubits in the Qureg(s) undergoing unit testing. In addition to operation upon larger Quregs being exponentially slower, beware that more qubits permit more variations and permutations of input parameters like target qubits, factorially increasing the number of tests per operation. |
+| `QUEST_TEST_MAX_NUM_QUBIT_PERMUTATIONS`  | `0` | The maximum number of control and target qubit permutations under which to unit test each function. Set to `0` (default) to test all permutations, or to a positive integer (e.g. `50`) to accelerate the unit tests. See more info [here](https://quest-kit.github.io/QuEST/group__testutilsconfig.html#ga34b54a167498c27babfcc9b28c4ac680). |
+| `QUEST_TEST_MAX_NUM_SUPEROP_TARGETS` | `4` | The maximum number of superoperator targets for which to unit test functions `mixKrausMap()` and `mixSuperOp()`. These are computationally equivalent to simulating unitaries with double the number of targets upon a density matrix. Set to `0` to test all sizes which is likely prohibitively slow, or to a positive integer (e.g. the default of `4`) to accelerate the unit tests. |
+| `QUEST_TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS` | `10` | The number of times (minimum of `1`) to repeat each random mixed-deployment unit test for each deployment combination. |
+| `QUEST_TEST_TRY_ALL_DEPLOYMENTS` | `1` | Whether unit tests will be run using all possible deployment combinations (i.e. OpenMP, CUDA, MPI) in-turn (`=1`), or only once using all available deployments simultaneously (`=0`). |
 
 
 
@@ -268,8 +268,9 @@ ctest
 
 QuEST execution can be configured prior to runtime using the below [environment variables](https://en.wikipedia.org/wiki/Environment_variable).
 
-- [`PERMIT_NODES_TO_SHARE_GPU`](https://quest-kit.github.io/QuEST/group__modes.html#ga7e12922138caa68ddaa6221e40f62dda)
-- [`DEFAULT_VALIDATION_EPSILON`](https://quest-kit.github.io/QuEST/group__modes.html#ga55810d6f3d23de810cd9b12a2bbb8cc2)
+- [`QUEST_PERMIT_NODES_TO_SHARE_GPU`](https://quest-kit.github.io/QuEST/group__modes.html#ga84b134d552464a82d29517e1ce1309a7)
+- [`QUEST_DEFAULT_VALIDATION_EPSILON`](https://quest-kit.github.io/QuEST/group__modes.html#gac4ab30619e411c965377c910680e242c)
+- [`QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK`](https://quest-kit.github.io/QuEST/group__modes.html#gaf1b71f54d270d3353fe072c66827339b)
 
 Note the unit tests in the preceding section accept additional environment variables.
 
diff --git a/docs/tutorial.md b/docs/tutorial.md
index b3e99706e..306a1f1ec 100644
--- a/docs/tutorial.md
+++ b/docs/tutorial.md
@@ -14,11 +14,7 @@ QuEST is included into a `C` or `C++` project via
 
 <!-- @todo the below link fails in Doxygen; it's too stupid to recognise the section ref -->
 > [!TIP]
-> Some of QuEST's deprecated `v3` API can be accessed by specifying `ENABLE_DEPRECATED_API` when [compiling](/docs/compile.md#v3), or defining it before import, i.e. 
-> ```cpp
-> #define ENABLE_DEPRECATED_API 1
-> #include "quest.h"
-> ```
+> Some of QuEST's deprecated `v3` API can be accessed by specifying `QUEST_ENABLE_DEPRECATED_API` when [compiling](/docs/compile.md#v3).
 > We recommend migrating to the latest `v4` API however as will be showcased below.
 
 Simulation typically proceeds as:
@@ -173,29 +169,29 @@ if (env.isGpuAccelerated)
 
 Configuring the environment is ordinarily not necessary, but convenient in certain applications.
 
-For example, we may wish our simulations to deterministically obtain the same measurement outcomes and random states as a previous or future run, and ergo choose to [override](https://quest-kit.github.io/QuEST/group__debug__seed.html#ga9e3a6de413901afbf50690573add1587) the default seeds.
+For example, we may wish our simulations to deterministically obtain the same measurement outcomes and random states as a previous or future run, and ergo choose to [override](https://quest-kit.github.io/QuEST/group__debug__seed.html#ga4fea21c26edfea5a64cbdab860dbf583) the default seeds.
 ```cpp
 unsigned seeds[] = {123u, 1u << 10};
-setSeeds(seeds, 2);
+setQuESTSeeds(seeds, 2);
 ```
 
 We may wish further to [adjust](https://quest-kit.github.io/QuEST/group__debug__reporting.html) how subsequent functions will display information to the screen
 ```cpp
 int maxRows = 8;
 int maxCols = 4;
-setMaxNumReportedItems(maxRows, maxCols);
-setMaxNumReportedSigFigs(3);
+setQuESTMaxNumReportedItems(maxRows, maxCols);
+setQuESTMaxNumReportedSigFigs(3);
 ```
-or [add](https://quest-kit.github.io/QuEST/group__debug__reporting.html#ga29413703d609254244d6b13c663e6e06) extra spacing between QuEST's printed outputs
+or [add](https://quest-kit.github.io/QuEST/group__debug__reporting.html#gac5fa20b24814c555eae1d77229959b5e) extra spacing between QuEST's printed outputs
 ```cpp
-setNumReportedNewlines(3);
+setQuESTNumReportedNewlines(3);
 ```
 
-Perhaps we also wish to relax the [precision](https://quest-kit.github.io/QuEST/group__debug__validation.html#gae395568df6def76045ec1881fcb4e6d1) with which our future inputs will be asserted unitary or Hermitian
+Perhaps we also wish to relax the [precision](https://quest-kit.github.io/QuEST/group__debug__validation.html#ga6be7e12fc056a751a03073ee6844b0eb) with which our future inputs will be asserted unitary or Hermitian
 ```cpp
-setValidationEpsilon(0.001);
+setQuESTValidationEpsilon(0.001);
 ```
-but when unitarity _is_ violated, or we otherwise pass an invalid input, we wish to execute a [custom function](https://quest-kit.github.io/QuEST/group__debug__validation.html#ga14b6e7ce08465e36750da3acbc41062f) before exiting.
+but when unitarity _is_ violated, or we otherwise pass an invalid input, we wish to execute a [custom function](https://quest-kit.github.io/QuEST/group__debug__validation.html#gaa02a39c21c770e06ff891e028fd1fe75) before exiting.
 ```cpp
 #include <stdlib.h>
 
@@ -205,7 +201,7 @@ void myErrorHandler(const char *func, const char *msg) {
     exit(1);
 }
 
-setInputErrorHandler(myErrorHandler);
+setQuESTInputErrorHandler(myErrorHandler);
 ```
 
 > [!TIP]
@@ -218,7 +214,7 @@ setInputErrorHandler(myErrorHandler);
 >     std::string msg(errMsg);
 >     throw std::runtime_error(func + ": " + msg);
 > }
-> setInputErrorHandler(myErrorHandler);
+> setQuESTInputErrorHandler(myErrorHandler);
 > ```
 <!-- newlines removed above because doxygen renders them as <br> text, how stupid! -->
 
@@ -253,7 +249,7 @@ Qureg (10 qubit statevector, 1024 qcomps, 16.1 KiB):
     0  |1022⟩
     0  |1023⟩
 ```
-> This printed only `8` amplitudes as per our setting of [`setMaxNumReportedItems()`](https://quest-kit.github.io/QuEST/group__debug__reporting.html#ga093c985b1970a0fd8616c01b9825979a) above.
+> This printed only `8` amplitudes as per our setting of [`setQuESTMaxNumReportedItems()`](https://quest-kit.github.io/QuEST/group__debug__reporting.html#ga2f2d0258f4f7acd6bfe74a19f697d0c2) above.
 
 Behind the scenes, the function `createQureg` did something clever; it consulted the compiled deployments and available hardware to decide whether to distribute `qureg`, or dedicate it persistent GPU memory, and marked whether or not to multithread its subsequent modification. It attempts to choose _optimally_, avoiding gratuitous parallelisation if the overheads outweigh the benefits, or if the hardware devices have insufficient memory.
 
@@ -356,7 +352,7 @@ Qureg:
     globalTotal.......16 MiB
 ```
 
-> The spacing between the outputs of those two consecutive QuEST functions was determined by our earlier call to [`setNumReportedNewlines()`](https://quest-kit.github.io/QuEST/group__debug__reporting.html#ga29413703d609254244d6b13c663e6e06).
+> The spacing between the outputs of those two consecutive QuEST functions was determined by our earlier call to [`setQuESTNumReportedNewlines()`](https://quest-kit.github.io/QuEST/group__debug__reporting.html#gac5fa20b24814c555eae1d77229959b5e).
 
 
 A density matrix `Qureg` can model classical uncertainty as results from [decoherence](https://quest-kit.github.io/QuEST/group__decoherence.html), and proves useful when simulating quantum operations on a noisy quantum computer.
@@ -415,7 +411,7 @@ Qureg (5 qubit density matrix, 32x32 qcomps, 16.1 KiB):
     -0.00597-0.00615i   -0.00207-0.00451i   …  0.000509-0.00401i   0.0173+(3.12e-19)i
 ```
 
-> The number of printed significant figures above results from our earlier calling of [`setMaxNumReportedSigFigs()`](https://quest-kit.github.io/QuEST/group__debug__reporting.html#ga15d46e5d813f70b587762814964e1994).
+> The number of printed significant figures above results from our earlier calling of [`setQuESTMaxNumReportedSigFigs()`](https://quest-kit.github.io/QuEST/group__debug__reporting.html#ga3b4156994fdcf65eee0875316a9cc95f).
 
 
 
@@ -609,10 +605,10 @@ QuEST encountered a validation error during function 'applyCompMatr1':
 The given matrix was not (approximately) unitary.
 Exiting...
 ```
-If we're satisfied our matrix _is_ sufficiently approximately unitary, we can [adjust](https://quest-kit.github.io/QuEST/group__debug__validation.html#gae395568df6def76045ec1881fcb4e6d1) or [disable](https://quest-kit.github.io/QuEST/group__debug__validation.html#ga5999824df0785ea88fb2d5b5582f2b46) the validation.
+If we're satisfied our matrix _is_ sufficiently approximately unitary, we can [adjust](https://quest-kit.github.io/QuEST/group__debug__validation.html#ga6be7e12fc056a751a03073ee6844b0eb) or [disable](https://quest-kit.github.io/QuEST/group__debug__validation.html#ga0a20ca2bc35e22e914bc25671dabdb9b) the validation.
 ```cpp
 // max(norm(m * dagger(m) - identity)) = 0.9025
-setValidationEpsilon(0.903);
+setQuESTValidationEpsilon(0.903);
 applyCompMatr1(qureg, 0, m);
 ```
 
@@ -783,7 +779,7 @@ reportScalar("entanglement", calcPurity(reduced));
 ## Report the results
 
 
-We've seen above that [scalars](https://quest-kit.github.io/QuEST/group__types.html) can be reported, handling the pretty formatting of real and complex numbers, controlled by settings like [`setMaxNumReportedSigFigs()`](https://quest-kit.github.io/QuEST/group__debug__reporting.html#ga15d46e5d813f70b587762814964e1994). But we can also report every data structure in the QuEST API, such as Pauli strings
+We've seen above that [scalars](https://quest-kit.github.io/QuEST/group__types.html) can be reported, handling the pretty formatting of real and complex numbers, controlled by settings like [`setQuESTMaxNumReportedSigFigs()`](https://quest-kit.github.io/QuEST/group__debug__reporting.html#ga3b4156994fdcf65eee0875316a9cc95f). But we can also report every data structure in the QuEST API, such as Pauli strings
 ```cpp
 reportPauliStr(
     getInlinePauliStr("XXYYZZ", {5,50, 10,60, 30,40})
@@ -805,8 +801,8 @@ PauliStrSum (4 terms, 160 bytes):
 ```
 All outputs are affected by the [reporter settings](https://quest-kit.github.io/QuEST/group__debug__reporting.html).
 ```cpp
-setMaxNumReportedItems(4,4);
-setMaxNumReportedSigFigs(1);
+setQuESTMaxNumReportedItems(4,4);
+setQuESTMaxNumReportedSigFigs(1);
 reportCompMatr(bigmatrix);
 ```
 ```
diff --git a/docs/v4.md b/docs/v4.md
index bc8018355..42c109521 100644
--- a/docs/v4.md
+++ b/docs/v4.md
@@ -53,7 +53,7 @@ QuEST `v4` has completely overhauled the API, software architecture, algorithms,
   The set of supported quantum operations has greatly expanded. _All_ unitaries can be effected with any number of control qubits (in any [state](https://quest-kit.github.io/QuEST/group__op__compmatr.html#ga2f4526fe3a4f96509040151f3d31535a)), diagonal matrices can be [raised to powers](https://quest-kit.github.io/QuEST/group__op__diagmatr.html#ga7e07c28332d7d89784166f82cdd26eb9), density matrices can undergo [partial tracing](https://quest-kit.github.io/QuEST/group__calc__partialtrace.html) and [inhomogeneous Pauli channels](https://quest-kit.github.io/QuEST/group__decoherence.html#ga51a7f8d5ba0b142c37a698deed07bc28) (in addition to general [Kraus maps](https://quest-kit.github.io/QuEST/group__decoherence.html#ga57753c0d2deac93d3395c5b20a0122f0) and [superoperatos](https://quest-kit.github.io/QuEST/group__decoherence.html#ga6afbb4f2bb3a9c382861feb8a7b70951)), and multi-qubit projectors can now be performed, [with](https://quest-kit.github.io/QuEST/group__op__measurement.html#ga6bd438f3ebd80cf017292bb68542ed8f) and [without](https://quest-kit.github.io/QuEST/group__op__projectors.html#gaa4bde7e5a344fb46cf3119d462b18745) renormalisation.
   <br><br>
 - **more control** <br>
-  Extensive new [debugging](https://quest-kit.github.io/QuEST/group__debug.html) facilities allow [disabling](https://quest-kit.github.io/QuEST/group__debug__validation.html#ga5999824df0785ea88fb2d5b5582f2b46) or [changing](https://quest-kit.github.io/QuEST/group__debug__validation.html#gae395568df6def76045ec1881fcb4e6d1) the validation precision and [error response](https://quest-kit.github.io/QuEST/group__debug__validation.html#ga14b6e7ce08465e36750da3acbc41062f) at runtime, and controlling how many [amplitudes](https://quest-kit.github.io/QuEST/group__debug__reporting.html#ga093c985b1970a0fd8616c01b9825979a) and [significant figures](https://quest-kit.github.io/QuEST/group__debug__reporting.html#ga15d46e5d813f70b587762814964e1994) of `Qureg` and matrices are printed.
+  Extensive new [debugging](https://quest-kit.github.io/QuEST/group__debug.html) facilities allow [disabling](https://quest-kit.github.io/QuEST/group__debug__validation.html#ga0a20ca2bc35e22e914bc25671dabdb9b) or [changing](https://quest-kit.github.io/QuEST/group__debug__validation.html#ga6be7e12fc056a751a03073ee6844b0eb) the validation precision and [error response](https://quest-kit.github.io/QuEST/group__debug__validation.html#gaa02a39c21c770e06ff891e028fd1fe75) at runtime, and controlling how many [amplitudes](https://quest-kit.github.io/QuEST/group__debug__reporting.html#ga2f2d0258f4f7acd6bfe74a19f697d0c2) and [significant figures](https://quest-kit.github.io/QuEST/group__debug__reporting.html#ga3b4156994fdcf65eee0875316a9cc95f) of `Qureg` and matrices are printed.
   <br><br>
 - **better documentation** <br>
   The [documentation](/docs/) has been rewritten from the ground-up, and the [API doc](https://quest-kit.github.io/QuEST/topics.html) grouped into sub-categories and aesthetically overhauled with [Doxygen Awesome](https://jothepro.github.io/doxygen-awesome-css/). It is now more consistently structured, mathematically explicit, and is a treat on the eyes!
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index afc8f85d6..10278afb6 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,7 +20,11 @@ function(add_example direc in_fn)
   add_executable(${target} ${in_fn})
   target_link_libraries(${target} PUBLIC QuEST)
 
-  if (INSTALL_BINARIES)
+  if (QUEST_ENABLE_MPI AND QUEST_ENABLE_SUBCOMM)
+    target_link_libraries(${target} PRIVATE MPI::MPI_CXX)
+  endif()
+
+  if (QUEST_INSTALL_BINARIES)
     install(
       TARGETS ${target}
       RUNTIME
diff --git a/examples/extended/dynamics.c b/examples/extended/dynamics.c
index 03abcc16c..8c72d71ab 100644
--- a/examples/extended/dynamics.c
+++ b/examples/extended/dynamics.c
@@ -103,16 +103,16 @@ PauliStrSum createMyObservable(int numQubits) {
 
 void reportMyStructs(Qureg qureg, PauliStrSum hamil, PauliStrSum observ) {
 
-    setMaxNumReportedSigFigs(6);   // sig-figs in scalars
-    setNumReportedNewlines(2);     // spacing between reports
-    setReportedPauliChars(".XYZ"); // print I as .
-    setReportedPauliStrStyle(0);   // print XYZ (0) or Z3 Y2 X1 (1)
-    setMaxNumReportedItems(8, 8);  // show max 8 qureg amplitudes
+    setQuESTMaxNumReportedSigFigs(6);   // sig-figs in scalars
+    setQuESTNumReportedNewlines(2);     // spacing between reports
+    setQuESTReportedPauliChars(".XYZ"); // print I as .
+    setQuESTReportedPauliStrStyle(0);   // print XYZ (0) or Z3 Y2 X1 (1)
+    setQuESTMaxNumReportedItems(8, 8);  // show max 8 qureg amplitudes
 
     reportStr("[Initial state]");
     reportQureg(qureg);
 
-    setMaxNumReportedItems(0, 0); // show 0=all Pauli operators
+    setQuESTMaxNumReportedItems(0, 0); // show 0=all Pauli operators
 
     reportStr("[Hamiltonian]");
     reportPauliStrSum(hamil);
@@ -144,8 +144,8 @@ int main() {
     reportMyStructs(qureg, hamil, observ);
 
     // tidy reporting of below expectation values
-    setMaxNumReportedSigFigs(3);
-    setNumReportedNewlines(1);
+    setQuESTMaxNumReportedSigFigs(3);
+    setQuESTNumReportedNewlines(1);
 
     // evolve by repeatedly (each is a "step") Trotterising
     // exp(-i dt H) with the specified order and repetitions.
@@ -172,8 +172,8 @@ int main() {
     reportStr("");
     
     // preview the final state...
-    setNumReportedNewlines(2);
-    setMaxNumReportedItems(25, 25);
+    setQuESTNumReportedNewlines(2);
+    setQuESTMaxNumReportedItems(25, 25);
     reportStr("[Final state]");
     reportQureg(qureg);
 
diff --git a/examples/extended/dynamics.cpp b/examples/extended/dynamics.cpp
index 636145387..da4fd9223 100644
--- a/examples/extended/dynamics.cpp
+++ b/examples/extended/dynamics.cpp
@@ -100,16 +100,16 @@ PauliStrSum createMyObservable(int numQubits) {
 
 void reportMyStructs(Qureg qureg, PauliStrSum hamil, PauliStrSum observ) {
 
-    setMaxNumReportedSigFigs(6);   // sig-figs in scalars
-    setNumReportedNewlines(2);     // spacing between reports
-    setReportedPauliChars(".XYZ"); // print I as .
-    setReportedPauliStrStyle(0);   // print XYZ (0) or Z3 Y2 X1 (1)
-    setMaxNumReportedItems(8, 8);  // show max 8 qureg amplitudes
+    setQuESTMaxNumReportedSigFigs(6);   // sig-figs in scalars
+    setQuESTNumReportedNewlines(2);     // spacing between reports
+    setQuESTReportedPauliChars(".XYZ"); // print I as .
+    setQuESTReportedPauliStrStyle(0);   // print XYZ (0) or Z3 Y2 X1 (1)
+    setQuESTMaxNumReportedItems(8, 8);  // show max 8 qureg amplitudes
 
     reportStr("[Initial state]");
     reportQureg(qureg);
 
-    setMaxNumReportedItems(0, 0); // show 0=all Pauli operators
+    setQuESTMaxNumReportedItems(0, 0); // show 0=all Pauli operators
 
     reportStr("[Hamiltonian]");
     reportPauliStrSum(hamil);
@@ -141,8 +141,8 @@ int main() {
     reportMyStructs(qureg, hamil, observ);
 
     // tidy reporting of below expectation values
-    setMaxNumReportedSigFigs(3);
-    setNumReportedNewlines(1);
+    setQuESTMaxNumReportedSigFigs(3);
+    setQuESTNumReportedNewlines(1);
 
     // evolve by repeatedly (each is a "step") Trotterising
     // exp(-i dt H) with the specified order and repetitions.
@@ -166,8 +166,8 @@ int main() {
     reportStr("");
     
     // preview the final state...
-    setNumReportedNewlines(2);
-    setMaxNumReportedItems(25, 25);
+    setQuESTNumReportedNewlines(2);
+    setQuESTMaxNumReportedItems(25, 25);
     reportStr("[Final state]");
     reportQureg(qureg);
 
diff --git a/examples/extended/set_num_gpu_threads.c b/examples/extended/set_num_gpu_threads.c
new file mode 100644
index 000000000..1b3dc175f
--- /dev/null
+++ b/examples/extended/set_num_gpu_threads.c
@@ -0,0 +1,91 @@
+/** @file
+ * 
+ * An example of using QuEST's experimental
+ * setQuESTNumGpuThreadsPerBlock() function
+ * to change the parallelisation granularity
+ * of GPU simulation
+ * 
+ * @author Tyson Jones
+ */
+
+#include "quest.h"
+#include <stdio.h>
+#include <time.h>
+
+
+const int NUM_REPS = 10;
+const int NUM_QUBITS = 25;  // 512 MiB (at double precision)
+
+
+void simulation(Qureg qureg)
+{
+    // put your favourite QuEST simulation here
+    initRandomPureState(qureg);
+    applyFullQuantumFourierTransform(qureg, /*inverse=*/false);
+    calcTotalProb(qureg);
+}
+
+
+void benchmark(Qureg qureg, int numThreadsPerBlock)
+{
+    printf("Using %d threads per block... ", numThreadsPerBlock);
+    fflush(stdout);
+
+    setQuESTNumGpuThreadsPerBlock(numThreadsPerBlock);
+
+    // warmup
+    for (int r=0; r<NUM_REPS; r++)
+        simulation(qureg);
+    syncQuESTEnv();
+
+    double start = (double) clock();
+
+    for (int r = 0; r < NUM_REPS; r++)
+        simulation(qureg);
+    syncQuESTEnv();
+
+    double end = (double) clock();
+    double dur = (end - start) / CLOCKS_PER_SEC;
+    double av = dur / NUM_REPS;
+
+    printf("took %fs\n", av);
+}
+
+
+int main(void)
+{
+    initQuESTEnv();
+
+    // This example is pointless without a GPU!
+    if (!getQuESTEnv().isGpuAccelerated)
+    {
+        printf(
+            "GPU acceleration is not enabled, and so changing the number "
+            "of threads per block has no effect. Exiting...\n");
+        finalizeQuESTEnv();
+        return 0;
+    }
+
+    int initNumTPB = getQuESTNumGpuThreadsPerBlock();
+    printf("Initial numThreadsPerBlock: %d\n\n", initNumTPB);
+
+    // Create a statevector parallelised only by the GPU
+    Qureg qureg = createCustomQureg(NUM_QUBITS, 0, 0, 1, 0);
+    reportQuregParams(qureg);
+
+    // Benchmark sensible parameters
+    int goodTPB[] = {64, 128, 256, 512, 1024};
+    for (int i = 0; i < 5; i++)
+        benchmark(qureg, goodTPB[i]);
+
+    // Try silly parameters
+    setQuESTValidationOff();
+    int badTPB[] = {31, 15, 5, 1};
+    for (int i = 0; i < 4; i++)
+        benchmark(qureg, badTPB[i]);
+
+    destroyQureg(qureg);
+    finalizeQuESTEnv();
+
+    return 0;
+}
diff --git a/examples/extended/set_num_gpu_threads.cpp b/examples/extended/set_num_gpu_threads.cpp
new file mode 100644
index 000000000..c298d736d
--- /dev/null
+++ b/examples/extended/set_num_gpu_threads.cpp
@@ -0,0 +1,91 @@
+/** @file
+ * 
+ * An example of using QuEST's experimental
+ * setQuESTNumGpuThreadsPerBlock() function
+ * to change the parallelisation granularity
+ * of GPU simulation
+ * 
+ * @author Tyson Jones
+ */
+
+#include "quest.h"
+#include <iostream>
+#include <chrono>
+
+
+const int NUM_REPS = 10;
+const int NUM_QUBITS = 25;  // 512 MiB (at double precision)
+
+
+void simulation(Qureg qureg)
+{
+    // put your favourite QuEST simulation here
+    initRandomPureState(qureg);
+    applyFullQuantumFourierTransform(qureg, /*inverse=*/false);
+    calcTotalProb(qureg);
+}
+
+
+void benchmark(Qureg qureg, int numThreadsPerBlock)
+{
+    std::cout << "Using " << numThreadsPerBlock << " threads per block... " << std::flush;
+
+    setQuESTNumGpuThreadsPerBlock(numThreadsPerBlock);
+
+    // warmup
+    for (int r=0; r<NUM_REPS; r++)
+        simulation(qureg);
+    syncQuESTEnv();
+
+    using clock = std::chrono::steady_clock;
+    auto start = clock::now();
+
+    for (int r=0; r<NUM_REPS; r++)
+        simulation(qureg);
+    syncQuESTEnv();
+
+    auto end = clock::now();
+    auto dur = std::chrono::duration<double>(end - start).count();
+    auto av  = dur / NUM_REPS;
+
+    std::cout << " took " << av << "s" << std::endl;
+}
+
+
+int main()
+{
+    initQuESTEnv();
+
+    // This example is pointless without a GPU!
+    if (!getQuESTEnv().isGpuAccelerated) {
+        std::cout 
+            << "GPU acceleration is not enabled, and so changing the number "
+            << "of threads per block has no effect. Exiting..."
+            << std::endl;
+        finalizeQuESTEnv();
+        return 0;
+    }
+
+    // The initial number of threads per block is informed by the optional environment
+    // variable QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK. If not specified, QuEST will
+    // use the value of the CMake option of the same name passed during compilation,
+    // which itself will has a default of 128
+    auto initNumTPB = getQuESTNumGpuThreadsPerBlock();
+    std::cout << "Initial numThreadsPerBlock: " << initNumTPB << "\n\n";
+
+    // Create a statevector parallelised only by the GPU
+    Qureg qureg = createCustomQureg(NUM_QUBITS, 0, 0, 1, 0);
+    reportQuregParams(qureg);
+
+    // Benchmark QuEST with sensible numbers of threads per block (multiples of warp size)
+    for (auto numTPB : {64, 128, 256, 512, 1024})
+        benchmark(qureg, numTPB);
+
+    // Try silly parameters ¯\_(ツ)_/¯
+    setQuESTValidationOff();
+    for (auto numTPB : {31, 15, 5, 1})
+        benchmark(qureg, numTPB);
+    
+    finalizeQuESTEnv();
+    return 0;
+}
diff --git a/examples/extended/user_owned_mpi.c b/examples/extended/user_owned_mpi.c
new file mode 100644
index 000000000..4e3c766f4
--- /dev/null
+++ b/examples/extended/user_owned_mpi.c
@@ -0,0 +1,49 @@
+/** @file
+ * 
+ * An example of using QuEST's experimental
+ * initCustomMpiQuESTEnv() function, to
+ * initialise QuEST in an environment where
+ * MPI is owned and controlled by the user.
+ * 
+ * @author Oliver Brown
+ * @author Tyson Jones (doc)
+ */
+
+#include "quest.h"
+#include <stdio.h>
+
+
+// This example requires linking with MPI, which the CMake
+// build only enables when QUEST_ENABLE_SUBCOMM is ON, which
+// results in quest.h defining QUEST_COMPILE_SUBCOMM. To
+// enable this example to always be compilable (like during
+// our CI), we guard against when QUEST_ENABLE_SUBCOMM is OFF.
+#if ! QUEST_COMPILE_SUBCOMM
+int main(void)
+{    
+    printf("Example skipped since MPI is not linked.\n");
+    return 0;
+}
+#else 
+
+
+#include <mpi.h>
+
+int main(void)
+{
+    const int  USE_DISTRIB = 1;
+    const bool USER_MPI    = 1;
+    const int  USE_OPENMP  = 1;
+    const int  USE_GPU     = 0;
+
+    MPI_Init(NULL, NULL);
+    initCustomMpiQuESTEnv(USE_DISTRIB, USER_MPI, USE_GPU, USE_OPENMP);
+    reportQuESTEnv();
+    finalizeQuESTEnv();
+    MPI_Finalize();
+
+    return 0;
+}
+
+
+#endif // QUEST_COMPILE_SUBCOMM
diff --git a/examples/extended/user_owned_mpi.cpp b/examples/extended/user_owned_mpi.cpp
new file mode 100644
index 000000000..54345d576
--- /dev/null
+++ b/examples/extended/user_owned_mpi.cpp
@@ -0,0 +1,49 @@
+/** @file
+ * 
+ * An example of using QuEST's experimental
+ * initCustomMpiQuESTEnv() function to
+ * initialise QuEST in an environment where
+ * MPI is owned and controlled by the user.
+ * 
+ * @author Oliver Brown
+ * @author Tyson Jones (doc)
+ */
+
+#include "quest.h"
+#include <cstdio>
+
+
+// This example requires linking with MPI, which the CMake
+// build only enables when QUEST_ENABLE_SUBCOMM is ON, which
+// results in quest.h defining QUEST_COMPILE_SUBCOMM. To
+// enable this example to always be compilable (like during
+// our CI), we guard against when QUEST_ENABLE_SUBCOMM is OFF.
+#if ! QUEST_COMPILE_SUBCOMM
+int main(void)
+{    
+    std::printf("Example skipped since MPI is not linked.\n");
+    return 0;
+}
+#else 
+
+
+#include <mpi.h>
+
+int main(void)
+{
+    const int  USE_DISTRIB = 1;
+    const bool USER_MPI    = 1;
+    const int  USE_OPENMP  = 1;
+    const int  USE_GPU     = 0;
+
+    MPI_Init(NULL, NULL);
+    initCustomMpiQuESTEnv(USE_DISTRIB, USER_MPI, USE_GPU, USE_OPENMP);
+    reportQuESTEnv();
+    finalizeQuESTEnv();
+    MPI_Finalize();
+
+    return 0;
+}
+
+
+#endif // QUEST_COMPILE_SUBCOMM
diff --git a/examples/extended/user_owned_submpi.c b/examples/extended/user_owned_submpi.c
new file mode 100644
index 000000000..6f2ea6290
--- /dev/null
+++ b/examples/extended/user_owned_submpi.c
@@ -0,0 +1,84 @@
+/** @file
+ * 
+ * An example of using QuEST's experimental
+ * initCustomMpiCommQuESTEnv() function to
+ * dedicate only some user-owned MPI processes
+ * to QuEST, and dedicate the remainder to
+ * other tasks.
+ * 
+ * @author Oliver Brown
+ * @author Tyson Jones (doc)
+ */
+
+#include "quest.h"
+#include <stdio.h>
+
+
+// This example requires linking with MPI, which the CMake
+// build only enables when QUEST_ENABLE_SUBCOMM is ON, which
+// results in quest.h defining QUEST_COMPILE_SUBCOMM. To
+// enable this example to always be compilable (like during
+// our CI), we guard against when QUEST_ENABLE_SUBCOMM is OFF.
+#if ! QUEST_COMPILE_SUBCOMM
+int main()
+{    
+    printf("Example skipped since MPI is not linked.\n");
+    return 0;
+}
+#else 
+
+
+#include <mpi.h>
+
+int main (void)
+{
+    int nprocs, quest_nprocs, world_rank, quest_rank;
+    MPI_Comm comm_split, comm_quantum, comm_classical;
+
+    MPI_Init(NULL, NULL);
+
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+    const int I_AM_QUANTUM = world_rank % 2;
+
+    printf("[%d] Hello from rank %d of %d in MPI_COMM_WORLD.\n", world_rank, world_rank, nprocs);
+
+    MPI_Comm_split(MPI_COMM_WORLD, I_AM_QUANTUM, world_rank, &comm_split);
+
+    if (I_AM_QUANTUM) {
+        MPI_Comm_dup(comm_split, &comm_quantum);
+        MPI_Comm_size(comm_quantum, &quest_nprocs);
+        MPI_Comm_rank(comm_quantum, &quest_rank);
+        printf("[%d] Hello from rank %d of %d in comm_quantum.\n", world_rank, quest_rank, quest_nprocs);
+    } else {
+        MPI_Comm_dup(comm_split, &comm_classical);
+        quest_rank = -1;
+        quest_nprocs = -1;
+    }
+
+    // only procs in quantum comm initialise QuEST
+    if (I_AM_QUANTUM) {
+        printf("[%d] Initialising QuEST.\n", world_rank);
+        initCustomMpiCommQuESTEnv(comm_quantum, -1, -1); // -1 = auto-deployments
+
+        reportQuESTEnv();
+
+        printf("[%d] Finalising QuEST.\n", world_rank);
+        finalizeQuESTEnv();
+    }
+
+    MPI_Comm_free(&comm_split);
+    if (I_AM_QUANTUM) {
+        MPI_Comm_free(&comm_quantum);
+    } else {
+        MPI_Comm_free(&comm_classical);
+    }
+
+    MPI_Finalize();
+
+    return 0;
+}
+
+
+#endif // QUEST_COMPILE_SUBCOMM
diff --git a/examples/extended/user_owned_submpi.cpp b/examples/extended/user_owned_submpi.cpp
new file mode 100644
index 000000000..ea82a4f9d
--- /dev/null
+++ b/examples/extended/user_owned_submpi.cpp
@@ -0,0 +1,84 @@
+/** @file
+ * 
+ * An example of using QuEST's experimental
+ * initCustomMpiCommQuESTEnv() function to
+ * dedicate only some user-owned MPI processes
+ * to QuEST, and dedicate the remainder to
+ * other tasks.
+ * 
+ * @author Oliver Brown
+ * @author Tyson Jones (doc)
+ */
+
+#include "quest.h"
+#include <cstdio>
+
+
+// This example requires linking with MPI, which the CMake
+// build only enables when QUEST_ENABLE_SUBCOMM is ON, which
+// results in quest.h defining QUEST_COMPILE_SUBCOMM. To
+// enable this example to always be compilable (like during
+// our CI), we guard against when QUEST_ENABLE_SUBCOMM is OFF.
+#if ! QUEST_COMPILE_SUBCOMM
+int main()
+{    
+    std::printf("Example skipped since MPI is not linked.\n");
+    return 0;
+}
+#else 
+
+
+#include <mpi.h>
+
+int main (void)
+{
+    int nprocs, quest_nprocs, world_rank, quest_rank;
+    MPI_Comm comm_split, comm_quantum, comm_classical;
+
+    MPI_Init(NULL, NULL);
+
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+    const int I_AM_QUANTUM = world_rank % 2;
+
+    std::printf("[%d] Hello from rank %d of %d in MPI_COMM_WORLD.\n", world_rank, world_rank, nprocs);
+
+    MPI_Comm_split(MPI_COMM_WORLD, I_AM_QUANTUM, world_rank, &comm_split);
+
+    if (I_AM_QUANTUM) {
+        MPI_Comm_dup(comm_split, &comm_quantum);
+        MPI_Comm_size(comm_quantum, &quest_nprocs);
+        MPI_Comm_rank(comm_quantum, &quest_rank);
+        std::printf("[%d] Hello from rank %d of %d in comm_quantum.\n", world_rank, quest_rank, quest_nprocs);
+    } else {
+        MPI_Comm_dup(comm_split, &comm_classical);
+        quest_rank = -1;
+        quest_nprocs = -1;
+    }
+
+    // only procs in quantum comm initialise QuEST
+    if (I_AM_QUANTUM) {
+        std::printf("[%d] Initialising QuEST.\n", world_rank);
+        initCustomMpiCommQuESTEnv(comm_quantum, modeflag::USE_AUTO, modeflag::USE_AUTO);
+
+        reportQuESTEnv();
+
+        std::printf("[%d] Finalising QuEST.\n", world_rank);
+        finalizeQuESTEnv();
+    }
+
+    MPI_Comm_free(&comm_split);
+    if (I_AM_QUANTUM) {
+        MPI_Comm_free(&comm_quantum);
+    } else {
+        MPI_Comm_free(&comm_classical);
+    }
+
+    MPI_Finalize();
+
+    return 0;
+}
+
+
+#endif // QUEST_COMPILE_SUBCOMM
diff --git a/examples/isolated/reporting_matrices.c b/examples/isolated/reporting_matrices.c
index 319c758cb..cb497593e 100644
--- a/examples/isolated/reporting_matrices.c
+++ b/examples/isolated/reporting_matrices.c
@@ -49,7 +49,7 @@ void demo_CompMatr() {
     for (int i=0; i<len; i++) {
         qindex num = numReportElems[i];
         rootPrint(num);
-        setMaxNumReportedItems(num, num);
+        setQuESTMaxNumReportedItems(num, num);
         reportCompMatr(matr);
     }
 }
@@ -69,7 +69,7 @@ void demo_DiagMatr() {
     for (int i=0; i<len; i++) {
         qindex num = numReportElems[i];
         rootPrint(num);
-        setMaxNumReportedItems(num, num);
+        setQuESTMaxNumReportedItems(num, num);
         reportDiagMatr(matr);
     }
 }
@@ -91,7 +91,7 @@ void demo_FullStateDiagMatr() {
     for (int i=0; i<6; i++) {
         qindex num = numReportElems[i];
         rootPrint(num);
-        setMaxNumReportedItems(num, num);
+        setQuESTMaxNumReportedItems(num, num);
         reportFullStateDiagMatr(matr);
     }
 }
diff --git a/examples/isolated/reporting_matrices.cpp b/examples/isolated/reporting_matrices.cpp
index 18cdf5eb1..d390a2d40 100644
--- a/examples/isolated/reporting_matrices.cpp
+++ b/examples/isolated/reporting_matrices.cpp
@@ -39,7 +39,7 @@ void demo_CompMatr() {
 
     for (int num : {0, 12, 5, 1}) {
         rootPrint(num);
-        setMaxNumReportedItems(num, num);
+        setQuESTMaxNumReportedItems(num, num);
         reportCompMatr(matr);
     }
 }
@@ -55,7 +55,7 @@ void demo_DiagMatr() {
 
     for (int num : {0, 10}) {
         rootPrint(num);
-        setMaxNumReportedItems(num, num);
+        setQuESTMaxNumReportedItems(num, num);
         reportDiagMatr(matr);
     }
 }
@@ -74,7 +74,7 @@ void demo_FullStateDiagMatr() {
 
     for (int num : {0, 50, 30, 10, 5, 1}) {
         rootPrint(num);
-        setMaxNumReportedItems(num, num);
+        setQuESTMaxNumReportedItems(num, num);
         reportFullStateDiagMatr(matr);
     }
 }
diff --git a/examples/isolated/reporting_paulis.c b/examples/isolated/reporting_paulis.c
index 56c2c4124..4ce2d737c 100644
--- a/examples/isolated/reporting_paulis.c
+++ b/examples/isolated/reporting_paulis.c
@@ -93,7 +93,7 @@ void demo_PauliStrSum() {
         qindex num = numReportElems[i];
         rootPrint(num);
 
-        setMaxNumReportedItems(num, num);
+        setQuESTMaxNumReportedItems(num, num);
         reportPauliStrSum(sum);
     }
 
diff --git a/examples/isolated/reporting_paulis.cpp b/examples/isolated/reporting_paulis.cpp
index c85df1b3b..7d2bff2c1 100644
--- a/examples/isolated/reporting_paulis.cpp
+++ b/examples/isolated/reporting_paulis.cpp
@@ -99,7 +99,7 @@ void demo_PauliStrSum() {
     for (int num : numReportElems) {
         rootPrint(num);
 
-        setMaxNumReportedItems(num, num);
+        setQuESTMaxNumReportedItems(num, num);
         reportPauliStrSum(sum);
     }
 
diff --git a/examples/isolated/setting_errorhandler.c b/examples/isolated/setting_errorhandler.c
index 9f777f122..7d980f495 100644
--- a/examples/isolated/setting_errorhandler.c
+++ b/examples/isolated/setting_errorhandler.c
@@ -19,7 +19,7 @@ void myErrorHandler(const char* errFunc, const char* errMsg) {
 
 int main() {
     initQuESTEnv();
-    setInputErrorHandler(myErrorHandler);
+    setQuESTInputErrorHandler(myErrorHandler);
 
     Qureg qureg = createQureg(-123);
 
diff --git a/examples/isolated/setting_errorhandler.cpp b/examples/isolated/setting_errorhandler.cpp
index 4a41ef30c..79d3995dc 100644
--- a/examples/isolated/setting_errorhandler.cpp
+++ b/examples/isolated/setting_errorhandler.cpp
@@ -48,7 +48,7 @@ void myErrorHandlerB(const char* errFunc, const char* errMsg) {
 int main() {
     initQuESTEnv();
 
-    setInputErrorHandler(myErrorHandlerA);
+    setQuESTInputErrorHandler(myErrorHandlerA);
 
     try {
         Qureg qureg = createQureg(-123);
@@ -59,7 +59,7 @@ int main() {
             << std::endl;
     }
 
-    setInputErrorHandler(myErrorHandlerB);
+    setQuESTInputErrorHandler(myErrorHandlerB);
     initQuESTEnv(); // illegal to recall
     
     std::cout << "this will never be reached, because myErrorHandlerB exits!" << std::endl;
diff --git a/quest/include/CMakeLists.txt b/quest/include/CMakeLists.txt
index 2ab3d569d..43146ceb4 100644
--- a/quest/include/CMakeLists.txt
+++ b/quest/include/CMakeLists.txt
@@ -4,7 +4,7 @@
 # @author Tyson Jones (doc)
 
 # Generate a header file which defines all configurable preprocessors
-# needed by the QuEST source (e.g. COMPILE_MPI), as informed by the
+# needed by the QuEST source (e.g. QUEST_COMPILE_MPI), as informed by the
 # user-set CMake options. This permits us to avoid passing any macros
 # through compiler flags and the associated conflicts arising when
 # installing QuEST. Note that config.h must be manually created when
diff --git a/quest/include/calculations.h b/quest/include/calculations.h
index 645f54c34..2ce813748 100644
--- a/quest/include/calculations.h
+++ b/quest/include/calculations.h
@@ -69,7 +69,7 @@ extern "C" {
  * 
  * - Postcondition validation will check that the calculated expectation value is approximately
  *   real (i.e. the imaginary component is smaller in size than the validation epsilon), as admitted
- *   when @p qureg is correctly normalised. This behaviour can be adjusted using setValidationEpsilon(). 
+ *   when @p qureg is correctly normalised. This behaviour can be adjusted using setQuESTValidationEpsilon(). 
  * - Regardless of the validation epsilon, the returned value is always real and the imaginary component
  *   is discarded. The full complex value can be obtained using calcExpecNonHermitianPauliStrSum().
  * 
@@ -129,7 +129,7 @@ qreal calcExpecPauliStr(Qureg qureg, PauliStr str);
  *   @f[ 
      |\im{c}| \le \valeps
  *   @f]
- *   for all @f$c \in @f$ `sum.coeffs`. Adjust @f$\valeps@f$ using setValidationEpsilon().
+ *   for all @f$c \in @f$ `sum.coeffs`. Adjust @f$\valeps@f$ using setQuESTValidationEpsilon().
  *   The sub-epsilon imaginary components of the coefficients _are_ included in calculation.
  * - Postcondition validation will check that the calculated expectation value is approximately
  *   real (i.e. the imaginary component is smaller in size than the validation epsilon), as should be
@@ -201,7 +201,7 @@ qreal calcExpecPauliStrSum(Qureg qureg, PauliStrSum sum);
  *   @f[ 
      |\im{c}| \le \valeps
  *   @f]
- *   for all @f$c \in @f$ `matr.cpuElems`. Adjust @f$\valeps@f$ using setValidationEpsilon().
+ *   for all @f$c \in @f$ `matr.cpuElems`. Adjust @f$\valeps@f$ using setQuESTValidationEpsilon().
  * - Postcondition validation will check that the calculated expectation value is approximately
  *   real (i.e. the imaginary component is smaller in size than the validation epsilon), as should be
  *   admitted when @p qureg is correctly normalised, and @p matr is Hermitian.
@@ -277,14 +277,14 @@ qreal calcExpecFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matr);
  *   @f[ 
      |\im{c}| \le \valeps
  *   @f]
- *   for all @f$c \in @f$ `matr.cpuElems`. Adjust @f$\valeps@f$ using setValidationEpsilon().
+ *   for all @f$c \in @f$ `matr.cpuElems`. Adjust @f$\valeps@f$ using setQuESTValidationEpsilon().
  * 
  *   > [!CAUTION]
  *   > Unlike other functions (including calcExpecFullStateDiagMatr()), this function will _NOT_
  *   > consult the imaginary components of the elements of @p matrix, since a non-complex exponentiation
  *   > function is used. That is, while validation permits the imaginary components to be small, they
  *   > will be internally treated as precisely zero. This is true even when Hermiticity validation
- *   > is disabled using setValidationOff(). To consult the imaginary components of @p matrix, use
+ *   > is disabled using setQuESTValidationOff(). To consult the imaginary components of @p matrix, use
  *   > calcExpecNonHermitianFullStateDiagMatrPower().
  * 
  * - Hermiticity of @p matrix when raised to @p exponent further requires that, when @p exponent is 
@@ -298,7 +298,7 @@ qreal calcExpecFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matr);
  *   zero elements which would otherwise create divergences in @f$\hat{D}^x@f$. Validation ergo
  *   checks that when @p exponent is (strictly) negative, @p matrix contains no elements within 
  *   distance @f$\valeps@f$ to zero (regardless of the magnitude of @p exponent). Adjust
- *   @f$\valeps@f$ using setValidationEpsilon().
+ *   @f$\valeps@f$ using setQuESTValidationEpsilon().
  * - The passed @p exponent is always real, but can be relaxed to a general complex scalar via
  *   calcExpecNonHermitianFullStateDiagMatrPower().
  * - The returned value is always real, and the imaginary component is neglected even when 
@@ -890,7 +890,7 @@ qreal calcPurity(Qureg qureg);
  * - The output of this function is always real, which validation will check after computing the
  *   fidelity as a complex scalar. Specifically, validation will assert that the result has an
  *   absolute imaginary component less than the validation epsilon, which can be adjusted with
- *   setValidationEpsilon().
+ *   setQuESTValidationEpsilon().
  * 
  * - This function does not yet support both @p qureg and @p other being density matrices, for
  *   which the fidelity calculation is more substantial.
@@ -1004,7 +1004,7 @@ qreal calcFidelity(Qureg qureg, Qureg other);
          \left| \, \im{ \brapsi \dmrho \svpsi } \, \right| \le \valeps, \\
          \re{ \brapsi \dmrho \svpsi } \le 1 + \valeps,
  *   @f]
- *   where @f$\valeps@f$ is the validation epsilon, adjustable via setValidationEpsilon().
+ *   where @f$\valeps@f$ is the validation epsilon, adjustable via setQuESTValidationEpsilon().
  * 
  * - Even when the above postcondition validation is disabled, the Bures and purified distance
  *   calculations will respectively replace @f$\left| \braket{\phi}{\psi} \right|@f$ and 
diff --git a/quest/include/config.h.in b/quest/include/config.h.in
index 2cb12fa90..1bb8a0470 100644
--- a/quest/include/config.h.in
+++ b/quest/include/config.h.in
@@ -7,9 +7,9 @@
  * defined in one central place (right here) rather than being 
  * passed to each source file as compiler flags. It further
  * ensures that when QuEST is installed, critical user-facing
- * macros such as FLOAT_PRECISION cannot ever be changed from
+ * macros such as QUEST_FLOAT_PRECISION cannot ever be changed from
  * their value during source compilation. Finally, it enables
- * users to access macros such as COMPILE_OPENMP at pre-build
+ * users to access macros such as QUEST_COMPILE_OMP at pre-build
  * time of their own source code, which could prove necessary 
  * when interfacing with external libraries.
  * 
@@ -34,15 +34,16 @@
  */
 
 
-#if defined(FLOAT_PRECISION)              || \
-    defined(COMPILE_OPENMP)               || \
-    defined(COMPILE_MPI)                  || \
-    defined(COMPILE_CUDA)                 || \
-    defined(COMPILE_HIP)                  || \
-    defined(COMPILE_CUQUANTUM)            || \
-    defined(NUMA_AWARE)                   || \
-    defined(INCLUDE_DEPRECATED_FUNCTIONS) || \
-    defined(DISABLE_DEPRECATION_WARNINGS)
+#if defined(QUEST_FLOAT_PRECISION)              || \
+    defined(QUEST_COMPILE_OMP)                  || \
+    defined(QUEST_COMPILE_MPI)                  || \
+    defined(QUEST_COMPILE_SUBCOMM)              || \
+    defined(QUEST_COMPILE_CUDA)                 || \
+    defined(QUEST_COMPILE_HIP)                  || \
+    defined(QUEST_COMPILE_CUQUANTUM)            || \
+    defined(QUEST_ENABLE_NUMA)                  || \
+    defined(QUEST_INCLUDE_DEPRECATED_FUNCTIONS) || \
+    defined(QUEST_DISABLE_DEPRECATION_WARNINGS)
 
     #error "Pre-config macros were erroneously passed directly to the source rather than through the CMake config file."
 
@@ -71,24 +72,26 @@
 
 
 // crucial to user source (informs API)
-#cmakedefine FLOAT_PRECISION @FLOAT_PRECISION@
-#cmakedefine01 INCLUDE_DEPRECATED_FUNCTIONS
-#cmakedefine01 DISABLE_DEPRECATION_WARNINGS
+#cmakedefine QUEST_FLOAT_PRECISION @QUEST_FLOAT_PRECISION@
+#cmakedefine01 QUEST_INCLUDE_DEPRECATED_FUNCTIONS
+#cmakedefine01 QUEST_DISABLE_DEPRECATION_WARNINGS
 
 
 // crucial to QuEST source (informs external library usage)
-#cmakedefine01 COMPILE_OPENMP
-#cmakedefine01 COMPILE_MPI
-#cmakedefine01 COMPILE_CUDA
-#cmakedefine01 COMPILE_CUQUANTUM
+#cmakedefine01 QUEST_COMPILE_OMP
+#cmakedefine01 QUEST_COMPILE_MPI
+#cmakedefine01 QUEST_COMPILE_SUBCOMM
+#cmakedefine01 QUEST_COMPILE_CUDA
+#cmakedefine01 QUEST_COMPILE_CUQUANTUM
+#cmakedefine01 QUEST_COMPILE_HIP
 
 
-// not actually a CMake option (user cannot disable) but nonetheless crucial
-#cmakedefine01 NUMA_AWARE
+// crucial to QuEST source (informs optional NUMA usage)
+#cmakedefine01 QUEST_ENABLE_NUMA
 
 
-// not consulted by src (included for book-keeping)
-#cmakedefine01 COMPILE_HIP
+// default parameters which may have been tuned for performance when building the library
+#cmakedefine QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK @QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK@
 
 
 
@@ -115,15 +118,16 @@
  */
 
 
-#if ! defined(FLOAT_PRECISION)              || \
-    ! defined(COMPILE_OPENMP)               || \
-    ! defined(COMPILE_MPI)                  || \
-    ! defined(COMPILE_CUDA)                 || \
-    ! defined(COMPILE_HIP)                  || \
-    ! defined(COMPILE_CUQUANTUM)            || \
-    ! defined(NUMA_AWARE)                   || \
-    ! defined(INCLUDE_DEPRECATED_FUNCTIONS) || \
-    ! defined(DISABLE_DEPRECATION_WARNINGS)
+#if ! defined(QUEST_FLOAT_PRECISION)              || \
+    ! defined(QUEST_COMPILE_OMP)                  || \
+    ! defined(QUEST_COMPILE_MPI)                  || \
+    ! defined(QUEST_COMPILE_SUBCOMM)              || \
+    ! defined(QUEST_COMPILE_CUDA)                 || \
+    ! defined(QUEST_COMPILE_HIP)                  || \
+    ! defined(QUEST_COMPILE_CUQUANTUM)            || \
+    ! defined(QUEST_ENABLE_NUMA)                  || \
+    ! defined(QUEST_INCLUDE_DEPRECATED_FUNCTIONS) || \
+    ! defined(QUEST_DISABLE_DEPRECATION_WARNINGS)
 
     #error "Expected macros were not defined by the config.h header, possibly because their corresponding CMake variables were not substituted."
 
@@ -142,14 +146,15 @@
  */
 
 
-#if ! (COMPILE_OPENMP               == 0 || COMPILE_OPENMP               == 1) || \
-    ! (COMPILE_MPI                  == 0 || COMPILE_MPI                  == 1) || \
-    ! (COMPILE_CUDA                 == 0 || COMPILE_CUDA                 == 1) || \
-    ! (COMPILE_HIP                  == 0 || COMPILE_HIP                  == 1) || \
-    ! (COMPILE_CUQUANTUM            == 0 || COMPILE_CUQUANTUM            == 1) || \
-    ! (NUMA_AWARE                   == 0 || NUMA_AWARE                   == 1) || \
-    ! (INCLUDE_DEPRECATED_FUNCTIONS == 0 || INCLUDE_DEPRECATED_FUNCTIONS == 1) || \
-    ! (DISABLE_DEPRECATION_WARNINGS == 0 || DISABLE_DEPRECATION_WARNINGS == 1)
+#if ! (QUEST_COMPILE_OMP                  == 0 || QUEST_COMPILE_OMP                  == 1) || \
+    ! (QUEST_COMPILE_MPI                  == 0 || QUEST_COMPILE_MPI                  == 1) || \
+    ! (QUEST_COMPILE_SUBCOMM              == 0 || QUEST_COMPILE_SUBCOMM              == 1) || \
+    ! (QUEST_COMPILE_CUDA                 == 0 || QUEST_COMPILE_CUDA                 == 1) || \
+    ! (QUEST_COMPILE_HIP                  == 0 || QUEST_COMPILE_HIP                  == 1) || \
+    ! (QUEST_COMPILE_CUQUANTUM            == 0 || QUEST_COMPILE_CUQUANTUM            == 1) || \
+    ! (QUEST_ENABLE_NUMA                  == 0 || QUEST_ENABLE_NUMA                  == 1) || \
+    ! (QUEST_INCLUDE_DEPRECATED_FUNCTIONS == 0 || QUEST_INCLUDE_DEPRECATED_FUNCTIONS == 1) || \
+    ! (QUEST_DISABLE_DEPRECATION_WARNINGS == 0 || QUEST_DISABLE_DEPRECATION_WARNINGS == 1)
 
     #error "A macro defined by the config.h header (as inferred from a CMake variable) had an illegal value."
 
@@ -166,4 +171,4 @@
 
 
 
-#endif // CONFIG_H
\ No newline at end of file
+#endif // CONFIG_H
diff --git a/quest/include/debug.h b/quest/include/debug.h
index 48ef22527..a51236141 100644
--- a/quest/include/debug.h
+++ b/quest/include/debug.h
@@ -43,19 +43,19 @@ extern "C" {
 
 
 /// @notyetdoced
-void setSeeds(unsigned* seeds, int numSeeds);
+void setQuESTSeeds(unsigned* seeds, int numSeeds);
 
 
 /// @notyetdoced
-void setSeedsToDefault();
+void setQuESTSeedsToDefault();
 
 
 /// @notyetdoced
-void getSeeds(unsigned* seeds);
+void getQuESTSeeds(unsigned* seeds);
 
 
 /// @notyetdoced
-int getNumSeeds();
+int getQuESTNumSeeds();
 
 
 /** @} */
@@ -79,27 +79,27 @@ int getNumSeeds();
  * - [C](https://github.com/QuEST-Kit/QuEST/blob/devel/examples/isolated/setting_errorhandler.c) and 
  *   [C++](https://github.com/QuEST-Kit/QuEST/blob/devel/examples/isolated/setting_errorhandler.cpp) examples
  */
-void setInputErrorHandler(void (*callback)(const char* func, const char* msg));
+void setQuESTInputErrorHandler(void (*callback)(const char* func, const char* msg));
 
 
 /// @notyetdoced
-void setValidationOn();
+void setQuESTValidationOn();
 
 
 /// @notyetdoced
-void setValidationOff();
+void setQuESTValidationOff();
 
 
 /// @notyetdoced
-void setValidationEpsilonToDefault();
+void setQuESTValidationEpsilonToDefault();
 
 
 /// @notyetdoced
-void setValidationEpsilon(qreal eps);
+void setQuESTValidationEpsilon(qreal eps);
 
 
 /// @notyetdoced
-qreal getValidationEpsilon();
+qreal getQuESTValidationEpsilon();
 
 
 /** @} */
@@ -115,7 +115,7 @@ qreal getValidationEpsilon();
 
 /// @notyetdoced
 /// @notyettested
-void setMaxNumReportedItems(qindex numRows, qindex numCols);
+void setQuESTMaxNumReportedItems(qindex numRows, qindex numCols);
 
 
 /** @notyetdoced
@@ -123,11 +123,11 @@ void setMaxNumReportedItems(qindex numRows, qindex numCols);
  * > (e.g. `5.32 KiB`) which is always shown with three significant figures 
  * > (or four when in bytes, e.g. `1023 bytes`).
  */
-void setMaxNumReportedSigFigs(int numSigFigs);
+void setQuESTMaxNumReportedSigFigs(int numSigFigs);
 
 
 /// @notyetdoced
-void setNumReportedNewlines(int numNewlines);
+void setQuESTNumReportedNewlines(int numNewlines);
 
 
 /** 
@@ -138,11 +138,11 @@ void setNumReportedNewlines(int numNewlines);
    PauliStr str = getInlinePauliStr("XYZ", {0,10,20});
    reportPauliStr(str);
 
-   setReportedPauliChars(".xyz");
+   setQuESTReportedPauliChars(".xyz");
    reportPauliStr(str);
  * ```
  */
-void setReportedPauliChars(const char* paulis);
+void setQuESTReportedPauliChars(const char* paulis);
 
 
 /** 
@@ -152,14 +152,14 @@ void setReportedPauliChars(const char* paulis);
  * ```
    PauliStr str = getInlinePauliStr("XYZ", {0,10,20});
 
-   setReportedPauliStrStyle(0);
+   setQuESTReportedPauliStrStyle(0);
    reportPauliStr(str);
 
-   setReportedPauliStrStyle(1);
+   setQuESTReportedPauliStrStyle(1);
    reportPauliStr(str);
  * ```
  */
-void setReportedPauliStrStyle(int style);
+void setQuESTReportedPauliStrStyle(int style);
 
 
 /** @} */
@@ -174,11 +174,11 @@ void setReportedPauliStrStyle(int style);
 
 
 /// @notyetdoced
-qindex getGpuCacheSize();
+qindex getQuESTGpuCacheSize();
 
 
 /// @notyetdoced
-void clearGpuCache();
+void clearQuESTGpuCache();
 
 
 /** @} */
@@ -194,7 +194,7 @@ void clearGpuCache();
 
 /// @notyetdoced
 /// @notyettested
-void getEnvironmentString(char str[200]);
+void getQuESTEnvironmentString(char str[200]);
 
 
 /** @} */
@@ -225,16 +225,16 @@ void getEnvironmentString(char str[200]);
 /// @notyettested
 /// @notyetdoced
 /// @cppvectoroverload
-/// @see setSeeds()
-void setSeeds(std::vector<unsigned> seeds);
+/// @see setQuESTSeeds()
+void setQuESTSeeds(std::vector<unsigned> seeds);
 
 
 /// @ingroup debug_seed
 /// @notyettested
 /// @notyetdoced
 /// @cpponly
-/// @see getSeeds()
-std::vector<unsigned> getSeeds();
+/// @see getQuESTSeeds()
+std::vector<unsigned> getQuESTSeeds();
 
 
 #endif // __cplusplus
diff --git a/quest/include/deprecated.h b/quest/include/deprecated.h
index 1a63b2044..92032efb8 100644
--- a/quest/include/deprecated.h
+++ b/quest/include/deprecated.h
@@ -29,7 +29,7 @@
  * INITIAL WARNING
  */
 
-#if !defined(DISABLE_DEPRECATION_WARNINGS) || DISABLE_DEPRECATION_WARNINGS == 0
+#if !defined(QUEST_DISABLE_DEPRECATION_WARNINGS) || QUEST_DISABLE_DEPRECATION_WARNINGS == 0
 
     // #warning command is always recognised (deprecated API is not MSVC-compatible)
     #warning "\
@@ -49,7 +49,7 @@ refactor your code to v4, and should absolutely not continue to use the old v3 A
 /*
  * TOGGLEABLE WARNING MESSAGES
  *
- * users can define precompiler constant DISABLE_DEPRECATION_WARNINGS=1
+ * users can define precompiler constant QUEST_DISABLE_DEPRECATION_WARNINGS=1
  * in order to disable compile-time deprecation warnings. This will
  * make most of the QuEST v3 API silently work by casting to the 
  * v4 API at compile-time. Note that _Pragma() are resolved at
@@ -62,7 +62,7 @@ refactor your code to v4, and should absolutely not continue to use the old v3 A
 
 #define _EFFECT_PRAGMA(cmd) _Pragma(#cmd)
 
-#if DISABLE_DEPRECATION_WARNINGS
+#if QUEST_DISABLE_DEPRECATION_WARNINGS
 
     #define _WARN_TYPE_RENAMED(oldname, newname)
 
@@ -449,13 +449,6 @@ typedef enum pauliOpType _NoWarnPauliOpType;
         "setDensityQuregAmps(Qureg, qindex startRow, qindex startCol, qcomp** amps, qindex numRows, qindex numCols)")
 
 
-#define getQuESTSeeds(...) \
-    _ERROR_GENERAL_MSG( \
-        "The QuEST function 'getQuESTSeeds(QuESTEnv env, unsigned long int* out, int numOut)' has been deprecated. " \
-        "Please instead use 'getSeeds(unsigned* out)' which accepts a pointer to pre-allocated memory of length " \
-        "equal to that returned by 'getNumSeeds()'. We cannot automatically invoke this replacement routine." )
-
-
 #define applyPhaseFunc(...) \
     _ERROR_PHASE_FUNC_REMOVED("applyPhaseFunc")
 
@@ -548,17 +541,44 @@ typedef enum pauliOpType _NoWarnPauliOpType;
 
 
 #define _GET_ENVIRONMENT_STRING_1(str) \
-    getEnvironmentString(str)
+    getQuESTEnvironmentString(str)
 
 #define _GET_ENVIRONMENT_STRING_2(str) \
-    _WARN_FUNC_NOW_HAS_FEWER_ARGS("getEnvironmentString(QuESTEnv, char[200])", "getEnvironmentString(char[200])") \
+    _WARN_FUNC_NOW_HAS_FEWER_ARGS("getQuESTEnvironmentString(QuESTEnv, char[200])", "getQuESTEnvironmentString(char[200])") \
     _GET_ENVIRONMENT_STRING_1(str)
 
-#define getEnvironmentString(...) \
+#define getQuESTEnvironmentString(...) \
     _CALL_MACRO_WITH_1_OR_2_ARGS(_GET_ENVIRONMENT_STRING, __VA_ARGS__)
 
 
 
+/*
+ * FUNCTIONS WITH THE SAME NAME BUT 1 INSTEAD OF 3 ARGS
+ *
+ * which are handled similar to above
+ */
+
+
+#define _GET_MACRO_WITH_1_OR_3_ARGS(_1, _2, _3, macroname, ...) macroname
+
+#define _CALL_MACRO_WITH_1_OR_3_ARGS(prefix, ...) \
+    _GET_MACRO_WITH_1_OR_3_ARGS(__VA_ARGS__, prefix##_3, prefix##_2, prefix##_1)(__VA_ARGS__)
+
+
+#define _GET_QUEST_SEEDS_1(out) \
+    getQuESTSeeds(out)
+
+#define _GET_QUEST_SEEDS_3(env, out, numOut) \
+    _WARN_FUNC_NOW_HAS_FEWER_ARGS( \
+        "getQuESTSeeds(QuESTEnv env, unsigned long int* out, int numOut)", \
+        "getQuESTSeeds(unsigned* out)") \
+    _GET_QUEST_SEEDS_1(out)
+
+#define getQuESTSeeds(...) \
+    _CALL_MACRO_WITH_1_OR_3_ARGS(_GET_QUEST_SEEDS, __VA_ARGS__)
+
+
+
 /*
  * FUNCTIONS WITH THE SAME NAME BUT 0 INSTEAD OF 1 ARGS
  *
@@ -657,10 +677,10 @@ static inline void v3_mixKrausMap(Qureg qureg, int targ, _NoWarnComplexMatrix2 *
 
 
 static inline void _mixNonTPKrausMap(Qureg qureg, int targ, _NoWarnComplexMatrix2 *ops, int numOps) {
-    qreal eps = getValidationEpsilon();
-    setValidationEpsilon(0);
+    qreal eps = getQuESTValidationEpsilon();
+    setQuESTValidationEpsilon(0);
     _MIX_KRAUS_MAP_INNER(qureg, ops, numOps, &targ, 1);
-    setValidationEpsilon(eps);
+    setQuESTValidationEpsilon(eps);
 }
 
 #define mixNonTPKrausMap(...) \
@@ -673,10 +693,10 @@ static inline void _mixNonTPKrausMap(Qureg qureg, int targ, _NoWarnComplexMatrix
 
 static inline void _mixTwoQubitKrausMap(Qureg qureg, int targ1, int targ2, _NoWarnComplexMatrix4 *ops, int numOps, int isNonCPTP) {
     int targs[] = {targ1, targ2};
-    qreal eps = getValidationEpsilon();
-    if (isNonCPTP) setValidationEpsilon(0);
+    qreal eps = getQuESTValidationEpsilon();
+    if (isNonCPTP) setQuESTValidationEpsilon(0);
     _MIX_KRAUS_MAP_INNER(qureg, ops, numOps, targs, 2);
-    setValidationEpsilon(eps);
+    setQuESTValidationEpsilon(eps);
 }
 
 #define mixTwoQubitKrausMap(...) \
@@ -703,11 +723,11 @@ static inline void _mixMultiQubitKrausMap(Qureg qureg, int* targs, int numTargs,
     setKrausMap(map, ptrs);
     free(ptrs);
 
-    qreal eps = getValidationEpsilon();
-    if (isNonCPTP) setValidationEpsilon(0);
+    qreal eps = getQuESTValidationEpsilon();
+    if (isNonCPTP) setQuESTValidationEpsilon(0);
     (mixKrausMap)(qureg, targs, numTargs, map); // calls above macro, wrapped to avoid warning */
     destroyKrausMap(map);
-    setValidationEpsilon(eps);
+    setQuESTValidationEpsilon(eps);
 }
 
 #define mixMultiQubitKrausMap(...) \
@@ -827,16 +847,16 @@ static inline QuESTEnv _createQuESTEnv() {
     leftapplyDiagMatr(__VA_ARGS__)
 
 static inline void _applyGateSubDiagonalOp(Qureg qureg, int* targets, int numTargets, DiagMatr op) {
-    qreal eps = getValidationEpsilon();
-    setValidationEpsilon(0);
+    qreal eps = getQuESTValidationEpsilon();
+    setQuESTValidationEpsilon(0);
     applyDiagMatr(qureg, targets, numTargets, op);
-    setValidationEpsilon(eps);
+    setQuESTValidationEpsilon(eps);
 }
 #define applyGateSubDiagonalOp(...) \
     _WARN_GENERAL_MSG( \
         "The QuEST function 'applyGateSubDiagonalOp()' is deprecated. To achieve the same thing, disable " \
-        "numerical validation via 'setValidationEpsilon(0)' before calling 'applyDiagMatr()'. You can " \
-        "save the existing epsilon via 'getValidationEpsilon()' to thereafter restore. This procedure " \
+        "numerical validation via 'setQuESTValidationEpsilon(0)' before calling 'applyDiagMatr()'. You can " \
+        "save the existing epsilon via 'getQuESTValidationEpsilon()' to thereafter restore. This procedure " \
         "has been performed here automatically.") \
     _applyGateSubDiagonalOp(__VA_ARGS__)
 
@@ -1131,32 +1151,32 @@ static inline void _applyPauliHamil(Qureg inQureg, PauliStrSum hamil, Qureg outQ
 
 
 static inline void _applyGateMatrixN(Qureg qureg, int* targs, int numTargs, CompMatr u) {
-    qreal eps = getValidationEpsilon();
-    setValidationEpsilon(0);
+    qreal eps = getQuESTValidationEpsilon();
+    setQuESTValidationEpsilon(0);
     applyCompMatr(qureg, targs, numTargs, u);
-    setValidationEpsilon(eps);
+    setQuESTValidationEpsilon(eps);
 }
 
 #define applyGateMatrixN(...) \
     _WARN_GENERAL_MSG( \
         "The QuEST function 'applyGateMatrixN()' is deprecated. To achieve the same thing, disable " \
-        "numerical validation via 'setValidationEpsilon(0)' before calling 'applyCompMatr()'. You can " \
-        "save the existing epsilon via 'getValidationEpsilon()' to thereafter restore. This procedure " \
+        "numerical validation via 'setQuESTValidationEpsilon(0)' before calling 'applyCompMatr()'. You can " \
+        "save the existing epsilon via 'getQuESTValidationEpsilon()' to thereafter restore. This procedure " \
         "has been performed here automatically.") \
     _applyGateMatrixN(__VA_ARGS__)
 
 static inline void _applyMultiControlledGateMatrixN(Qureg qureg, int* ctrls, int numCtrls, int* targs, int numTargs, CompMatr u) {
-    qreal eps = getValidationEpsilon();
-    setValidationEpsilon(0);
+    qreal eps = getQuESTValidationEpsilon();
+    setQuESTValidationEpsilon(0);
     applyMultiControlledCompMatr(qureg, ctrls, numCtrls, targs, numTargs, u);
-    setValidationEpsilon(eps);
+    setQuESTValidationEpsilon(eps);
 }
 
 #define applyMultiControlledGateMatrixN(...) \
     _WARN_GENERAL_MSG( \
         "The QuEST function 'applyMultiControlledGateMatrixN()' is deprecated. To achieve the same thing, disable " \
-        "numerical validation via 'setValidationEpsilon(0)' before calling 'applyMultiControlledCompMatr()'. You can " \
-        "save the existing epsilon via 'getValidationEpsilon()' to thereafter restore. This procedure has been " \
+        "numerical validation via 'setQuESTValidationEpsilon(0)' before calling 'applyMultiControlledCompMatr()'. You can " \
+        "save the existing epsilon via 'getQuESTValidationEpsilon()' to thereafter restore. This procedure has been " \
         "performed here automatically.") \
     _applyMultiControlledGateMatrixN(__VA_ARGS__)
 
@@ -1331,12 +1351,12 @@ static inline void _multiControlledMultiRotatePauli(Qureg qureg, int* ctrls, int
 
 
 #define seedQuESTDefault(...) \
-    _WARN_FUNC_RENAMED("seedQuESTDefault(QuESTEnv)", "setSeedsToDefault()") \
-    setSeedsToDefault()
+    _WARN_FUNC_RENAMED("seedQuESTDefault(QuESTEnv)", "setQuESTSeedsToDefault()") \
+    setQuESTSeedsToDefault()
 
 #define seedQuEST(env, seeds, numSeeds) \
-    _WARN_FUNC_RENAMED("seedQuEST(QuESTEnv, unsigned long int*, int)", "setSeeds(unsigned*, int)") \
-    setSeeds(seeds, numSeeds)
+    _WARN_FUNC_RENAMED("seedQuEST(QuESTEnv, unsigned long int*, int)", "setQuESTSeeds(unsigned*, int)") \
+    setQuESTSeeds(seeds, numSeeds)
 
 
 
diff --git a/quest/include/environment.h b/quest/include/environment.h
index 04f24bfe2..cdefa7d7d 100644
--- a/quest/include/environment.h
+++ b/quest/include/environment.h
@@ -14,6 +14,8 @@
 #ifndef ENVIRONMENT_H
 #define ENVIRONMENT_H
 
+#include <stdbool.h>
+
 // enable invocation by both C and C++ binaries
 #ifdef __cplusplus
 extern "C" {
@@ -33,15 +35,17 @@ extern "C" {
 typedef struct {
 
     // deployment modes which can be runtime disabled
-    int isMultithreaded;
-    int isGpuAccelerated;
-    int isDistributed;
+    bool isMultithreaded;
+    bool isGpuAccelerated;
+    bool isDistributed;
+    bool isMpiUserOwned;
 
     // deployment modes which cannot be directly changed after compilation
-    int isCuQuantumEnabled;
+    bool isCuQuantumEnabled;
 
     // deployment configurations which can be changed via environment variables
     int isGpuSharingEnabled;
+    int isMpiGpuAware;
 
     // distributed configuration
     int rank;
diff --git a/quest/include/experimental.h b/quest/include/experimental.h
new file mode 100644
index 000000000..8c2cc4e0a
--- /dev/null
+++ b/quest/include/experimental.h
@@ -0,0 +1,110 @@
+/** @file
+ * Experimental functions which are liable to
+ * API breaks within QuEST minor version releases.
+ * Some optional functions require compiling this
+ * file against MPI, despite being outside of /comm/, 
+ * and so require opt-in macros (QUEST_COMPILE_SUBCOMM)
+ * 
+ * @author Oliver Brown
+ * @author Tyson Jones (formatting)
+ * 
+ * @defgroup experimental Experimental
+ * @ingroup api
+ * @brief Experimental functions with tentative APIs
+ * @{
+ */
+
+#ifndef EXPERIMENTAL_H
+#define EXPERIMENTAL_H
+
+#include "quest/include/config.h"
+
+#if QUEST_COMPILE_SUBCOMM && ! QUEST_COMPILE_MPI
+    #error "Macro QUEST_COMPILE_SUBCOMM was true, but QUEST_COMPILE_MPI was illegally false."
+#endif
+
+#if QUEST_COMPILE_SUBCOMM
+    #include <mpi.h>
+#endif
+
+// enable invocation by both C and C++ binaries
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** @notyetdoced
+ *
+ *  Advanced initialiser which lets the user positively declare that they take responsibility for MPI.
+ *  This means we assume they have called MPI_Init, and that they will call MPI_Finalize.
+ * 
+ * @author Oliver Brown
+ */
+void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread);
+
+
+#if QUEST_COMPILE_SUBCOMM
+/** @notyetdoced
+ * 
+ *  Advanced initialiser which allows the user to provide an MPI communicator for QuEST to use.
+ *  Use of this initialiser implies userOwnsMpi = true, (exposed by initCustomMpiQuESTEnv) and 
+ *  therefore that they have already initialised MPI, and they will call MPI_Finalize at the 
+ *  appropriate time.
+ *
+ *  The user-provided MPI communicator undergoes the same validation procedure as any that QuEST
+ *  would use, and so must contain a power-of-2 number of processes.
+ * 
+ * This function is only compiled and exposed when macro QUEST_COMPILE_SUBCOMM is 1, as is
+ * defined when providing CMake option QUEST_ENABLE_SUBCOMM during building.
+ *
+ * @author Oliver Brown
+ */
+void initCustomMpiCommQuESTEnv(MPI_Comm questComm, int useGpuAccel, int useMultithread);
+#endif // QUEST_COMPILE_SUBCOMM
+
+
+/** @notyetdoced
+ * 
+ * @author Oliver Brown
+ */
+int getQuESTNumGpuThreadsPerBlock();
+
+
+/** Overrides the number of CUDA threads per block (or @p blockDim) used by QuEST's GPU-accelerated backend.
+ * 
+ * This changes the GPU parallelisation granularity and can affect performance, and is useful
+ * for performance tuning or diagnostics. Before this function is called, QuEST will use the
+ * number as specified by the environment variable @p QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK,
+ * if defined. Otherwise, it will use the value specified by the CMake/compile option of the
+ * same name, which itself presently defaults to @p 128. After this function is called, QuEST
+ * will adopt @p numThreadsPerBlock for the remainder of execution, or until this function is
+ * called again.
+ * 
+ * Practical values of @p numThreadsPerBlock can vary with the simulation size, the user's GPU hardware,
+ * and whether it is NVIDIA or AMD, which have respective warp sizes of @p 32 and @p 64.
+ * 
+ * @note
+ * This function has no effect when QuEST is not deployed with GPU-acceleration enabled.
+ *
+ * @param[in] numThreadsPerBlock the new block size.
+ * @throws @validationerror
+ * - if the @p QuESTEnv is not initialised.
+ * - if @p numThreadsPerBlock is negative.
+ * - if @p numThreadsPerBlock is not a multiple of the GPU warp size.
+ * - if @p numThreadsPerBlock exceeds the maximum @p blockDim imposed by the GPU hardware.
+ * @see
+ * - QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK
+ * @author Oliver Brown
+ * @author Tyson Jones
+ */
+void setQuESTNumGpuThreadsPerBlock(int numThreadsPerBlock);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+#endif // EXPERIMENTAL_H
+
+/** @} */ // (end file-wide doxygen defgroup)
diff --git a/quest/include/modes.h b/quest/include/modes.h
index f8fc52a1c..25ad8bb54 100644
--- a/quest/include/modes.h
+++ b/quest/include/modes.h
@@ -43,39 +43,77 @@
      *  - forbid sharing: @p 0, @p '0', @p '', @p , (unspecified)
      *  - permit sharing: @p 1, @p '1'
      * 
+     * @constraints
+     * The function initQuESTEnv() will throw a validation error if any of the below are not satisfied.
+     *   - The specified string does not evaluate to an integer @p 0 or @p 1.
+     * 
      * @author Tyson Jones
      */
-    const int PERMIT_NODES_TO_SHARE_GPU = 0;
+    const int QUEST_PERMIT_NODES_TO_SHARE_GPU = 0;
 
 
     /** @envvardoc
      * 
      * Specifies the default validation epsilon. 
      * 
-     * Specifying `DEFAULT_VALIDATION_EPSILON` to a positive, real number overrides the 
+     * Specifying `QUEST_DEFAULT_VALIDATION_EPSILON` to a positive, real number overrides the 
      * precision-specific default (`1E-5`, `1E-12`, `1E-13` for single, double and quadruple 
      * precision respectively). The specified epsilon is used by QuEST for numerical validation
-     * unless overriden at runtime via setValidationEpsilon(), in which case it can be
-     * restored to that specified by this environment variable using setValidationEpsilonToDefault().
+     * unless overriden at runtime via setQuESTValidationEpsilon(), in which case it can be
+     * restored to that specified by this environment variable using setQuESTValidationEpsilonToDefault().
      * 
      * @envvarvalues
-     *  - setting @p DEFAULT_VALIDATION_EPSILON=0 disables numerical validation, as if the value
+     *  - setting @p QUEST_DEFAULT_VALIDATION_EPSILON=0 disables numerical validation, as if the value
      *    were instead infinity.
-     *  - setting @p DEFAULT_VALIDATION_EPSILON='' is equivalent to _not_ specifying the variable,
+     *  - setting @p QUEST_DEFAULT_VALIDATION_EPSILON='' is equivalent to _not_ specifying the variable,
      *    adopting instead the precision-specific default above.
-     *  - setting @p DEFAULT_VALIDATION_EPSILON=x where `x` is a positive, valid `qreal` in any
+     *  - setting @p QUEST_DEFAULT_VALIDATION_EPSILON=x where `x` is a positive, valid `qreal` in any
      *    format accepted by `C` or `C++` (e.g. `0.01`, `1E-2`, `+1e-2`) will use `x` as the
      *    default validation epsilon.
      * 
      * @constraints
-     * The function initQuESTEnv() will throw a validation error if:
+     * The function initQuESTEnv() will throw a validation error if any of the below are not satisfied.
      *   - The specified epsilon must be `0` or positive.
      *   - The specified epsilon must not exceed that maximum or minimum value which can be stored
      *     in a `qreal`, which is specific to its precision.
      * 
      * @author Tyson Jones
      */
-    const qreal DEFAULT_VALIDATION_EPSILON = 0;
+    const qreal QUEST_DEFAULT_VALIDATION_EPSILON = 0;
+
+
+    /** @envvardoc
+     * 
+     * Specifies the default number of threads per block (or "block dimension") used by GPU acceleration. 
+     * 
+     * The number of dispatched CUDA threads per block controls the parallelisation granularity of
+     * QuEST's GPU backend, affecting performance.
+     * Specifying `QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK` to a valid, positive integer overrides
+     * QuEST's default otherwise set during compilation via a CMake option of the same name. If 
+     * that CMake option was not set, the default is assumed to be @p 128.
+     * 
+     * The number specified by this environment variable will be used as the block dimension by all of
+     * QuEST's GPU backend functions, unless overridden at runtime via setQuESTNumGpuThreadsPerBlock().
+     * The actual number of threads per block used at any time can be queried via 
+     * getQuESTNumGpuThreadsPerBlock(), or reported by reportQuESTEnv().
+     * 
+     * @envvarvalues
+     *  - use internal default of `128`: @p '', @p , (unspecified)
+     *  - use number `x`: @p x, @p 'x', @p '+x'
+     * 
+     * @constraints
+     * The function initQuESTEnv() will throw a validation error if any of the below are not satisfied.
+     *   - The specified number must be a positive integer.
+     *   - The specified number must not exceed the minimum or maximum value which can be stored in an @p int.
+     *   - The specified number must be divisible by the GPU warp size, which is 32 or 64, depending on
+     *     whether deployed to an NVIDIA or AMD GPU. This restriction is imposed even when QuEST is not
+     *     deployed with GPU-acceleration.
+     *   - The specified number exceeds the maximum imposed by the available GPU hardware.
+     * 
+     * @author Oliver Brown
+     * @author Tyson Jones
+     */
+    const qreal QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = 0;
 
 
 #endif
diff --git a/quest/include/operations.h b/quest/include/operations.h
index ea4a316ae..3c97d2c61 100644
--- a/quest/include/operations.h
+++ b/quest/include/operations.h
@@ -95,7 +95,7 @@ digraph {
  *   @f[ 
         \max\limits_{ij} \Big|\left(\hat{U} \hat{U}^\dagger - \id\right)_{ij}\Big|^2 \le \valeps
  *   @f]
- *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setValidationEpsilon().
+ *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setQuESTValidationEpsilon().
  * 
  * @myexample
  * ```
@@ -194,7 +194,7 @@ digraph {
  *   @f[ 
         \max\limits_{ij} \Big|\left(\hat{U} \hat{U}^\dagger - \id\right)_{ij}\Big|^2 \le \valeps
  *   @f]
- *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setValidationEpsilon().
+ *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setQuESTValidationEpsilon().
  *
  * @equivalences
  * 
@@ -573,7 +573,7 @@ void applyMultiControlledCompMatr2(Qureg qureg, std::vector<int> controls, int t
 /// @notyetdoced
 /// @cppvectoroverload
 /// @see applyMultiStateControlledCompMatr2()
-void applyMultiStateControlledCompMatr2(Qureg qureg, std::vector<int> controls, std::vector<int> states, int numControls, int target1, int target2, CompMatr2 matr);
+void applyMultiStateControlledCompMatr2(Qureg qureg, std::vector<int> controls, std::vector<int> states, int target1, int target2, CompMatr2 matr);
 
 
 #endif // __cplusplus
diff --git a/quest/include/precision.h b/quest/include/precision.h
index d37b9a2d3..7b932e678 100644
--- a/quest/include/precision.h
+++ b/quest/include/precision.h
@@ -77,16 +77,16 @@
  */
 
 // validate precision is 1 (float), 2 (double) or 4 (long double)
-#if ! (FLOAT_PRECISION == 1 || FLOAT_PRECISION == 2 || FLOAT_PRECISION == 4)
-    #error "FLOAT_PRECISION must be 1 (float), 2 (double) or 4 (long double)"
+#if ! (QUEST_FLOAT_PRECISION == 1 || QUEST_FLOAT_PRECISION == 2 || QUEST_FLOAT_PRECISION == 4)
+    #error "QUEST_FLOAT_PRECISION must be 1 (float), 2 (double) or 4 (long double)"
 #endif
 
 // infer floating-point type from precision
-#if FLOAT_PRECISION == 1
+#if QUEST_FLOAT_PRECISION == 1
     #define FLOAT_TYPE float
-#elif FLOAT_PRECISION == 2
+#elif QUEST_FLOAT_PRECISION == 2
     #define FLOAT_TYPE double
-#elif FLOAT_PRECISION == 4
+#elif QUEST_FLOAT_PRECISION == 4
     #define FLOAT_TYPE long double
 #endif
 
@@ -96,13 +96,13 @@
     /// @notyetdoced
     /// @macrodoc
     ///
-    /// (note this macro is informed by the FLOAT_PRECISION CMake variable)
-    const int FLOAT_PRECISION = 2;
+    /// (note this macro is informed by the QUEST_FLOAT_PRECISION CMake variable)
+    const int QUEST_FLOAT_PRECISION = 2;
 
     /// @notyetdoced
     /// @macrodoc
     ///
-    /// (note this macro is informed by the FLOAT_PRECISION CMake variable)
+    /// (note this macro is informed by the QUEST_FLOAT_PRECISION CMake variable)
     typedef double int FLOAT_TYPE;
 
 #endif
@@ -113,8 +113,8 @@
  * CHECK PRECISION TYPES ARE COMPATIBLE WITH DEPLOYMENT
  */
 
-#if COMPILE_CUDA && (FLOAT_PRECISION == 4)
-    #error "A quad floating-point precision (FLOAT_PRECISION=4, i.e. long double) is not supported by GPU deployment"
+#if QUEST_COMPILE_CUDA && (QUEST_FLOAT_PRECISION == 4)
+    #error "A quad floating-point precision (QUEST_FLOAT_PRECISION=4, i.e. long double) is not supported by GPU deployment"
 #endif
 
 
@@ -125,14 +125,14 @@
  * which is pre-run-time overridable by specifying the corresponding environment variable.
  */
 
-#if FLOAT_PRECISION == 1
-    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-5
+#if QUEST_FLOAT_PRECISION == 1
+    #define QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-5
 
-#elif FLOAT_PRECISION == 2
-    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-12
+#elif QUEST_FLOAT_PRECISION == 2
+    #define QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-12
 
-#elif FLOAT_PRECISION == 4
-    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-13
+#elif QUEST_FLOAT_PRECISION == 4
+    #define QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-13
 
 #endif
 
@@ -142,13 +142,13 @@
  * PRECISION-AGNOSTIC CONVENIENCE MACROS
  */
 
-#if FLOAT_PRECISION == 1
+#if QUEST_FLOAT_PRECISION == 1
     #define QREAL_FORMAT_SPECIFIER "%.8g"
 
-#elif FLOAT_PRECISION == 2
+#elif QUEST_FLOAT_PRECISION == 2
     #define QREAL_FORMAT_SPECIFIER "%.14g"
 
-#elif FLOAT_PRECISION == 4
+#elif QUEST_FLOAT_PRECISION == 4
     #define QREAL_FORMAT_SPECIFIER "%.17Lg"
     
 #endif
diff --git a/quest/include/quest.h b/quest/include/quest.h
index 409253ff8..da1c778e2 100644
--- a/quest/include/quest.h
+++ b/quest/include/quest.h
@@ -38,6 +38,7 @@
 #include "quest/include/debug.h"
 #include "quest/include/decoherence.h"
 #include "quest/include/environment.h"
+#include "quest/include/experimental.h"
 #include "quest/include/trotterisation.h"
 #include "quest/include/initialisations.h"
 #include "quest/include/channels.h"
@@ -49,7 +50,7 @@
 #include "quest/include/wrappers.h"
 
 
-#if INCLUDE_DEPRECATED_FUNCTIONS
+#if QUEST_INCLUDE_DEPRECATED_FUNCTIONS
     #include "quest/include/deprecated.h"
 #endif
 
diff --git a/quest/include/qureg.h b/quest/include/qureg.h
index f3284fa14..4ff4c5627 100644
--- a/quest/include/qureg.h
+++ b/quest/include/qureg.h
@@ -281,10 +281,10 @@ Qureg createForcedDensityQureg(int numQubits);
  * @par Memory
  * The total allocated memory depends on all parameters (_except_ 
  * @p useMultithread), and the size of the variable-precision @c qcomp used to represent each
- * amplitude. This is determined by preprocessor @c FLOAT_PRECISION via
  * 
+ * amplitude. This is determined by preprocessor @c QUEST_FLOAT_PRECISION via
  * <center>
- * | @c FLOAT_PRECISION | @c qcomp size (bytes) | 
+ * | @c QUEST_FLOAT_PRECISION | @c qcomp size (bytes) |
  * | --- | --- |
  * | 1   | 8   |
  * | 2   | 16  | 
@@ -310,7 +310,7 @@ Qureg createForcedDensityQureg(int numQubits);
  * | 1 | 1 | @f$ 2 \, B \, D \, / \, W @f$ | @f$ 2 \, B \, D @f$ | @f$ 2 \, B \, D \, / \, W @f$ | @f$ 2 \, B \, D @f$ | @f$ 4 \, B \, D @f$ |
  * </center>
  *
- * For illustration, using the default @c FLOAT_PRECISION=2 whereby @f$ B = 16 @f$ bytes, the <b>RAM _per node_</b>
+ * For illustration, using the default @c QUEST_FLOAT_PRECISION=2 whereby @f$ B = 16 @f$ bytes, the <b>RAM _per node_</b>
  * over varying distributions is:
  * 
  * <center>
diff --git a/quest/include/trotterisation.h b/quest/include/trotterisation.h
index 6fd493264..59600c9d9 100644
--- a/quest/include/trotterisation.h
+++ b/quest/include/trotterisation.h
@@ -138,7 +138,7 @@ extern "C" {
  *   @f[ 
         \max\limits_{i} |c_i| \le \valeps
  *   @f]
- *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setValidationEpsilon().
+ *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setQuESTValidationEpsilon().
  *   Otherwise, use applyTrotterizedNonUnitaryPauliStrSumGadget() to permit non-Hermitian @p sum
  *   and ergo effect a non-unitary exponential(s). 
  * 
@@ -352,7 +352,7 @@ extern "C" {
  *   @f[ 
         \max\limits_{i} |c_i| \le \valeps
  *   @f]
- *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setValidationEpsilon(). The imaginary components
+ *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setQuESTValidationEpsilon(). The imaginary components
  *   of the Hamiltonian _are_ considered during simulation.
  * 
  * - The @p time parameter is necessarily real to retain unitarity. It can be substituted for a strictly imaginary
@@ -488,7 +488,7 @@ void applyTrotterizedUnitaryTimeEvolution(Qureg qureg, PauliStrSum hamil, qreal
  *   @f[ 
         \max\limits_{i} |c_i| \le \valeps
  *   @f]
- *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setValidationEpsilon(). Beware however that 
+ *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setQuESTValidationEpsilon(). Beware however that 
  *   imaginary-time evolution under a non-Hermitian Hamiltonian will _not_ necessarily approach the lowest lying eigenstate
  *   (the eigenvalues may be non-real) so is likely of limited utility.
  * 
@@ -604,8 +604,8 @@ void applyTrotterizedImaginaryTimeEvolution(Qureg qureg, PauliStrSum hamil, qrea
  *   @f[
           \min\limits_{i} \gamma_i \ge - \valeps
  *   @f]
- *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setValidationEpsilon(). Non-trace-preserving,
- *   negative damping rates can be simulated by disabling numerical validation via `setValidationEpsilon(0)`.
+ *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setQuESTValidationEpsilon(). Non-trace-preserving,
+ *   negative damping rates can be simulated by disabling numerical validation via `setQuESTValidationEpsilon(0)`.
  * 
  * - The @p time parameter is necessarily real, and cannot be generalised to imaginary or complex like in other
  *   functions. Generalisation is trivially numerically possible, but has no established physical meaning and so
diff --git a/quest/include/types.h b/quest/include/types.h
index f1f49315d..ac0ef36c1 100644
--- a/quest/include/types.h
+++ b/quest/include/types.h
@@ -53,13 +53,13 @@ typedef INDEX_TYPE qindex;
     // which is either MSVC's custom C complex...
     #ifdef _MSC_VER
 
-        #if (FLOAT_PRECISION == 1)
+        #if (QUEST_FLOAT_PRECISION == 1)
             typedef _Fcomplex qcomp;
 
-        #elif (FLOAT_PRECISION == 2)
+        #elif (QUEST_FLOAT_PRECISION == 2)
             typedef _Dcomplex qcomp;
 
-        #elif (FLOAT_PRECISION == 4)
+        #elif (QUEST_FLOAT_PRECISION == 4)
             typedef _Lcomplex qcomp;
 
         #endif
diff --git a/quest/src/api/CMakeLists.txt b/quest/src/api/CMakeLists.txt
index 0979f2f6c..7f90dcf17 100644
--- a/quest/src/api/CMakeLists.txt
+++ b/quest/src/api/CMakeLists.txt
@@ -5,6 +5,7 @@ target_sources(QuEST
   debug.cpp
   decoherence.cpp
   environment.cpp
+  experimental.cpp
   initialisations.cpp
   matrices.cpp
   modes.cpp
@@ -14,4 +15,4 @@ target_sources(QuEST
   qureg.cpp
   trotterisation.cpp
   types.cpp
-)
\ No newline at end of file
+)
diff --git a/quest/src/api/calculations.cpp b/quest/src/api/calculations.cpp
index 1143a4ecd..47e5d8a63 100644
--- a/quest/src/api/calculations.cpp
+++ b/quest/src/api/calculations.cpp
@@ -12,6 +12,7 @@
 #include "quest/include/calculations.h"
 
 #include "quest/src/core/validation.hpp"
+#include "quest/src/core/lists.hpp"
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/core/localiser.hpp"
 #include "quest/src/core/bitwise.hpp"
@@ -253,12 +254,12 @@ qreal calcProbOfMultiQubitOutcome(Qureg qureg, int* qubits, int* outcomes, int n
     validate_targets(qureg, qubits, numQubits, __func__);
     validate_measurementOutcomesAreValid(outcomes, numQubits, __func__);
 
-    auto qubitVec = util_getVector(qubits, numQubits);
-    auto outcomeVec = util_getVector(outcomes, numQubits);
+    auto qubitList = lists_getList64(qubits, numQubits);
+    auto outcomeList = lists_getList64(outcomes, numQubits);
 
     return (qureg.isDensityMatrix)?
-        localiser_densmatr_calcProbOfMultiQubitOutcome(qureg, qubitVec, outcomeVec):
-        localiser_statevec_calcProbOfMultiQubitOutcome(qureg, qubitVec, outcomeVec);
+        localiser_densmatr_calcProbOfMultiQubitOutcome(qureg, qubitList, outcomeList):
+        localiser_statevec_calcProbOfMultiQubitOutcome(qureg, qubitList, outcomeList);
 }
 
 
@@ -267,11 +268,11 @@ void calcProbsOfAllMultiQubitOutcomes(qreal* outcomeProbs, Qureg qureg, int* qub
     validate_targets(qureg, qubits, numQubits, __func__);
     validate_measurementOutcomesFitInGpuMem(qureg, numQubits, __func__);
 
-    auto qubitVec = util_getVector(qubits, numQubits);
+    auto qubitList = lists_getList64(qubits, numQubits);
 
     (qureg.isDensityMatrix)?
-        localiser_densmatr_calcProbsOfAllMultiQubitOutcomes(outcomeProbs, qureg, qubitVec):
-        localiser_statevec_calcProbsOfAllMultiQubitOutcomes(outcomeProbs, qureg, qubitVec);
+        localiser_densmatr_calcProbsOfAllMultiQubitOutcomes(outcomeProbs, qureg, qubitList):
+        localiser_statevec_calcProbsOfAllMultiQubitOutcomes(outcomeProbs, qureg, qubitList);
 }
 
 
@@ -383,7 +384,7 @@ Qureg calcPartialTrace(Qureg qureg, int* traceOutQubits, int numTraceQubits) {
         qureg.isGpuAccelerated, qureg.isMultithreaded, __func__);
 
     // set it to reduced density matrix
-    auto targets = util_getVector(traceOutQubits, numTraceQubits);
+    auto targets = lists_getList64(traceOutQubits, numTraceQubits);
     localiser_densmatr_partialTrace(qureg, out, targets);
 
     return out;
@@ -396,7 +397,7 @@ Qureg calcReducedDensityMatrix(Qureg qureg, int* retainQubits, int numRetainQubi
     validate_targets(qureg, retainQubits, numRetainQubits, __func__);
     validate_quregCanBeReduced(qureg, qureg.numQubits - numRetainQubits, __func__);
 
-    auto traceQubits = util_getNonTargetedQubits(retainQubits, numRetainQubits, qureg.numQubits);
+    auto traceQubits = util_getNonTargetedQubits(lists_getList64(retainQubits, numRetainQubits), qureg.numQubits);
 
     // harmlessly re-validates
     return calcPartialTrace(qureg, traceQubits.data(), traceQubits.size());
diff --git a/quest/src/api/channels.cpp b/quest/src/api/channels.cpp
index d6e3ac4fb..c6702438a 100644
--- a/quest/src/api/channels.cpp
+++ b/quest/src/api/channels.cpp
@@ -107,7 +107,7 @@ void freeAllMemoryIfAnyAllocsFailed(T& obj) {
 
     // determine whether any node experienced a failure
     bool anyFail = didAnyLocalAllocsFail(obj);
-    if (comm_isInit())
+    if (comm_isActive())
         anyFail = comm_isTrueOnAllNodes(anyFail);
 
     // if so, free all memory before subsequent validation
@@ -456,11 +456,15 @@ extern "C" void reportSuperOp(SuperOp op) {
     size_t elemMem = mem_getLocalSuperOpMemoryRequired(op.numQubits);
     size_t structMem = sizeof(op);
 
+    printer_sync();
+
     print_header(op, elemMem + structMem);
     print_elems(op);
 
     // exclude mandatory newline above
     print_oneFewerNewlines();
+
+    printer_sync();
 }
 
 
@@ -479,6 +483,8 @@ extern "C" void reportKrausMap(KrausMap map) {
     size_t superMem = mem_getLocalSuperOpMemoryRequired(map.superop.numQubits);
     size_t strucMem = sizeof(map);
 
+    printer_sync();
+
     // gauranteed not to overflow
     size_t totalMem = krausMem + superMem + strucMem;
     print_header(map, totalMem);
@@ -486,4 +492,6 @@ extern "C" void reportKrausMap(KrausMap map) {
 
     // exclude mandatory newline above
     print_oneFewerNewlines();
+    
+    printer_sync();
 }
diff --git a/quest/src/api/debug.cpp b/quest/src/api/debug.cpp
index 82146da2a..e6c6b9f2a 100644
--- a/quest/src/api/debug.cpp
+++ b/quest/src/api/debug.cpp
@@ -34,7 +34,7 @@ extern "C" {
  */
 
 
-void setSeeds(unsigned* seeds, int numSeeds) {
+void setQuESTSeeds(unsigned* seeds, int numSeeds) {
     validate_envIsInit(__func__);
     validate_randomSeeds(seeds, numSeeds, __func__);
 
@@ -42,20 +42,20 @@ void setSeeds(unsigned* seeds, int numSeeds) {
     rand_setSeeds(vector<unsigned>(seeds, seeds+numSeeds));
 }
 
-void setSeedsToDefault() {
+void setQuESTSeedsToDefault() {
     validate_envIsInit(__func__);
 
     rand_setSeedsToDefault();
 }
 
 
-int getNumSeeds() {
+int getQuESTNumSeeds() {
     validate_envIsInit(__func__);
 
     return rand_getNumSeeds();
 }
 
-void getSeeds(unsigned* seeds) {
+void getQuESTSeeds(unsigned* seeds) {
     validate_envIsInit(__func__);
 
     auto vec = rand_getSeeds();
@@ -71,19 +71,19 @@ void getSeeds(unsigned* seeds) {
  * VALIDATION
  */
 
-void setInputErrorHandler(void (*callback)(const char*, const char*)) {
+void setQuESTInputErrorHandler(void (*callback)(const char*, const char*)) {
     validate_envIsInit(__func__);
 
     validateconfig_setErrorHandler(callback);
 }
 
-void setValidationOn() {
+void setQuESTValidationOn() {
     validate_envIsInit(__func__);
     
     validateconfig_enable();
 }
 
-void setValidationOff() {
+void setQuESTValidationOff() {
     validate_envIsInit(__func__);
 
     // disables all validation and computation
@@ -97,7 +97,7 @@ void setValidationOff() {
 }
 
 
-void setValidationEpsilon(qreal eps) {
+void setQuESTValidationEpsilon(qreal eps) {
     validate_envIsInit(__func__);
     validate_newEpsilonValue(eps, __func__);
 
@@ -105,14 +105,14 @@ void setValidationEpsilon(qreal eps) {
     util_setEpsilonSensitiveHeapFlagsToUnknown();
 }
 
-void setValidationEpsilonToDefault() {
+void setQuESTValidationEpsilonToDefault() {
     validate_envIsInit(__func__);
 
     validateconfig_setEpsilonToDefault();
     util_setEpsilonSensitiveHeapFlagsToUnknown();
 }
 
-qreal getValidationEpsilon() {
+qreal getQuESTValidationEpsilon() {
     validate_envIsInit(__func__);
 
     return validateconfig_getEpsilon();
@@ -125,7 +125,7 @@ qreal getValidationEpsilon() {
  */
 
 
-void setMaxNumReportedItems(qindex numRows, qindex numCols) {
+void setQuESTMaxNumReportedItems(qindex numRows, qindex numCols) {
     validate_envIsInit(__func__);
     validate_newMaxNumReportedScalars(numRows, numCols, __func__);
 
@@ -139,7 +139,7 @@ void setMaxNumReportedItems(qindex numRows, qindex numCols) {
 }
 
 
-void setMaxNumReportedSigFigs(int numSigFigs) {
+void setQuESTMaxNumReportedSigFigs(int numSigFigs) {
     validate_envIsInit(__func__);
     validate_newMaxNumReportedSigFigs(numSigFigs, __func__);
 
@@ -147,7 +147,7 @@ void setMaxNumReportedSigFigs(int numSigFigs) {
 }
 
 
-void setNumReportedNewlines(int numNewlines) {
+void setQuESTNumReportedNewlines(int numNewlines) {
     validate_envIsInit(__func__);
     validate_newNumReportedNewlines(numNewlines, __func__);
 
@@ -155,7 +155,7 @@ void setNumReportedNewlines(int numNewlines) {
 }
 
 
-void setReportedPauliChars(const char* paulis) {
+void setQuESTReportedPauliChars(const char* paulis) {
     validate_envIsInit(__func__);
     validate_numPauliChars(paulis, __func__);
 
@@ -163,7 +163,7 @@ void setReportedPauliChars(const char* paulis) {
 }
 
 
-void setReportedPauliStrStyle(int flag) {
+void setQuESTReportedPauliStrStyle(int flag) {
     validate_envIsInit(__func__);
     validate_reportedPauliStrStyleFlag(flag, __func__);
 
@@ -177,7 +177,7 @@ void setReportedPauliStrStyle(int flag) {
  */
 
 
-qindex getGpuCacheSize() {
+qindex getQuESTGpuCacheSize() {
     validate_envIsInit(__func__);
 
     if (getQuESTEnv().isGpuAccelerated)
@@ -188,7 +188,7 @@ qindex getGpuCacheSize() {
 }
 
 
-void clearGpuCache() {
+void clearQuESTGpuCache() {
     validate_envIsInit(__func__);
 
     // safely do nothing if not GPU accelerated
@@ -206,19 +206,19 @@ void clearGpuCache() {
  */
 
 
-void setSeeds(vector<unsigned> seeds) {
-    setSeeds(seeds.data(), seeds.size());
+void setQuESTSeeds(vector<unsigned> seeds) {
+    setQuESTSeeds(seeds.data(), seeds.size());
 }
 
-vector<unsigned> getSeeds() {
+vector<unsigned> getQuESTSeeds() {
     validate_envIsInit(__func__);
 
     // allocate temp vector, and pedantically validate successful
     vector<unsigned> out;
-    int numSeeds = getNumSeeds();
+    int numSeeds = rand_getNumSeeds();
     auto callback = [&]() { validate_tempListAllocSucceeded(false, numSeeds, sizeof(unsigned), __func__); };
     util_tryAllocVector(out, numSeeds, callback);
 
-    getSeeds(out.data());
+    getQuESTSeeds(out.data());
     return out;
 }
diff --git a/quest/src/api/decoherence.cpp b/quest/src/api/decoherence.cpp
index d2fadf621..4e1901f25 100644
--- a/quest/src/api/decoherence.cpp
+++ b/quest/src/api/decoherence.cpp
@@ -126,7 +126,7 @@ void mixKrausMap(Qureg qureg, int* qubits, int numQubits, KrausMap map) {
     validate_krausMapIsCPTP(map, __func__); // also checks fields and is-sync
     validate_krausMapMatchesTargets(map, numQubits, __func__);
 
-    localiser_densmatr_krausMap(qureg, map, util_getVector(qubits, numQubits));
+    localiser_densmatr_krausMap(qureg, map, lists_getList64(qubits, numQubits));
 }
 
 
@@ -149,7 +149,7 @@ void mixSuperOp(Qureg qureg, int* targets, int numTargets, SuperOp superop) {
     validate_superOpDimMatchesTargs(superop, numTargets, __func__);
     validate_mixedAmpsFitInNode(qureg, 2*numTargets, __func__); // superop acts on 2x
 
-    localiser_densmatr_superoperator(qureg, superop, util_getVector(targets, numTargets));
+    localiser_densmatr_superoperator(qureg, superop, lists_getList64(targets, numTargets));
 }
 
 
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 541491899..c59334b55 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -48,7 +48,7 @@ using std::string;
  */
 
 
-static QuESTEnv* globalEnvPtr = nullptr;
+static QuESTEnv* global_envPtr = nullptr;
 
 
 
@@ -62,7 +62,7 @@ static QuESTEnv* globalEnvPtr = nullptr;
  */
 
 
-static bool hasEnvBeenFinalized = false;
+static bool global_hasEnvBeenFinalized = false;
 
 
 
@@ -71,12 +71,18 @@ static bool hasEnvBeenFinalized = false;
  */
 
 
-void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread, const char* caller) {
+void validateAndInitCustomQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread, const char* caller) {
 
     // ensure that we are never re-initialising QuEST (even after finalize) because
-    // this leads to undefined behaviour in distributed mode, as per the MPI
-    validate_envNeverInit(globalEnvPtr != nullptr, hasEnvBeenFinalized, caller);
-
+    // this leads to undefined behaviour in distributed mode, as per the MPI std,
+    // regardless of whether the user owns MPI
+    validate_envNeverInit(global_envPtr != nullptr, global_hasEnvBeenFinalized, caller);
+
+    // load env-vars before validating deployment mode, because some env vars can
+    // affect validation (such as QUEST_PERMIT_NODES_TO_SHARE_GPU). note that
+    // some env-vars (like QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK) will be here
+    // validated to have a correct format (like an int), but the validity of its
+    // actual value will be checked later (since it requires deciding GPU-accel).
     envvars_validateAndLoadEnvVars(caller);
     validateconfig_setEpsilonToDefault();
 
@@ -86,15 +92,19 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
     // by mpirun believe they are each the main rank. This seems unavoidable.
     validate_newEnvDeploymentMode(useDistrib, useGpuAccel, useMultithread, caller);
 
-    // overwrite deployments left as modeflag::USE_AUTO
+    // overwrite deployments (left as modeflag::USE_AUTO=-1) with 0,1 (a bool),
+    // which crucially, resolves useDistrib, permitting its consultation below
     autodep_chooseQuESTEnvDeployment(useDistrib, useGpuAccel, useMultithread);
 
+    // ensure that current state of MPI is valid
+    validate_mpiInitStatus(useDistrib, userOwnsMpi, caller);
+
     // optionally initialise MPI; necessary before completing validation,
     // and before any GPU initialisation and validation, since we will
     // perform that specifically upon the MPI-process-bound GPU(s). Further,
     // we can make sure validation errors are reported only by the root node.
     if (useDistrib)
-        comm_init();
+        comm_init(userOwnsMpi);
 
     validate_newEnvDistributedBetweenPower2Nodes(caller);
 
@@ -124,6 +134,11 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
     /// should we warn here if each machine contains
     /// more GPUs than deployed MPI-processes (some GPUs idle)?
 
+    // validate the initial numTPB env-var (if specified) is valid
+    int initNumThreadsPerBlock = envvars_getDefaultNumGpuThreadsPerBlock();
+    validate_numGpuThreadsPerBlock(initNumThreadsPerBlock, useGpuAccel, caller);
+    gpu_setNumThreadsPerBlock(initNumThreadsPerBlock);
+
     // cuQuantum is always used in GPU-accelerated envs when available
     bool useCuQuantum = useGpuAccel && gpu_isCuQuantumCompiled();
     if (useCuQuantum) {
@@ -131,26 +146,32 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
         gpu_initCuQuantum();
     }
 
+    // MPI GPU-awareness detection is platform specific; sometimes it is
+    // known at compile-time, other times according to env-vars
+    bool isMpiGpuAware = comm_isMpiGpuAware();
+
     // initialise RNG, used by measurements and random-state generation
     rand_setSeedsToDefault();
 
     // allocate space for the global QuESTEnv singleton (overwriting nullptr, unless malloc fails)
-    globalEnvPtr = (QuESTEnv*) malloc(sizeof(QuESTEnv));
+    global_envPtr = (QuESTEnv*) malloc(sizeof(QuESTEnv));
 
     // pedantically check that teeny tiny malloc just succeeded
-    if (globalEnvPtr == nullptr)
+    if (global_envPtr == nullptr)
         error_allocOfQuESTEnvFailed();
 
-    // bind deployment info to global instance
-    globalEnvPtr->isMultithreaded     = useMultithread;
-    globalEnvPtr->isGpuAccelerated    = useGpuAccel;
-    globalEnvPtr->isDistributed       = useDistrib;
-    globalEnvPtr->isCuQuantumEnabled  = useCuQuantum;
-    globalEnvPtr->isGpuSharingEnabled = permitGpuSharing;
+    // bind deployment info to global instance (autocasting int to bool)
+    global_envPtr->isMultithreaded     = useMultithread;
+    global_envPtr->isGpuAccelerated    = useGpuAccel;
+    global_envPtr->isDistributed       = useDistrib;
+    global_envPtr->isMpiUserOwned      = userOwnsMpi;
+    global_envPtr->isMpiGpuAware       = isMpiGpuAware;
+    global_envPtr->isCuQuantumEnabled  = useCuQuantum;
+    global_envPtr->isGpuSharingEnabled = permitGpuSharing;
 
     // bind distributed info
-    globalEnvPtr->rank     = (useDistrib)? comm_getRank()     : 0;
-    globalEnvPtr->numNodes = (useDistrib)? comm_getNumNodes() : 1;
+    global_envPtr->rank     = (useDistrib)? comm_getRank()     : 0;
+    global_envPtr->numNodes = (useDistrib)? comm_getNumNodes() : 1;
 }
 
 
@@ -187,10 +208,12 @@ void printCompilationInfo() {
 
     print_table(
         "compilation", {
-        {"isMpiCompiled",       comm_isMpiCompiled()},
-        {"isGpuCompiled",       gpu_isGpuCompiled()},
-        {"isOmpCompiled",       cpu_isOpenmpCompiled()},
-        {"isCuQuantumCompiled", gpu_isCuQuantumCompiled()},
+        {"isOmpCompiled",         cpu_isOpenmpCompiled()},
+        {"isMpiCompiled",         comm_isMpiCompiled()},
+        {"isMpiSubCommCompiled",  comm_isMpiSubCommCompiled()},
+        {"isGpuCompiled",         gpu_isGpuCompiled()},
+        {"isHipCompiled",         gpu_isHipCompiled()},
+        {"isCuQuantumCompiled",   gpu_isCuQuantumCompiled()},
     });
 }
 
@@ -199,11 +222,10 @@ void printDeploymentInfo() {
 
     print_table(
         "deployment", {
-        {"isMpiEnabled",        globalEnvPtr->isDistributed},
-        {"isGpuEnabled",        globalEnvPtr->isGpuAccelerated},
-        {"isOmpEnabled",        globalEnvPtr->isMultithreaded},
-        {"isCuQuantumEnabled",  globalEnvPtr->isCuQuantumEnabled},
-        {"isGpuSharingEnabled", globalEnvPtr->isGpuSharingEnabled},
+        {"isOmpEnabled",        global_envPtr->isMultithreaded},
+        {"isMpiEnabled",        global_envPtr->isDistributed},
+        {"isGpuEnabled",        global_envPtr->isGpuAccelerated},
+        {"isCuQuantumEnabled",  global_envPtr->isCuQuantumEnabled},
     });
 }
 
@@ -252,6 +274,7 @@ void printGpuInfo() {
         {"gpuMemory",     isGpu?  printer_getMemoryWithUnitStr(gpu_getTotalMemoryInBytes())            + pg : na},
         {"gpuMemoryFree", isGpu?  printer_getMemoryWithUnitStr(gpu_getCurrentAvailableMemoryInBytes()) + pg : na},
         {"gpuCache",      isGpu?  printer_getMemoryWithUnitStr(gpu_getCacheMemoryInBytes())            + pg : na},
+        {"numThreadsPerBlock", isGpu? printer_toStr(gpu_getNumThreadsPerBlock()) : na},
     });
 }
 
@@ -260,10 +283,16 @@ void printDistributionInfo() {
 
     using namespace printer_substrings;
 
+    bool comm = global_envPtr->isDistributed;
+    bool gpu  = global_envPtr->isGpuAccelerated;
+    bool both = comm && gpu;
+
     print_table(
         "distribution", {
-        {"isMpiGpuAware", (comm_isMpiCompiled())? printer_toStr(comm_isMpiGpuAware()) : na},
-        {"numMpiNodes",   printer_toStr(globalEnvPtr->numNodes)},
+        {"isMpiUserOwned",      comm? printer_toStr(global_envPtr->isMpiUserOwned) : na},
+        {"isMpiGpuAware",       comm? printer_toStr(global_envPtr->isMpiGpuAware ) : na},
+        {"isGpuSharingEnabled", both? printer_toStr(global_envPtr->isGpuSharingEnabled) : na},
+        {"numMpiNodes",         printer_toStr(global_envPtr->numNodes)},
     });
 }
 
@@ -273,7 +302,7 @@ void printQuregSizeLimits(bool isDensMatr) {
     using namespace printer_substrings;
 
     // for brevity
-    int numNodes = globalEnvPtr->numNodes;
+    int numNodes = global_envPtr->numNodes;
 
     // by default, CPU limits are unknown (because memory query might fail)
     string maxQbForCpu = un;
@@ -285,7 +314,7 @@ void printQuregSizeLimits(bool isDensMatr) {
         maxQbForCpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, 1, cpuMem));
 
         // and the max MPI sizes are only relevant when env is distributed
-        if (globalEnvPtr->isDistributed)
+        if (global_envPtr->isDistributed)
             maxQbForMpiCpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, numNodes, cpuMem));
 
         // when MPI irrelevant, change their status from "unknown" to "N/A"
@@ -300,12 +329,12 @@ void printQuregSizeLimits(bool isDensMatr) {
     string maxQbForMpiGpu = na;
 
     // max GPU registers only relevant if env is GPU-accelerated
-    if (globalEnvPtr->isGpuAccelerated) {
+    if (global_envPtr->isGpuAccelerated) {
         qindex gpuMem = gpu_getCurrentAvailableMemoryInBytes();
         maxQbForGpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, 1, gpuMem));
 
         // and the max MPI sizes are further only relevant when env is distributed 
-        if (globalEnvPtr->isDistributed)
+        if (global_envPtr->isDistributed)
             maxQbForMpiGpu = printer_toStr(mem_getMaxNumQuregQubitsWhichCanFitInMemory(isDensMatr, numNodes, gpuMem));
     }
 
@@ -342,7 +371,7 @@ void printQuregAutoDeployments(bool isDensMatr) {
 
     // test to theoretically max #qubits, surpassing max that can fit in RAM and GPUs, because
     // auto-deploy will still try to deploy there to (then subsequent validation will fail)
-    int maxQubits = mem_getMaxNumQuregQubitsBeforeGlobalMemSizeofOverflow(isDensMatr, globalEnvPtr->numNodes);
+    int maxQubits = mem_getMaxNumQuregQubitsBeforeGlobalMemSizeofOverflow(isDensMatr, global_envPtr->numNodes);
 
     for (int numQubits=1; numQubits<maxQubits; numQubits++) {
 
@@ -350,7 +379,7 @@ void printQuregAutoDeployments(bool isDensMatr) {
         useDistrib  = modeflag::USE_AUTO;
         useGpuAccel = modeflag::USE_AUTO;
         useMulti    = modeflag::USE_AUTO;;
-        autodep_chooseQuregDeployment(numQubits, isDensMatr, useDistrib, useGpuAccel, useMulti, *globalEnvPtr);
+        autodep_chooseQuregDeployment(numQubits, isDensMatr, useDistrib, useGpuAccel, useMulti, *global_envPtr);
 
         // skip if deployments are unchanged
         if (useDistrib  == prevDistrib  &&
@@ -397,19 +426,21 @@ extern "C" {
 
 void initCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread) {
 
-    validateAndInitCustomQuESTEnv(useDistrib, useGpuAccel, useMultithread, __func__);
+    const bool userOwnsMpi = false;
+    validateAndInitCustomQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread, __func__);
 }
 
 
 void initQuESTEnv() {
 
-    validateAndInitCustomQuESTEnv(modeflag::USE_AUTO, modeflag::USE_AUTO, modeflag::USE_AUTO, __func__);
+    const bool userOwnsMpi = false;
+    validateAndInitCustomQuESTEnv(modeflag::USE_AUTO, userOwnsMpi, modeflag::USE_AUTO, modeflag::USE_AUTO, __func__);
 }
 
 
 int isQuESTEnvInit() {
 
-    return (int) (globalEnvPtr != nullptr);
+    return (int) (global_envPtr != nullptr);
 }
 
 
@@ -417,7 +448,7 @@ QuESTEnv getQuESTEnv() {
     validate_envIsInit(__func__);
 
     // returns a copy, so cheeky users calling memcpy() upon const struct still won't mutate
-    return *globalEnvPtr;
+    return *global_envPtr;
 }
 
 
@@ -428,33 +459,33 @@ void finalizeQuESTEnv() {
     // calling this will not automatically
     // free the memory of existing Quregs
 
-    if (globalEnvPtr->isGpuAccelerated)
+    if (global_envPtr->isGpuAccelerated)
         gpu_clearCache(); // syncs first
 
-    if (globalEnvPtr->isGpuAccelerated && gpu_isCuQuantumCompiled())
+    if (global_envPtr->isGpuAccelerated && gpu_isCuQuantumCompiled())
         gpu_finalizeCuQuantum();
 
-    if (globalEnvPtr->isDistributed) {
+    if (global_envPtr->isDistributed) {
         comm_sync();
         comm_end();
     }
 
     // free global env's heap memory and flag it as unallocated
-    free(globalEnvPtr);
-    globalEnvPtr = nullptr;
+    free(global_envPtr);
+    global_envPtr = nullptr;
 
     // flag that the environment was finalised, to ensure it is never re-initialised
-    hasEnvBeenFinalized = true;
+    global_hasEnvBeenFinalized = true;
 }
 
 
 void syncQuESTEnv() {
     validate_envIsInit(__func__);
 
-    if (globalEnvPtr->isGpuAccelerated)
+    if (global_envPtr->isGpuAccelerated)
         gpu_sync();
 
-    if (globalEnvPtr->isDistributed)
+    if (global_envPtr->isDistributed)
         comm_sync();
 }
 
@@ -465,6 +496,8 @@ void reportQuESTEnv() {
 
     /// @todo add function to write this output to file (useful for HPC debugging)
 
+    printer_sync();
+
     print_label("QuEST execution environment");
 
     bool statevec = false;
@@ -486,24 +519,25 @@ void reportQuESTEnv() {
 
     // exclude mandatory newline above
     print_oneFewerNewlines();
+
+    printer_sync();
 }
 
 
-void getEnvironmentString(char str[200]) {
+void getQuESTEnvironmentString(char str[200]) {
     validate_envIsInit(__func__);
 
-    QuESTEnv env = getQuESTEnv();
-
     int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1;
-    int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled();
-    int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible();
-
-    snprintf(str, 200, "CUDA=%d OpenMP=%d MPI=%d threads=%d ranks=%d cuQuantum=%d gpuDirect=%d",
-        env.isGpuAccelerated,
-        env.isMultithreaded,
-        env.isDistributed,
+    int cuQuantum = global_envPtr->isGpuAccelerated && gpu_isCuQuantumCompiled();
+    int gpuDirect = global_envPtr->isGpuAccelerated && gpu_isDirectGpuCommPossible();
+
+    snprintf(str, 200, "CUDA=%d OpenMP=%d MPI=%d userOwnsMPI=%d threads=%d ranks=%d cuQuantum=%d gpuDirect=%d",
+        global_envPtr->isGpuAccelerated,
+        global_envPtr->isMultithreaded,
+        global_envPtr->isDistributed,
+        global_envPtr->isMpiUserOwned,
         numThreads,
-        env.numNodes,
+        global_envPtr->numNodes,
         cuQuantum,
         gpuDirect);
 }
diff --git a/quest/src/api/experimental.cpp b/quest/src/api/experimental.cpp
new file mode 100644
index 000000000..a6f883656
--- /dev/null
+++ b/quest/src/api/experimental.cpp
@@ -0,0 +1,107 @@
+/** @file
+ * Experimental functions which are liable to
+ * API breaks within QuEST minor version releases.
+ * Some optional functions require compiling this
+ * file against MPI, despite being outside of /comm/, 
+ * and so require opt-in macros (QUEST_COMPILE_SUBCOMM)
+ * 
+ * @author Oliver Brown
+ */
+
+#include "quest/include/config.h"
+#include "quest/include/environment.h"
+
+#include "quest/src/core/validation.hpp"
+#include "quest/src/comm/comm_config.hpp"
+#include "quest/src/gpu/gpu_config.hpp"
+
+#if QUEST_COMPILE_SUBCOMM && ! QUEST_COMPILE_MPI
+    #error "Macro QUEST_COMPILE_SUBCOMM was true, but QUEST_COMPILE_MPI was illegally false."
+#endif
+
+#if QUEST_COMPILE_SUBCOMM
+    #include <mpi.h>
+#endif
+
+
+
+/*
+ * EXTERNAL FUNCTIONS
+ *
+ * which we here regretfully 'extern' because we are either
+ * unsure which header should expose them, or because they
+ * contain deployment-specific types (like MPI_Comm) which
+ * we do not wish to expose within internal headers 
+ */
+
+
+extern void validateAndInitCustomQuESTEnv(
+    int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread, const char* caller);
+
+
+#if QUEST_COMPILE_SUBCOMM // hide MPI_Comm
+    extern bool comm_setMpiComm(MPI_Comm newComm, bool userOwnsMpi);
+#endif
+
+
+
+/*
+ * API FUNCTIONS
+ */
+
+
+// enable invocation by both C and C++ binaries
+extern "C" {
+
+
+void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread) {
+    validateAndInitCustomQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread, __func__);
+}
+
+
+#if QUEST_COMPILE_SUBCOMM // hide MPI_Comm
+ 
+void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useMultithread) {
+
+    // useDistrib and userOwnsMpi are implied by the user of this initialiser
+    const int useDistrib = 1;
+    const bool userOwnsMpi = true;
+
+    // pre-validate that we are able to set the MPI communicator
+    validate_mpiInitStatus(useDistrib, userOwnsMpi, __func__);
+    validate_mpiSubCommIsNonNull(userQuestComm != MPI_COMM_NULL, __func__);
+
+    // avoid re-setting the MPI comm (to avoid an internal error), which happens
+    // if a user illegally re-calls this function, which will be subsequently
+    // caught by the validation in validateAndInitCustomQuESTEnv() below
+    if (!comm_isActive()) {
+        bool success = comm_setMpiComm(userQuestComm, userOwnsMpi);
+        validate_mpiSubCommSetSucceeded(success, __func__);
+    }
+
+    // perform remaining validation (some is harmlessly repeated) and init QuEST env
+    validateAndInitCustomQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread, __func__);
+}
+#endif // QUEST_COMPILE_SUBCOMM
+
+
+int getQuESTNumGpuThreadsPerBlock() {
+    validate_envIsInit(__func__);
+    
+    return gpu_getNumThreadsPerBlock();
+}
+
+
+void setQuESTNumGpuThreadsPerBlock(int numTPB) {
+    validate_envIsInit(__func__);
+
+    // validation messages and queries depend upon GPU usage
+    bool gpuIsActive = getQuESTEnv().isGpuAccelerated;
+    validate_numGpuThreadsPerBlock(numTPB, gpuIsActive, __func__);
+
+    gpu_setNumThreadsPerBlock(numTPB);
+}
+
+
+// end de-mangler
+}
diff --git a/quest/src/api/initialisations.cpp b/quest/src/api/initialisations.cpp
index aba838e0f..36f63910c 100644
--- a/quest/src/api/initialisations.cpp
+++ b/quest/src/api/initialisations.cpp
@@ -13,6 +13,7 @@
 
 #include "quest/src/core/validation.hpp"
 #include "quest/src/core/localiser.hpp"
+#include "quest/src/core/lists.hpp"
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/core/bitwise.hpp"
 #include "quest/src/gpu/gpu_config.hpp"
@@ -220,7 +221,7 @@ void setQuregToPartialTrace(Qureg out, Qureg in, int* traceOutQubits, int numTra
     validate_targets(in, traceOutQubits, numTraceQubits, __func__);
     validate_quregCanBeSetToReducedDensMatr(out, in, numTraceQubits, __func__);
 
-    auto targets = util_getVector(traceOutQubits, numTraceQubits);
+    auto targets = lists_getList64(traceOutQubits, numTraceQubits);
     localiser_densmatr_partialTrace(in, out, targets);
 }
 
@@ -233,7 +234,7 @@ void setQuregToReducedDensityMatrix(Qureg out, Qureg in, int* retainQubits, int
     validate_targets(in, retainQubits, numRetainQubits, __func__);
     validate_quregCanBeSetToReducedDensMatr(out, in, in.numQubits - numRetainQubits, __func__);
 
-    auto traceQubits = util_getNonTargetedQubits(retainQubits, numRetainQubits, in.numQubits);
+    auto traceQubits = util_getNonTargetedQubits(lists_getList64(retainQubits, numRetainQubits), in.numQubits);
     localiser_densmatr_partialTrace(in, out, traceQubits);
 }
 
diff --git a/quest/src/api/matrices.cpp b/quest/src/api/matrices.cpp
index b17987eb4..07e37025c 100644
--- a/quest/src/api/matrices.cpp
+++ b/quest/src/api/matrices.cpp
@@ -165,7 +165,7 @@ void freeAllMemoryIfAnyAllocsFailed(T matr) {
 
     // ascertain whether any allocs failed on any node
     bool anyFail = didAnyLocalAllocsFail(matr);
-    if (comm_isInit())
+    if (comm_isActive())
         anyFail = comm_isTrueOnAllNodes(anyFail);
 
     // if so, free all heap fields
@@ -763,11 +763,16 @@ void validateAndPrintMatrix(T matr, const char* caller) {
         structMem -= elemMem;
 
     size_t numBytesPerNode = elemMem + structMem;
+
+    printer_sync();
+    
     print_header(matr, numBytesPerNode);
     print_elems(matr);
 
     // exclude mandatory newline above
     print_oneFewerNewlines();
+    
+    printer_sync();
 }
 
 
diff --git a/quest/src/api/multiplication.cpp b/quest/src/api/multiplication.cpp
index 9761735a5..a4b72e6da 100644
--- a/quest/src/api/multiplication.cpp
+++ b/quest/src/api/multiplication.cpp
@@ -12,6 +12,7 @@
 #include "quest/include/multiplication.h"
 
 #include "quest/src/core/validation.hpp"
+#include "quest/src/core/lists.hpp"
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/core/localiser.hpp"
 #include "quest/src/core/paulilogic.hpp"
@@ -22,6 +23,14 @@ using std::vector;
 
 
 
+// The multiplication API doesn't accept control qubits
+// (which don't have much relevance to non-unitaries),
+// so passes ctrls={} to most internal functions; we
+// spare ourselves some keystrokes by this shortcut
+List64 none = lists_getEmptyList64();
+
+
+
 /*
  * CompMatr1
  */
@@ -35,7 +44,7 @@ void leftapplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
 
     bool conj = false;
     bool transp = false;
-    localiser_statevec_anyCtrlOneTargDenseMatr(qureg, {}, {}, target, matrix, conj, transp);
+    localiser_statevec_anyCtrlOneTargDenseMatr(qureg, none, none, target, matrix, conj, transp);
 }
 
 void rightapplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
@@ -48,7 +57,7 @@ void rightapplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
     bool conj = false;
     bool transp = true;
     int qubit = util_getBraQubit(target, qureg);
-    localiser_statevec_anyCtrlOneTargDenseMatr(qureg, {}, {}, qubit, matrix, conj, transp);
+    localiser_statevec_anyCtrlOneTargDenseMatr(qureg, none, none, qubit, matrix, conj, transp);
 }
 
 } // end de-mangler
@@ -69,7 +78,7 @@ void leftapplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix)
 
     bool conj = false;
     bool transp = false;
-    localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, {}, {}, target1, target2, matrix, conj, transp);
+    localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, none, none, target1, target2, matrix, conj, transp);
 }
 
 void rightapplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix) {
@@ -84,7 +93,7 @@ void rightapplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix
     bool transp = true;
     int qubit1 = util_getBraQubit(target1, qureg);
     int qubit2 = util_getBraQubit(target2, qureg);
-    localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, {}, {}, qubit1, qubit2, matrix, conj, transp);
+    localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, none, none, qubit1, qubit2, matrix, conj, transp);
 }
 
 } // end de-mangler
@@ -105,7 +114,7 @@ void leftapplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matri
 
     bool conj = false;
     bool transp = false;
-    localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, {}, {}, util_getVector(targets, numTargets), matrix, conj, transp);
+    localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, none, none, lists_getList64(targets, numTargets), matrix, conj, transp);
 }
 
 void rightapplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix) {
@@ -118,8 +127,8 @@ void rightapplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matr
     // rho matrix ~ transpose(rho) (x) I ||rho>>
     bool conj = false;
     bool transp = true;
-    auto qubits = util_getBraQubits(util_getVector(targets, numTargets), qureg);
-    localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, {}, {}, qubits, matrix, conj, transp);
+    auto qubits = util_getBraQubits(lists_getList64(targets, numTargets), qureg);
+    localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, none, none, qubits, matrix, conj, transp);
 }
 
 } // end de-mangler
@@ -148,7 +157,7 @@ void leftapplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
     validate_matrixFields(matrix, __func__);
 
     bool conj = false;
-    localiser_statevec_anyCtrlOneTargDiagMatr(qureg, {}, {}, target, matrix, conj);
+    localiser_statevec_anyCtrlOneTargDiagMatr(qureg, none, none, target, matrix, conj);
 }
 
 void rightapplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
@@ -159,7 +168,7 @@ void rightapplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
 
     bool conj = false;
     int qubit = util_getBraQubit(target, qureg);
-    localiser_statevec_anyCtrlOneTargDiagMatr(qureg, {}, {}, qubit, matrix, conj);
+    localiser_statevec_anyCtrlOneTargDiagMatr(qureg, none, none, qubit, matrix, conj);
 }
 
 } // end de-mangler
@@ -178,7 +187,7 @@ void leftapplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix)
     validate_matrixFields(matrix, __func__);
 
     bool conj = false;
-    localiser_statevec_anyCtrlTwoTargDiagMatr(qureg, {}, {}, target1, target2, matrix, conj);
+    localiser_statevec_anyCtrlTwoTargDiagMatr(qureg, none, none, target1, target2, matrix, conj);
 }
 
 void rightapplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix) {
@@ -190,7 +199,7 @@ void rightapplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix
     bool conj = false;
     int qubit1 = util_getBraQubit(target1, qureg);
     int qubit2 = util_getBraQubit(target2, qureg);
-    localiser_statevec_anyCtrlTwoTargDiagMatr(qureg, {}, {}, qubit1, qubit2, matrix, conj);
+    localiser_statevec_anyCtrlTwoTargDiagMatr(qureg, none, none, qubit1, qubit2, matrix, conj);
 }
 
 } // end de-mangler
@@ -210,8 +219,8 @@ void leftapplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matri
 
     bool conj = false;
     qcomp exponent = 1;
-    auto qubits = util_getVector(targets, numTargets);
-    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
+    auto qubits = lists_getList64(targets, numTargets);
+    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, none, none, qubits, matrix, exponent, conj);
 }
 
 void rightapplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix) {
@@ -222,8 +231,8 @@ void rightapplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matr
 
     bool conj = false;
     qcomp exponent = 1;
-    auto qubits = util_getBraQubits(util_getVector(targets, numTargets), qureg);
-    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
+    auto qubits = util_getBraQubits(lists_getList64(targets, numTargets), qureg);
+    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, none, none, qubits, matrix, exponent, conj);
 }
 
 } // end de-mangler
@@ -253,8 +262,8 @@ void leftapplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr
     validate_matrixExpIsNonDiverging(matrix, exponent, __func__); // harmlessly re-validates fields and is-sync
 
     bool conj = false;
-    auto qubits = util_getVector(targets, numTargets);
-    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
+    auto qubits = lists_getList64(targets, numTargets);
+    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, none, none, qubits, matrix, exponent, conj);
 }
 
 void rightapplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent) {
@@ -265,8 +274,8 @@ void rightapplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr
     validate_matrixExpIsNonDiverging(matrix, exponent, __func__); // harmlessly re-validates fields and is-sync
 
     bool conj = false;
-    auto qubits = util_getBraQubits(util_getVector(targets, numTargets), qureg);
-    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
+    auto qubits = util_getBraQubits(lists_getList64(targets, numTargets), qureg);
+    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, none, none, qubits, matrix, exponent, conj);
 }
 
 } // end de-mangler
@@ -350,7 +359,7 @@ void leftapplySwap(Qureg qureg, int qubit1, int qubit2) {
     validate_quregFields(qureg, __func__);
     validate_twoTargets(qureg, qubit1, qubit2, __func__);
 
-    localiser_statevec_anyCtrlSwap(qureg, {}, {}, qubit1, qubit2);
+    localiser_statevec_anyCtrlSwap(qureg, none, none, qubit1, qubit2);
 }
 
 void rightapplySwap(Qureg qureg, int qubit1, int qubit2) {
@@ -360,7 +369,7 @@ void rightapplySwap(Qureg qureg, int qubit1, int qubit2) {
 
     qubit1 = util_getBraQubit(qubit1, qureg);
     qubit2 = util_getBraQubit(qubit2, qureg);
-    localiser_statevec_anyCtrlSwap(qureg, {}, {}, qubit1, qubit2);
+    localiser_statevec_anyCtrlSwap(qureg, none, none, qubit1, qubit2);
 }
 
 } // end de-mangler
@@ -378,7 +387,7 @@ void leftapplyPauliX(Qureg qureg, int target) {
     validate_target(qureg, target, __func__);
 
     PauliStr str = getPauliStr("X", {target});
-    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+    localiser_statevec_anyCtrlPauliTensor(qureg, none, none, str);
 }
 
 void leftapplyPauliY(Qureg qureg, int target) {
@@ -386,7 +395,7 @@ void leftapplyPauliY(Qureg qureg, int target) {
     validate_target(qureg, target, __func__);
 
     PauliStr str = getPauliStr("Y", {target});
-    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+    localiser_statevec_anyCtrlPauliTensor(qureg, none, none, str);
 }
 
 void leftapplyPauliZ(Qureg qureg, int target) {
@@ -394,7 +403,7 @@ void leftapplyPauliZ(Qureg qureg, int target) {
     validate_target(qureg, target, __func__);
 
     PauliStr str = getPauliStr("Z", {target});
-    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+    localiser_statevec_anyCtrlPauliTensor(qureg, none, none, str);
 }
 
 void rightapplyPauliX(Qureg qureg, int target) {
@@ -404,7 +413,7 @@ void rightapplyPauliX(Qureg qureg, int target) {
 
     PauliStr str = getPauliStr("X", {target});
     str = paulis_getShiftedPauliStr(str, qureg.numQubits);
-    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+    localiser_statevec_anyCtrlPauliTensor(qureg, none, none, str);
 }
 
 void rightapplyPauliY(Qureg qureg, int target) {
@@ -415,7 +424,7 @@ void rightapplyPauliY(Qureg qureg, int target) {
     qcomp factor = -1; // undo transpose
     PauliStr str = getPauliStr("Y", {target});
     str = paulis_getShiftedPauliStr(str, qureg.numQubits);
-    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str, factor);
+    localiser_statevec_anyCtrlPauliTensor(qureg, none, none, str, factor);
 }
 
 void rightapplyPauliZ(Qureg qureg, int target) {
@@ -425,7 +434,7 @@ void rightapplyPauliZ(Qureg qureg, int target) {
 
     PauliStr str = getPauliStr("Z", {target});
     str = paulis_getShiftedPauliStr(str, qureg.numQubits);
-    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+    localiser_statevec_anyCtrlPauliTensor(qureg, none, none, str);
 }
 
 } // end de-mangler
@@ -442,7 +451,7 @@ void leftapplyPauliStr(Qureg qureg, PauliStr str) {
     validate_quregFields(qureg, __func__);
     validate_pauliStrTargets(qureg, str, __func__);
 
-    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+    localiser_statevec_anyCtrlPauliTensor(qureg, none, none, str);
 }
 
 void rightapplyPauliStr(Qureg qureg, PauliStr str) {
@@ -452,7 +461,7 @@ void rightapplyPauliStr(Qureg qureg, PauliStr str) {
 
     qcomp factor = paulis_getSignOfPauliStrConj(str); // undo transpose
     str = paulis_getShiftedPauliStr(str, qureg.numQubits);
-    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str, factor);
+    localiser_statevec_anyCtrlPauliTensor(qureg, none, none, str, factor);
 }
 
 } // end de-mangler
@@ -470,7 +479,7 @@ void leftapplyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
     validate_pauliStrTargets(qureg, str, __func__);
 
     qreal phase = util_getPhaseFromGateAngle(angle);
-    localiser_statevec_anyCtrlPauliGadget(qureg, {}, {}, str, phase);
+    localiser_statevec_anyCtrlPauliGadget(qureg, none, none, str, phase);
 }
 
 void rightapplyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
@@ -481,7 +490,7 @@ void rightapplyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
     qreal factor = paulis_getSignOfPauliStrConj(str);
     qreal phase = factor * util_getPhaseFromGateAngle(angle);
     str = paulis_getShiftedPauliStr(str, qureg.numQubits);
-    localiser_statevec_anyCtrlPauliGadget(qureg, {}, {}, str, phase);
+    localiser_statevec_anyCtrlPauliGadget(qureg, none, none, str, phase);
 }
 
 } // end de-mangler
@@ -499,8 +508,8 @@ void leftapplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle
     validate_targets(qureg, targets, numTargets, __func__);
 
     qreal phase = util_getPhaseFromGateAngle(angle);
-    auto qubits = util_getVector(targets, numTargets);
-    localiser_statevec_anyCtrlPhaseGadget(qureg, {}, {}, qubits, phase);
+    auto qubits = lists_getList64(targets, numTargets);
+    localiser_statevec_anyCtrlPhaseGadget(qureg, none, none, qubits, phase);
 }
 
 void rightapplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle) {
@@ -509,8 +518,8 @@ void rightapplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angl
     validate_targets(qureg, targets, numTargets, __func__);
 
     qreal phase = util_getPhaseFromGateAngle(angle);
-    auto qubits = util_getBraQubits(util_getVector(targets, numTargets), qureg);
-    localiser_statevec_anyCtrlPhaseGadget(qureg, {}, {}, qubits, phase);
+    auto qubits = util_getBraQubits(lists_getList64(targets, numTargets), qureg);
+    localiser_statevec_anyCtrlPhaseGadget(qureg, none, none, qubits, phase);
 }
 
 } // end de-mangler
@@ -578,7 +587,7 @@ void leftapplyQubitProjector(Qureg qureg, int qubit, int outcome) {
     validate_measurementOutcomeIsValid(outcome, __func__); 
 
     qreal prob = 1;
-    localiser_statevec_multiQubitProjector(qureg, {qubit}, {outcome}, prob);
+    localiser_statevec_multiQubitProjector(qureg, lists_getList64({qubit}), lists_getList64({outcome}), prob);
 }
 
 void leftapplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQubits) {
@@ -587,8 +596,8 @@ void leftapplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int n
     validate_measurementOutcomesAreValid(outcomes, numQubits, __func__);
 
     qreal prob = 1;
-    auto qubitVec = util_getVector(qubits, numQubits);
-    auto outcomeVec = util_getVector(outcomes, numQubits);
+    auto qubitVec = lists_getList64(qubits, numQubits);
+    auto outcomeVec = lists_getList64(outcomes, numQubits);
     localiser_statevec_multiQubitProjector(qureg, qubitVec, outcomeVec, prob);
 }
 
@@ -599,7 +608,8 @@ void rightapplyQubitProjector(Qureg qureg, int qubit, int outcome) {
     validate_measurementOutcomeIsValid(outcome, __func__); 
     
     qreal prob = 1;
-    localiser_statevec_multiQubitProjector(qureg, {util_getBraQubit(qubit,qureg)}, {outcome}, prob);
+    auto qubitList = lists_getList64({util_getBraQubit(qubit,qureg)});
+    localiser_statevec_multiQubitProjector(qureg, qubitList, lists_getList64({outcome}), prob);
 }
 
 void rightapplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQubits) {
@@ -609,8 +619,8 @@ void rightapplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int
     validate_measurementOutcomesAreValid(outcomes, numQubits, __func__);
 
     qreal prob = 1;
-    auto qubitVec = util_getBraQubits(util_getVector(qubits, numQubits), qureg);
-    auto outcomeVec = util_getVector(outcomes, numQubits);
+    auto qubitVec = util_getBraQubits(lists_getList64(qubits, numQubits), qureg);
+    auto outcomeVec = lists_getList64(outcomes, numQubits);
     localiser_statevec_multiQubitProjector(qureg, qubitVec, outcomeVec, prob);
 }
 
@@ -649,9 +659,9 @@ void leftapplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
 
     // left-multiply each term in-turn, mixing into output qureg, then undo using idempotency
     for (qindex i=0; i<sum.numTerms; i++) {
-        localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, sum.strings[i]);
+        localiser_statevec_anyCtrlPauliTensor(workspace, none, none, sum.strings[i]);
         localiser_statevec_setQuregToWeightedSum(qureg, {1, sum.coeffs[i]}, {qureg, workspace});
-        localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, sum.strings[i]);
+        localiser_statevec_anyCtrlPauliTensor(workspace, none, none, sum.strings[i]);
     }
 
     // workspace -> qureg, and qureg -> sum * qureg
@@ -674,9 +684,9 @@ void rightapplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
         PauliStr str =  paulis_getShiftedPauliStr(sum.strings[i], qureg.numQubits);
         qcomp factor = paulis_getSignOfPauliStrConj(str); // undoes transpose
 
-        localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, str, factor);
+        localiser_statevec_anyCtrlPauliTensor(workspace, none, none, str, factor);
         localiser_statevec_setQuregToWeightedSum(qureg, {1, sum.coeffs[i]}, {qureg, workspace});
-        localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, str, factor);
+        localiser_statevec_anyCtrlPauliTensor(workspace, none, none, str, factor);
     }
 
     // workspace -> qureg, and qureg -> sum * qureg
diff --git a/quest/src/api/operations.cpp b/quest/src/api/operations.cpp
index c15f9f1bd..15574b281 100644
--- a/quest/src/api/operations.cpp
+++ b/quest/src/api/operations.cpp
@@ -42,20 +42,20 @@ void validateAndApplyAnyCtrlAnyTargUnitaryMatrix(Qureg qureg, int* ctrls, int* s
     if (util_isDenseMatrixType<T>())
         validate_mixedAmpsFitInNode(qureg, numTargs, caller);
 
-    auto ctrlVec  = util_getVector(ctrls,  numCtrls);
-    auto stateVec = util_getVector(states, numCtrls);
-    auto targVec  = util_getVector(targs,  numTargs);
+    List64 ctrlList  = lists_getList64(ctrls,  numCtrls);
+    List64 stateList = util_getList64OrAllOnes(states, numCtrls);
+    List64 targList  = lists_getList64(targs,  numTargs);
 
     bool conj = false;
-    localiser_statevec_anyCtrlAnyTargAnyMatr(qureg, ctrlVec, stateVec, targVec, matr, conj);
+    localiser_statevec_anyCtrlAnyTargAnyMatr(qureg, ctrlList, stateList, targList, matr, conj);
 
     if (!qureg.isDensityMatrix)
         return;
 
     conj = true;
-    ctrlVec = util_getBraQubits(ctrlVec, qureg);
-    targVec = util_getBraQubits(targVec, qureg);
-    localiser_statevec_anyCtrlAnyTargAnyMatr(qureg, ctrlVec, stateVec, targVec, matr, conj);
+    ctrlList = util_getBraQubits(ctrlList, qureg);
+    targList = util_getBraQubits(targList, qureg);
+    localiser_statevec_anyCtrlAnyTargAnyMatr(qureg, ctrlList, stateList, targList, matr, conj);
 
     /// @todo
     /// the above logic always performs two in-turn operations upon density matrices, 
@@ -144,7 +144,7 @@ void applyMultiControlledCompMatr2(Qureg qureg, vector<int> controls, int target
     applyMultiControlledCompMatr2(qureg, controls.data(), controls.size(), target1, target2, matr);
 }
 
-void applyMultiStateControlledCompMatr2(Qureg qureg, vector<int> controls, vector<int> states, int numControls, int target1, int target2, CompMatr2 matr) {
+void applyMultiStateControlledCompMatr2(Qureg qureg, vector<int> controls, vector<int> states, int target1, int target2, CompMatr2 matr) {
     validate_controlsMatchStates(controls.size(), states.size(), __func__);
 
     applyMultiStateControlledCompMatr2(qureg, controls.data(), states.data(), controls.size(), target1, target2, matr);
@@ -410,18 +410,18 @@ void applyMultiStateControlledDiagMatrPower(Qureg qureg, int* controls, int* sta
     // when numerical validation is disabled without a separate func.
 
     bool conj = false;
-    auto ctrlVec = util_getVector(controls, numControls);
-    auto stateVec = util_getVector(states,  numControls); // empty if states==nullptr
-    auto targVec = util_getVector(targets,  numTargets);
-    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, ctrlVec, stateVec, targVec, matrix, exponent, conj);
+    auto ctrlList = lists_getList64(controls, numControls);
+    auto stateList = util_getList64OrAllOnes(states, numControls);
+    auto targList = lists_getList64(targets,  numTargets);
+    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, ctrlList, stateList, targList, matrix, exponent, conj);
 
     if (!qureg.isDensityMatrix)
         return;
 
     conj = true;
-    ctrlVec = util_getBraQubits(ctrlVec, qureg);
-    targVec = util_getBraQubits(targVec, qureg);
-    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, ctrlVec, stateVec, targVec, matrix, exponent, conj);
+    ctrlList = util_getBraQubits(ctrlList, qureg);
+    targList = util_getBraQubits(targList, qureg);
+    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, ctrlList, stateList, targList, matrix, exponent, conj);
 }
 
 } // end de-mangler
@@ -518,7 +518,7 @@ void applyMultiControlledS(Qureg qureg, int* controls, int numControls, int targ
 
 void applyMultiStateControlledS(Qureg qureg, int* controls, int* states, int numControls, int target) {
 
-    DiagMatr1 matr = getDiagMatr1({1, 1_i});
+    static const DiagMatr1 matr = getDiagMatr1({1, 1_i});
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matr, __func__);
 }
 
@@ -569,7 +569,7 @@ void applyMultiControlledT(Qureg qureg, int* controls, int numControls, int targ
 
 void applyMultiStateControlledT(Qureg qureg, int* controls, int* states, int numControls, int target) {
 
-    DiagMatr1 matr = getDiagMatr1({1, 1/std::sqrt(2) + 1_i/std::sqrt(2)});
+    static const DiagMatr1 matr = getDiagMatr1({1, (1 + 1_i)/std::sqrt(2)});
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matr, __func__);
 }
 
@@ -620,11 +620,11 @@ void applyMultiControlledHadamard(Qureg qureg, int* controls, int numControls, i
 
 void applyMultiStateControlledHadamard(Qureg qureg, int* controls, int* states, int numControls, int target) {
 
-    qcomp a = 1/std::sqrt(2);
-    CompMatr1 matr = getCompMatr1({
-        {a, a}, 
-        {a,-a}});
-
+    static const qcomp a = 1 / std::sqrt(2);
+    static const CompMatr1 matr = getCompMatr1({
+        {a,  a}, 
+        {a, -a}
+    });
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matr, __func__);
 }
 
@@ -678,17 +678,17 @@ void applyMultiStateControlledSwap(Qureg qureg, int* controls, int* states, int
     validate_controlsAndTwoTargets(qureg, controls, numControls, qubit1, qubit2, __func__);
     validate_controlStates(states, numControls, __func__); // permits states==nullptr
 
-    auto ctrlVec = util_getVector(controls, numControls);
-    auto stateVec = util_getVector(states, numControls); // empty if states==nullptr
-    localiser_statevec_anyCtrlSwap(qureg, ctrlVec, stateVec, qubit1, qubit2);
+    auto ctrlList = lists_getList64(controls, numControls);
+    auto stateList = util_getList64OrAllOnes(states, numControls);
+    localiser_statevec_anyCtrlSwap(qureg, ctrlList, stateList, qubit1, qubit2);
 
     if (!qureg.isDensityMatrix)
         return;
 
-    ctrlVec = util_getBraQubits(ctrlVec, qureg);
+    ctrlList = util_getBraQubits(ctrlList, qureg);
     qubit1 = util_getBraQubit(qubit1, qureg);
     qubit2 = util_getBraQubit(qubit2, qureg);
-    localiser_statevec_anyCtrlSwap(qureg, ctrlVec, stateVec, qubit1, qubit2);
+    localiser_statevec_anyCtrlSwap(qureg, ctrlList, stateList, qubit1, qubit2);
 }
 
 } // end de-mangler
@@ -749,7 +749,7 @@ void applyMultiStateControlledSqrtSwap(Qureg qureg, int* controls, int* states,
 
     validate_mixedAmpsFitInNode(qureg, 2, __func__); // to throw SqrtSwap error, not generic CompMatr2 error
 
-    CompMatr2 matr = getCompMatr2({
+    static const CompMatr2 matr = getCompMatr2({
         {1, 0, 0, 0},
         {0, .5+.5_i, .5-.5_i, 0},
         {0, .5-.5_i, .5+.5_i, 0},
@@ -869,7 +869,7 @@ void applyMultiStateControlledPauliX(Qureg qureg, int* controls, int* states, in
     /// since it avoids all superfluous flops; check worthwhile for multi-qubit
 
     // harmlessly re-validates, including hardcoded matrix unitarity
-    CompMatr1 matrix = util_getPauliX();
+    static const CompMatr1 matrix = util_getPauliX();
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matrix, __func__);
 }
 
@@ -879,7 +879,7 @@ void applyMultiStateControlledPauliY(Qureg qureg, int* controls, int* states, in
     validate_controlStates(states, numControls, __func__); // permits states==nullptr
 
     // harmlessly re-validates, including hardcoded matrix unitarity
-    CompMatr1 matrix = util_getPauliY();
+    static const CompMatr1 matrix = util_getPauliY();
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matrix, __func__);
 }
 
@@ -889,7 +889,7 @@ void applyMultiStateControlledPauliZ(Qureg qureg, int* controls, int* states, in
     validate_controlStates(states, numControls, __func__); // permits states==nullptr
 
     // harmlessly re-validates, including hardcoded matrix unitarity
-    DiagMatr1 matrix = util_getPauliZ();
+    static const DiagMatr1 matrix = util_getPauliZ();
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matrix, __func__);
 }
 
@@ -966,27 +966,27 @@ void applyMultiStateControlledPauliStr(Qureg qureg, int* controls, int* states,
     validate_controlStates(states, numControls, __func__); // permits states==nullptr
 
     qcomp factor = 1;
-    auto ctrlVec = util_getVector(controls, numControls);
-    auto stateVec = util_getVector(states, numControls); // empty if states==nullptr
+    auto ctrlList = lists_getList64(controls, numControls);
+    auto stateList = util_getList64OrAllOnes(states, numControls);
 
     // when there are no control qubits, we can merge the density matrix's 
     // operation sinto a single tensor, i.e. +- (shift(str) (x) str), to 
     // avoid superfluous re-enumeration of the state
     if (qureg.isDensityMatrix && numControls == 0) {
         factor = paulis_getSignOfPauliStrConj(str);
-        ctrlVec = util_getConcatenated(ctrlVec, util_getBraQubits(ctrlVec, qureg));
-        stateVec = util_getConcatenated(stateVec, stateVec); 
+        ctrlList = util_getConcatenated(ctrlList, util_getBraQubits(ctrlList, qureg));
+        stateList = util_getConcatenated(stateList, stateList); 
         str = paulis_getKetAndBraPauliStr(str, qureg);
     }
 
-    localiser_statevec_anyCtrlPauliTensor(qureg, ctrlVec, stateVec, str, factor);
+    localiser_statevec_anyCtrlPauliTensor(qureg, ctrlList, stateList, str, factor);
 
     // but density-matrix control qubits require two distinct operations
     if (qureg.isDensityMatrix && numControls > 0) {
         factor = paulis_getSignOfPauliStrConj(str);
-        ctrlVec = util_getBraQubits(ctrlVec, qureg);
+        ctrlList = util_getBraQubits(ctrlList, qureg);
         str = paulis_getShiftedPauliStr(str, qureg.numQubits);
-        localiser_statevec_anyCtrlPauliTensor(qureg, ctrlVec, stateVec, str, factor);
+        localiser_statevec_anyCtrlPauliTensor(qureg, ctrlList, stateList, str, factor);
     }
 }
 
@@ -1250,7 +1250,8 @@ void applyNonUnitaryPauliGadget(Qureg qureg, PauliStr str, qcomp angle) {
     validate_pauliStrTargets(qureg, str, __func__);
 
     qcomp phase = util_getPhaseFromGateAngle(angle);
-    localiser_statevec_anyCtrlPauliGadget(qureg, {}, {}, str, phase);
+    auto none = lists_getEmptyList64();
+    localiser_statevec_anyCtrlPauliGadget(qureg, none, none, str, phase);
 
     if (!qureg.isDensityMatrix)
         return;
@@ -1258,7 +1259,7 @@ void applyNonUnitaryPauliGadget(Qureg qureg, PauliStr str, qcomp angle) {
     // conj(e^i(a)P) = e^(-i s conj(a) P)
     phase = - std::conj(phase) * paulis_getSignOfPauliStrConj(str);
     str = paulis_getShiftedPauliStr(str, qureg.numQubits);
-    localiser_statevec_anyCtrlPauliGadget(qureg, {}, {}, str, phase);
+    localiser_statevec_anyCtrlPauliGadget(qureg, none, none, str, phase);
 }
 
 void applyControlledPauliGadget(Qureg qureg, int control, PauliStr str, qreal angle) {
@@ -1291,18 +1292,18 @@ void applyMultiStateControlledPauliGadget(Qureg qureg, int* controls, int* state
     // which is sufficiently efficient using the existing gadget backend function
 
     qreal phase = util_getPhaseFromGateAngle(angle);
-    auto ctrlVec = util_getVector(controls, numControls);
-    auto stateVec = util_getVector(states, numControls); // empty if states==nullptr
-    localiser_statevec_anyCtrlPauliGadget(qureg, ctrlVec, stateVec, str, phase);
+    auto ctrlList = lists_getList64(controls, numControls);
+    auto stateList = util_getList64OrAllOnes(states, numControls);
+    localiser_statevec_anyCtrlPauliGadget(qureg, ctrlList, stateList, str, phase);
 
     if (!qureg.isDensityMatrix)
         return;
 
     // conj(e^(i a P)) = e^(-i s a P)
     phase *= - paulis_getSignOfPauliStrConj(str);
-    ctrlVec = util_getBraQubits(ctrlVec, qureg);
+    ctrlList = util_getBraQubits(ctrlList, qureg);
     str = paulis_getShiftedPauliStr(str, qureg.numQubits);
-    localiser_statevec_anyCtrlPauliGadget(qureg, ctrlVec, stateVec, str, phase);
+    localiser_statevec_anyCtrlPauliGadget(qureg, ctrlList, stateList, str, phase);
 }
 
 } // end de-mangler
@@ -1356,18 +1357,18 @@ void applyMultiStateControlledPhaseGadget(Qureg qureg, int* controls, int* state
     validate_controlStates(states, numControls, __func__);
 
     qreal phase = util_getPhaseFromGateAngle(angle);
-    auto ctrlVec = util_getVector(controls, numControls);
-    auto targVec = util_getVector(targets,  numTargets);
-    auto stateVec = util_getVector(states,  numControls); // empty if states==nullptr
-    localiser_statevec_anyCtrlPhaseGadget(qureg, ctrlVec, stateVec, targVec, phase);
+    auto ctrlList = lists_getList64(controls, numControls);
+    auto stateList = util_getList64OrAllOnes(states, numControls);
+    auto targList = lists_getList64(targets,  numTargets);
+    localiser_statevec_anyCtrlPhaseGadget(qureg, ctrlList, stateList, targList, phase);
 
     if (!qureg.isDensityMatrix)
         return;
 
     phase *= -1;
-    ctrlVec = util_getBraQubits(ctrlVec, qureg);
-    targVec = util_getBraQubits(targVec, qureg);
-    localiser_statevec_anyCtrlPhaseGadget(qureg, ctrlVec, stateVec, targVec, phase);
+    ctrlList = util_getBraQubits(ctrlList, qureg);
+    targList = util_getBraQubits(targList, qureg);
+    localiser_statevec_anyCtrlPhaseGadget(qureg, ctrlList, stateList, targList, phase);
 }
 
 } // end de-mangler
@@ -1423,7 +1424,8 @@ void applyMultiQubitPhaseShift(Qureg qureg, int* targets, int numTargets, qreal
     validate_targets(qureg, targets, numTargets, __func__);
 
     // treat as a (numTargets-1)-controlled 1-target diagonal matrix
-    DiagMatr1 matr = getDiagMatr1({1, std::exp(1_i * angle)});
+    static DiagMatr1 matr = getDiagMatr1({1, /*un-init*/ 0});
+    matr.elems[1] = std::exp(1_i * angle); // micro-optimisation
 
     // harmlessly re-validates
     applyMultiStateControlledDiagMatr1(qureg, &targets[1], nullptr, numTargets-1, targets[0], matr);
@@ -1466,7 +1468,7 @@ void applyMultiQubitPhaseFlip(Qureg qureg, int* targets, int numTargets) {
     validate_targets(qureg, targets, numTargets, __func__);
 
     // treat as a (numTargets-1)-controlled 1-target Pauli Z
-    DiagMatr1 matr = getDiagMatr1({1, -1});
+    static const DiagMatr1 matr = getDiagMatr1({1, -1});
 
     // harmlessly re-validates
     applyMultiStateControlledDiagMatr1(qureg, &targets[1], nullptr, numTargets-1, targets[0], matr);
@@ -1561,10 +1563,13 @@ void applyQubitProjector(Qureg qureg, int target, int outcome) {
     
     qreal prob = 1;
 
+    auto targList    = lists_getList64({target});
+    auto outcomeList = lists_getList64({outcome});
+
     // density matrix has an optimised func in lieu of calling the statevector func twice
     (qureg.isDensityMatrix)?
-        localiser_densmatr_multiQubitProjector(qureg, {target}, {outcome}, prob):
-        localiser_statevec_multiQubitProjector(qureg, {target}, {outcome}, prob);
+        localiser_densmatr_multiQubitProjector(qureg, targList, outcomeList, prob):
+        localiser_statevec_multiQubitProjector(qureg, targList, outcomeList, prob);
 }
 
 void applyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQubits) {
@@ -1573,13 +1578,13 @@ void applyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQu
     validate_measurementOutcomesAreValid(outcomes, numQubits, __func__);
 
     qreal prob = 1;
-    auto qubitVec = util_getVector(qubits, numQubits);
-    auto outcomeVec = util_getVector(outcomes, numQubits);
+    auto qubitList = lists_getList64(qubits, numQubits);
+    auto outcomeList = lists_getList64(outcomes, numQubits);
 
     // density matrix has an optimised func in lieu of calling the statevector func twice
     (qureg.isDensityMatrix)?
-        localiser_densmatr_multiQubitProjector(qureg, qubitVec, outcomeVec, prob):
-        localiser_statevec_multiQubitProjector(qureg, qubitVec, outcomeVec, prob);
+        localiser_densmatr_multiQubitProjector(qureg, qubitList, outcomeList, prob):
+        localiser_statevec_multiQubitProjector(qureg, qubitList, outcomeList, prob);
 }
 
 } // end de-mangler
@@ -1623,10 +1628,13 @@ int applyQubitMeasurementAndGetProb(Qureg qureg, int target, qreal* probability)
     int outcome = rand_getRandomSingleQubitOutcome(probs[0]);
     *probability = probs[outcome];
 
+    auto targList    = lists_getList64({target});
+    auto outcomeList = lists_getList64({outcome});
+
     // collapse to the outcome
     (qureg.isDensityMatrix)?
-        localiser_densmatr_multiQubitProjector(qureg, {target}, {outcome}, *probability):
-        localiser_statevec_multiQubitProjector(qureg, {target}, {outcome}, *probability);
+        localiser_densmatr_multiQubitProjector(qureg, targList, outcomeList, *probability):
+        localiser_statevec_multiQubitProjector(qureg, targList, outcomeList, *probability);
 
     return outcome;
 }
@@ -1642,10 +1650,13 @@ qreal applyForcedQubitMeasurement(Qureg qureg, int target, int outcome) {
     qreal prob = calcProbOfQubitOutcome(qureg, target, outcome); // harmlessly re-validates
     validate_measurementOutcomeProbNotZero(outcome, prob, __func__);
 
+    auto targList    = lists_getList64({target});
+    auto outcomeList = lists_getList64({outcome});
+
     // project to the outcome, renormalising the surviving states
     (qureg.isDensityMatrix)?
-        localiser_densmatr_multiQubitProjector(qureg, {target}, {outcome}, prob):
-        localiser_statevec_multiQubitProjector(qureg, {target}, {outcome}, prob);
+        localiser_densmatr_multiQubitProjector(qureg, targList, outcomeList, prob):
+        localiser_statevec_multiQubitProjector(qureg, targList, outcomeList, prob);
 
     return prob;
 }
@@ -1683,14 +1694,14 @@ qindex applyMultiQubitMeasurementAndGetProb(Qureg qureg, int* qubits, int numQub
     *probability = probs[outcome];
 
     // map outcome to individual qubit outcomes
-    auto qubitVec = util_getVector(qubits, numQubits);
-    auto outcomeVec = vector<int>(numQubits);
-    getBitsFromInteger(outcomeVec.data(), outcome, numQubits);
+    auto qubitList = lists_getList64(qubits, numQubits);
+    auto outcomeList = util_getConstantList(-1, numQubits);
+    setToBitsOfInteger(outcomeList.data(), outcome, numQubits);
 
     // project to the outcomes, renormalising the surviving states
     (qureg.isDensityMatrix)?
-        localiser_densmatr_multiQubitProjector(qureg, qubitVec, outcomeVec, *probability):
-        localiser_statevec_multiQubitProjector(qureg, qubitVec, outcomeVec, *probability);
+        localiser_densmatr_multiQubitProjector(qureg, qubitList, outcomeList, *probability):
+        localiser_statevec_multiQubitProjector(qureg, qubitList, outcomeList, *probability);
 
     return outcome;
 }
@@ -1700,8 +1711,8 @@ qreal applyForcedMultiQubitMeasurement(Qureg qureg, int* qubits, int* outcomes,
     validate_targets(qureg, qubits, numQubits, __func__);
     validate_measurementOutcomesAreValid(outcomes, numQubits, __func__);
 
-    auto qubitVec = util_getVector(qubits, numQubits);
-    auto outcomeVec = util_getVector(outcomes, numQubits);
+    auto qubitList = lists_getList64(qubits, numQubits);
+    auto outcomeList = lists_getList64(outcomes, numQubits);
 
     // ensure probability of the forced measurement outcome is not negligible
     qreal prob = calcProbOfMultiQubitOutcome(qureg, qubits, outcomes, numQubits); // harmlessly re-validates
@@ -1709,8 +1720,8 @@ qreal applyForcedMultiQubitMeasurement(Qureg qureg, int* qubits, int* outcomes,
 
     // project to the outcome, renormalising the surviving states
     (qureg.isDensityMatrix)?
-        localiser_densmatr_multiQubitProjector(qureg, qubitVec, outcomeVec, prob):
-        localiser_statevec_multiQubitProjector(qureg, qubitVec, outcomeVec, prob);
+        localiser_densmatr_multiQubitProjector(qureg, qubitList, outcomeList, prob):
+        localiser_statevec_multiQubitProjector(qureg, qubitList, outcomeList, prob);
 
     return prob;
 }
@@ -1782,11 +1793,7 @@ void applyQuantumFourierTransform(Qureg qureg, int* targets, int numTargets, boo
 void applyFullQuantumFourierTransform(Qureg qureg, bool inverse) {
     validate_quregFields(qureg, __func__);
 
-    // tiny; no need to validate alloc
-    vector<int> targets(qureg.numQubits);
-    for (size_t i=0; i<targets.size(); i++)
-        targets[i] = i;
-
+    auto targets = util_getRange(qureg.numQubits);
     applyQuantumFourierTransform(qureg, targets.data(), targets.size(), inverse);
 }
 
diff --git a/quest/src/api/paulis.cpp b/quest/src/api/paulis.cpp
index 855a9cfd8..d5ac4d8e7 100644
--- a/quest/src/api/paulis.cpp
+++ b/quest/src/api/paulis.cpp
@@ -38,7 +38,7 @@ bool didAnyAllocsFailOnAnyNode(PauliStrSum sum) {
         ! mem_isAllocated(sum.coeffs)  || 
         ! mem_isAllocated(sum.isApproxHermitian) );
     
-    if (comm_isInit())
+    if (comm_isActive())
         anyFail = comm_isTrueOnAllNodes(anyFail);
 
     return anyFail;
@@ -263,12 +263,16 @@ extern "C" void destroyPauliStrSum(PauliStrSum sum) {
 
 extern "C" void reportPauliStr(PauliStr str) {
 
+    printer_sync();
+
     // no header, so no indentation
     string indent = "";
     print_elemsWithoutNewline(str, indent);
 
     // print all user-set newlines (including none)
     print_newlines();
+
+    printer_sync();
 }
 
 
@@ -285,11 +289,15 @@ extern "C" void reportPauliStrSum(PauliStrSum sum) {
     // linearly with user input parameters, unlike Qureg and matrices.
     qindex numTotalBytes = numStrBytes + numCoeffBytes + numStrucBytes;
 
+    printer_sync();
+
     print_header(sum, numTotalBytes);
     print_elems(sum);
     
     // exclude mandatory newline above
     print_oneFewerNewlines();
+
+    printer_sync();
 }
 
 
diff --git a/quest/src/api/qureg.cpp b/quest/src/api/qureg.cpp
index 034c96e5c..84bcd2bd0 100644
--- a/quest/src/api/qureg.cpp
+++ b/quest/src/api/qureg.cpp
@@ -116,7 +116,7 @@ bool didAnyLocalAllocsFail(Qureg qureg) {
 bool didAnyAllocsFailOnAnyNode(Qureg qureg) {
 
     bool anyFail = didAnyLocalAllocsFail(qureg);
-    if (comm_isInit())
+    if (comm_isActive())
         anyFail = comm_isTrueOnAllNodes(anyFail);
 
     return anyFail;
@@ -360,7 +360,8 @@ void reportQuregParams(Qureg qureg) {
 
     /// @todo add function to write this output to file (useful for HPC debugging)
 
-    // printer routines will consult env rank to avoid duplicate printing
+    printer_sync();
+
     print_label("Qureg");
     printDeploymentInfo(qureg);
     printDimensionInfo(qureg);
@@ -369,6 +370,8 @@ void reportQuregParams(Qureg qureg) {
 
     // exclude mandatory newline above
     print_oneFewerNewlines();
+
+    printer_sync();
 }
 
 
@@ -385,11 +388,15 @@ void reportQureg(Qureg qureg) {
     // include struct size (expected negligibly tiny)
     localMem += sizeof(qureg);
 
+    printer_sync();
+
     print_header(qureg, localMem);
     print_elems(qureg);
 
     // exclude mandatory newline above
     print_oneFewerNewlines();
+
+    printer_sync();
 }
 
 
diff --git a/quest/src/api/trotterisation.cpp b/quest/src/api/trotterisation.cpp
index 8a5a6863a..6fd5781ba 100644
--- a/quest/src/api/trotterisation.cpp
+++ b/quest/src/api/trotterisation.cpp
@@ -11,6 +11,7 @@
 #include "quest/include/matrices.h"
 
 #include "quest/src/core/validation.hpp"
+#include "quest/src/core/lists.hpp"
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/core/localiser.hpp"
 #include "quest/src/core/paulilogic.hpp"
@@ -29,8 +30,8 @@ using std::vector;
  */
 
 void internal_applyFirstOrderTrotterRepetition(
-    Qureg qureg, vector<int>& ketCtrls, vector<int>& braCtrls,
-    vector<int>& states, PauliStrSum sum, vector<qindex>& sumOrdering,
+    Qureg qureg, ConstList64 ketCtrls, ConstList64 braCtrls,
+    ConstList64 states, PauliStrSum sum, vector<qindex>& sumOrdering,
     qcomp angle, bool onlyLeftApply, bool reverse
 ) {
     // apply each sum term as a gadget, in forward or reverse order
@@ -62,8 +63,8 @@ void internal_applyFirstOrderTrotterRepetition(
 }
 
 void internal_applyHigherOrderTrotterRepetition(
-    Qureg qureg, vector<int>& ketCtrls, vector<int>& braCtrls,
-    vector<int>& states, PauliStrSum sum, vector<qindex>& sumOrdering, 
+    Qureg qureg, ConstList64 ketCtrls, ConstList64 braCtrls,
+    ConstList64 states, PauliStrSum sum, vector<qindex>& sumOrdering, 
     qcomp angle, int order, bool onlyLeftApply
 ) {
     if (order == 1) {
@@ -107,9 +108,9 @@ void internal_applyAllTrotterRepetitions(
     }
 
     // prepare control-qubit lists once for all invoked gadgets below
-    auto ketCtrlsVec = util_getVector(controls, numControls);
-    auto braCtrlsVec = (qureg.isDensityMatrix)? util_getBraQubits(ketCtrlsVec, qureg) : vector<int>{};
-    auto statesVec = util_getVector(states, numControls);
+    auto ketCtrlsList = lists_getList64(controls, numControls);
+    auto braCtrlsList = (qureg.isDensityMatrix)? util_getBraQubits(ketCtrlsList, qureg) : lists_getEmptyList64();
+    auto statesList = lists_getList64(states, numControls * (states != nullptr));
 
     qcomp arg = angle / reps;
 
@@ -120,7 +121,7 @@ void internal_applyAllTrotterRepetitions(
             rand_setListToShuffled(sumOrdering);
 
         internal_applyHigherOrderTrotterRepetition(
-            qureg, ketCtrlsVec, braCtrlsVec, statesVec, sum, sumOrdering, arg, order, onlyLeftApply);
+            qureg, ketCtrlsList, braCtrlsList, statesList, sum, sumOrdering, arg, order, onlyLeftApply);
     }
 }
 
@@ -167,7 +168,7 @@ void applyTrotterizedNonUnitaryPauliStrSumGadget(Qureg qureg, PauliStrSum sum, q
     validate_quregFields(qureg, __func__);
     validate_pauliStrSumFields(sum, __func__);
     validate_pauliStrSumTargets(sum, qureg, __func__);
-    validate_trotterParams(qureg, order, reps, __func__);
+    validate_trotterParams(order, reps, __func__);
     // sum is permitted to be non-Hermitian
 
     // |psi> -> U |psi>, rho -> U rho U^dagger
@@ -180,7 +181,7 @@ void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle
     validate_pauliStrSumFields(sum, __func__);
     validate_pauliStrSumTargets(sum, qureg, __func__);
     validate_pauliStrSumIsHermitian(sum, __func__);
-    validate_trotterParams(qureg, order, reps, __func__);
+    validate_trotterParams(order, reps, __func__);
 
     bool onlyLeftApply = false;
     internal_applyAllTrotterRepetitions(qureg, nullptr, nullptr, 0, sum, angle, order, reps, onlyLeftApply, permuteTerms, __func__);
@@ -194,7 +195,7 @@ void applyTrotterizedControlledPauliStrSumGadget(
     validate_pauliStrSumFields(sum, __func__);
     validate_pauliStrSumIsHermitian(sum, __func__);
     validate_controlAndPauliStrSumTargets(qureg, control, sum, __func__);
-    validate_trotterParams(qureg, order, reps, __func__);
+    validate_trotterParams(order, reps, __func__);
     
     bool onlyLeftApply = false;
     internal_applyAllTrotterRepetitions(qureg, &control, nullptr, 1, sum, angle, order, reps, onlyLeftApply, permuteTerms, __func__);
@@ -208,7 +209,7 @@ void applyTrotterizedMultiControlledPauliStrSumGadget(
     validate_pauliStrSumFields(sum, __func__);
     validate_pauliStrSumIsHermitian(sum, __func__);
     validate_controlsAndPauliStrSumTargets(qureg, controls, numControls, sum, __func__);
-    validate_trotterParams(qureg, order, reps, __func__);
+    validate_trotterParams(order, reps, __func__);
 
     bool onlyLeftApply = false;
     internal_applyAllTrotterRepetitions(qureg, controls, nullptr, numControls, sum, angle, order, reps, onlyLeftApply, permuteTerms, __func__);
@@ -223,7 +224,7 @@ void applyTrotterizedMultiStateControlledPauliStrSumGadget(
     validate_pauliStrSumIsHermitian(sum, __func__);
     validate_controlsAndPauliStrSumTargets(qureg, controls, numControls, sum, __func__);
     validate_controlStates(states, numControls, __func__); // permits states==nullptr
-    validate_trotterParams(qureg, order, reps, __func__);
+    validate_trotterParams(order, reps, __func__);
 
     bool onlyLeftApply = false;
     internal_applyAllTrotterRepetitions(qureg, controls, states, numControls, sum, angle, order, reps, onlyLeftApply, permuteTerms, __func__);
@@ -260,7 +261,7 @@ void applyTrotterizedUnitaryTimeEvolution(Qureg qureg, PauliStrSum hamil, qreal
     validate_pauliStrSumFields(hamil, __func__);
     validate_pauliStrSumTargets(hamil, qureg, __func__);
     validate_pauliStrSumIsHermitian(hamil, __func__);
-    validate_trotterParams(qureg, order, reps, __func__);
+    validate_trotterParams(order, reps, __func__);
 
     // exp(-i t H) = exp(x i H) | x=-t
     qcomp angle = - time;
@@ -273,7 +274,7 @@ void applyTrotterizedImaginaryTimeEvolution(Qureg qureg, PauliStrSum hamil, qrea
     validate_pauliStrSumFields(hamil, __func__);
     validate_pauliStrSumTargets(hamil, qureg, __func__);
     validate_pauliStrSumIsHermitian(hamil, __func__);
-    validate_trotterParams(qureg, order, reps, __func__);
+    validate_trotterParams(order, reps, __func__);
 
     // exp(-tau H) = exp(x i H) | x=tau*i
     qcomp angle = qcomp(0, tau);
@@ -300,7 +301,7 @@ void applyTrotterizedNoisyTimeEvolution(
     validate_pauliStrSumFields(hamil, __func__);
     validate_pauliStrSumTargets(hamil, qureg, __func__);
     validate_pauliStrSumIsHermitian(hamil, __func__);
-    validate_trotterParams(qureg, order, reps, __func__);
+    validate_trotterParams(order, reps, __func__);
     validate_lindbladJumpOps(jumps, numJumps, qureg, __func__);
     validate_lindbladDampingRates(damps, numJumps, __func__);
     
diff --git a/quest/src/api/types.cpp b/quest/src/api/types.cpp
index 4fadf1cb5..cead74301 100644
--- a/quest/src/api/types.cpp
+++ b/quest/src/api/types.cpp
@@ -22,8 +22,12 @@ using std::string;
 void reportStr(std::string str) {
     validate_envIsInit(__func__);
 
+    printer_sync();
+
     print(str);
     print_newlines();
+
+    printer_sync();
 }
 
 extern "C" void reportStr(const char* str) {
diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index 854a12bd5..4b76ca71e 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -4,10 +4,13 @@
  * implementation (like OpenMPI vs MPICH). These functions
  * are callable even when MPI has not been compiled/linked.
  * 
- * Note that even when COMPILE_MPI=1, the user may have
+ * Note that even when QUEST_COMPILE_MPI=1, the user may have
  * disabled distribution when creating the QuEST environment
- * at runtime. Ergo we use comm_isInit() to determine whether
- * functions should invoke the MPI API.
+ * at runtime - even despite they themselves initialising and
+ * using MPI. So we must be careful about consulting MPI status!
+ * Furthermore, all routines here will only ever consult/affect
+ * the QuEST communicator, never the entire MPI environment,
+ * the latter of which may contain non-participating processes.
  * 
  * @author Tyson Jones
  */
@@ -18,7 +21,9 @@
 #include "quest/src/comm/comm_config.hpp"
 #include "quest/src/core/errors.hpp"
 
-#if COMPILE_MPI
+#include <string>
+
+#if QUEST_COMPILE_MPI
     #include <mpi.h>
 #endif
 
@@ -28,7 +33,8 @@
  * WARN ABOUT CUDA-AWARENESS
  */
 
-#if COMPILE_MPI && COMPILE_CUDA
+
+#if QUEST_COMPILE_MPI && QUEST_COMPILE_CUDA
 
     // this check is OpenMPI specific
     #ifdef OPEN_MPI
@@ -50,22 +56,115 @@
 
 
 
+/*
+ * COMMUNICATOR MANAGEMENT
+ *
+ * QuEST will only ever use the overridable global_mpiComm communicator,
+ * so that superusers can dedicate external MPI processes to other tasks.
+ * Beware that it's valid for QuEST to be compiled with MPI, but have
+ * distribution runtime-disabled, while the user is themselves using
+ * (and ergo have initialised) MPI. In that scenario, we must not touch
+ * MPI, hence why comm_isActive() below is distinct from comm_isMpiInit().
+ */
+
+
+// We must record whether the user owns MPI, so that we do not ever attempt
+// to kill it when gracefully exiting, or due to a validation error
+static bool global_isMpiUserOwned = false;
+
+
+// Guarded since MPI_Comm cannot be exposed when not compiling MPI. This
+// communicator is overridden from NULL either BEFORE or DURING comm_init()
+#if QUEST_COMPILE_MPI
+    static MPI_Comm global_mpiComm = MPI_COMM_NULL;
+#endif
+
+
+bool comm_isActive() {
+#if QUEST_COMPILE_MPI
+
+    // comm_init(), or potentially comm_setMpiComm() before it, will only
+    // ever override mpiComm with non-NULL, indicating active comm. Note
+    // it's principally for mpiComm to later return to NULL, via comm_end(),
+    // and for QuEST execution to continue (though not supported presently).
+    // if comm_isActive() is true, then it is guaranteed MPI is initialised
+    return global_mpiComm != MPI_COMM_NULL;
+
+    // note it is legal for QuEST distribution to be disabled (and ergo
+    // mpiComm never initialised) even when the user is themselves accessing
+    // MPI, hence this function is semantically distinct from comm_isMpiInit()
+#else
+
+    // QuEST communication is obviously never active if
+    // not even MPI is compiled; though this does not
+    // imply at all the user isn't themselves using MPI!
+    return false;
+
+#endif
+}
+
+
+// Hide MPI_Comm from signatures when MPI is not compiled. Beware that
+// these are not exposed in comm_config.hpp; callers must 'extern' them!
+#if QUEST_COMPILE_MPI
+
+
+MPI_Comm comm_getMpiComm() {
+
+    // illegal to call before communicator has been overridden
+    if (global_mpiComm == MPI_COMM_NULL)
+        error_commMpiCommIsNull();
+
+    return global_mpiComm;
+}
+
+
+bool comm_setMpiComm(MPI_Comm newComm, bool userOwnsMpi) {
+
+    // illegal to re-set, or set to null
+    if (global_mpiComm != MPI_COMM_NULL)
+        error_commAlreadyHasSetMpiComm();
+    if (newComm == MPI_COMM_NULL)
+        error_commNewMpiCommIsNull();
+
+    // detect bad communicator, and inform validation
+    auto status = MPI_Comm_dup(newComm, &global_mpiComm);
+    if (status != MPI_SUCCESS)
+        return false;
+
+    // record ownership as soon as QuEST communication becomes active, so
+    // validation errors during env initialisation never kill user-owned MPI
+    global_isMpiUserOwned = userOwnsMpi;
+    return true;
+}
+
+
+#endif // QUEST_COMPILE_MPI
+
+
+
 /*
  * MPI ENVIRONMENT MANAGEMENT
- * all of which is safely callable in non-distributed mode
+ *
+ * which queries MPI itself (as may be user-activated), rather
+ * than QuEST's (possibly more limited) MPI environment
  */
 
 
 bool comm_isMpiCompiled() {
-    return (bool) COMPILE_MPI;
+    return (bool) QUEST_COMPILE_MPI;
+}
+
+bool comm_isMpiSubCommCompiled() {
+    return (bool) QUEST_COMPILE_SUBCOMM;
 }
 
 
 bool comm_isMpiGpuAware() {
 
-    /// @todo these checks may be OpenMPI specific, so that
-    /// non-OpenMPI MPI compilers are always dismissed as
-    /// not being CUDA-aware. Check e.g. MPICH method!
+    // well duh
+    if (!comm_isMpiCompiled())
+        return false;
 
     // definitely not GPU-aware if compiler declares it is not
     #if defined(MPIX_CUDA_AWARE_SUPPORT) && ! MPIX_CUDA_AWARE_SUPPORT
@@ -77,71 +176,135 @@ bool comm_isMpiGpuAware() {
         return (bool) MPIX_Query_cuda_support();
     #endif
 
+    // check whether an MPICH env-var indicates support (we assume it never lies!)
+    static const auto var = std::getenv("MPICH_GPU_SUPPORT_ENABLED");
+    if (var && std::string(var) == "1") // ill-formed vars = 0
+        return true;
+
     // if we can't ascertain CUDA-awareness, just assume no to avoid seg-fault
     return false;
 }
 
 
-bool comm_isInit() {
-#if COMPILE_MPI
+bool comm_isMpiInit() {
+#if QUEST_COMPILE_MPI
 
     // safely callable before MPI initialisation, but NOT after comm_end()
     int isInit;
     MPI_Initialized(&isInit);
+
+    // when MPI is not initialised, it is guaranteed that QuEST's communicator
+    // is inactive, which we double check here so callers can be absolutely sure
+    if (!isInit && comm_isActive())
+        error_commActiveButMpiNotInit();
+
     return (bool) isInit;
 
 #else
 
     // obviously MPI is never initialised if not even compiled
     return false;
+
 #endif
 }
 
 
-void comm_init() {
-#if COMPILE_MPI
+bool comm_isMpiUserOwned() {
+
+    // this isn't presently used by the code base; I'm just naughtily silencing
+    // "unused var" warning when compiling without MPI :^)
+    return global_isMpiUserOwned;
+}
 
-    // error if attempting re-initialisation
-    if (comm_isInit())
+
+
+/*
+ * QUEST COMMUNICATION MANAGEMENT
+ *
+ * which interacts only with QuEST's MPI environment,
+ * which may be smaller than the user-controlled MPI env
+ */
+
+
+void comm_init(bool userOwnsMpi) {
+#if QUEST_COMPILE_MPI
+
+    // re-assert prior user-validations for clarity
+    if (userOwnsMpi && !comm_isMpiInit())
+        error_commNotInit();
+    if (!userOwnsMpi && comm_isMpiInit())
         error_commAlreadyInit();
-    
-    MPI_Init(NULL, NULL);
+   
+    // init MPI only when it's not the user's responsibility
+    if (!userOwnsMpi)
+        MPI_Init(NULL, NULL);
+
+    // choose communicator only when the user hasn't already
+    // (via comm_setMpiComm, during custom env initialisation)
+    if (global_mpiComm == MPI_COMM_NULL)
+        comm_setMpiComm(MPI_COMM_WORLD, userOwnsMpi);
 
 #endif
 }
 
 
 void comm_end() {
-#if COMPILE_MPI
-
-    // gracefully permit comm_end() before comm_init(), as input validation can trigger
-    if (!comm_isInit())
+#if QUEST_COMPILE_MPI
+
+    // If QuEST isn't using distribution, regardless of whether the user is using MPI,
+    // then we gracefully exit. We do NOT attempt to end MPI on the user's behalf (as we
+    // may be tempted to do during validation failure to avoid their MPI-crash), because
+    // it's possible/legal that not all processes are participating in this comm_end()
+    // call, in which case so MPI_Finalize() could just cause a hang.
+    if (!comm_isActive())
         return;
 
-    MPI_Barrier(MPI_COMM_WORLD);
-    MPI_Finalize();
+    // Syncing is not strictly necessary, but it ensures that finalizeQuESTEnv() never
+    // completes on one process while another process is still performing simulation
+    // (though that'd be weird), and so may avoid a silly user benchmarking pitfall
+    MPI_Barrier(global_mpiComm);
+    MPI_Comm_free(&global_mpiComm);
+    
+    // Do NOT close MPI if the user owns; they may still wish to use it after QuEST!
+    if (!global_isMpiUserOwned)
+        MPI_Finalize();
+
+    // Presently, comm_end() is only ever called during QuESTEnv destruction (either
+    // deliberately, or because of failed validation during QuESTEnv initialisation).
+    // This means any comm_*() call hereafter is invalid/illegal and will be prevented
+    // by validation. However, we can imagine a future where distribution gets runtime
+    // disabled while QuEST execution continues (e.g. initQuESTEnv automatically
+    // disabled distribution), and so we must indicate that communication is no longer
+    // active by overwriting comm to NULL. BEWARE that this is "hacky"; we have
+    // updated mpiComm here without MPI_Comm_dup(), but that's fine, because hereafter
+    // MPI will never be used again (illegal to re-init both MPI, and QuEST!)
+    global_mpiComm = MPI_COMM_NULL;
+    global_isMpiUserOwned = false;
 
 #endif
 }
 
 
 int comm_getRank() {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
 
     // if distribution was not runtime enabled (or a validation error was 
-    // triggered), every node (if many MPI processes were launched)
-    // believes it is the root rank
-    if (!comm_isInit())
+    // triggered during distributed initialisation), every process believes
+    // it is the root rank; this may lead to unavoidable error msg spam!
+    if (!comm_isActive())
         return ROOT_RANK;
 
+    // obtain the process rank within the QuEST communicator, which can
+    // differ from the global MPI process rank when users own MPI
     int rank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_rank(global_mpiComm, &rank);
     return rank;
 
 #else
 
     // if MPI isn't compiled, we're definitely non-distributed; return main rank 
     return ROOT_RANK;
+
 #endif
 }
 
@@ -155,33 +318,42 @@ bool comm_isRootNode() {
 
 
 int comm_getNumNodes() {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
 
     // if distribution was not runtime enabled (or a validation error was 
-    // triggered), every node (if many MPI processes were launched)
-    // believes it is the one and only node
-    if (!comm_isInit())
+    // triggered during distributed initialisation), every process is told
+    // it is the one and only node; this may lead to error msg spam, but
+    // appears unavoidable!
+    if (!comm_isActive())
         return 1;
 
+    // obtain the number of processes within the QuEST communicator, which
+    // can be smaller than global MPI process count when users own MPI
     int numNodes;
-    MPI_Comm_size(MPI_COMM_WORLD, &numNodes);
+    MPI_Comm_size(global_mpiComm, &numNodes);
     return numNodes;
 
 #else
 
-    // if MPI isn't compiled, we're definitely non-distributed; return single node
+    // if MPI isn't compiled, QuEST is definitely non-distributed and
+    // each process only knows itself (though users may own MPI and
+    // actually have many processes; that's none of our business!)
     return 1;
+
 #endif
 }
 
 
 void comm_sync() {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
 
-    // gracefully handle when not distributed, needed by e.g. pre-MPI-setup validation 
-    if (!comm_isInit())
+    // gracefully handle when not distributed, needed by e.g. pre-MPI-setup validation
+    if (!comm_isActive())
         return;
 
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(global_mpiComm);
+
 #endif
+
+    // do nothing at all when MPI is not compiled (user owned MPI processes go unsynced)
 }
diff --git a/quest/src/comm/comm_config.hpp b/quest/src/comm/comm_config.hpp
index 444d1dbf0..cc009ab9a 100644
--- a/quest/src/comm/comm_config.hpp
+++ b/quest/src/comm/comm_config.hpp
@@ -10,22 +10,29 @@
 #ifndef COMM_CONFIG_HPP
 #define COMM_CONFIG_HPP
 
-
 constexpr int ROOT_RANK = 0;
 
+// queries of MPI's global/general status (when visible)
 bool comm_isMpiCompiled();
+bool comm_isMpiSubCommCompiled();
 bool comm_isMpiGpuAware();
+bool comm_isMpiInit();
+bool comm_isMpiUserOwned();
 
-void comm_init();
+// control of QuEST's (possibly more limited) MPI env
+bool comm_isActive();
+void comm_init(bool userOwnsMpi);
 void comm_end();
 void comm_sync();
 
+// queries of QuEST's (possibly more limited) MPI env
 int comm_getRank();
 int comm_getNumNodes();
-
-bool comm_isInit();
 bool comm_isRootNode();
 bool comm_isRootNode(int rank);
 
+// Signatures containing MPI types which callers must extern:
+// extern MPI_Comm comm_getMpiComm()
+// extern bool comm_setMpiComm(MPI_Comm newComm, bool userOwnsMpi)
 
-#endif // COMM_CONFIG_HPP
\ No newline at end of file
+#endif // COMM_CONFIG_HPP
diff --git a/quest/src/comm/comm_routines.cpp b/quest/src/comm/comm_routines.cpp
index 19ebcb9f8..cf6956454 100644
--- a/quest/src/comm/comm_routines.cpp
+++ b/quest/src/comm/comm_routines.cpp
@@ -1,12 +1,12 @@
 /** @file
  * Functions for communicating and exchanging amplitudes between compute
  * nodes, when running in distributed mode, using the C MPI standard.
- * Calling these functions when COMPILE_MPI=0, or when the passed Quregs
+ * Calling these functions when QUEST_COMPILE_MPI=0, or when the passed Quregs
  * are not distributed, will throw a runtime internal error. 
  * 
  * @author Tyson Jones
  * @author Jakub Adamski (sped-up large comm by asynch messages)
- * @author Oliver Brown (patched max-message inference, consulted on AR and MPICH support)
+ * @author Oliver Brown (added custom communicators, patched max-message inference, consulted on AR and MPICH support)
  * @author Ania (Anna) Brown (developed QuEST v1 logic)
  */
 
@@ -22,8 +22,9 @@
 #include "quest/src/comm/comm_config.hpp"
 #include "quest/src/comm/comm_indices.hpp"
 
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
     #include <mpi.h>
+    extern MPI_Comm comm_getMpiComm(); // comm_config.cpp does not leak MPI_Comm
 #endif
 
 #include <vector>
@@ -108,18 +109,18 @@ qindex MAX_MESSAGE_LENGTH = powerOf2(28);
  */
 
 
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
 
     // declare MPI types for qreal and qcomp. We always use the 
     // C macros, even when the deprecated CXX equivalents are 
     // available, to maintain compatibility with modern MPICH
-    #if   (FLOAT_PRECISION == 1)
+    #if   (QUEST_FLOAT_PRECISION == 1)
         #define MPI_QREAL MPI_FLOAT
         #define MPI_QCOMP MPI_C_FLOAT_COMPLEX
-    #elif (FLOAT_PRECISION == 2)
+    #elif (QUEST_FLOAT_PRECISION == 2)
         #define MPI_QREAL MPI_DOUBLE
         #define MPI_QCOMP MPI_C_DOUBLE_COMPLEX
-    #elif (FLOAT_PRECISION == 4)
+    #elif (QUEST_FLOAT_PRECISION == 4)
         #define MPI_QREAL MPI_LONG_DOUBLE
         #define MPI_QCOMP MPI_C_LONG_DOUBLE_COMPLEX
     #else
@@ -136,7 +137,7 @@ qindex MAX_MESSAGE_LENGTH = powerOf2(28);
 
 
 int getMaxNumMessages() {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
 
     // the max supported tag value constrains the total number of messages 
     // we can send in a round of communication, since we uniquely tag
@@ -149,7 +150,7 @@ int getMaxNumMessages() {
     // messages. Beware the max is obtained via a void pointer and might be unset...
     void* tagUpperBoundPtr;
     int isAttribSet;
-    MPI_Comm_get_attr(MPI_COMM_WORLD, MPI_TAG_UB, &tagUpperBoundPtr, &isAttribSet);
+    MPI_Comm_get_attr(comm_getMpiComm(), MPI_TAG_UB, &tagUpperBoundPtr, &isAttribSet);
 
     // if something went wrong with obtaining the tag bound, return the safe minimum
     if (!isAttribSet)
@@ -214,7 +215,9 @@ std::array<qindex,3> dividePayloadIntoMessages(qindex numAmps) {
 
 
 void exchangeArrays(qcomp* send, qcomp* recv, qindex numElems, int pairRank) {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
+
+    MPI_Comm mpiComm = comm_getMpiComm();
 
     // each message is asynchronously dispatched with a final wait, as per arxiv.org/abs/2308.07402
 
@@ -226,8 +229,8 @@ void exchangeArrays(qcomp* send, qcomp* recv, qindex numElems, int pairRank) {
     // so that messages are permitted to arrive out-of-order (supporting UCX adaptive-routing)
     for (qindex m=0; m<numMessages; m++) {
         int tag = static_cast<int>(m); // gauranteed int, but m*messageSize needs qindex
-        MPI_Isend(&send[m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, MPI_COMM_WORLD, &requests[2*m]);
-        MPI_Irecv(&recv[m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, MPI_COMM_WORLD, &requests[2*m+1]);
+        MPI_Irecv(&recv[m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiComm, &requests[2*m]);
+        MPI_Isend(&send[m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiComm, &requests[2*m+1]);
     }
 
     // wait for all exchanges to complete (MPI will automatically free the request memory)
@@ -246,7 +249,9 @@ void exchangeArrays(qcomp* send, qcomp* recv, qindex numElems, int pairRank) {
 
 
 void asynchSendArray(qcomp* send, qindex numElems, int pairRank) {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
+
+    MPI_Comm mpiComm = comm_getMpiComm();
 
     // we will not track nor wait for the asynch send; instead, the caller will later comm_sync()
     MPI_Request nullReq = MPI_REQUEST_NULL;
@@ -257,7 +262,7 @@ void asynchSendArray(qcomp* send, qindex numElems, int pairRank) {
     // asynchronously send the uniquely-tagged messages
     for (qindex m=0; m<numMessages; m++) {
         int tag = static_cast<int>(m); // gauranteed int, but m*messageSize needs qindex
-        MPI_Isend(&send[m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, MPI_COMM_WORLD, &nullReq);
+        MPI_Isend(&send[m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiComm, &nullReq);
     }
 
 #else
@@ -267,7 +272,9 @@ void asynchSendArray(qcomp* send, qindex numElems, int pairRank) {
 
 
 void receiveArray(qcomp* dest, qindex numElems, int pairRank) {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
+
+    MPI_Comm mpiComm = comm_getMpiComm();
 
     // expect the data in multiple messages
     auto [messageSize, numMessages] = dividePow2PayloadIntoMessages(numElems);
@@ -278,7 +285,7 @@ void receiveArray(qcomp* dest, qindex numElems, int pairRank) {
     // listen to receive each uniquely-tagged message asynchronously (as per arxiv.org/abs/2308.07402)
     for (qindex m=0; m<numMessages; m++) {
         int tag = static_cast<int>(m); // gauranteed int, but m*messageSize needs qindex
-        MPI_Irecv(&dest[m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, MPI_COMM_WORLD, &requests[m]);
+        MPI_Irecv(&dest[m*messageSize], messageSize, MPI_QCOMP, pairRank, tag, mpiComm, &requests[m]);
     }
 
     // receivers wait for all messages to be received (while sender asynch proceeds)
@@ -301,8 +308,9 @@ void globallyCombineNonUniformSubArrays(
     vector<qindex> globalRecvIndPerRank, vector<qindex> localSendIndPerRank, vector<qindex> numSendPerRank,
     bool areGpuPtrs
 ) {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
 
+    auto mpiComm = comm_getMpiComm();
     int myRank = comm_getRank();
     int numNodes = comm_getNumNodes();
 
@@ -336,14 +344,14 @@ void globallyCombineNonUniformSubArrays(
         for (int m=0; m<numBigMsgs; m++) {
             qindex recvInd = globalRecvIndPerRank[sendRank] + (m * bigMsgSize);
             requests.push_back(MPI_REQUEST_NULL);
-            MPI_Ibcast(&recv[recvInd], bigMsgSize, MPI_QCOMP, sendRank, MPI_COMM_WORLD, &requests.back());
+            MPI_Ibcast(&recv[recvInd], bigMsgSize, MPI_QCOMP, sendRank, mpiComm, &requests.back());
         }
 
         // and potentially one remaining asynch message 
         if (remMsgSize > 0) {
             qindex recvInd = globalRecvIndPerRank[sendRank] + (numBigMsgs * bigMsgSize);
             requests.push_back(MPI_REQUEST_NULL);
-            MPI_Ibcast(&recv[recvInd], remMsgSize, MPI_QCOMP, sendRank, MPI_COMM_WORLD, &requests.back());
+            MPI_Ibcast(&recv[recvInd], remMsgSize, MPI_QCOMP, sendRank, mpiComm, &requests.back());
         }
     }
 
@@ -357,7 +365,7 @@ void globallyCombineNonUniformSubArrays(
 
 
 void globallyCombineSubArrays(qcomp* recv, qcomp* send, qindex numAmpsPerRank, bool areGpuPtrs) {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
 
     // simply wrap and call the non-uniform case has no performance penalty, 
     // and is only slightly messier than a bespoke power-of-2 msg implementation
@@ -637,9 +645,9 @@ void comm_exchangeAmpsToBuffers(Qureg qureg, int pairRank) {
 
 
 void comm_broadcastAmp(int sendRank, qcomp* sendAmp) {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
 
-    MPI_Bcast(sendAmp, 1, MPI_QCOMP, sendRank, MPI_COMM_WORLD);
+    MPI_Bcast(sendAmp, 1, MPI_QCOMP, sendRank, comm_getMpiComm());
 
 #else
     error_commButEnvNotDistributed();
@@ -648,7 +656,9 @@ void comm_broadcastAmp(int sendRank, qcomp* sendAmp) {
 
 
 void comm_sendAmpsToRoot(int sendRank, qcomp* send, qcomp* recv, qindex numAmps) {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
+
+    MPI_Comm mpiComm = comm_getMpiComm();
 
     // only the sender and root nodes need to continue
     int recvRank = ROOT_RANK;
@@ -665,8 +675,8 @@ void comm_sendAmpsToRoot(int sendRank, qcomp* send, qcomp* recv, qindex numAmps)
     for (qindex m=0; m<numMessages; m++) {
         int tag = static_cast<int>(m);
         (myRank == sendRank)?
-            MPI_Isend(&send[m*messageSize], messageSize, MPI_QCOMP, recvRank, tag, MPI_COMM_WORLD, &requests[m]): // sender
-            MPI_Irecv(&recv[m*messageSize], messageSize, MPI_QCOMP, sendRank, tag, MPI_COMM_WORLD, &requests[m]); // root
+            MPI_Isend(&send[m*messageSize], messageSize, MPI_QCOMP, recvRank, tag, mpiComm, &requests[m]): // sender
+            MPI_Irecv(&recv[m*messageSize], messageSize, MPI_QCOMP, sendRank, tag, mpiComm, &requests[m]); // root
     }
 
     // wait for all exchanges to complete (MPI will automatically free the request memory)
@@ -679,10 +689,10 @@ void comm_sendAmpsToRoot(int sendRank, qcomp* send, qcomp* recv, qindex numAmps)
 
 
 void comm_broadcastIntsFromRoot(int* arr, qindex length) {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
 
     int sendRank = ROOT_RANK;
-    MPI_Bcast(arr, length, MPI_INT, sendRank, MPI_COMM_WORLD);
+    MPI_Bcast(arr, length, MPI_INT, sendRank, comm_getMpiComm());
 
 #else
     error_commButEnvNotDistributed();
@@ -691,10 +701,10 @@ void comm_broadcastIntsFromRoot(int* arr, qindex length) {
 
 
 void comm_broadcastUnsignedsFromRoot(unsigned* arr, qindex length) {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
 
     int sendRank = ROOT_RANK;
-    MPI_Bcast(arr, length, MPI_UNSIGNED, sendRank, MPI_COMM_WORLD);
+    MPI_Bcast(arr, length, MPI_UNSIGNED, sendRank, comm_getMpiComm());
 
 #else
     error_commButEnvNotDistributed();
@@ -719,9 +729,9 @@ void comm_combineSubArrays(qcomp* recv, vector<qindex> recvInds, vector<qindex>
 
 
 void comm_reduceAmp(qcomp* localAmp) {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
 
-    MPI_Allreduce(MPI_IN_PLACE, localAmp, 1, MPI_QCOMP, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(MPI_IN_PLACE, localAmp, 1, MPI_QCOMP, MPI_SUM, comm_getMpiComm());
 
 #else
     error_commButEnvNotDistributed();
@@ -730,9 +740,9 @@ void comm_reduceAmp(qcomp* localAmp) {
 
 
 void comm_reduceReal(qreal* localReal) {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
 
-    MPI_Allreduce(MPI_IN_PLACE, localReal, 1, MPI_QREAL, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(MPI_IN_PLACE, localReal, 1, MPI_QREAL, MPI_SUM, comm_getMpiComm());
 
 #else
     error_commButEnvNotDistributed();
@@ -741,9 +751,9 @@ void comm_reduceReal(qreal* localReal) {
 
 
 void comm_reduceReals(qreal* localReals, qindex numLocalReals) {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
 
-    MPI_Allreduce(MPI_IN_PLACE, localReals, numLocalReals, MPI_QREAL, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(MPI_IN_PLACE, localReals, numLocalReals, MPI_QREAL, MPI_SUM, comm_getMpiComm());
 
 #else
     error_commButEnvNotDistributed();
@@ -752,12 +762,12 @@ void comm_reduceReals(qreal* localReals, qindex numLocalReals) {
 
 
 bool comm_isTrueOnAllNodes(bool val) {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
 
     // perform global AND and broadcast result back to all nodes
     int local = (int) val;
     int global;
-    MPI_Allreduce(&local, &global, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
+    MPI_Allreduce(&local, &global, 1, MPI_INT, MPI_LAND, comm_getMpiComm());
     return (bool) global;
 
 #else
@@ -768,7 +778,7 @@ bool comm_isTrueOnAllNodes(bool val) {
 
 
 bool comm_isTrueOnRootNode(bool val) {
-    #if COMPILE_MPI
+    #if QUEST_COMPILE_MPI
 
     // this isn't really a reduction - it's a broadcast - but
     // it's semantically relevant to comm_isTrueOnAllNodes()
@@ -791,7 +801,7 @@ bool comm_isTrueOnRootNode(bool val) {
 
 
 vector<string> comm_gatherStringsToRoot(char* localChars, int maxNumLocalChars) {
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
 
     // no need to validate array sizes and memory alloc successes;
     // these are trivial O(#nodes)-size arrays containing <20 chars
@@ -803,7 +813,7 @@ vector<string> comm_gatherStringsToRoot(char* localChars, int maxNumLocalChars)
     // all nodes send root all their local chars
     int recvRank = ROOT_RANK;
     MPI_Gather(localChars, maxNumLocalChars, MPI_CHAR, allChars.data(),
-        maxNumLocalChars, MPI_CHAR, recvRank, MPI_COMM_WORLD);
+        maxNumLocalChars, MPI_CHAR, recvRank, comm_getMpiComm());
 
     // divide allChars into stings, delimited by each node's terminal char
     vector<string> out(numNodes);
diff --git a/quest/src/comm/comm_routines.hpp b/quest/src/comm/comm_routines.hpp
index 3d0fc8b23..e75e889f6 100644
--- a/quest/src/comm/comm_routines.hpp
+++ b/quest/src/comm/comm_routines.hpp
@@ -1,7 +1,7 @@
 /** @file
  * Signatures for communicating and exchanging amplitudes between compute
  * nodes, when running in distributed mode, using the C MPI standard.
- * Calling these functions when COMPILE_MPI=0, or when the passed Quregs
+ * Calling these functions when QUEST_COMPILE_MPI=0, or when the passed Quregs
  * are not distributed, will throw a runtime internal error. 
  * 
  * @author Tyson Jones
diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
index 7bdcc1709..677e6c74a 100644
--- a/quest/src/core/accelerator.cpp
+++ b/quest/src/core/accelerator.cpp
@@ -23,16 +23,18 @@
 #include "quest/src/core/errors.hpp"
 #include "quest/src/core/memory.hpp"
 #include "quest/src/core/bitwise.hpp"
+#include "quest/src/core/lists.hpp"
 #include "quest/src/cpu/cpu_config.hpp"
 #include "quest/src/gpu/gpu_config.hpp"
 #include "quest/src/cpu/cpu_subroutines.hpp"
 #include "quest/src/gpu/gpu_subroutines.hpp"
 
+#include <array>
 #include <vector>
 #include <algorithm>
 
-using std::vector;
 using std::min;
+using std::array;
 
 
 
@@ -45,19 +47,16 @@ using std::min;
  * number of controls or targets exceeds that which have optimised compilations, 
  * we fall back to using a generic implementation, indicated by <-1>. In essence,
  * these macros simply call func<ctrls.size()> albeit without illegally passing
- * a runtime variable as a template parameter. Note an awkward use of decltype()
- * is to workaround a GCC <12 bug with implicitly-typed vector initialisations.
- * 
- * BEWARE that these macros are single-line expressions, so they can be used in
- * braceless if/else or ternary operators - but stay vigilant!
+ * a runtime variable as a template parameter.
  */
 
 
-#define GET_FUNC_OPTIMISED_FOR_BOOL(funcname, value) \
+
+#define GET_FUNC_OPTIMISED_FOR_BOOL( funcname, value ) \
     ((value)? funcname<true> : funcname<false>)
 
 
-#define GET_FUNC_OPTIMISED_FOR_TWO_BOOLS(funcname, b1, b2) \
+#define GET_FUNC_OPTIMISED_FOR_TWO_BOOLS( funcname, b1, b2 ) \
     ((b1)? \
         ((b2)? funcname<true, true> : funcname<true, false>) : \
         ((b2)? funcname<false,true> : funcname<false,false>))
@@ -69,61 +68,74 @@ using std::min;
         ((value)? cpu_##funcsuffix<true, fixed1,fixed2,fixed3> : cpu_##funcsuffix<false, fixed1,fixed2,fixed3> ))
 
 
-#if (MAX_OPTIMISED_NUM_CTRLS != 5) || (MAX_OPTIMISED_NUM_TARGS != 5)
+#if (MAX_OPTIMISED_PARAM != 5)
     #error "The number of optimised, templated QuEST functions was inconsistent between accelerator's source and header."
 #endif
 
+#define GET_TEMPLATE_PARAM( param ) \
+    std::min((int) param, MAX_OPTIMISED_PARAM + 1)
 
-#define GET_FUNC_OPTIMISED_FOR_NUM_QUREGS(f, numquregs) \
-    (vector <decltype(&f<0>)> {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}) \
-    [std::min((int) numquregs, MAX_OPTIMISED_NUM_QUREGS + 1)]
-
-#define GET_FUNC_OPTIMISED_FOR_NUM_CTRLS(f, numctrls) \
-    (vector <decltype(&f<0>)> {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}) \
-    [std::min((int) numctrls, MAX_OPTIMISED_NUM_CTRLS + 1)]
-
-#define GET_FUNC_OPTIMISED_FOR_NUM_TARGS(f, numtargs) \
-    (vector <decltype(&f<0>)> {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}) \
-    [std::min((int) numtargs, MAX_OPTIMISED_NUM_TARGS + 1)]
-
-#define GET_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(f, numctrls, numtargs) \
-    (vector <ARR(f)> { \
-        ARR(f) {&f<0,0>,  &f<0,1>,  &f<0,2>,  &f<0,3>,  &f<0,4>,  &f<0,5>,  &f<0,-1>}, \
-        ARR(f) {&f<1,0>,  &f<1,1>,  &f<1,2>,  &f<1,3>,  &f<1,4>,  &f<1,5>,  &f<1,-1>}, \
-        ARR(f) {&f<2,0>,  &f<2,1>,  &f<2,2>,  &f<2,3>,  &f<2,4>,  &f<2,5>,  &f<2,-1>}, \
-        ARR(f) {&f<3,0>,  &f<3,1>,  &f<3,2>,  &f<3,3>,  &f<3,4>,  &f<3,5>,  &f<3,-1>}, \
-        ARR(f) {&f<4,0>,  &f<4,1>,  &f<4,2>,  &f<4,3>,  &f<4,4>,  &f<4,5>,  &f<4,-1>}, \
-        ARR(f) {&f<5,0>,  &f<5,1>,  &f<5,2>,  &f<5,3>,  &f<5,4>,  &f<5,5>,  &f<5,-1>}, \
-        ARR(f) {&f<-1,0>, &f<-1,1>, &f<-1,2>, &f<-1,3>, &f<-1,4>, &f<-1,5>, &f<-1,-1>}}) \
-    [std::min((int) numctrls, MAX_OPTIMISED_NUM_CTRLS + 1)] \
-    [std::min((int) numtargs, MAX_OPTIMISED_NUM_TARGS + 1)]
-
-#define ARR(f) vector<decltype(&f<0,0>)>
 
+#define GET_ONE_PARAM_TEMPLATED_FUNC_ARRAY( f ) \
+    array {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}
 
-#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_QUREGS(funcsuffix, qureg, numquregs) \
-    ((qureg.isGpuAccelerated)? \
-        GET_FUNC_OPTIMISED_FOR_NUM_QUREGS( gpu_##funcsuffix, numquregs ) : \
-        GET_FUNC_OPTIMISED_FOR_NUM_QUREGS( cpu_##funcsuffix, numquregs ))
+#define GET_FUNC_OPTIMISED_FOR_ONE_PARAM( outvar, funcname, param ) \
+    static constexpr auto _ARRAY_##funcname = GET_ONE_PARAM_TEMPLATED_FUNC_ARRAY( funcname ); \
+    const auto outvar = _ARRAY_##funcname[GET_TEMPLATE_PARAM( param )];
 
-#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS(funcsuffix, qureg, numctrls) \
-    ((qureg.isGpuAccelerated)? \
-        GET_FUNC_OPTIMISED_FOR_NUM_CTRLS( gpu_##funcsuffix, numctrls ) : \
-        GET_FUNC_OPTIMISED_FOR_NUM_CTRLS( cpu_##funcsuffix, numctrls ))
+#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( outvar, funcsuffix, qureg, param ) \
+    GET_FUNC_OPTIMISED_FOR_ONE_PARAM( _GPU_FUNC, gpu_##funcsuffix, param ) \
+    GET_FUNC_OPTIMISED_FOR_ONE_PARAM( _CPU_FUNC, cpu_##funcsuffix, param ) \
+    const auto outvar = qureg.isGpuAccelerated ? _GPU_FUNC : _CPU_FUNC;
 
-#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS(funcsuffix, qureg, numtargs) \
-    ((qureg.isGpuAccelerated)? \
-        GET_FUNC_OPTIMISED_FOR_NUM_TARGS( gpu_##funcsuffix, numtargs ) : \
-        GET_FUNC_OPTIMISED_FOR_NUM_TARGS( cpu_##funcsuffix, numtargs ))
-
-#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(funcsuffix, qureg, numctrls, numtargs) \
-    ((qureg.isGpuAccelerated)? \
-        GET_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs ) : \
-        GET_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs ))
+    
+#define GET_TWO_PARAM_TEMPLATED_FUNC_MATRIX( f ) \
+    array { \
+        array {&f<0,0>,  &f<0,1>,  &f<0,2>,  &f<0,3>,  &f<0,4>,  &f<0,5>,  &f<0,-1>}, \
+        array {&f<1,0>,  &f<1,1>,  &f<1,2>,  &f<1,3>,  &f<1,4>,  &f<1,5>,  &f<1,-1>}, \
+        array {&f<2,0>,  &f<2,1>,  &f<2,2>,  &f<2,3>,  &f<2,4>,  &f<2,5>,  &f<2,-1>}, \
+        array {&f<3,0>,  &f<3,1>,  &f<3,2>,  &f<3,3>,  &f<3,4>,  &f<3,5>,  &f<3,-1>}, \
+        array {&f<4,0>,  &f<4,1>,  &f<4,2>,  &f<4,3>,  &f<4,4>,  &f<4,5>,  &f<4,-1>}, \
+        array {&f<5,0>,  &f<5,1>,  &f<5,2>,  &f<5,3>,  &f<5,4>,  &f<5,5>,  &f<5,-1>}, \
+        array {&f<-1,0>, &f<-1,1>, &f<-1,2>, &f<-1,3>, &f<-1,4>, &f<-1,5>, &f<-1,-1>}}
+
+#define GET_FUNC_OPTIMISED_FOR_TWO_PARAMS( outvar, funcname, param1, param2 ) \
+    static constexpr auto _MATRIX_##funcname = GET_TWO_PARAM_TEMPLATED_FUNC_MATRIX( funcname ); \
+    const auto outvar = _MATRIX_##funcname[GET_TEMPLATE_PARAM( param1 )][GET_TEMPLATE_PARAM( param2 )];
+
+#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_TWO_PARAMS( outvar, funcsuffix, qureg, param1, param2 ) \
+    GET_FUNC_OPTIMISED_FOR_TWO_PARAMS( _GPU_FUNC, gpu_##funcsuffix, param1, param2 ) \
+    GET_FUNC_OPTIMISED_FOR_TWO_PARAMS( _CPU_FUNC, cpu_##funcsuffix, param1, param2 ) \
+    const auto outvar = qureg.isGpuAccelerated ? _GPU_FUNC : _CPU_FUNC;
+
+
+#define GET_TWO_PARAM_TWO_BOOL_SUB_MATRIX( f, b1, b2 ) \
+    array { \
+        array {&f<0,0,b1,b2>,  &f<0,1,b1,b2>,  &f<0,2,b1,b2>,  &f<0,3,b1,b2>,  &f<0,4,b1,b2>,  &f<0,5,b1,b2>,  &f<0,-1,b1,b2>}, \
+        array {&f<1,0,b1,b2>,  &f<1,1,b1,b2>,  &f<1,2,b1,b2>,  &f<1,3,b1,b2>,  &f<1,4,b1,b2>,  &f<1,5,b1,b2>,  &f<1,-1,b1,b2>}, \
+        array {&f<2,0,b1,b2>,  &f<2,1,b1,b2>,  &f<2,2,b1,b2>,  &f<2,3,b1,b2>,  &f<2,4,b1,b2>,  &f<2,5,b1,b2>,  &f<2,-1,b1,b2>}, \
+        array {&f<3,0,b1,b2>,  &f<3,1,b1,b2>,  &f<3,2,b1,b2>,  &f<3,3,b1,b2>,  &f<3,4,b1,b2>,  &f<3,5,b1,b2>,  &f<3,-1,b1,b2>}, \
+        array {&f<4,0,b1,b2>,  &f<4,1,b1,b2>,  &f<4,2,b1,b2>,  &f<4,3,b1,b2>,  &f<4,4,b1,b2>,  &f<4,5,b1,b2>,  &f<4,-1,b1,b2>}, \
+        array {&f<5,0,b1,b2>,  &f<5,1,b1,b2>,  &f<5,2,b1,b2>,  &f<5,3,b1,b2>,  &f<5,4,b1,b2>,  &f<5,5,b1,b2>,  &f<5,-1,b1,b2>}, \
+        array {&f<-1,0,b1,b2>, &f<-1,1,b1,b2>, &f<-1,2,b1,b2>, &f<-1,3,b1,b2>, &f<-1,4,b1,b2>, &f<-1,5,b1,b2>, &f<-1,-1,b1,b2>}}
+
+#define GET_TWO_PARAM_TWO_BOOL_TEMPLATED_FUNC_MATRIX( f ) \
+    array { \
+        array{ GET_TWO_PARAM_TWO_BOOL_SUB_MATRIX( f, 0, 0 ), GET_TWO_PARAM_TWO_BOOL_SUB_MATRIX( f, 0, 1 ) }, \
+        array{ GET_TWO_PARAM_TWO_BOOL_SUB_MATRIX( f, 1, 0 ), GET_TWO_PARAM_TWO_BOOL_SUB_MATRIX( f, 1, 1 ) }}
+
+#define GET_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS( outvar, funcname, param1, param2, bool1, bool2 ) \
+    static constexpr auto _MATRIX_##funcname = GET_TWO_PARAM_TWO_BOOL_TEMPLATED_FUNC_MATRIX( funcname ); \
+    const auto outvar = _MATRIX_##funcname[bool1][bool2][GET_TEMPLATE_PARAM( param1 )][GET_TEMPLATE_PARAM( param2 )];
+
+#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS( outvar, funcsuffix, qureg, param1, param2, bool1, bool2 ) \
+    GET_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS( _GPU_FUNC, gpu_##funcsuffix, param1, param2, bool1, bool2 ) \
+    GET_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS( _CPU_FUNC, cpu_##funcsuffix, param1, param2, bool1, bool2 ) \
+    const auto outvar = qureg.isGpuAccelerated ? _GPU_FUNC : _CPU_FUNC;
 
 
 /// @todo
-/// GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS as defined below
+/// GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS as defined above
 /// is used by anyCtrlAnyTargDiagMatr and anyCtrlAnyTargDenseMatr; the 
 /// latter only ever receives numTargs>=3 (due to accelerator redirecting 
 /// fewer targets to faster bespoke functions which e.g. avoid global GPU
@@ -133,40 +145,6 @@ using std::min;
 /// can ergo non-negligibly speed up compilation by avoiding these redundant 
 /// instances at the cost of increased code complexity/asymmetry. Consider!
 
-#define GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(f, numctrls, numtargs, c, h) \
-    (vector <POWER_CONJ_ARR(f)> { \
-        POWER_CONJ_ARR(f) {&f<0,0,c,h>,  &f<0,1,c,h>,  &f<0,2,c,h>,  &f<0,3,c,h>,  &f<0,4,c,h>,  &f<0,5,c,h>,  &f<0,-1,c,h>}, \
-        POWER_CONJ_ARR(f) {&f<1,0,c,h>,  &f<1,1,c,h>,  &f<1,2,c,h>,  &f<1,3,c,h>,  &f<1,4,c,h>,  &f<1,5,c,h>,  &f<1,-1,c,h>}, \
-        POWER_CONJ_ARR(f) {&f<2,0,c,h>,  &f<2,1,c,h>,  &f<2,2,c,h>,  &f<2,3,c,h>,  &f<2,4,c,h>,  &f<2,5,c,h>,  &f<2,-1,c,h>}, \
-        POWER_CONJ_ARR(f) {&f<3,0,c,h>,  &f<3,1,c,h>,  &f<3,2,c,h>,  &f<3,3,c,h>,  &f<3,4,c,h>,  &f<3,5,c,h>,  &f<3,-1,c,h>}, \
-        POWER_CONJ_ARR(f) {&f<4,0,c,h>,  &f<4,1,c,h>,  &f<4,2,c,h>,  &f<4,3,c,h>,  &f<4,4,c,h>,  &f<4,5,c,h>,  &f<4,-1,c,h>}, \
-        POWER_CONJ_ARR(f) {&f<5,0,c,h>,  &f<5,1,c,h>,  &f<5,2,c,h>,  &f<5,3,c,h>,  &f<5,4,c,h>,  &f<5,5,c,h>,  &f<5,-1,c,h>}, \
-        POWER_CONJ_ARR(f) {&f<-1,0,c,h>, &f<-1,1,c,h>, &f<-1,2,c,h>, &f<-1,3,c,h>, &f<-1,4,c,h>, &f<-1,5,c,h>, &f<-1,-1,c,h>}}) \
-    [std::min((int) numctrls, MAX_OPTIMISED_NUM_CTRLS + 1)] \
-    [std::min((int) numtargs, MAX_OPTIMISED_NUM_TARGS + 1)]
-
-#define POWER_CONJ_ARR(f) vector<decltype(&f<0,0,false,false>)>
-
-#define GET_CPU_OR_GPU_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(funcsuffix, qureg, numctrls, numtargs, conj, haspower) \
-    ((qureg.isGpuAccelerated)? \
-        ((conj)? \
-            ((haspower)? \
-                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, true, true ) : \
-                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, true, false ) ) : \
-            ((haspower)? \
-                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, false, true ) : \
-                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, false, false ) ) ) : \
-        ((conj)? \
-            ((haspower)? \
-                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, true, true ) : \
-                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, true, false ) ) : \
-            ((haspower)? \
-                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, false, true ) : \
-                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, false, false ) ) ) )
-
-/// @todo
-/// The above macro spaghetti is diabolical - update using C++ metaprogamming!
-
 
 
 /*
@@ -244,7 +222,7 @@ void accel_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliS
  */
 
 
-qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, vector<int> qubits, vector<int> qubitStates) {
+qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates) {
 
     // we can never pack and swap buffers when there are no constrained qubit states, because we'd 
     // then fill the entire buffer andhave no room to receive the other node's buffer; caller would 
@@ -253,7 +231,7 @@ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, vector<int> qubits, vector
         error_noCtrlsGivenToBufferPacker();
 
     // note qubits may incidentally be ctrls or targs; it doesn't matter
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( statevec_packAmpsIntoBuffer, qureg, qubits.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_packAmpsIntoBuffer, qureg, qubits.size() );
     
     // return the number of packed amps, for caller convenience
     return func(qureg, qubits, qubitStates);
@@ -274,19 +252,19 @@ qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int
  */
 
 
-void accel_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2) {
+void accel_statevec_anyCtrlSwap_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlSwap_subA, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_anyCtrlSwap_subA, qureg, ctrls.size() );
     func(qureg, ctrls, ctrlStates, targ1, targ2);
 }
-void accel_statevec_anyCtrlSwap_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates) {
+void accel_statevec_anyCtrlSwap_subB(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlSwap_subB, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_anyCtrlSwap_subB, qureg, ctrls.size() );
     func(qureg, ctrls, ctrlStates);
 }
-void accel_statevec_anyCtrlSwap_subC(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, int targState) {
+void accel_statevec_anyCtrlSwap_subC(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, int targState) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlSwap_subC, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_anyCtrlSwap_subC, qureg, ctrls.size() );
     func(qureg, ctrls, ctrlStates, targ, targState);
 }
 
@@ -297,28 +275,28 @@ void accel_statevec_anyCtrlSwap_subC(Qureg qureg, vector<int> ctrls, vector<int>
  */
 
 
-void accel_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, CompMatr1 matr) {
+void accel_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, CompMatr1 matr) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlOneTargDenseMatr_subA, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_anyCtrlOneTargDenseMatr_subA, qureg, ctrls.size() );
     func(qureg, ctrls, ctrlStates, targ, matr);
 }
-void accel_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, qcomp fac0, qcomp fac1) {
+void accel_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, qcomp fac0, qcomp fac1) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlOneTargDenseMatr_subB, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_anyCtrlOneTargDenseMatr_subB, qureg, ctrls.size() );
     func(qureg, ctrls, ctrlStates, fac0, fac1);
 }
 
 
-void accel_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr) {
+void accel_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, CompMatr2 matr) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlTwoTargDenseMatr_sub, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_anyCtrlTwoTargDenseMatr_sub, qureg, ctrls.size() );
     func(qureg, ctrls, ctrlStates, targ1, targ2, matr);
 }
 
 
-void accel_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr, bool conj, bool transp) {
+void accel_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, CompMatr matr, bool conj, bool transp) {
 
-    auto func = GET_CPU_OR_GPU_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( statevec_anyCtrlAnyTargDenseMatr_sub, qureg, ctrls.size(), targs.size(), conj, transp );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS( func, statevec_anyCtrlAnyTargDenseMatr_sub, qureg, ctrls.size(), targs.size(), conj, transp );
     func(qureg, ctrls, ctrlStates, targs, matr);
 }
 
@@ -329,25 +307,25 @@ void accel_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls,
  */
 
 
-void accel_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, DiagMatr1 matr) {
+void accel_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, DiagMatr1 matr) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlOneTargDiagMatr_sub, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_anyCtrlOneTargDiagMatr_sub, qureg, ctrls.size() );
     func(qureg, ctrls, ctrlStates, targ, matr);
 }
 
 
-void accel_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, DiagMatr2 matr) {
+void accel_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, DiagMatr2 matr) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlTwoTargDiagMatr_sub, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_anyCtrlTwoTargDiagMatr_sub, qureg, ctrls.size() );
     func(qureg, ctrls, ctrlStates, targ1, targ2, matr);
 }
 
 
-void accel_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, DiagMatr matr, qcomp exponent, bool conj) {
+void accel_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, DiagMatr matr, qcomp exponent, bool conj) {
 
     bool hasPower = exponent != qcomp(1, 0);
 
-    auto func = GET_CPU_OR_GPU_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( statevec_anyCtrlAnyTargDiagMatr_sub, qureg, ctrls.size(), targs.size(), conj, hasPower );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS( func, statevec_anyCtrlAnyTargDiagMatr_sub, qureg, ctrls.size(), targs.size(), conj, hasPower );
     func(qureg, ctrls, ctrlStates, targs, matr, exponent);
 }
 
@@ -520,24 +498,24 @@ void accel_densmatr_allTargDiagMatr_subB(Qureg qureg, FullStateDiagMatr matr, qc
  */
 
 
-void accel_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector<int> ctrls, vector<int> states, vector<int> x, vector<int> y, vector<int> z, qcomp f0, qcomp f1) {
+void accel_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, ConstList64 ctrls, ConstList64 states, ConstList64 x, ConstList64 y, ConstList64 z, qcomp f0, qcomp f1) {
 
     // only X and Y constitute target qubits (Z merely induces a phase)
     int numTargs = x.size() + y.size();
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( statevector_anyCtrlPauliTensorOrGadget_subA, qureg, ctrls.size(), numTargs );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_TWO_PARAMS( func, statevector_anyCtrlPauliTensorOrGadget_subA, qureg, ctrls.size(), numTargs );
     func(qureg, ctrls, states, x, y, z, f0, f1);
 }
-void accel_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector<int> ctrls, vector<int> states, vector<int> x, vector<int> y, vector<int> z, qcomp f0, qcomp f1, qindex mask) {
+void accel_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, ConstList64 ctrls, ConstList64 states, ConstList64 x, ConstList64 y, ConstList64 z, qcomp f0, qcomp f1, qindex mask) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevector_anyCtrlPauliTensorOrGadget_subB, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevector_anyCtrlPauliTensorOrGadget_subB, qureg, ctrls.size() );
     func(qureg, ctrls, states, x, y, z, f0, f1, mask);
 }
 
 
-void accel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector<int> ctrls, vector<int> states, vector<int> targs, qcomp f0, qcomp f1) {
+void accel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, ConstList64 ctrls, ConstList64 states, ConstList64 targs, qcomp f0, qcomp f1) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevector_anyCtrlAnyTargZOrPhaseGadget_sub, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevector_anyCtrlAnyTargZOrPhaseGadget_sub, qureg, ctrls.size() );
     func(qureg, ctrls, states, targs, f0, f1);
 }
 
@@ -548,10 +526,10 @@ void accel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector<int>
  */
 
 
-void accel_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs) {
+void accel_statevec_setQuregToWeightedSum_sub(Qureg outQureg, std::vector<qcomp> coeffs, std::vector<Qureg> inQuregs) {
 
     // consult outQureg's deployment since others are prior validated to match
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_QUREGS( statevec_setQuregToWeightedSum_sub, outQureg, inQuregs.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_setQuregToWeightedSum_sub, outQureg, inQuregs.size() );
     func(outQureg, coeffs, inQuregs);
 }
 
@@ -845,15 +823,12 @@ void accel_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) {
  */
 
 
-void accel_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector<int> targs, vector<int> pairTargs) {
+void accel_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, ConstList64 targs, ConstList64 pairTargs) {
     assert_partialTraceQuregsAreIdenticallyDeployed(inQureg, outQureg);
 
-    auto cpuFunc = GET_FUNC_OPTIMISED_FOR_NUM_TARGS( cpu_densmatr_partialTrace_sub, targs.size() );
-    auto gpuFunc = GET_FUNC_OPTIMISED_FOR_NUM_TARGS( gpu_densmatr_partialTrace_sub, targs.size() );
-
-    // inQureg == outQureg except for dimension, so use common backend
-    auto useFunc = (inQureg.isGpuAccelerated)? gpuFunc : cpuFunc;
-    useFunc(inQureg, outQureg, targs, pairTargs);
+    // inQureg == outQureg (except for dimension), so use common backend, informed by inQureg
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, densmatr_partialTrace_sub, inQureg, targs.size() );
+    func(inQureg, outQureg, targs, pairTargs);
 }
 
 
@@ -877,26 +852,26 @@ qreal accel_densmatr_calcTotalProb_sub(Qureg qureg) {
 }
 
 
-qreal accel_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
+qreal accel_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( statevec_calcProbOfMultiQubitOutcome_sub, qureg, qubits.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_calcProbOfMultiQubitOutcome_sub, qureg, qubits.size() );
     return func(qureg, qubits, outcomes);
 }
-qreal accel_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
+qreal accel_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( densmatr_calcProbOfMultiQubitOutcome_sub, qureg, qubits.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, densmatr_calcProbOfMultiQubitOutcome_sub, qureg, qubits.size() );
     return func(qureg, qubits, outcomes);
 }
 
 
-void accel_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits) {
+void accel_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, ConstList64 qubits) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( statevec_calcProbsOfAllMultiQubitOutcomes_sub, qureg, qubits.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_calcProbsOfAllMultiQubitOutcomes_sub, qureg, qubits.size() );
     func(outProbs, qureg, qubits);
 }
-void accel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits) {
+void accel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, ConstList64 qubits) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( densmatr_calcProbsOfAllMultiQubitOutcomes_sub, qureg, qubits.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, densmatr_calcProbsOfAllMultiQubitOutcomes_sub, qureg, qubits.size() );
     func(outProbs, qureg, qubits);
 }
 
@@ -982,13 +957,13 @@ qcomp accel_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi, bool co
  */
 
 
-qreal accel_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
+qreal accel_statevec_calcExpecAnyTargZ_sub(Qureg qureg, ConstList64 targs) {
 
     return (qureg.isGpuAccelerated)?
         gpu_statevec_calcExpecAnyTargZ_sub(qureg, targs):
         cpu_statevec_calcExpecAnyTargZ_sub(qureg, targs);
 }
-qcomp accel_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
+qcomp accel_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, ConstList64 targs) {
 
     return (qureg.isGpuAccelerated)?
         gpu_densmatr_calcExpecAnyTargZ_sub(qureg, targs):
@@ -996,19 +971,19 @@ qcomp accel_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
 }
 
 
-qcomp accel_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
+qcomp accel_statevec_calcExpecPauliStr_subA(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z) {
 
     return (qureg.isGpuAccelerated)?
         gpu_statevec_calcExpecPauliStr_subA(qureg, x, y, z):
         cpu_statevec_calcExpecPauliStr_subA(qureg, x, y, z);
 }
-qcomp accel_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
+qcomp accel_statevec_calcExpecPauliStr_subB(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z) {
 
     return (qureg.isGpuAccelerated)?
         gpu_statevec_calcExpecPauliStr_subB(qureg, x, y, z):
         cpu_statevec_calcExpecPauliStr_subB(qureg, x, y, z);
 }
-qcomp accel_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
+qcomp accel_densmatr_calcExpecPauliStr_sub(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z) {
 
     return (qureg.isGpuAccelerated)?
         gpu_densmatr_calcExpecPauliStr_sub(qureg, x, y, z):
@@ -1110,14 +1085,14 @@ qcomp accel_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMa
  */
 
 
-void accel_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) {
+void accel_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( statevec_multiQubitProjector_sub, qureg, qubits.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_multiQubitProjector_sub, qureg, qubits.size() );
     func(qureg, qubits, outcomes, prob);
 }
-void accel_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) {
+void accel_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( densmatr_multiQubitProjector_sub, qureg, qubits.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, densmatr_multiQubitProjector_sub, qureg, qubits.size() );
     func(qureg, qubits, outcomes, prob);
 }
 
diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
index be50e22da..5a8dc37fb 100644
--- a/quest/src/core/accelerator.hpp
+++ b/quest/src/core/accelerator.hpp
@@ -24,9 +24,9 @@
 #include "quest/include/qureg.h"
 #include "quest/include/matrices.h"
 
-#include <vector>
+#include "quest/src/core/lists.hpp"
 
-using std::vector;
+#include <vector>
 
 
 /*
@@ -42,9 +42,7 @@ using std::vector;
  */
 
 // must match the macros below, and those in accelerator.cpp
-#define MAX_OPTIMISED_NUM_CTRLS 5
-#define MAX_OPTIMISED_NUM_TARGS 5
-#define MAX_OPTIMISED_NUM_QUREGS 5
+#define MAX_OPTIMISED_PARAM 5
 
 
 #define INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS(returntype, funcname, args) \
@@ -82,10 +80,6 @@ using std::vector;
     template returntype funcname <-1,numtargs> args;
 
 
-#define INSTANTIATE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(returntype, funcname, args) \
-    private_CONJUGABLE_INSTANTIATE_outer(returntype, funcname, true,  args) \
-    private_CONJUGABLE_INSTANTIATE_outer(returntype, funcname, false, args)
-
 #define private_CONJUGABLE_INSTANTIATE_outer(returntype, funcname, conj, args) \
     private_CONJUGABLE_INSTANTIATE_inner(returntype, funcname, 0, conj, args) \
     private_CONJUGABLE_INSTANTIATE_inner(returntype, funcname, 1, conj, args) \
@@ -175,7 +169,7 @@ void accel_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliS
  * COMMUNICATION BUFFER PACKING
  */
 
-qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, vector<int> qubits, vector<int> qubitStates);
+qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates);
 
 qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);
 
@@ -184,32 +178,32 @@ qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int
  * SWAPS
  */
 
-void accel_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2);
-void accel_statevec_anyCtrlSwap_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates);
-void accel_statevec_anyCtrlSwap_subC(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, int targState);
+void accel_statevec_anyCtrlSwap_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2);
+void accel_statevec_anyCtrlSwap_subB(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates);
+void accel_statevec_anyCtrlSwap_subC(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, int targState);
 
 
 /*
  * DENSE MATRICES
  */
 
-void accel_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, CompMatr1 matr);
-void accel_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, qcomp fac0, qcomp fac1);
+void accel_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, CompMatr1 matr);
+void accel_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, qcomp fac0, qcomp fac1);
 
-void accel_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr);
+void accel_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, CompMatr2 matr);
 
-void accel_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr, bool conj, bool transp);
+void accel_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, CompMatr matr, bool conj, bool transp);
 
 
 /*
  * DIAGONAL MATRICES
  */
 
-void accel_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, DiagMatr1 matr);
+void accel_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, DiagMatr1 matr);
 
-void accel_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, DiagMatr2 matr);
+void accel_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, DiagMatr2 matr);
 
-void accel_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, DiagMatr matr, qcomp exponent, bool conj);
+void accel_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, DiagMatr matr, qcomp exponent, bool conj);
 
 void accel_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
 
@@ -222,17 +216,17 @@ void accel_densmatr_allTargDiagMatr_subB(Qureg qureg, FullStateDiagMatr matr, qc
  * PAULI TENSOR AND GADGET
  */
 
-void accel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> z, qcomp ampFac, qcomp pairAmpFac);
+void accel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 z, qcomp ampFac, qcomp pairAmpFac);
 
-void accel_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> x, vector<int> y, vector<int> z, qcomp ampFac, qcomp pairAmpFac);
-void accel_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> x, vector<int> y, vector<int> z, qcomp ampFac, qcomp pairAmpFac, qindex bufferMaskXY);
+void accel_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 x, ConstList64 y, ConstList64 z, qcomp ampFac, qcomp pairAmpFac);
+void accel_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 x, ConstList64 y, ConstList64 z, qcomp ampFac, qcomp pairAmpFac, qindex bufferMaskXY);
 
 
 /*
  * QUREG COMBINATION
  */
 
-void accel_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs);
+void accel_statevec_setQuregToWeightedSum_sub(Qureg outQureg, std::vector<qcomp> coeffs, std::vector<Qureg> inQuregs);
 
 void accel_densmatr_mixQureg_subA(qreal outProb, Qureg out, qreal inProb, Qureg in);
 void accel_densmatr_mixQureg_subB(qreal outProb, Qureg out, qreal inProb, Qureg in);
@@ -273,7 +267,7 @@ void accel_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob);
  * PARTIAL TRACE
  */
 
-void accel_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector<int> targs, vector<int> pairTargs);
+void accel_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, ConstList64 targs, ConstList64 pairTargs);
 
 
 /*
@@ -283,11 +277,11 @@ void accel_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector<int>
 qreal accel_statevec_calcTotalProb_sub(Qureg qureg);
 qreal accel_densmatr_calcTotalProb_sub(Qureg qureg);
 
-qreal accel_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes);
-qreal accel_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes);
+qreal accel_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes);
+qreal accel_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes);
 
-void accel_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits);
-void accel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits);
+void accel_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, ConstList64 qubits);
+void accel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, ConstList64 qubits);
 
 
 /*
@@ -305,12 +299,12 @@ qreal accel_densmatr_calcHilbertSchmidtDistance_sub(Qureg quregA, Qureg quregB);
  * EXPECTATION VALUES
  */
 
-qreal accel_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> sufTargs);
-qcomp accel_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> allTargs);;
+qreal accel_statevec_calcExpecAnyTargZ_sub(Qureg qureg, ConstList64 sufTargs);
+qcomp accel_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, ConstList64 allTargs);;
 
-qcomp accel_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vector<int> y, vector<int> z);
-qcomp accel_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vector<int> y, vector<int> z);
-qcomp accel_densmatr_calcExpecPauliStr_sub (Qureg qureg, vector<int> x, vector<int> y, vector<int> z);
+qcomp accel_statevec_calcExpecPauliStr_subA(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z);
+qcomp accel_statevec_calcExpecPauliStr_subB(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z);
+qcomp accel_densmatr_calcExpecPauliStr_sub (Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z);
 
 qcomp accel_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool useRealPow);
 qcomp accel_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool useRealPow);
@@ -320,8 +314,8 @@ qcomp accel_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMa
  * PROJECTORS 
  */
 
-void accel_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob);
-void accel_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob);
+void accel_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
+void accel_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
 
 
 /*
diff --git a/quest/src/core/bitwise.hpp b/quest/src/core/bitwise.hpp
index 4d455c2d8..f5266afa4 100644
--- a/quest/src/core/bitwise.hpp
+++ b/quest/src/core/bitwise.hpp
@@ -163,7 +163,7 @@ INLINE int getBitMaskParity(qindex mask) {
  */
 
 
-INLINE qindex insertBits(qindex number, int* bitIndices, int numIndices, int bitValue) {
+INLINE qindex insertBits(qindex number, const int* bitIndices, int numIndices, int bitValue) {
     
     // bitIndices must be strictly increasing
     for (int i=0; i<numIndices; i++)
@@ -173,7 +173,7 @@ INLINE qindex insertBits(qindex number, int* bitIndices, int numIndices, int bit
 }
 
 
-INLINE qindex setBits(qindex number, int* bitIndices, int numIndices, qindex bitsValue) {
+INLINE qindex setBits(qindex number, const int* bitIndices, int numIndices, qindex bitsValue) {
     
     // bitIndices are arbitrarily ordered, which does not affect number
     for (int i=0; i<numIndices; i++) {
@@ -185,7 +185,7 @@ INLINE qindex setBits(qindex number, int* bitIndices, int numIndices, qindex bit
 }
 
 
-INLINE qindex getValueOfBits(qindex number, int* bitIndices, int numIndices) {
+INLINE qindex getValueOfBits(qindex number, const int* bitIndices, int numIndices) {
 
     // bits are arbitrarily ordered, which affects value
     qindex value = 0;
@@ -205,7 +205,7 @@ INLINE qindex getValueOfBits(qindex number, int* bitIndices, int numIndices) {
  */
 
 
-INLINE qindex insertBitsWithMaskedValues(qindex number, int* bitInds, int numBits, qindex mask) {
+INLINE qindex insertBitsWithMaskedValues(qindex number, const int* bitInds, int numBits, qindex mask) {
 
     // bitInds must be sorted (increasing), and mask must be zero everywhere except bitInds
     return mask | insertBits(number, bitInds, numBits, 0);
@@ -268,7 +268,7 @@ INLINE qindex flipTwoBits(qindex number, int i1, int i0) {
  */
 
 
-INLINE qindex flipBits(qindex number, int* bitIndices, int numIndices) {
+INLINE qindex flipBits(qindex number, const int* bitIndices, int numIndices) {
 
     for (int i=0; i<numIndices; i++)
         number = flipBit(number, bitIndices[i]);
@@ -297,7 +297,7 @@ INLINE int getIndOfNextRightmostZeroBit(qindex mask, int bitInd) {
 }
 
 
-INLINE bool allBitsAreOne(qindex number, int* bitIndices, int numIndices) {
+INLINE bool allBitsAreOne(qindex number, const int* bitIndices, int numIndices) {
     
     for (int i=0; i<numIndices; i++)
         if (!getBit(number, bitIndices[i]))
@@ -307,7 +307,7 @@ INLINE bool allBitsAreOne(qindex number, int* bitIndices, int numIndices) {
 }
 
 
-INLINE qindex getBitMask(int* bitIndices, int* bitValues, int numIndices) {
+INLINE qindex getBitMask(const int* bitIndices, const int* bitValues, int numIndices) {
 
     qindex mask = 0;
     for (int i=0; i<numIndices; i++)
@@ -317,7 +317,7 @@ INLINE qindex getBitMask(int* bitIndices, int* bitValues, int numIndices) {
 }
 
 
-INLINE qindex getBitMask(int* bitIndices, int numIndices) {
+INLINE qindex getBitMask(const int* bitIndices, int numIndices) {
     
     qindex mask = 0;
     for (int i=0; i<numIndices; i++)
@@ -327,7 +327,7 @@ INLINE qindex getBitMask(int* bitIndices, int numIndices) {
 }
 
 
-INLINE qindex removeBits(qindex number, int* bitInds, int numInds) {
+INLINE qindex removeBits(qindex number, const int* bitInds, int numInds) {
 
     // assumes bitIndices are strictly increasing without duplicates
     int numRemoved = 0;
@@ -359,7 +359,7 @@ INLINE int logBase2(qindex powerOf2) {
 }
 
 
-INLINE qindex getIntegerFromBits(int* bits, int numBits) {
+INLINE qindex getIntegerFromBits(const int* bits, int numBits) {
 
     // first bit is treated as least significant
     qindex value = 0;
@@ -371,7 +371,7 @@ INLINE qindex getIntegerFromBits(int* bits, int numBits) {
 }
 
 
-INLINE void getBitsFromInteger(int* bits, qindex number, int numBits) {
+INLINE void setToBitsOfInteger(int* bits, qindex number, int numBits) {
 
     for (int i=0; i<numBits; i++)
         bits[i] = getBit(number, i);
diff --git a/quest/src/core/envvars.cpp b/quest/src/core/envvars.cpp
index c88647e0e..c1d3e81ed 100644
--- a/quest/src/core/envvars.cpp
+++ b/quest/src/core/envvars.cpp
@@ -6,12 +6,14 @@
  * @author Tyson Jones
  */
 
+#include "quest/include/config.h"
 #include "quest/include/precision.h"
 #include "quest/include/types.h"
 
 #include "quest/src/core/errors.hpp"
 #include "quest/src/core/parser.hpp"
 #include "quest/src/core/validation.hpp"
+#include "quest/src/gpu/gpu_config.hpp"
 
 #include <string>
 #include <cstdlib>
@@ -26,8 +28,9 @@ using std::string;
 
 
 namespace envvar_names {
-    string PERMIT_NODES_TO_SHARE_GPU = "PERMIT_NODES_TO_SHARE_GPU";
-    string DEFAULT_VALIDATION_EPSILON = "DEFAULT_VALIDATION_EPSILON";
+    string QUEST_PERMIT_NODES_TO_SHARE_GPU         = "QUEST_PERMIT_NODES_TO_SHARE_GPU";
+    string QUEST_DEFAULT_VALIDATION_EPSILON        = "QUEST_DEFAULT_VALIDATION_EPSILON";
+    string QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = "QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK";
 }
 
 
@@ -41,11 +44,15 @@ namespace envvar_values {
 
     // by default, do not permit GPU sharing since it sabotages performance
     // and should only ever be carefully, deliberately enabled
-    bool PERMIT_NODES_TO_SHARE_GPU = false;
+    bool QUEST_PERMIT_NODES_TO_SHARE_GPU = false;
 
     // by default, the initial validation epsilon (before being overriden
     // by users at runtime) should depend on qreal (i.e. FLOAT_PRECISION)
-    qreal DEFAULT_VALIDATION_EPSILON = UNSPECIFIED_DEFAULT_VALIDATION_EPSILON;
+    qreal QUEST_DEFAULT_VALIDATION_EPSILON = QUEST_UNSPECIFIED_DEFAULT_VALIDATION_EPSILON;
+
+    // by default, the initial number of GPU threads per block is informed by
+    // the below cmake variable (before being overridden by env-var or at runtime)
+    int QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = QUEST_UNSPECIFIED_DEFAULT_NUM_GPU_THREADS_PER_BLOCK;
 }
 
 
@@ -94,7 +101,7 @@ void assertEnvVarsAreLoaded() {
 void validateAndSetWhetherGpuSharingIsPermitted(const char* caller) {
 
     // permit unspecified, falling back to default value
-    string name = envvar_names::PERMIT_NODES_TO_SHARE_GPU;
+    string name = envvar_names::QUEST_PERMIT_NODES_TO_SHARE_GPU;
     if (!isEnvVarSpecified(name))
         return;
 
@@ -103,14 +110,14 @@ void validateAndSetWhetherGpuSharingIsPermitted(const char* caller) {
     validate_envVarPermitNodesToShareGpu(value, caller);
 
     // overwrite default env-var value
-    envvar_values::PERMIT_NODES_TO_SHARE_GPU = (value[0] == '1');
+    envvar_values::QUEST_PERMIT_NODES_TO_SHARE_GPU = (value[0] == '1');
 }
 
 
 void validateAndSetDefaultValidationEpsilon(const char* caller) {
 
     // permit unspecified, falling back to the hardcoded precision-specific default
-    string name = envvar_names::DEFAULT_VALIDATION_EPSILON;
+    string name = envvar_names::QUEST_DEFAULT_VALIDATION_EPSILON;
     if (!isEnvVarSpecified(name))
         return;
     
@@ -119,7 +126,22 @@ void validateAndSetDefaultValidationEpsilon(const char* caller) {
     validate_envVarDefaultValidationEpsilon(value, caller);
 
     // overwrite default env-var value
-    envvar_values::DEFAULT_VALIDATION_EPSILON = parser_parseReal(value);    
+    envvar_values::QUEST_DEFAULT_VALIDATION_EPSILON = parser_parseReal(value);    
+}
+
+
+void validateAndSetDefaultNumGpuThreadsPerBlock(const char* caller) {
+
+    // permit unspecified, falling back to the hardcoded default
+    string name = envvar_names::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK;
+    if (!isEnvVarSpecified(name))
+        return;
+
+    string value = getSpecifiedEnvVarValue(name);
+    validate_envVarDefaultNumGpuThreadsPerBlockIsAnInt(value, caller);
+
+    // overwrite default env-var value
+    envvar_values::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK = parser_parseInteger(value);
 }
 
 
@@ -138,6 +160,7 @@ void envvars_validateAndLoadEnvVars(const char* caller) {
     // load all env-vars
     validateAndSetWhetherGpuSharingIsPermitted(caller);
     validateAndSetDefaultValidationEpsilon(caller);
+    validateAndSetDefaultNumGpuThreadsPerBlock(caller);
 
     // ensure no re-loading
     global_areEnvVarsLoaded = true;
@@ -147,12 +170,19 @@ void envvars_validateAndLoadEnvVars(const char* caller) {
 bool envvars_getWhetherGpuSharingIsPermitted() {
     assertEnvVarsAreLoaded();
 
-    return envvar_values::PERMIT_NODES_TO_SHARE_GPU;
+    return envvar_values::QUEST_PERMIT_NODES_TO_SHARE_GPU;
 }
 
 
 qreal envvars_getDefaultValidationEpsilon() {
     assertEnvVarsAreLoaded();
 
-    return envvar_values::DEFAULT_VALIDATION_EPSILON;
+    return envvar_values::QUEST_DEFAULT_VALIDATION_EPSILON;
+}
+
+
+int envvars_getDefaultNumGpuThreadsPerBlock() {
+    assertEnvVarsAreLoaded();
+
+    return envvar_values::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK;
 }
diff --git a/quest/src/core/envvars.hpp b/quest/src/core/envvars.hpp
index 828d5605e..4862e8d08 100644
--- a/quest/src/core/envvars.hpp
+++ b/quest/src/core/envvars.hpp
@@ -13,8 +13,9 @@
 
 
 namespace envvar_names { 
-    extern std::string PERMIT_NODES_TO_SHARE_GPU;
-    extern std::string DEFAULT_VALIDATION_EPSILON;
+    extern std::string QUEST_PERMIT_NODES_TO_SHARE_GPU;
+    extern std::string QUEST_DEFAULT_VALIDATION_EPSILON;
+    extern std::string QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK;
 }
 
 
@@ -33,5 +34,7 @@ bool envvars_getWhetherGpuSharingIsPermitted();
 
 qreal envvars_getDefaultValidationEpsilon();
 
+int envvars_getDefaultNumGpuThreadsPerBlock();
+
 
 #endif // ENVVARS_HPP
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 9e72b1e0b..807cad105 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -41,6 +41,8 @@ using std::string;
 
 void raiseInternalError(string errorMsg) {
 
+    printer_sync();
+
     print(string("")
         + "\n\n"
         + "A fatal internal QuEST error occurred. "
@@ -49,6 +51,8 @@ void raiseInternalError(string errorMsg) {
         + "\n"
     );
 
+    printer_sync();
+
     exit(EXIT_FAILURE);
 }
 
@@ -181,6 +185,26 @@ void error_commNumMessagesExceedTagMax() {
     raiseInternalError("A function attempted to communicate via more messages than permitted (since there would be more uniquely-tagged messages than the tag upperbound).");
 }
 
+void error_commAlreadyHasSetMpiComm() {
+  
+    raiseInternalError("An attempt was made to set the QuEST MPI communicator after it had already been set (and changed from MPI_COMM_NULL).");
+}
+
+void error_commMpiCommIsNull() {
+
+    raiseInternalError("The MPI communicator was queried but was unexpectedly MPI_COMM_NULL.");
+}
+
+void error_commNewMpiCommIsNull() {
+
+    raiseInternalError("The MPI communicator was attemptedly set to MPI_COMM_NULL, which validation should have prior caught.");
+}
+
+void error_commActiveButMpiNotInit() {
+
+    raiseInternalError("QuEST believed communication was active, but MPI_Init reported MPI was not initialised.");
+}
+
 void assert_commBoundsAreValid(Qureg qureg, qindex sendInd, qindex recvInd, qindex numAmps) {
 
     bool valid = (
@@ -243,11 +267,6 @@ void assert_receiverCanFitSendersEntireElems(Qureg receiver, FullStateDiagMatr s
  * LOCALISER ERRORS
  */
 
-void error_localiserNumCtrlStatesInconsistentWithNumCtrls() {
-
-    raiseInternalError("An inconsistent number of ctrls and ctrlStates were passed to a function in localiser.cpp.");
-}
-
 void error_localiserGivenPauliTensorOrGadgetWithoutXOrY() {
 
     raiseInternalError("The localiser was asked to simulate a Pauli tensor or gadget which contained no X or Y Paulis, which is a special case reserved for phase gadgets.");
@@ -278,6 +297,11 @@ void error_localiserGivenNonUnityGlobalFactorToZTensor() {
     raiseInternalError("A localiser function to apply a PauliStr (as a tensor, not a gadget) was given a PauliStr containing only Z and I, along with a non-unity global factor. This is an illegal combination.");
 }
 
+void error_calcFidStateVecDistribWhileDensMatrLocal() {
+
+    raiseInternalError("A localiser function attempted to compute the fidelity between a local density matrix and a distributed statevector, which is an illegal combination.");
+}
+
 void assert_localiserSuccessfullyAllocatedTempMemory(qcomp* ptr, bool isGpu) {
 
     if (mem_isAllocated(ptr))
@@ -314,9 +338,10 @@ void assert_localiserPartialTraceGivenCompatibleQuregs(Qureg inQureg, Qureg outQ
         raiseInternalError("Inconsistent Qureg sizes and number of traced qubits given to localiser's partial trace function.");
 }
 
-void error_calcFidStateVecDistribWhileDensMatrLocal() {
+void assert_localiserListLengthsAgree(size_t length1, size_t length2) {
 
-    raiseInternalError("A localiser function attempted to compute the fidelity between a local density matrix and a distributed statevector, which is an illegal combination.");
+    if (length1 != length2)
+        raiseInternalError("Two corresponding lists (such as ctrls & ctrlStates, or qubits & outcomes) passed to localiser.cpp differed in length.");
 }
 
 void assert_localiserDistribQuregSpooferGivenValidQuregs(Qureg local, Qureg distrib) {
@@ -625,6 +650,11 @@ void error_gpuUnexpectedlyInaccessible() {
     raiseInternalError("A function internally assumed (as a precondition) that QuEST was compiled with GPU-acceleration enabled, and that one was physically accessible, though this was untrue.");
 }
 
+void error_gpuNumThreadsPerBlockNotSet() {
+
+    raiseInternalError("A function queried the GPU numThreadsPerBlock before it had been set (intendedly by QuESTEnv initialisation).");
+}
+
 void error_gpuMemSyncQueriedButEnvNotGpuAccelerated() {
 
     raiseInternalError("A function checked whether persistent GPU memory (such as in a CompMatr) had been synchronised, but the QuEST environment is not GPU accelerated.");  
@@ -753,6 +783,37 @@ void error_pauliStrSumConjHasIncorrectNumTerms() {
 
 
 
+/*
+ * LIST ERRORS 
+ */
+
+void error_smallListLengthExceededMax() {
+
+    raiseInternalError("A List64 was attemptedly allocated or grown to an illegally large size.");
+}
+
+void error_smallListIndexWasNegative() {
+
+    raiseInternalError("A List64 index was negative.");
+}
+
+void error_smallListIndexExceededLength() {
+
+    raiseInternalError("A List64 index equalled or exceeded the list length.");
+}
+
+void error_smallListWasEmpty() {
+
+    raiseInternalError("A List64 was unexpectedly empty.");
+}
+
+void error_smallListNullPtrWithPositiveLength() {
+
+    raiseInternalError("The List64 constructor was given a nullptr yet a non-zero length.");
+}
+
+
+
 /*
  * UTILITY ERRORS 
  */
@@ -828,6 +889,16 @@ void error_attemptedToParseRealFromInvalidString() {
     raiseInternalError("A function attempted to parse a string to a qreal but the string was not validly formatted. This should have been caught by prior user validation.");
 }
 
+void error_attemptedToParseIntegerFromInvalidString() {
+
+    raiseInternalError("A function attempted to parse a string to an int but the string was not validly formatted. This should have been caught by prior user validation.");
+}
+
+void error_attemptedToParseOutOfRangeInteger() {
+
+    raiseInternalError("A function attempted to parse a string to an integer but the numerical value of the string literal exceeded the range of the integer. This should have been caught by prior validation.");
+}
+
 void error_attemptedToParseOutOfRangeReal() {
 
     raiseInternalError("A function attempted to parse a string to a qreal but the numerical value of the string literal exceeded the range of the qreal. This should have been caught by prior user validation.");
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index 950ac17ed..f91f890b0 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -4,6 +4,12 @@
  * hardware accelerators are behaving as expected, and that runtime
  * deployment is consistent with the compiled deployment modes.
  * 
+ * Some error() functions are explicitly marked as [[noreturn]] so that
+ * the compiler knows code after their invocation is never executed,
+ * avoiding warnings about (e.g.) invalid static array indexing. In
+ * theory, all error() functions can be [[noreturn]], but we only
+ * bother with the ones that make a compile-time difference.
+ * 
  * @author Tyson Jones
  * @author Luc Jaulmes (NUMA & pagesize errors)
  */
@@ -85,6 +91,14 @@ void error_commGivenInconsistentNumSubArraysANodes();
 
 void error_commNumMessagesExceedTagMax();
 
+void error_commAlreadyHasSetMpiComm();
+
+void error_commMpiCommIsNull();
+
+void error_commNewMpiCommIsNull();
+
+void error_commActiveButMpiNotInit();
+
 void assert_commBoundsAreValid(Qureg qureg, qindex sendInd, qindex recvInd, qindex numAmps);
 
 void assert_commPayloadIsPowerOf2(qindex numAmps);
@@ -107,8 +121,6 @@ void assert_receiverCanFitSendersEntireElems(Qureg receiver, FullStateDiagMatr s
  * LOCALISER ERRORS
  */
 
-void error_localiserNumCtrlStatesInconsistentWithNumCtrls();
-
 void error_localiserGivenPauliTensorOrGadgetWithoutXOrY();
 
 void error_localiserPassedStateVecToChannelComCheck();
@@ -121,6 +133,8 @@ void error_localiserGivenPauliStrWithoutXorY();
 
 void error_localiserGivenNonUnityGlobalFactorToZTensor();
 
+void error_calcFidStateVecDistribWhileDensMatrLocal();
+
 void assert_localiserSuccessfullyAllocatedTempMemory(qcomp* ptr, bool isGpu);
 
 void assert_localiserGivenStateVec(Qureg qureg);
@@ -129,7 +143,7 @@ void assert_localiserGivenDensMatr(Qureg qureg);
 
 void assert_localiserPartialTraceGivenCompatibleQuregs(Qureg inQureg, Qureg outQureg, int numTargs);
 
-void error_calcFidStateVecDistribWhileDensMatrLocal();
+void assert_localiserListLengthsAgree(size_t length1, size_t length2);
 
 void assert_localiserDistribQuregSpooferGivenValidQuregs(Qureg local, Qureg distrib);
 
@@ -235,12 +249,16 @@ void error_gpuCopyButMatrixNotGpuAccelerated();
 
 void error_gpuMemSyncQueriedButEnvNotGpuAccelerated();
 
+void error_gpuNumThreadsPerBlockNotSet();
+
 void error_gpuUnexpectedlyInaccessible();
 
 void error_gpuDeadCopyMatrixFunctionCalled();
 
 void error_gpuDenseMatrixConjugatedAndTransposed();
 
+void error_gpuBadNumThreadsPerBlock();
+
 void assert_gpuIsAccessible();
 
 void assert_gpuHasBeenBound(bool isBound);
@@ -301,6 +319,22 @@ void error_pauliStrSumConjHasIncorrectNumTerms();
 
 
 
+/*
+ * LIST ERRORS 
+ */
+
+[[noreturn]] void error_smallListLengthExceededMax();
+
+[[noreturn]] void error_smallListIndexWasNegative();
+
+[[noreturn]] void error_smallListIndexExceededLength();
+
+[[noreturn]] void error_smallListWasEmpty();
+
+[[noreturn]] void error_smallListNullPtrWithPositiveLength();
+
+
+
 /*
  * UTILITY ERRORS 
  */
@@ -335,6 +369,10 @@ void error_attemptedToParseComplexFromInvalidString();
 
 void error_attemptedToParseRealFromInvalidString();
 
+void error_attemptedToParseIntegerFromInvalidString();
+
+void error_attemptedToParseOutOfRangeInteger();
+
 void error_attemptedToParseOutOfRangeReal();
 
 void error_attemptedToParsePauliStringFromInvalidString();
@@ -383,4 +421,4 @@ void error_unexpectedNumLindbladSuperpropTerms();
 
 
 
-#endif // ERRORS_HPP
\ No newline at end of file
+#endif // ERRORS_HPP
diff --git a/quest/src/core/lists.hpp b/quest/src/core/lists.hpp
new file mode 100644
index 000000000..68a15d3ec
--- /dev/null
+++ b/quest/src/core/lists.hpp
@@ -0,0 +1,247 @@
+/** @file
+ * A stack-based list of length <= 64, primarily
+ * for storing qubit indices, as an alternative to
+ * std::vector and associated heap-alloc/copy
+ * overheads. Use of List64 optimises few-qubit
+ * simulation where STL container costs dominate;
+ * and in the GPU backend, use of List64 avoids
+ * CUDA memory writes before kernel launches!
+ * 
+ * This header also defines ConstList64, which is
+ * merely 'const List64&', to avoid superfluous
+ * stack copies when passing non-mutated List64.
+ * 
+ * The functions herein are inlined (in this header-
+ * only file) in the hopes of unbridled compiler
+ * optimisations, but this may prove incompatible
+ * with GPU mode (since INLINE specifies __device__,
+ * which may be incompatible with initialiser lists)
+ * 
+ * @author Tyson Jones
+ */
+
+#ifndef LISTS_HPP
+#define LISTS_HPP
+
+#include "quest/src/core/errors.hpp"
+#include "quest/src/core/inliner.hpp"
+
+
+
+/*
+ * CAPACITY
+ *
+ * Since stored in stack, we must upperbound the length of
+ * a List64; we choose 64, which is around the maximum
+ * addressable number of qubits by qindex. In theory, we
+ * could permit users to compile-time reduce this length,
+ * restricting their max simulable system but speeding up
+ * List64 copies in function calls - this may have a
+ * measurable benefit for Quregs of 1-8 qubits. But Donald
+ * Knuth knows and sees all, and he won't be happy!
+ */
+
+
+constexpr size_t MAX_LIST_LENGTH = 64;
+
+
+
+/*
+ * LIST64 DECLARATION
+ *
+ * which mimics an STL container so that it is easily
+ * substituted for std::vector in our codebase, but
+ * crucially, remains (almost) POD and with no heap
+ * allocs, and compatible with CUDA kernels 
+ */
+
+
+struct List64 {
+
+private:
+
+    // Keep data private to dissuade inconsistent
+    // access patterns (e.g. .elems vs .data()),
+    // and so users cannot invalidly mutate length.
+    // Readers may wonder why we avoid std::array;
+    // it has a surprise overhead in pass-by-ref!
+    int elems[MAX_LIST_LENGTH];
+
+    // We use size_t, over the arguably internally
+    // natural int, for consistency with STL containers
+    size_t length;
+
+public:
+
+    // Note there is deliberately no constructor!
+    // This keeps the struct trivial and compatible
+    // with CUDA; we must forego initializer ctors
+    // and other syntactic goodies :(
+
+    // let List64 be iterable, e.g. for(auto x : list)
+    INLINE auto begin()       { return elems; }
+    INLINE auto begin() const { return elems; }
+    INLINE auto end()         { return elems + length; }
+    INLINE auto end()   const { return elems + length; }
+
+    // let List64 be indexable, e.g. list[3]
+    INLINE const int& operator[](int index) const {
+
+        if (index < 0)
+            error_smallListIndexWasNegative();
+        if (index >= static_cast<int>(length))
+            error_smallListIndexExceededLength();
+
+        return elems[index];
+    }
+    INLINE int& operator[](int index) {
+
+        return const_cast<int&>(
+            static_cast<const List64&>(*this)[index]);
+    }
+
+    // give List64 all the familiar methods of std::vector
+    INLINE void clear() {
+        length = 0;
+    }
+    INLINE bool empty() const { 
+        return length == 0; 
+    }
+    INLINE size_t size() const { 
+        return length;
+    }
+    INLINE int* data() {
+        return elems;
+    }
+    INLINE const int* data() const {
+        return elems;
+    }
+
+    INLINE void push_back(int elem) {
+
+        if (length >= MAX_LIST_LENGTH)
+            error_smallListLengthExceededMax();
+
+        elems[length++] = elem;
+    }
+
+    INLINE void resize(size_t newLength, int value=0) {
+
+        if (newLength > MAX_LIST_LENGTH)
+            error_smallListLengthExceededMax();
+
+        for (auto i=length; i<newLength; i++)
+            elems[i] = value;
+
+        length = newLength;
+    }
+
+    INLINE const int& back() const {
+
+        if (length == 0)
+            error_smallListWasEmpty();
+
+        return elems[length - 1];
+    }
+    INLINE int& back() {
+
+        return const_cast<int&>(
+            static_cast<const List64&>(*this).back());
+    }
+
+    INLINE void assign(size_t count, int value) {
+
+        if (count > MAX_LIST_LENGTH)
+            error_smallListLengthExceededMax();
+
+        for (size_t i = 0; i < count; i++)
+            elems[i] = value;
+
+        length = count;
+    }
+};
+
+
+
+/*
+ * LIST64 CONSTRUCTORS
+ *
+ * which are separated here because making them actual
+ * constructors stops List64 being POD/trivial, and
+ * makes it incompatible with CUDA kernels
+ */
+
+
+INLINE List64 lists_getEmptyList64() {
+
+    List64 out{};
+    out.clear();
+    return out;
+}
+
+
+INLINE List64 lists_getList64(const int* begin, const int* end) {
+
+    if (end < begin)
+        error_smallListIndexExceededLength();
+
+    auto length = static_cast<size_t>(end - begin);
+    if (length > MAX_LIST_LENGTH)
+        error_smallListLengthExceededMax();
+
+    List64 out = lists_getEmptyList64();
+
+    for (const int* ptr = begin; ptr != end; ++ptr)
+        out.push_back(*ptr);
+
+    return out;
+}
+
+
+INLINE List64 lists_getList64(const int* elems, size_t length) {
+
+    if (elems == nullptr && length > 0)
+        error_smallListNullPtrWithPositiveLength();
+    
+    // no ptr necessary whgen list is empty
+    if (elems == nullptr)
+        return lists_getEmptyList64();
+
+    return lists_getList64(elems, elems + length); // validates length <= MAX
+}
+
+
+INLINE List64 lists_getList64(std::initializer_list<int> init) {
+
+    return lists_getList64(init.begin(), init.end());
+}
+
+
+
+/*
+ * ASSERT TRIVIAL
+ *
+ * which doesn't really gaurantee CUDA compatibility, but may
+ * catch a developer accidentally breaking compatibility
+ */
+
+
+static_assert(std::is_trivially_copyable_v<List64>);
+static_assert(std::is_standard_layout_v<List64>);
+
+
+
+/*
+ * CONST LIST64 DECLARATION
+ * 
+ * Functions can accept ConstList64 (over List64) to avoid
+ * a stack copy. A List64 can always be passed to a
+ * function accepting a ConstList64, but a ConstList64 can never
+ * be returned from a function (duh).
+ */
+
+using ConstList64 = const List64&;
+
+
+
+#endif // LISTS_HPP
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index 9d4dbce09..83a23b921 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -18,6 +18,7 @@
 
 #include "quest/src/core/errors.hpp"
 #include "quest/src/core/bitwise.hpp"
+#include "quest/src/core/lists.hpp"
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/core/paulilogic.hpp"
 #include "quest/src/core/localiser.hpp"
@@ -44,31 +45,7 @@ using std::tuple;
  */
 
 
-void assertValidCtrlStates(vector<int> ctrls, vector<int> ctrlStates) {
-
-    // providing no control states is always valid (to invoke default all-on-1)
-    if (ctrlStates.empty())
-        return;
-
-    // otherwise a state must be explicitly given for each ctrl
-    if (ctrlStates.size() != ctrls.size())
-        error_localiserNumCtrlStatesInconsistentWithNumCtrls();
-}
-
-
-void setDefaultCtrlStates(vector<int> ctrls, vector<int> &states) {
-
-    // no states necessary if there are no control qubits
-    if (ctrls.empty())
-        return;
-
-    // default ctrl state is all-1
-    if (states.empty())
-        states.insert(states.end(), ctrls.size(), 1);
-}
-
-
-bool doesGateRequireComm(Qureg qureg, vector<int> targs) {
+bool doesGateRequireComm(Qureg qureg, ConstList64 targs) {
 
     // non-distributed quregs never communicate (duh)
     if (!qureg.isDistributed)
@@ -80,11 +57,11 @@ bool doesGateRequireComm(Qureg qureg, vector<int> targs) {
 
 bool doesGateRequireComm(Qureg qureg, int targ) {
 
-    return doesGateRequireComm(qureg, vector{targ});
+    return doesGateRequireComm(qureg, lists_getList64({targ}));
 }
 
 
-bool doesChannelRequireComm(Qureg qureg, vector<int> ketQubits) {
+bool doesChannelRequireComm(Qureg qureg, ConstList64 ketQubits) {
     if (!qureg.isDensityMatrix)
         error_localiserPassedStateVecToChannelComCheck();
 
@@ -96,11 +73,11 @@ bool doesChannelRequireComm(Qureg qureg, vector<int> ketQubits) {
 
 bool doesChannelRequireComm(Qureg qureg, int ketQubit) {
 
-    return doesChannelRequireComm(qureg, vector{ketQubit});
+    return doesChannelRequireComm(qureg, lists_getList64({ketQubit}));
 }
 
 
-bool doAnyLocalStatesHaveQubitValues(Qureg qureg, vector<int> qubits, vector<int> states) {
+bool doAnyLocalStatesHaveQubitValues(Qureg qureg, ConstList64 qubits, ConstList64 states) {
 
     // this answers the generic question of "do any of the given qubits lie in the
     // prefix substate with node-fixed values inconsistent with the given states?"
@@ -126,25 +103,23 @@ bool doAnyLocalStatesHaveQubitValues(Qureg qureg, vector<int> qubits, vector<int
 }
 
 
-void removePrefixQubitsAndStates(Qureg qureg, vector<int> &qubits, vector<int> &states) {
+tuple<List64,List64> getSuffixQubitsAndStates(Qureg qureg, ConstList64 qubits, ConstList64 states) {
 
-    vector<int> suffixQubits(0);  suffixQubits.reserve(qubits.size());
-    vector<int> suffixStates(0);  suffixStates.reserve(states.size());
+    List64 suffixQubits = lists_getEmptyList64();
+    List64 suffixStates = lists_getEmptyList64();
 
-    // collect suffix qubits/states
-    for (size_t i=0; i<qubits.size(); i++)
+    for (size_t i=0; i<qubits.size(); i++) {
         if (util_isQubitInSuffix(qubits[i], qureg)) {
             suffixQubits.push_back(qubits[i]);
             suffixStates.push_back(states[i]);
         }
+    }
 
-    // overwrite given vectors
-    qubits = suffixQubits;
-    states = suffixStates;
+    return {suffixQubits, suffixStates};
 }
 
 
-auto getCtrlsAndTargsSwappedToMinSuffix(Qureg qureg, vector<int> ctrls, vector<int> targs) {
+auto getCtrlsAndTargsSwappedToMinSuffix(Qureg qureg, ConstList64 ctrls, ConstList64 targs) {
 
     // this function is called by multi-target dense matrix, and is used to find
     // targets in the prefix substate and where they can be swapped into the suffix
@@ -156,19 +131,25 @@ auto getCtrlsAndTargsSwappedToMinSuffix(Qureg qureg, vector<int> ctrls, vector<i
     if (!doesGateRequireComm(qureg, targs))
         return tuple{ctrls, targs};
 
+    // otherwise, prepare lists to modify
+    List64 outCtrls = ctrls;
+    List64 outTargs = targs;
+    const auto numCtrls = ctrls.size();
+    const auto numTargs = targs.size();
+
     // prepare masks to avoid quadratic nested looping
-    qindex targMask = getBitMask(targs.data(), targs.size());
-    qindex ctrlMask = getBitMask(ctrls.data(), ctrls.size());
+    qindex targMask = util_getBitMask(outTargs);
+    qindex ctrlMask = util_getBitMask(outCtrls);
     int minNonTarg = getIndOfNextRightmostZeroBit(targMask, -1);
 
     // prepare map from control qubit to its index in ctrls list (i.e. inverse of ctrls)
     std::unordered_map<int,int> ctrlInds;
-    for (size_t i=0; i<ctrls.size(); i++)
-        ctrlInds[ctrls[i]] = i;
+    for (size_t i=0; i<numCtrls; i++)
+        ctrlInds[outCtrls[i]] = i;
 
-    // check every target in arbitrary order, modifying our copies of targs and ctrls as we go
-    for (size_t i=0; i<targs.size(); i++) {
-        int targ = targs[i];
+    // check every target in arbitrary order, modifying outTargs and outCtrls as we go
+    for (size_t i=0; i<numTargs; i++) {
+        int targ = outTargs[i];
 
         // consider only targs in the prefix substate
         if (util_isQubitInSuffix(targ, qureg))
@@ -179,7 +160,7 @@ auto getCtrlsAndTargsSwappedToMinSuffix(Qureg qureg, vector<int> ctrls, vector<i
 
             // find and swap that ctrl with the old targ
             int ctrlInd = ctrlInds[minNonTarg];
-            ctrls[ctrlInd] = targ;
+            outCtrls[ctrlInd] = targ;
 
             // update our ctrl trackers
             ctrlInds[targ] = ctrlInd;
@@ -188,7 +169,7 @@ auto getCtrlsAndTargsSwappedToMinSuffix(Qureg qureg, vector<int> ctrls, vector<i
         }
 
         // swap the prefix targ with the smallest available suffix targ
-        targs[i] = minNonTarg;
+        outTargs[i] = minNonTarg;
 
         // update our targ trackers
         targMask = flipTwoBits(targMask, targ, minNonTarg);
@@ -196,11 +177,11 @@ auto getCtrlsAndTargsSwappedToMinSuffix(Qureg qureg, vector<int> ctrls, vector<i
     }
 
     // the ordering in ctrls relative to the caller's ctrlStates is unchanged
-    return tuple{ctrls, targs};
+    return tuple{outCtrls, outTargs};
 }
 
 
-auto getQubitsSwappedToMaxSuffix(Qureg qureg, vector<int> qubits) {
+auto getQubitsSwappedToMaxSuffix(Qureg qureg, ConstList64 qubits) {
 
     // this function is called by any-targ partial trace, and is used to find
     // targets in the prefix substate and where they can be swapped into the suffix
@@ -213,20 +194,23 @@ auto getQubitsSwappedToMaxSuffix(Qureg qureg, vector<int> qubits) {
     if (!doesGateRequireComm(qureg, qubits))
         return qubits;
 
+    // otherwise, prepare list to modify
+    List64 outQubits = qubits;
+
     // prepare mask to avoid quadratic nested looping
-    qindex qubitMask = getBitMask(qubits.data(), qubits.size());
+    qindex qubitMask = util_getBitMask(outQubits);
     int maxFreeSuffixQubit = getIndOfNextLeftmostZeroBit(qubitMask, qureg.logNumAmpsPerNode);
 
     // enumerate qubits backward, modifying our copy of qubits as we go
-    for (size_t i=qubits.size(); i-- != 0; ) {
-        int qubit = qubits[i];
+    for (size_t i=outQubits.size(); i-- != 0; ) {
+        int qubit = outQubits[i];
 
         // consider only qubits in the prefix substate
         if (util_isQubitInSuffix(qubit, qureg))
             continue;
 
         // swap the prefix qubit into the largest available suffix position
-        qubits[i] = maxFreeSuffixQubit;
+        outQubits[i] = maxFreeSuffixQubit;
 
         // update trackers
         qubitMask = flipTwoBits(qubitMask, qubit, maxFreeSuffixQubit);
@@ -234,20 +218,21 @@ auto getQubitsSwappedToMaxSuffix(Qureg qureg, vector<int> qubits) {
     }
 
     // return our modified copy
-    return qubits;
+    return outQubits;
 }
 
 
-auto getNonSwappedCtrlsAndStates(vector<int> oldCtrls, vector<int> oldStates, vector<int> newCtrls) {
+auto getNonSwappedCtrlsAndStates(ConstList64 oldCtrls, ConstList64 oldStates, ConstList64 newCtrls) {
 
-    vector<int> sameCtrls(0);   sameCtrls .reserve(oldCtrls.size());
-    vector<int> sameStates(0);  sameStates.reserve(oldStates.size());
+    auto sameCtrls = lists_getEmptyList64();
+    auto sameStates = lists_getEmptyList64();
 
-    for (size_t i=0; i<oldCtrls.size(); i++)
+    for (size_t i=0; i<oldCtrls.size(); i++) {
         if (oldCtrls[i] == newCtrls[i]) {
             sameCtrls .push_back(oldCtrls[i]);
             sameStates.push_back(oldStates[i]);
         }
+    }
 
     return tuple{sameCtrls, sameStates};
 }
@@ -446,7 +431,7 @@ void freeSpoofedLocalStateVec(Qureg spoof, bool wasMemAlloc) {
  */
 
 
-void exchangeAmpsToBuffersWhereQubitsAreInStates(Qureg qureg, int pairRank, vector<int> qubits, vector<int> states) {
+void exchangeAmpsToBuffersWhereQubitsAreInStates(Qureg qureg, int pairRank, ConstList64 qubits, ConstList64 states) {
 
     // when there are no constraining qubits, all amps are exchanged; there is no need to pack the buffer.
     // this is typically triggered when a communicating localiser function is given no control qubits
@@ -839,7 +824,7 @@ void localiser_densmatr_initMixtureOfUniformlyRandomPureStates(Qureg qureg, qind
  */
 
 
-void anyCtrlSwapBetweenPrefixAndPrefix(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2) {
+void anyCtrlSwapBetweenPrefixAndPrefix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2) {
 
     int prefInd1 = util_getPrefixInd(targ1, qureg);
     int prefInd2 = util_getPrefixInd(targ2, qureg);
@@ -857,15 +842,15 @@ void anyCtrlSwapBetweenPrefixAndPrefix(Qureg qureg, vector<int> ctrls, vector<in
 }
 
 
-void anyCtrlSwapBetweenPrefixAndSuffix(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int suffixTarg, int prefixTarg) {
+void anyCtrlSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int suffixTarg, int prefixTarg) {
 
     // every node exchanges at most half its amps; those where suffixTarg bit differs from rank's fixed prefixTarg bit
     int pairRank = util_getRankWithQubitFlipped(prefixTarg, qureg);
     int suffixState =  ! util_getRankBitOfQubit(prefixTarg, qureg);
 
     // pack and exchange only to-be-communicated amps between sub-buffers
-    vector<int> qubits = ctrls;
-    vector<int> states = ctrlStates;
+    auto qubits = ctrls;
+    auto states = ctrlStates;
     qubits.push_back(suffixTarg);
     states.push_back(suffixState);
     exchangeAmpsToBuffersWhereQubitsAreInStates(qureg, pairRank, qubits, states);
@@ -875,10 +860,9 @@ void anyCtrlSwapBetweenPrefixAndSuffix(Qureg qureg, vector<int> ctrls, vector<in
 }
 
 
-void localiser_statevec_anyCtrlSwap(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2) {
-    assertValidCtrlStates(ctrls, ctrlStates);
-    setDefaultCtrlStates(ctrls, ctrlStates);
-
+void localiser_statevec_anyCtrlSwap(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2) {
+    assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size());
+    
     // ensure targ2 > targ1
     if (targ1 > targ2)
         std::swap(targ1, targ2);
@@ -888,18 +872,18 @@ void localiser_statevec_anyCtrlSwap(Qureg qureg, vector<int> ctrls, vector<int>
         return;
 
     // retain only suffix control qubits as relevant to communication and local amp modification
-    removePrefixQubitsAndStates(qureg, ctrls, ctrlStates);
+    auto [suffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, ctrls, ctrlStates);
 
     // determine necessary communication
     bool comm1 = doesGateRequireComm(qureg, targ1);
     bool comm2 = doesGateRequireComm(qureg, targ2);
 
     if (comm2 && comm1)
-        anyCtrlSwapBetweenPrefixAndPrefix(qureg, ctrls, ctrlStates, targ1, targ2);
+        anyCtrlSwapBetweenPrefixAndPrefix(qureg, suffixCtrls, suffixCtrlStates, targ1, targ2);
     if (comm2 && !comm1)
-        anyCtrlSwapBetweenPrefixAndSuffix(qureg, ctrls, ctrlStates, targ1, targ2);
+        anyCtrlSwapBetweenPrefixAndSuffix(qureg, suffixCtrls, suffixCtrlStates, targ1, targ2);
     if (!comm2 && !comm1)
-        accel_statevec_anyCtrlSwap_subA(qureg, ctrls, ctrlStates, targ1, targ2);
+        accel_statevec_anyCtrlSwap_subA(qureg, suffixCtrls, suffixCtrlStates, targ1, targ2);
 }
 
 
@@ -909,7 +893,7 @@ void localiser_statevec_anyCtrlSwap(Qureg qureg, vector<int> ctrls, vector<int>
  */
 
 
-void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targsA, vector<int> targsB) {
+void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targsA, ConstList64 targsB) {
 
     // this is an internal function called by the below routines which require
     // performing a sequence of SWAPs to reorder qubits, or move them into suffix.
@@ -944,7 +928,7 @@ void anyCtrlMultiSwapBetweenPrefixAndSuffix(Qureg qureg, vector<int> ctrls, vect
  */
 
 
-void anyCtrlOneTargDenseMatrOnPrefix(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, CompMatr1 matr) {
+void anyCtrlOneTargDenseMatrOnPrefix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, CompMatr1 matr) {
   
     int pairRank = util_getRankWithQubitFlipped(targ, qureg);
     exchangeAmpsToBuffersWhereQubitsAreInStates(qureg, pairRank, ctrls, ctrlStates);
@@ -959,16 +943,15 @@ void anyCtrlOneTargDenseMatrOnPrefix(Qureg qureg, vector<int> ctrls, vector<int>
 }
 
 
-void localiser_statevec_anyCtrlOneTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, CompMatr1 matr, bool conj, bool transp) {
-    assertValidCtrlStates(ctrls, ctrlStates);
-    setDefaultCtrlStates(ctrls, ctrlStates);
+void localiser_statevec_anyCtrlOneTargDenseMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, CompMatr1 matr, bool conj, bool transp) {
+    assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size());
 
     // node has nothing to do if all local amps violate control condition
     if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates))
         return;
 
     // retain only suffix control qubits as relevant to communication and local amp modification
-    removePrefixQubitsAndStates(qureg, ctrls, ctrlStates);
+    auto [suffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, ctrls, ctrlStates);
 
     // only one of conj or transp will be true (but logic is correct if both were true)
     if (conj) 
@@ -978,8 +961,8 @@ void localiser_statevec_anyCtrlOneTargDenseMatr(Qureg qureg, vector<int> ctrls,
 
     // perform embarrassingly parallel routine or communication-inducing swaps
     doesGateRequireComm(qureg, targ)?
-        anyCtrlOneTargDenseMatrOnPrefix(qureg, ctrls, ctrlStates, targ, matr) :
-        accel_statevec_anyCtrlOneTargDenseMatr_subA(qureg, ctrls, ctrlStates, targ, matr);
+        anyCtrlOneTargDenseMatrOnPrefix(qureg, suffixCtrls, suffixCtrlStates, targ, matr) :
+        accel_statevec_anyCtrlOneTargDenseMatr_subA(qureg, suffixCtrls, suffixCtrlStates, targ, matr);
 }
 
 
@@ -992,21 +975,21 @@ void localiser_statevec_anyCtrlOneTargDenseMatr(Qureg qureg, vector<int> ctrls,
  */
 
 
-void anyCtrlTwoOrAnyTargDenseMatrOnSuffix(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr2 matr, bool conj, bool transp) {
+void anyCtrlTwoOrAnyTargDenseMatrOnSuffix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, CompMatr2 matr, bool conj, bool transp) {
     if (conj) 
         matr = util_getConj(matr);
     if (transp)
         matr = util_getTranspose(matr);
     accel_statevec_anyCtrlTwoTargDenseMatr_sub(qureg, ctrls, ctrlStates, targs[0], targs[1], matr);
 }
-void anyCtrlTwoOrAnyTargDenseMatrOnSuffix(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr  matr, bool conj, bool transp) {
+void anyCtrlTwoOrAnyTargDenseMatrOnSuffix(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, CompMatr  matr, bool conj, bool transp) {
     accel_statevec_anyCtrlAnyTargDenseMatr_sub(qureg, ctrls, ctrlStates, targs, matr, conj, transp);
 }
 
 
 // T can be CompMatr2 or CompMatr
 template <typename T>
-void anyCtrlTwoOrAnyTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, T matr, bool conj, bool transp) {
+void anyCtrlTwoOrAnyTargDenseMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, T matr, bool conj, bool transp) {
 
     // node has nothing to do if all local amps violate control condition
     if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates))
@@ -1016,8 +999,8 @@ void anyCtrlTwoOrAnyTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ct
     if (!doesGateRequireComm(qureg, targs)) {
 
         // using only the suffix ctrls
-        removePrefixQubitsAndStates(qureg, ctrls, ctrlStates);
-        anyCtrlTwoOrAnyTargDenseMatrOnSuffix(qureg, ctrls, ctrlStates, targs, matr, conj, transp);
+        auto [suffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, ctrls, ctrlStates);
+        anyCtrlTwoOrAnyTargDenseMatrOnSuffix(qureg, suffixCtrls, suffixCtrlStates, targs, matr, conj, transp);
         return;
     }
 
@@ -1033,8 +1016,8 @@ void anyCtrlTwoOrAnyTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ct
     /// order to accelerate them (since more ctrls = fewer comm). However, this is strangely not
     /// working; controlling the SWAPs upon these 'meta' control qubits is breaking the unit tests!
     /// Until we better understand this, we disable this optimisation by removing all SWAP controls.
-    unmovedCtrls = {};
-    unmovedCtrlStates = {};
+    unmovedCtrls      = lists_getEmptyList64();
+    unmovedCtrlStates = lists_getEmptyList64();
 
     // perform necessary swaps to move all targets into suffix, invoking communication (swaps are real, so no need to conj)
     anyCtrlMultiSwapBetweenPrefixAndSuffix(qureg, unmovedCtrls, unmovedCtrlStates, targs, newTargs);
@@ -1043,8 +1026,8 @@ void anyCtrlTwoOrAnyTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ct
     if (doAnyLocalStatesHaveQubitValues(qureg, newCtrls, ctrlStates)) {
 
         // perform embarrassingly parallel simulation using only the new suffix ctrls
-        removePrefixQubitsAndStates(qureg, newCtrls, ctrlStates);
-        anyCtrlTwoOrAnyTargDenseMatrOnSuffix(qureg, newCtrls, ctrlStates, newTargs, matr, conj, transp);
+        auto [newSuffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, newCtrls, ctrlStates);
+        anyCtrlTwoOrAnyTargDenseMatrOnSuffix(qureg, newSuffixCtrls, suffixCtrlStates, newTargs, matr, conj, transp);
     }
 
     // undo swaps, again invoking communication
@@ -1052,17 +1035,15 @@ void anyCtrlTwoOrAnyTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ct
 }
 
 
-void localiser_statevec_anyCtrlTwoTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr, bool conj, bool transp) {
-    assertValidCtrlStates(ctrls, ctrlStates);
-    setDefaultCtrlStates(ctrls, ctrlStates);
+void localiser_statevec_anyCtrlTwoTargDenseMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, CompMatr2 matr, bool conj, bool transp) {
+    assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size());
 
-    anyCtrlTwoOrAnyTargDenseMatr(qureg, ctrls, ctrlStates, {targ1,targ2}, matr, conj, transp);
+    anyCtrlTwoOrAnyTargDenseMatr(qureg, ctrls, ctrlStates, lists_getList64({targ1,targ2}), matr, conj, transp);
 }
 
 
-void localiser_statevec_anyCtrlAnyTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr, bool conj, bool transp) {
-    assertValidCtrlStates(ctrls, ctrlStates);
-    setDefaultCtrlStates(ctrls, ctrlStates);
+void localiser_statevec_anyCtrlAnyTargDenseMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, CompMatr matr, bool conj, bool transp) {
+    assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size());
 
     // despite our use of compile-time templating, the bespoke one-targ routines are still faster 
     // than this any-targ routine when given a single target, because they can leverage a bespoke
@@ -1098,9 +1079,8 @@ void localiser_statevec_anyCtrlAnyTargDenseMatr(Qureg qureg, vector<int> ctrls,
  */
 
 
-void localiser_statevec_anyCtrlOneTargDiagMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, DiagMatr1 matr, bool conj) {
-    assertValidCtrlStates(ctrls, ctrlStates);
-    setDefaultCtrlStates(ctrls, ctrlStates);
+void localiser_statevec_anyCtrlOneTargDiagMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, DiagMatr1 matr, bool conj) {
+    assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size());
 
     // node has nothing to do if all local amps violate control condition
     if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates))
@@ -1109,15 +1089,14 @@ void localiser_statevec_anyCtrlOneTargDiagMatr(Qureg qureg, vector<int> ctrls, v
     if (conj)
         matr = util_getConj(matr);
 
-    // retain only suffix control qubits, as relevant to local amp modification
-    removePrefixQubitsAndStates(qureg, ctrls, ctrlStates);
-    accel_statevec_anyCtrlOneTargDiagMatr_sub(qureg, ctrls, ctrlStates, targ, matr);
+    // only suffix control qubits are relevant to local amp modification
+    auto [suffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, ctrls, ctrlStates);
+    accel_statevec_anyCtrlOneTargDiagMatr_sub(qureg, suffixCtrls, suffixCtrlStates, targ, matr);
 }
 
 
-void localiser_statevec_anyCtrlTwoTargDiagMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, DiagMatr2 matr, bool conj) {
-    assertValidCtrlStates(ctrls, ctrlStates);
-    setDefaultCtrlStates(ctrls, ctrlStates);
+void localiser_statevec_anyCtrlTwoTargDiagMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, DiagMatr2 matr, bool conj) {
+    assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size());
 
     // node has nothing to do if all local amps violate control condition
     if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates))
@@ -1126,23 +1105,22 @@ void localiser_statevec_anyCtrlTwoTargDiagMatr(Qureg qureg, vector<int> ctrls, v
     if (conj)
         matr = util_getConj(matr);
 
-    // retain only suffix control qubits, as relevant to local amp modification
-    removePrefixQubitsAndStates(qureg, ctrls, ctrlStates);
-    accel_statevec_anyCtrlTwoTargDiagMatr_sub(qureg, ctrls, ctrlStates, targ1, targ2, matr);
+    // only suffix control qubits are relevant to local amp modification
+    auto [suffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, ctrls, ctrlStates);
+    accel_statevec_anyCtrlTwoTargDiagMatr_sub(qureg, suffixCtrls, suffixCtrlStates, targ1, targ2, matr);
 }
 
 
-void localiser_statevec_anyCtrlAnyTargDiagMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, DiagMatr matr, qcomp exponent, bool conj) {
-    assertValidCtrlStates(ctrls, ctrlStates);
-    setDefaultCtrlStates(ctrls, ctrlStates);
+void localiser_statevec_anyCtrlAnyTargDiagMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, DiagMatr matr, qcomp exponent, bool conj) {
+    assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size());
 
     // node has nothing to do if all local amps violate control condition
     if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates))
         return;
 
-    // retain only suffix control qubits, as relevant to local amp modification
-    removePrefixQubitsAndStates(qureg, ctrls, ctrlStates);
-    accel_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, targs, matr, exponent, conj);
+    // only suffix control qubits are relevant to local amp modification
+    auto [suffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, ctrls, ctrlStates);
+    accel_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, suffixCtrls, suffixCtrlStates, targs, matr, exponent, conj);
 }
 
 
@@ -1226,7 +1204,7 @@ void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qco
 
 
 template <class T>
-void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, T matr, bool conj) {
+void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, T matr, bool conj) {
 
     // this function is never invoked by operations whch require transposing matr
     bool transp = false;
@@ -1244,12 +1222,12 @@ void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg qureg, vector<int> ctrls, ve
     if constexpr (util_isCompMatr2<T>()) localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, ctrls, ctrlStates, targs[0], targs[1], matr, conj, transp);
 }
 
-template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, vector<int>, vector<int>, vector<int>, DiagMatr,  bool);
-template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, vector<int>, vector<int>, vector<int>, DiagMatr1, bool);
-template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, vector<int>, vector<int>, vector<int>, DiagMatr2, bool);
-template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, vector<int>, vector<int>, vector<int>, CompMatr,  bool);
-template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, vector<int>, vector<int>, vector<int>, CompMatr1, bool);
-template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, vector<int>, vector<int>, vector<int>, CompMatr2, bool);
+template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, ConstList64, ConstList64, DiagMatr,  bool);
+template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, ConstList64, ConstList64, DiagMatr1, bool);
+template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, ConstList64, ConstList64, DiagMatr2, bool);
+template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, ConstList64, ConstList64, CompMatr,  bool);
+template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, ConstList64, ConstList64, CompMatr1, bool);
+template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, ConstList64, ConstList64, ConstList64, CompMatr2, bool);
 
 
 
@@ -1258,16 +1236,14 @@ template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, vector<int>, vecto
  */
 
 
-void anyCtrlZTensorOrGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, bool isGadget, qcomp phase) {     
-    assertValidCtrlStates(ctrls, ctrlStates);
-    setDefaultCtrlStates(ctrls, ctrlStates);
+void anyCtrlZTensorOrGadget(Qureg qureg, ConstList64 allCtrls, ConstList64 allCtrlStates, ConstList64 targs, bool isGadget, qcomp phase) {
 
     // node has nothing to do if all local amps violate control condition
-    if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates))
+    if (!doAnyLocalStatesHaveQubitValues(qureg, allCtrls, allCtrlStates))
         return;
 
     // retain only suffix control qubits, as relevant to local amp modification
-    removePrefixQubitsAndStates(qureg, ctrls, ctrlStates);
+    auto [suffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, allCtrls, allCtrlStates);
 
     // prefixZ merely applies a node-wide factor to fac0 and fac1
     auto [prefixZ, suffixZ] = util_getPrefixAndSuffixQubits(targs, qureg);
@@ -1278,24 +1254,22 @@ void anyCtrlZTensorOrGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStat
     qcomp fac1 = (isGadget)? std::exp(- phase * sign * 1_i) : -1 * sign;
 
     // simulation is always embarrassingly parallel
-    accel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(qureg, ctrls, ctrlStates, suffixZ, fac0, fac1);
+    accel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(qureg, suffixCtrls, suffixCtrlStates, suffixZ, fac0, fac1);
 }
 
 
-void anyCtrlPauliTensorOrGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, PauliStr str, qcomp ampFac, qcomp pairAmpFac) {
-    assertValidCtrlStates(ctrls, ctrlStates);
-    setDefaultCtrlStates(ctrls, ctrlStates);
+void anyCtrlPauliTensorOrGadget(Qureg qureg, ConstList64 allCtrls, ConstList64 allCtrlStates, PauliStr str, qcomp ampFac, qcomp pairAmpFac) {
 
     // this routine is invalid for str=ZI
     if (!paulis_containsXOrY(str))
         error_localiserGivenPauliStrWithoutXorY();
 
     // node has nothing to do if all local amps violate control condition
-    if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates))
+    if (!doAnyLocalStatesHaveQubitValues(qureg, allCtrls, allCtrlStates))
         return;
 
     // retain only suffix control qubits, as relevant to local amp modification
-    removePrefixQubitsAndStates(qureg, ctrls, ctrlStates);
+    auto [suffixCtrls, suffixCtrlStates] = getSuffixQubitsAndStates(qureg, allCtrls, allCtrlStates);
 
     // partition non-Id Paulis into prefix and suffix, since...
     // - prefix X,Y determine communication, because they apply bit-not to rank
@@ -1311,26 +1285,29 @@ void anyCtrlPauliTensorOrGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrl
 
     // embarrassingly parallel when there is only Z's in prefix
     if (prefixX.empty() && prefixY.empty()) {
-        accel_statevector_anyCtrlPauliTensorOrGadget_subA(qureg, ctrls, ctrlStates, suffixX, suffixY, suffixZ, ampFac, pairAmpFac);
+        accel_statevector_anyCtrlPauliTensorOrGadget_subA(
+            qureg, suffixCtrls, suffixCtrlStates, suffixX, suffixY, suffixZ, ampFac, pairAmpFac);
         return;
     }
 
     // otherwise, we pair-wise communicate amps satisfying ctrls
     auto prefixXY = util_getConcatenated(prefixX, prefixY);
     int pairRank = util_getRankWithQubitsFlipped(prefixXY, qureg);
-    exchangeAmpsToBuffersWhereQubitsAreInStates(qureg, pairRank, ctrls, ctrlStates);
+    exchangeAmpsToBuffersWhereQubitsAreInStates(qureg, pairRank, suffixCtrls, suffixCtrlStates);
 
     // ctrls reduce communicated amps, so received buffer is compacted;
     // we must ergo prepare a no-ctrl XY mask for accessing buffer elems
-    auto sortedCtrls = util_getSorted(ctrls);
+    auto sortedCtrls = util_getSorted(suffixCtrls);
     auto suffixMaskXY = util_getBitMask(util_getConcatenated(suffixX, suffixY));
     auto bufferMaskXY = removeBits(suffixMaskXY, sortedCtrls.data(), sortedCtrls.size());
 
-    accel_statevector_anyCtrlPauliTensorOrGadget_subB(qureg, ctrls, ctrlStates, suffixX, suffixY, suffixZ, ampFac, pairAmpFac, bufferMaskXY);
+    accel_statevector_anyCtrlPauliTensorOrGadget_subB(
+        qureg, suffixCtrls, suffixCtrlStates, suffixX, suffixY, suffixZ, ampFac, pairAmpFac, bufferMaskXY);
 }
 
 
-void localiser_statevec_anyCtrlPauliTensor(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, PauliStr str, qcomp factor) {
+void localiser_statevec_anyCtrlPauliTensor(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, PauliStr str, qcomp factor) {
+    assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size());
 
     // this function accepts a global factor, so that density matrices can effect conj(pauli)
 
@@ -1353,14 +1330,16 @@ void localiser_statevec_anyCtrlPauliTensor(Qureg qureg, vector<int> ctrls, vecto
 }
 
 
-void localiser_statevec_anyCtrlPhaseGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qcomp phase) {
+void localiser_statevec_anyCtrlPhaseGadget(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, qcomp phase) {
+    assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size());
 
     bool isGadget = true;
     anyCtrlZTensorOrGadget(qureg, ctrls, ctrlStates, targs, isGadget, phase);
 }
 
 
-void localiser_statevec_anyCtrlPauliGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, PauliStr str, qcomp phase) {
+void localiser_statevec_anyCtrlPauliGadget(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, PauliStr str, qcomp phase) {
+    assert_localiserListLengthsAgree(ctrls.size(), ctrlStates.size());
 
     // when str=IZ, we must use the above bespoke algorithm
     if (!paulis_containsXOrY(str)) {
@@ -1486,7 +1465,7 @@ void oneQubitDepolarisingOnPrefix(Qureg qureg, int ketQubit, qreal prob) {
     // pack and exchange amps to buffers where local ket qubit and fixed-prefix-bra qubit agree
     int braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
     int pairRank = util_getRankWithBraQubitFlipped(ketQubit, qureg);
-    exchangeAmpsToBuffersWhereQubitsAreInStates(qureg, pairRank, {ketQubit}, {braBit});
+    exchangeAmpsToBuffersWhereQubitsAreInStates(qureg, pairRank, lists_getList64({ketQubit}), lists_getList64({braBit}));
 
     // use received sub-buffer to update local amps
     accel_densmatr_oneQubitDepolarising_subB(qureg, ketQubit, prob);
@@ -1536,7 +1515,8 @@ void twoQubitDepolarisingOnPrefixAndPrefix(Qureg qureg, int ketQb1, int ketQb2,
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
 
     // pack unscaled amps before subsequent scaling
-    qindex numPacked = accel_statevec_packAmpsIntoBuffer(qureg, {ketQb1,ketQb2}, {braBit1,braBit2});
+    auto ketList = lists_getList64({ketQb1,ketQb2});
+    qindex numPacked = accel_statevec_packAmpsIntoBuffer(qureg, ketList, lists_getList64({braBit1,braBit2}));
 
     // scale all amps
     accel_densmatr_twoQubitDepolarising_subE(qureg, ketQb1, ketQb2, prob);
@@ -1544,7 +1524,7 @@ void twoQubitDepolarisingOnPrefixAndPrefix(Qureg qureg, int ketQb1, int ketQb2,
     // swap the buffer with 3 other nodes to update local amps
     int pairRank1 = util_getRankWithBraQubitFlipped(ketQb1, qureg);
     int pairRank2 = util_getRankWithBraQubitFlipped(ketQb2, qureg);
-    int pairRank3 = util_getRankWithBraQubitsFlipped({ketQb1,ketQb2}, qureg);
+    int pairRank3 = util_getRankWithBraQubitsFlipped(ketList, qureg);
 
     comm_exchangeSubBuffers(qureg, numPacked, pairRank1);
     accel_densmatr_twoQubitDepolarising_subF(qureg, ketQb1, ketQb2, prob);
@@ -1631,7 +1611,7 @@ void oneQubitDampingOnPrefix(Qureg qureg, int ketQubit, qreal prob) {
     if (braBit == 1) {
 
         // pack and async send half the buffer
-        accel_statevec_packAmpsIntoBuffer(qureg, {ketQubit}, {1});
+        accel_statevec_packAmpsIntoBuffer(qureg, lists_getList64({ketQubit}), lists_getList64({1}));
         comm_asynchSendSubBuffer(qureg, numAmps, pairRank);
 
         // scale the local amps which were just sent
@@ -1692,7 +1672,7 @@ CompMatr getSpoofedCompMatrFromSuperOp(SuperOp op) {
 }
 
 
-void localiser_densmatr_superoperator(Qureg qureg, SuperOp op, vector<int> ketTargs) {
+void localiser_densmatr_superoperator(Qureg qureg, SuperOp op, ConstList64 ketTargs) {
     assert_localiserGivenDensMatr(qureg);
 
     // effect the superoperator as a dense matrix on the ket + bra qubits
@@ -1701,11 +1681,12 @@ void localiser_densmatr_superoperator(Qureg qureg, SuperOp op, vector<int> ketTa
     auto braTargs = util_getBraQubits(ketTargs, qureg);
     auto allTargs = util_getConcatenated(ketTargs, braTargs);
     CompMatr matr = getSpoofedCompMatrFromSuperOp(op);
-    localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, {}, {}, allTargs, matr, conj, transp);
+    List64 empty = lists_getEmptyList64();
+    localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, empty, empty, allTargs, matr, conj, transp);
 }
 
 
-void localiser_densmatr_krausMap(Qureg qureg, KrausMap map, vector<int> ketTargs) {
+void localiser_densmatr_krausMap(Qureg qureg, KrausMap map, ConstList64 ketTargs) {
     
     // Kraus map is simulated through its existing superoperator
     localiser_densmatr_superoperator(qureg, map.superop, ketTargs);
@@ -1718,12 +1699,10 @@ void localiser_densmatr_krausMap(Qureg qureg, KrausMap map, vector<int> ketTargs
  */
 
 
-auto getNonTracedQubitOrder(Qureg qureg, vector<int> originalTargs, vector<int> revisedTargs) {
+auto getNonTracedQubitOrder(Qureg qureg, ConstList64 originalTargs, ConstList64 revisedTargs) {
 
-    // prepare a list of all the qureg's qubits when treated as a statevector
-    vector<int> allQubits(2*qureg.numQubits);
-    for (size_t q=0; q<allQubits.size(); q++)
-        allQubits[q] = q;
+    // get a list of all the qureg's qubits when treated as a statevector
+    auto allQubits = util_getRange(2 * qureg.numQubits);
     
     // determine the ordering of all the Qureg's qubits after swaps
     for (size_t i=0; i<originalTargs.size(); i++) {
@@ -1737,8 +1716,7 @@ auto getNonTracedQubitOrder(Qureg qureg, vector<int> originalTargs, vector<int>
     qindex revisedMask = util_getBitMask(revisedTargs);
 
     // retain only non-targeted qubits
-    vector<int> remainingQubits;
-    remainingQubits.reserve(allQubits.size() - originalTargs.size());
+    auto remainingQubits = lists_getEmptyList64();
     for (size_t q=0; q<allQubits.size(); q++)
         if (!getBit(revisedMask, q))
             remainingQubits.push_back(allQubits[q]);
@@ -1758,7 +1736,7 @@ auto getNonTracedQubitOrder(Qureg qureg, vector<int> originalTargs, vector<int>
 }
 
 
-void reorderReducedQureg(Qureg inQureg, Qureg outQureg, vector<int> allTargs, vector<int> suffixTargs) {
+void reorderReducedQureg(Qureg inQureg, Qureg outQureg, ConstList64 allTargs, ConstList64 suffixTargs) {
 
     /// @todo 
     /// this function performs a sequence of SWAPs which are NOT necessarily upon disjoint qubits,
@@ -1770,7 +1748,7 @@ void reorderReducedQureg(Qureg inQureg, Qureg outQureg, vector<int> allTargs, ve
     auto remainingQubits = getNonTracedQubitOrder(inQureg, allTargs, suffixTargs);
 
    // perform additional swaps to re-order the remaining qubits (heuristically starting from back)
-    for (int qubit=(int)remainingQubits.size(); qubit-- != 0; ) {
+    for (int qubit=remainingQubits.size(); qubit-- != 0; ) {
 
         // locate the next qubit which is out of its sorted position
         if (remainingQubits[qubit] == qubit)
@@ -1782,20 +1760,21 @@ void reorderReducedQureg(Qureg inQureg, Qureg outQureg, vector<int> allTargs, ve
             pair++;
         
         // and swap it directly to its required position, triggering any communication scenario (I think)
-        localiser_statevec_anyCtrlSwap(outQureg, {}, {}, qubit, pair);
+        auto empty = lists_getEmptyList64();
+        localiser_statevec_anyCtrlSwap(outQureg, empty, empty, qubit, pair);
         std::swap(remainingQubits[qubit], remainingQubits[pair]);
     }
 }
 
 
-void partialTraceOnSuffix(Qureg inQureg, Qureg outQureg, vector<int> ketTargs) {
+void partialTraceOnSuffix(Qureg inQureg, Qureg outQureg, ConstList64 ketTargs) {
 
     auto braTargs = util_getBraQubits(ketTargs, inQureg);
     accel_densmatr_partialTrace_sub(inQureg, outQureg, ketTargs, braTargs);
 }
 
 
-void partialTraceOnPrefix(Qureg inQureg, Qureg outQureg, vector<int> ketTargs) {
+void partialTraceOnPrefix(Qureg inQureg, Qureg outQureg, ConstList64 ketTargs) {
 
     // all ketTargs (pre-sorted) are in the suffix, but one or more braTargs are in the prefix
     auto braTargs = util_getBraQubits(ketTargs, inQureg); // sorted
@@ -1803,22 +1782,24 @@ void partialTraceOnPrefix(Qureg inQureg, Qureg outQureg, vector<int> ketTargs) {
     auto sufTargs = getQubitsSwappedToMaxSuffix(inQureg, allTargs); // arbitrarily ordered
 
     // swap iniQureg's prefix bra-qubits into suffix, invoking communication
-    anyCtrlMultiSwapBetweenPrefixAndSuffix(inQureg, {}, {}, sufTargs, allTargs);
+    auto empty = lists_getEmptyList64();
+    anyCtrlMultiSwapBetweenPrefixAndSuffix(inQureg, empty, empty, sufTargs, allTargs);
 
     // use the second half of sufTargs as the pair targs, which are now all in the suffix,
-    // to perform embarrassingly parallel overwriting of outQureg
-    vector<int> pairTargs(sufTargs.begin() + ketTargs.size(), sufTargs.end()); // arbitrarily ordered
+    // to perform embarrassingly parallel overwriting of outQureg (they're arbitrarily ordered)
+    auto pairTargs = lists_getList64(sufTargs.begin() + ketTargs.size(), sufTargs.end());
+
     accel_densmatr_partialTrace_sub(inQureg, outQureg, ketTargs, pairTargs);
 
     // restore the relative order of outQureg's remaining qubits using SWAPs
     reorderReducedQureg(inQureg, outQureg, allTargs, sufTargs);
 
     // undo the swaps on inQureg
-    anyCtrlMultiSwapBetweenPrefixAndSuffix(inQureg, {}, {}, sufTargs, allTargs);
+    anyCtrlMultiSwapBetweenPrefixAndSuffix(inQureg, empty, empty, sufTargs, allTargs);
 }
 
 
-void localiser_densmatr_partialTrace(Qureg inQureg, Qureg outQureg, vector<int> targs) {
+void localiser_densmatr_partialTrace(Qureg inQureg, Qureg outQureg, ConstList64 targs) {
     assert_localiserPartialTraceGivenCompatibleQuregs(inQureg, outQureg, targs.size());
 
     // this function requires inQureg and outQureg are both or neither distributed;
@@ -1871,7 +1852,7 @@ qreal localiser_densmatr_calcTotalProb(Qureg qureg) {
 }
 
 
-qreal localiser_statevec_calcProbOfMultiQubitOutcome(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
+qreal localiser_statevec_calcProbOfMultiQubitOutcome(Qureg qureg, ConstList64 qubits, ConstList64 outcomes) {
     assert_localiserGivenStateVec(qureg);
 
     qreal prob = 0;
@@ -1880,8 +1861,8 @@ qreal localiser_statevec_calcProbOfMultiQubitOutcome(Qureg qureg, vector<int> qu
     if (doAnyLocalStatesHaveQubitValues(qureg, qubits, outcomes)) {
 
         // and do so using only the suffix qubits/outcomes
-        removePrefixQubitsAndStates(qureg, qubits, outcomes);
-        prob += accel_statevec_calcProbOfMultiQubitOutcome_sub(qureg, qubits, outcomes);
+        auto [suffixQubits, suffixOutcomes] = getSuffixQubitsAndStates(qureg, qubits, outcomes);
+        prob += accel_statevec_calcProbOfMultiQubitOutcome_sub(qureg, suffixQubits, suffixOutcomes);
     }
 
     // but all nodes must sum their probabilities (unless qureg was cloned per-node), for conensus
@@ -1892,7 +1873,7 @@ qreal localiser_statevec_calcProbOfMultiQubitOutcome(Qureg qureg, vector<int> qu
 }
 
 
-qreal localiser_densmatr_calcProbOfMultiQubitOutcome(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
+qreal localiser_densmatr_calcProbOfMultiQubitOutcome(Qureg qureg, ConstList64 qubits, ConstList64 outcomes) {
     assert_localiserGivenDensMatr(qureg);
 
     qreal prob = 0;
@@ -1905,8 +1886,8 @@ qreal localiser_densmatr_calcProbOfMultiQubitOutcome(Qureg qureg, vector<int> qu
     if (doAnyLocalStatesHaveQubitValues(qureg, braQubits, outcomes)) {
 
         // such nodes need only know the ket qubits/outcomes for which the bra-qubits are in suffix
-        vector<int> ketQubitsWithBraInSuffix;
-        vector<int> ketOutcomesWithBraInSuffix;
+        auto ketQubitsWithBraInSuffix = lists_getEmptyList64();
+        auto ketOutcomesWithBraInSuffix = lists_getEmptyList64();
         for (size_t q=0; q<qubits.size(); q++)
             if (util_isBraQubitInSuffix(qubits[q], qureg)) {
                 ketQubitsWithBraInSuffix.push_back(qubits[q]);
@@ -1925,7 +1906,7 @@ qreal localiser_densmatr_calcProbOfMultiQubitOutcome(Qureg qureg, vector<int> qu
 }
 
 
-void localiser_statevec_calcProbsOfAllMultiQubitOutcomes(qreal* outProbs, Qureg qureg, vector<int> qubits) {
+void localiser_statevec_calcProbsOfAllMultiQubitOutcomes(qreal* outProbs, Qureg qureg, ConstList64 qubits) {
     assert_localiserGivenStateVec(qureg);
 
     /// @todo
@@ -1965,7 +1946,7 @@ void localiser_statevec_calcProbsOfAllMultiQubitOutcomes(qreal* outProbs, Qureg
 }
 
 
-void localiser_densmatr_calcProbsOfAllMultiQubitOutcomes(qreal* outProbs, Qureg qureg, vector<int> qubits) {
+void localiser_densmatr_calcProbsOfAllMultiQubitOutcomes(qreal* outProbs, Qureg qureg, ConstList64 qubits) {
     assert_localiserGivenDensMatr(qureg);
 
     // each node independently populates local outProbs
@@ -1986,7 +1967,7 @@ void localiser_densmatr_calcProbsOfAllMultiQubitOutcomes(qreal* outProbs, Qureg
 PAULI_MASK_TYPE paulis_getKeyOfSameMixedAmpsGroup(PauliStr str);
 
 
-qcomp getStateVecExpecAllSuffixPauliStr(Qureg qureg, vector<int> suffixX, vector<int> suffixY, vector<int> suffixZ) {
+qcomp getStateVecExpecAllSuffixPauliStr(Qureg qureg, ConstList64 suffixX, ConstList64 suffixY, ConstList64 suffixZ) {
     assert_localiserGivenStateVec(qureg);
 
     // optimised scenario when str = I
@@ -2318,7 +2299,8 @@ qreal localiser_densmatr_calcHilbertSchmidtDistance(Qureg quregA, Qureg quregB)
  */
 
 
-void localiser_statevec_multiQubitProjector(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) {
+void localiser_statevec_multiQubitProjector(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
+    assert_localiserListLengthsAgree(qubits.size(), outcomes.size());
 
     // this routine is always embarrassingly parallel; however, we handle the
     // prefix-qubits here so that the backend can receive only the suffix qubits
@@ -2331,15 +2313,15 @@ void localiser_statevec_multiQubitProjector(Qureg qureg, vector<int> qubits, vec
         return;
     }
 
-    // all other nodes has some or all states consistent with suffix outcomes
-    removePrefixQubitsAndStates(qureg, qubits, outcomes);
-    (qubits.empty())?
+    // all other nodes contain some or only basis states consistent with suffix outcomes
+    auto [suffixQubits, suffixOutcomes] = getSuffixQubitsAndStates(qureg, qubits, outcomes);
+    (suffixQubits.empty())?
         localiser_statevec_scaleAmps(qureg, 1/std::sqrt(prob)):
-        accel_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, prob);
+        accel_statevec_multiQubitProjector_sub(qureg, suffixQubits, suffixOutcomes, prob);
 }
 
 
-void localiser_densmatr_multiQubitProjector(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) {
+void localiser_densmatr_multiQubitProjector(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
     assert_localiserGivenDensMatr(qureg);
 
     // always embarrassingly parallel
diff --git a/quest/src/core/localiser.hpp b/quest/src/core/localiser.hpp
index b56ad92a4..0e954ea70 100644
--- a/quest/src/core/localiser.hpp
+++ b/quest/src/core/localiser.hpp
@@ -18,6 +18,8 @@
 #include "quest/include/matrices.h"
 #include "quest/include/channels.h"
 
+#include "quest/src/core/lists.hpp"
+
 #include <vector>
 
 using std::vector;
@@ -78,29 +80,29 @@ void localiser_densmatr_initMixtureOfUniformlyRandomPureStates(Qureg qureg, qind
  * SWAP
  */
 
-void localiser_statevec_anyCtrlSwap(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2);
+void localiser_statevec_anyCtrlSwap(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2);
 
 
 /*
  * DENSE MATRICES
  */
 
-void localiser_statevec_anyCtrlOneTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, CompMatr1 matr, bool conj, bool transp);
+void localiser_statevec_anyCtrlOneTargDenseMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, CompMatr1 matr, bool conj, bool transp);
 
-void localiser_statevec_anyCtrlTwoTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr, bool conj, bool transp);
+void localiser_statevec_anyCtrlTwoTargDenseMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, CompMatr2 matr, bool conj, bool transp);
 
-void localiser_statevec_anyCtrlAnyTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr, bool conj, bool transp);
+void localiser_statevec_anyCtrlAnyTargDenseMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, CompMatr matr, bool conj, bool transp);
 
 
 /*
  * DIAGONAL MATRICES
  */
 
-void localiser_statevec_anyCtrlOneTargDiagMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, DiagMatr1 matr, bool conj);
+void localiser_statevec_anyCtrlOneTargDiagMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, DiagMatr1 matr, bool conj);
 
-void localiser_statevec_anyCtrlTwoTargDiagMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, DiagMatr2 matr, bool conj);
+void localiser_statevec_anyCtrlTwoTargDiagMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, DiagMatr2 matr, bool conj);
 
-void localiser_statevec_anyCtrlAnyTargDiagMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, DiagMatr matr, qcomp exponent, bool conj);
+void localiser_statevec_anyCtrlAnyTargDiagMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, DiagMatr matr, qcomp exponent, bool conj);
 
 void localiser_statevec_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
 void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool applyLeft, bool applyRight, bool conjRight);
@@ -111,18 +113,18 @@ void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qco
  */
 
 template <class T>
-void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, T matr, bool conj);
+void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, T matr, bool conj);
 
 
 /*
  * PAULI TENSORS AND GADGETS
  */
 
-void localiser_statevec_anyCtrlPauliTensor(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, PauliStr str, qcomp globalFactor=1);
+void localiser_statevec_anyCtrlPauliTensor(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, PauliStr str, qcomp globalFactor=1);
 
-void localiser_statevec_anyCtrlPauliGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, PauliStr str, qcomp phase);
+void localiser_statevec_anyCtrlPauliGadget(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, PauliStr str, qcomp phase);
 
-void localiser_statevec_anyCtrlPhaseGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qcomp phase);
+void localiser_statevec_anyCtrlPhaseGadget(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, qcomp phase);
 
 
 /*
@@ -150,16 +152,16 @@ void localiser_densmatr_oneQubitPauliChannel(Qureg qureg, int qubit, qreal pX, q
 
 void localiser_densmatr_oneQubitDamping(Qureg qureg, int qubit, qreal prob);
 
-void localiser_densmatr_superoperator(Qureg qureg, SuperOp op, vector<int> ketTargs);
+void localiser_densmatr_superoperator(Qureg qureg, SuperOp op, ConstList64 ketTargs);
 
-void localiser_densmatr_krausMap(Qureg qureg, KrausMap map, vector<int> qubits);
+void localiser_densmatr_krausMap(Qureg qureg, KrausMap map, ConstList64 qubits);
 
 
 /*
  * PARTIAL TRACE
  */
 
-void localiser_densmatr_partialTrace(Qureg inQureg, Qureg outQureg, vector<int> targs);
+void localiser_densmatr_partialTrace(Qureg inQureg, Qureg outQureg, ConstList64 targs);
 
 
 /*
@@ -169,11 +171,11 @@ void localiser_densmatr_partialTrace(Qureg inQureg, Qureg outQureg, vector<int>
 qreal localiser_statevec_calcTotalProb(Qureg qureg);
 qreal localiser_densmatr_calcTotalProb(Qureg qureg);
 
-qreal localiser_statevec_calcProbOfMultiQubitOutcome(Qureg qureg, vector<int> qubits, vector<int> outcomes);
-qreal localiser_densmatr_calcProbOfMultiQubitOutcome(Qureg qureg, vector<int> qubits, vector<int> outcomes);
+qreal localiser_statevec_calcProbOfMultiQubitOutcome(Qureg qureg, ConstList64 qubits, ConstList64 outcomes);
+qreal localiser_densmatr_calcProbOfMultiQubitOutcome(Qureg qureg, ConstList64 qubits, ConstList64 outcomes);
 
-void localiser_statevec_calcProbsOfAllMultiQubitOutcomes(qreal* outProbs, Qureg qureg, vector<int> qubits);
-void localiser_densmatr_calcProbsOfAllMultiQubitOutcomes(qreal* outProbs, Qureg qureg, vector<int> qubits);
+void localiser_statevec_calcProbsOfAllMultiQubitOutcomes(qreal* outProbs, Qureg qureg, ConstList64 qubits);
+void localiser_densmatr_calcProbsOfAllMultiQubitOutcomes(qreal* outProbs, Qureg qureg, ConstList64 qubits);
 
 
 /*
@@ -205,8 +207,8 @@ qcomp localiser_densmatr_calcExpecFullStateDiagMatr(Qureg qureg, FullStateDiagMa
  * PROJECTORS 
  */
 
-void localiser_statevec_multiQubitProjector(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob);
-void localiser_densmatr_multiQubitProjector(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob);
+void localiser_statevec_multiQubitProjector(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
+void localiser_densmatr_multiQubitProjector(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
 
 
 #endif // LOCALISER_HPP
\ No newline at end of file
diff --git a/quest/src/core/parser.cpp b/quest/src/core/parser.cpp
index 5448d3862..9d9194a3f 100644
--- a/quest/src/core/parser.cpp
+++ b/quest/src/core/parser.cpp
@@ -82,6 +82,9 @@ namespace patterns {
     // full complex; any format, importantly in order of decreasing specificity. do not consult for captured groups
     string num = group(comp) + "|" + group(imag) + "|" + group(real);
 
+    // full signed integer
+    string signedInt = optSign + "[0-9]+";
+
     // no capturing because 'num' pollutes captured groups, and pauli syntax overlaps real integers
     string pauli = "[" + parser_RECOGNISED_PAULI_CHARS + "]";
     string paulis = group(optSpace + pauli + optSpace) + "+";
@@ -96,6 +99,7 @@ namespace regexes {
     regex imag(patterns::imag);
     regex comp(patterns::comp);
     regex num(patterns::num);
+    regex signedInt(patterns::signedInt);
     regex paulis(patterns::paulis);
     regex weightedPaulis(patterns::weightedPaulis);
 }
@@ -173,6 +177,63 @@ int getNumPaulisInLine(string line) {
 
 
 
+/*
+ * INTEGER PARSING
+ */
+
+
+bool parser_isAnySizedInteger(string str) {
+
+    smatch match;
+    return regex_match(str, match, regexes::signedInt);
+}
+
+
+bool parser_isValidInteger(string str) {
+
+    // reject str if it doesn't match regex
+    if (!parser_isAnySizedInteger(str))
+        return false;
+
+    // remove whitespace which stoi() below cannot handle after the sign
+    removeWhiteSpace(str);
+
+    // check number is in-range of int via duck-typing
+    try {
+        std::stoi(str);
+    } catch (const out_of_range&) {
+        return false;
+
+    // error if our regex permitted an unparsable string
+    } catch (const invalid_argument&) {
+        error_attemptedToParseIntegerFromInvalidString();
+    }
+
+    return true;
+}
+
+
+int parser_parseInteger(string str) {
+
+    if (!parser_isValidInteger(str))
+        error_attemptedToParseIntegerFromInvalidString();
+
+    removeWhiteSpace(str); // stoi can't handle
+
+    try {
+        return std::stoi(str);
+    } catch (const invalid_argument&) {
+        error_attemptedToParseIntegerFromInvalidString();
+    } catch (const out_of_range&) {
+        error_attemptedToParseOutOfRangeInteger();
+    }
+
+    // unreachable
+    return -1;
+}
+
+
+
 /*
  * REAL NUMBER PARSING
  */
@@ -187,9 +248,9 @@ qreal precisionAgnosticStringToFloat(string str) {
     removeWhiteSpace(str);
 
     // below throws exception when the (prefix) of str cannot be/fit into a qreal
-    if (FLOAT_PRECISION == 1) return static_cast<qreal>(std::stof (str));
-    if (FLOAT_PRECISION == 2) return static_cast<qreal>(std::stod (str));
-    if (FLOAT_PRECISION == 4) return static_cast<qreal>(std::stold(str));
+    if (QUEST_FLOAT_PRECISION == 1) return static_cast<qreal>(std::stof (str));
+    if (QUEST_FLOAT_PRECISION == 2) return static_cast<qreal>(std::stod (str));
+    if (QUEST_FLOAT_PRECISION == 4) return static_cast<qreal>(std::stold(str));
 
     // unreachable
     return -1;
diff --git a/quest/src/core/parser.hpp b/quest/src/core/parser.hpp
index 4a9df2d02..3d34588ae 100644
--- a/quest/src/core/parser.hpp
+++ b/quest/src/core/parser.hpp
@@ -20,12 +20,16 @@ using std::string;
  * PARSING NUMBERS
  */
 
+bool parser_isAnySizedInteger(string str);
+bool parser_isValidInteger(string str);
+
 bool parser_isAnySizedReal(string str);
 bool parser_isAnySizedComplex(string str);
 
 bool parser_isValidReal(string str);
 bool parser_isValidComplex(string str);
 
+int parser_parseInteger(string str);
 qreal parser_parseReal(string str);
 qcomp parser_parseComplex(string str);
 
diff --git a/quest/src/core/paulilogic.cpp b/quest/src/core/paulilogic.cpp
index 04e8311ed..58ddc39f8 100644
--- a/quest/src/core/paulilogic.cpp
+++ b/quest/src/core/paulilogic.cpp
@@ -9,6 +9,7 @@
 #include "quest/include/qureg.h"
 
 #include "quest/src/core/paulilogic.hpp"
+#include "quest/src/core/lists.hpp"
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/core/bitwise.hpp"
 #include "quest/src/core/errors.hpp"
@@ -113,7 +114,7 @@ int paulis_getSignOfPauliStrConj(PauliStr str) {
 }
 
 
-int paulis_getPrefixZSign(Qureg qureg, vector<int> prefixZ) {
+int paulis_getPrefixZSign(Qureg qureg, ConstList64 prefixZ) {
 
     int sign = 1;
 
@@ -125,7 +126,7 @@ int paulis_getPrefixZSign(Qureg qureg, vector<int> prefixZ) {
 }
 
 
-qcomp paulis_getPrefixPaulisElem(Qureg qureg, vector<int> prefixY, vector<int> prefixZ) {
+qcomp paulis_getPrefixPaulisElem(Qureg qureg, ConstList64 prefixY, ConstList64 prefixZ) {
 
     // each Z contributes +- 1
     qcomp elem = paulis_getPrefixZSign(qureg, prefixZ);
@@ -138,12 +139,10 @@ qcomp paulis_getPrefixPaulisElem(Qureg qureg, vector<int> prefixY, vector<int> p
 }
 
 
-vector<int> paulis_getTargetInds(PauliStr str) {
+List64 paulis_getTargetInds(PauliStr str) {
 
     int maxInd = paulis_getIndOfLefmostNonIdentityPauli(str);
-
-    vector<int> inds(0);
-    inds.reserve(maxInd+1);
+    auto inds = lists_getEmptyList64();
 
     for (int i=0; i<=maxInd; i++)
         if (paulis_getPauliAt(str, i) != 0) // Id
@@ -170,12 +169,14 @@ qindex paulis_getTargetBitMask(PauliStr str) {
 }
 
 
-std::array<vector<int>,3> paulis_getSeparateInds(PauliStr str) {
+std::array<List64,3> paulis_getSeparateInds(PauliStr str) {
 
-    vector<int> iXYZ = paulis_getTargetInds(str);
-    vector<int> iX, iY, iZ;
+    auto iXYZ = paulis_getTargetInds(str);
+    auto iX = lists_getEmptyList64();
+    auto iY = lists_getEmptyList64();
+    auto iZ = lists_getEmptyList64();
 
-    vector<int>* ptrs[] = {&iX, &iY, &iZ};
+    List64* ptrs[] = {&iX, &iY, &iZ};
 
     for (int i : iXYZ)
         ptrs[paulis_getPauliAt(str, i) - 1]->push_back(i);
diff --git a/quest/src/core/paulilogic.hpp b/quest/src/core/paulilogic.hpp
index b9f1b5fdc..f2b53a6bf 100644
--- a/quest/src/core/paulilogic.hpp
+++ b/quest/src/core/paulilogic.hpp
@@ -12,6 +12,8 @@
 #include "quest/include/paulis.h"
 #include "quest/include/qureg.h"
 
+#include "quest/src/core/lists.hpp"
+
 #include <utility>
 #include <vector>
 #include <array>
@@ -43,13 +45,13 @@ int paulis_getIndOfLefmostNonIdentityPauli(PauliStr* strings, qindex numStrings)
 
 int paulis_getSignOfPauliStrConj(PauliStr str);
 
-int paulis_getPrefixZSign(Qureg qureg, vector<int> prefixZ);
+int paulis_getPrefixZSign(Qureg qureg, ConstList64 prefixZ);
 
-qcomp paulis_getPrefixPaulisElem(Qureg qureg, vector<int> prefixY, vector<int> prefixZ);
+qcomp paulis_getPrefixPaulisElem(Qureg qureg, ConstList64 prefixY, ConstList64 prefixZ);
 
-vector<int> paulis_getTargetInds(PauliStr str);
+List64 paulis_getTargetInds(PauliStr str);
 
-std::array<vector<int>,3> paulis_getSeparateInds(PauliStr str);
+std::array<List64,3> paulis_getSeparateInds(PauliStr str);
 
 qindex paulis_getTargetBitMask(PauliStr str);
 
diff --git a/quest/src/core/printer.cpp b/quest/src/core/printer.cpp
index e4d4cbc32..863317c8f 100644
--- a/quest/src/core/printer.cpp
+++ b/quest/src/core/printer.cpp
@@ -32,6 +32,7 @@
 #include <stdlib.h>
 #include <iostream>
 #include <iomanip>
+#include <cstdio>
 #include <sstream>
 #include <memory>
 #include <vector>
@@ -167,6 +168,26 @@ void printer_setPauliStrFormat(int flag) {
 
 
 
+/*
+ * MULTI-PROCESS MANAGEMENT
+ */
+
+
+void printer_sync() {
+
+    // make all participating processes flush, to improve the chance
+    // that user-printing from non-root processes reaches the screen
+    // before QuEST begins to print from the root process
+    std::cout << std::flush; // C++ buffer
+    fflush(stdout);          // C buffer
+
+    // wait for all process flushes to complete, which defers non-root
+    // processes from printing until after root has finished printing
+    comm_sync();
+}
+
+
+
 /*
  * TYPE NAME STRINGS
  */
@@ -228,6 +249,10 @@ inline std::string demangleTypeName(const char* mangledName) {
 // type T can be anything in principle, although it's currently only used for qcomp
 template <typename T>
 std::string getTypeName(T _unused) {
+
+    // Shut those obnovioux compilers right up
+    (void) _unused;
+
     // For MSVC, typeid(T).name() typically returns something like "class Foo"
     // or "struct Foo", but it's still not exactly "Foo".
     // For GCC/Clang, you get a raw "mangled" name, e.g. "N3FooE".
@@ -258,7 +283,7 @@ string printer_getQindexType() {
 
 string printer_getFloatPrecisionFlag() {
 
-    return GET_STR( FLOAT_PRECISION );
+    return GET_STR( QUEST_FLOAT_PRECISION );
 }
 
 
diff --git a/quest/src/core/printer.hpp b/quest/src/core/printer.hpp
index d2ff8274d..b359bb381 100644
--- a/quest/src/core/printer.hpp
+++ b/quest/src/core/printer.hpp
@@ -48,6 +48,14 @@ void printer_setPauliStrFormat(int flag);
 
 
 
+/*
+ * MULTI-PROCESS MANAGEMENT
+ */
+
+void printer_sync();
+
+
+
 /*
  * TYPE NAME STRINGS
  */
diff --git a/quest/src/core/randomiser.cpp b/quest/src/core/randomiser.cpp
index 65c6da4eb..7b35a29fc 100644
--- a/quest/src/core/randomiser.cpp
+++ b/quest/src/core/randomiser.cpp
@@ -66,14 +66,14 @@ void rand_setSeeds(vector<unsigned> seeds) {
 
     // all nodes learn root node's #seeds
     unsigned numRootSeeds = seeds.size();
-    if (comm_isInit())
+    if (comm_isActive())
         comm_broadcastUnsignedsFromRoot(&numRootSeeds, 1);
 
     // all nodes ensure they have space to receive root node's seeds
     seeds.resize(numRootSeeds);
     
     // all nodes receive root seeds
-    if (comm_isInit())
+    if (comm_isActive())
         comm_broadcastUnsignedsFromRoot(seeds.data(), seeds.size());
 
     // all nodes remember seeds (in case user wishes to later recall them)
diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index c1e0a6273..7d9d2106b 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -19,6 +19,7 @@
 #include "quest/src/core/errors.hpp"
 #include "quest/src/core/bitwise.hpp"
 #include "quest/src/core/memory.hpp"
+#include "quest/src/core/lists.hpp"
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/core/validation.hpp"
 #include "quest/src/cpu/cpu_config.hpp"
@@ -73,7 +74,7 @@ bool util_isQubitInSuffix(int qubit, Qureg qureg) {
     return qubit < qureg.logNumAmpsPerNode;
 }
 
-bool util_areAllQubitsInSuffix(vector<int> qubits, Qureg qureg) {
+bool util_areAllQubitsInSuffix(ConstList64 qubits, Qureg qureg) {
 
     for (int q : qubits)
         if (!util_isQubitInSuffix(q, qureg))
@@ -89,22 +90,21 @@ bool util_isBraQubitInSuffix(int ketQubit, Qureg qureg) {
     return ketQubit < qureg.logNumColsPerNode;
 }
 
-vector<int> getPrefixOrSuffixQubits(vector<int> qubits, Qureg qureg, bool getSuffix) {
+List64 getPrefixOrSuffixQubits(ConstList64 qubits, Qureg qureg, bool getSuffix) {
 
     // note that when the qureg is local/duplicated, 
     // all qubits will be suffix, none will be prefix
 
-    vector<int> subQubits(0);
-    subQubits.reserve(qubits.size());
+    List64 out = lists_getEmptyList64();
 
     for (int qubit : qubits)
         if (util_isQubitInSuffix(qubit, qureg) == getSuffix)
-            subQubits.push_back(qubit);
+            out.push_back(qubit);
 
-    return subQubits;
+    return out;
 }
 
-std::array<vector<int>,2> util_getPrefixAndSuffixQubits(vector<int> qubits, Qureg qureg) {
+std::array<List64,2> util_getPrefixAndSuffixQubits(ConstList64 qubits, Qureg qureg) {
     return {
         getPrefixOrSuffixQubits(qubits, qureg, false), 
         getPrefixOrSuffixQubits(qubits, qureg, true)
@@ -132,7 +132,7 @@ int util_getRankWithQubitFlipped(int prefixKetQubit, Qureg qureg) {
     return rankFlip;
 }
 
-int util_getRankWithQubitsFlipped(vector<int> prefixQubits,  Qureg qureg) {
+int util_getRankWithQubitsFlipped(ConstList64 prefixQubits,  Qureg qureg) {
 
     int rank = qureg.rank;
     for (int qubit : prefixQubits)
@@ -148,7 +148,7 @@ int util_getRankWithBraQubitFlipped(int ketQubit, Qureg qureg) {
     return rankFlip;
 }
 
-int util_getRankWithBraQubitsFlipped(vector<int> ketQubits, Qureg qureg) {
+int util_getRankWithBraQubitsFlipped(ConstList64 ketQubits, Qureg qureg) {
 
     int rank = qureg.rank;
     for (int qubit : ketQubits)
@@ -157,68 +157,118 @@ int util_getRankWithBraQubitsFlipped(vector<int> ketQubits, Qureg qureg) {
     return rank;
 }
 
-vector<int> util_getBraQubits(vector<int> ketQubits, Qureg qureg) {
+List64 util_getBraQubits(ConstList64 ketQubits, Qureg qureg) {
 
-    vector<int> braInds(0);
-    braInds.reserve(ketQubits.size());
+    List64 braQubits = ketQubits;
 
-    for (int qubit : ketQubits)
-        braInds.push_back(util_getBraQubit(qubit, qureg));
+    for (int &qubit : braQubits)
+        qubit = util_getBraQubit(qubit, qureg);
 
-    return braInds;
+    return braQubits;
 }
 
-vector<int> util_getNonTargetedQubits(int* targets, int numTargets, int numQubits) {
+List64 util_getNonTargetedQubits(ConstList64 targets, int numQubits) {
     
-    qindex mask = getBitMask(targets, numTargets);
+    qindex mask = util_getBitMask(targets);
 
-    vector<int> nonTargets;
-    nonTargets.reserve(numQubits - numTargets);
+    List64 out = lists_getEmptyList64();
 
     for (int i=0; i<numQubits; i++)
         if (getBit(mask, i) == 0)
-            nonTargets.push_back(i);
+            out.push_back(i);
 
-    return nonTargets;
+    return out;
 }
 
-vector<int> util_getConcatenated(vector<int> list1, vector<int> list2) {
+List64 util_getConcatenated(ConstList64 list1, ConstList64 list2) {
+
+    auto out = list1;
+    for (auto elem : list2)
+        out.push_back(elem);
 
-    // modify the copy of list1
-    list1.insert(list1.end(), list2.begin(), list2.end());
-    return list1;
+    return out;
 }
 
-vector<int> util_getSorted(vector<int> qubits) {
+List64 util_getSorted(ConstList64 list) {
+
+    // optimise common edgecases
+    if (list.size() < 2)
+        return list;
+    
+    List64 out = list;
+
+    if (out.size() == 2) {
+        if (out[0] > out[1])
+            std::swap(out[0], out[1]);
+        return out;
+    }
 
-    vector<int> copy = qubits;
-    std::sort(copy.begin(), copy.end());
-    return copy;
+    // fallback to inbuilt sort
+    std::sort(out.begin(), out.end());
+    return out;
 }
 
-vector<int> util_getSorted(vector<int> ctrls, vector<int> targs) {
+List64 util_getSorted(ConstList64 ctrls, ConstList64 targs) {
 
     return util_getSorted(util_getConcatenated(ctrls, targs));
 }
 
-qindex util_getBitMask(vector<int> qubits) {
+List64 util_getSorted(ConstList64 ctrls, std::initializer_list<int> targs) {
+
+    return util_getSorted(ctrls, lists_getList64(targs));
+}
+
+List64 util_getRange(int maxExcl) {
+
+    List64 out = lists_getEmptyList64();
+
+    for (int i=0; i<maxExcl; i++)
+        out.push_back(i);
+        
+    return out;
+}
+
+List64 util_getConstantList(int elem, int length) {
+
+    List64 out = lists_getEmptyList64();
+    out.assign(length, elem);
+    return out;
+}
+
+qindex util_getBitMask(ConstList64 qubits) {
 
     // inserts qubits in state 1
     return getBitMask(qubits.data(), qubits.size());
 }
 
-qindex util_getBitMask(vector<int> qubits, vector<int> states) {
+qindex util_getBitMask(ConstList64 qubits, ConstList64 states) {
 
+    // assumes qubits.size() == states.size()
     return getBitMask(qubits.data(), states.data(), states.size());
 }
 
-qindex util_getBitMask(vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, vector<int> targStates) {
+qindex util_getBitMask(ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, ConstList64 targStates) {
 
     auto qubits = util_getConcatenated(ctrls, targs);
     auto states = util_getConcatenated(ctrlStates, targStates);
     return util_getBitMask(qubits, states);
 }
 
+qindex util_getBitMask(ConstList64 ctrls, ConstList64 ctrlStates, std::initializer_list<int> targs, std::initializer_list<int> targStates) {
+
+    return util_getBitMask(ctrls, ctrlStates, lists_getList64(targs), lists_getList64(targStates));
+}
+
+List64 util_getList64OrAllOnes(const int* elemsOrNullptr, size_t length) {
+
+    if (elemsOrNullptr != nullptr)
+        return lists_getList64(elemsOrNullptr, length);
+
+    List64 out = lists_getEmptyList64();
+    out.assign(length, 1);
+    return out;
+}
+
 
 
 /*
@@ -1202,7 +1252,7 @@ void util_tryAllocVector(vector<unsigned> &vec, qindex size, std::function<void(
 void util_tryAllocVector(vector<PauliStr> &vec, qindex size, std::function<void()> errFunc) { tryAllocVector(vec, size, errFunc); }
 
 // cuQuantum needs a vector<double> overload, which we additionally define when qreal!=double. Gross!
-#if FLOAT_PRECISION != 2
+#if QUEST_FLOAT_PRECISION != 2
     void util_tryAllocVector(vector<double> &vec, qindex size, std::function<void()> errFunc) { tryAllocVector(vec, size, errFunc); }
 #endif
 
diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
index a78f66301..8e9509853 100644
--- a/quest/src/core/utilities.hpp
+++ b/quest/src/core/utilities.hpp
@@ -20,6 +20,8 @@
 #include "quest/include/channels.h"
 #include "quest/include/environment.h"
 
+#include "quest/src/core/lists.hpp"
+
 #include <type_traits>
 #include <functional>
 #include <utility>
@@ -29,6 +31,7 @@
 
 using std::is_same_v;
 using std::vector;
+using std::array;
 
 
 
@@ -38,36 +41,44 @@ using std::vector;
 
 bool util_isQubitInSuffix(int qubit, Qureg qureg);
 bool util_isBraQubitInSuffix(int ketQubit, Qureg qureg);
-bool util_areAllQubitsInSuffix(vector<int> qubits, Qureg qureg);
+bool util_areAllQubitsInSuffix(ConstList64 qubits, Qureg qureg);
 
 int util_getBraQubit(int ketQubit, Qureg qureg);
 
 int util_getPrefixInd(int qubit, Qureg qureg);
 int util_getPrefixBraInd(int ketQubit, Qureg qureg);
 
-std::array<vector<int>,2> util_getPrefixAndSuffixQubits(vector<int> qubits, Qureg qureg);
+array<List64,2> util_getPrefixAndSuffixQubits(ConstList64 qubits, Qureg qureg);
 
 int util_getRankBitOfQubit(int ketQubit, Qureg qureg);
 int util_getRankBitOfBraQubit(int ketQubit, Qureg qureg);
 
 int util_getRankWithQubitFlipped(int ketQubit, Qureg qureg);
-int util_getRankWithQubitsFlipped(vector<int> prefixQubits, Qureg qureg);
+int util_getRankWithQubitsFlipped(ConstList64 prefixQubits, Qureg qureg);
 
 int util_getRankWithBraQubitFlipped(int ketQubit, Qureg qureg);
-int util_getRankWithBraQubitsFlipped(vector<int> ketQubits, Qureg qureg);
+int util_getRankWithBraQubitsFlipped(ConstList64 ketQubits, Qureg qureg);
+
+List64 util_getBraQubits(ConstList64 ketQubits, Qureg qureg);
+
+List64 util_getNonTargetedQubits(ConstList64, int numQubits);
+
+List64 util_getConcatenated(ConstList64 list1, ConstList64 list2);
 
-vector<int> util_getBraQubits(vector<int> ketQubits, Qureg qureg);
+List64 util_getRange(int maxExcl);
 
-vector<int> util_getNonTargetedQubits(int* targets, int numTargets, int numQubits);
+List64 util_getConstantList(int elem, int length);
 
-vector<int> util_getConcatenated(vector<int> list1, vector<int> list2);
+List64 util_getSorted(ConstList64 list);
+List64 util_getSorted(ConstList64 ctrls, ConstList64 targs);
+List64 util_getSorted(ConstList64 ctrls, std::initializer_list<int> targs);
 
-vector<int> util_getSorted(vector<int> list);
-vector<int> util_getSorted(vector<int> ctrls, vector<int> targs);
+qindex util_getBitMask(ConstList64 qubits);
+qindex util_getBitMask(ConstList64 qubits, ConstList64 states);
+qindex util_getBitMask(ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, ConstList64 targStates);
+qindex util_getBitMask(ConstList64 ctrls, ConstList64 ctrlStates, std::initializer_list<int> targs, std::initializer_list<int> targStates);
 
-qindex util_getBitMask(vector<int> qubits);
-qindex util_getBitMask(vector<int> qubits, vector<int> states);
-qindex util_getBitMask(vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, vector<int> targStates);
+List64 util_getList64OrAllOnes(const int* elemsOrNullptr, size_t length);
 
 
 
@@ -423,7 +434,7 @@ void util_tryAllocVector(vector<unsigned> &vec, qindex size, std::function<void(
 void util_tryAllocVector(vector<PauliStr> &vec, qindex size, std::function<void()> errFunc);
 
 // cuQuantum needs a vector<double> overload, which we additionally define when qreal!=double. Gross!
-#if FLOAT_PRECISION != 2
+#if QUEST_FLOAT_PRECISION != 2
     void util_tryAllocVector(vector<double> &vec, qindex size, std::function<void()> errFunc);
 #endif
 
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index 3b6fc18a2..62ff93166 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -99,7 +99,7 @@ namespace report {
         "Cannot distribute QuEST between ${NUM_NODES} nodes; must use a power-of-2 number of nodes.";
 
     string MULTIPLE_NODES_BOUND_TO_SAME_GPU =
-        "Multiple MPI processes (nodes) were bound to the same GPU which is detrimental to performance and almost never intended. Please re-deploy QuEST with no more MPI processes than there are total GPUs. Alternatively, recompile QuEST with macro PERMIT_NODES_TO_SHARE_GPU=1.";
+        "Multiple MPI processes (nodes) were bound to the same GPU which is detrimental to performance and almost never intended. Please re-deploy QuEST with no more MPI processes than there are total GPUs. Alternatively, recompile QuEST with macro QUEST_PERMIT_NODES_TO_SHARE_GPU=1.";
 
     string CUQUANTUM_DEPLOYED_ON_BELOW_CC_GPU =
         "Cannot use cuQuantum on a GPU with compute-capability ${OUR_CC}; a compute-capability of ${MIN_CC} or above is required. Recompile with cuQuantum disabled to fall-back to using Thrust and custom kernels.";
@@ -107,6 +107,21 @@ namespace report {
     string CUQUANTUM_DEPLOYED_ON_GPU_WITHOUT_MEM_POOLS =
         "Cannot use cuQuantum since your GPU does not support memory pools. Recompile with cuQuantum disabled to fall-back to using Thrust and custom kernels.";
 
+    string USER_OWNED_MPI_WAS_NOT_INIT =
+        "User owns MPI but did not prior initialise MPI before initialising QuEST.";
+
+    string USER_GIVEN_MPI_COMMUNICATOR_IS_NULL =
+        "The provided MPI communicator was null (MPI_COMM_NULL).";
+
+    string USER_GIVEN_MPI_COMMUNICATOR_FAILED_TO_SET =
+        "The provided MPI communicator could not be used; MPI_Comm_dup() was not successful.";
+
+    string QUEST_OWNED_MPI_WAS_PRE_INIT =
+        "MPI was already initialised prior to QuESTEnv initialisation, but the user did not declare MPI ownership.";
+
+    string QUEST_IS_NON_DISTRIBUTED_BUT_MPI_WAS_INIT =
+        "QuESTEnv was initialised to be non-distributed but MPI was externally initialised - this is presently unsupported due to a (very minor) technical limitation. If you need this facility, please raise a Github issue!";
+
     
     /*
      * EXISTING QUESTENV
@@ -129,6 +144,9 @@ namespace report {
     string INVALID_NUM_REPORTED_SIG_FIGS =
         "Invalid number of significant figures (${NUM_SIG_FIGS}). Cannot be less than one.";
 
+    string RANDOM_SEEDS_PTR_IS_NULL =
+        "The given seeds list pointer is NULL.";
+
     string INVALID_NUM_RANDOM_SEEDS =
         "Invalid number of random seeds (${NUM_SEEDS}). Must specify one or more. In distributed settings, only the root node needs to pass a valid number of seeds (other node arguments are ignored).";
     
@@ -136,7 +154,7 @@ namespace report {
         "Invalid number of trailing newlines (${NUM_NEWLINES}). Cannot generally be less than zero, and must not be zero when calling multi-line reporting functions like reportQureg().";
 
     string INSUFFICIENT_NUM_REPORTED_NEWLINES =
-        "The number of trailing newlines (set by setNumReportedNewlines()) is zero which is not permitted when calling multi-line reporters.";
+        "The number of trailing newlines (set by setQuESTNumReportedNewlines()) is zero which is not permitted when calling multi-line reporters.";
 
     string INVALID_NUM_NEW_PAULI_CHARS =
         "Given an invalid number of Pauli characters. Must specify precisely four to respectively replace IXYZ.";
@@ -144,6 +162,31 @@ namespace report {
     string INVALID_REPORTED_PAULI_STR_STYLE_FLAG =
         "Given an unrecognised style flag (${FLAG}). Legal flags are 0 and 1.";
 
+    // substrings re-used below
+    string _invalid_num_tpb_prefix =
+        "An invalid number of GPU threads per block (${NUM_TPB}) was passed, or specified via environment variable " + envvar_names::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK + ", or compiled into the QuEST library through the CMake option of the same name.";
+    string _num_tpb_warp_indivisible_infix =
+        "The specified number does not divide evenly into the warp size of ${CUDA_WARP_SIZE} (NVIDIA GPUs) or ${HIP_WARP_SIZE} (AMD GPUs).";
+    string _num_tpb_warp_negative_infix =
+        "The specified number must be positive.";
+    string _num_tpb_ineffectual_suffix =
+        "Note GPU acceleration is not active so this parameter has no effect anyway.";
+
+    string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE =
+        _invalid_num_tpb_prefix + " " + _num_tpb_warp_negative_infix;
+
+    string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE_BUT_GPU_NOT_ACTIVE_ANYWAY =
+        _invalid_num_tpb_prefix + " " + _num_tpb_warp_negative_infix + " " + _num_tpb_ineffectual_suffix;
+
+    string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE =
+        _invalid_num_tpb_prefix + " " + _num_tpb_warp_indivisible_infix;
+
+    string GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE_BUT_GPU_NOT_AVAILABLE_ANYWAY =
+        _invalid_num_tpb_prefix + " " + _num_tpb_warp_indivisible_infix + " " + _num_tpb_ineffectual_suffix;
+
+    string GPU_NUM_THREADS_PER_BLOCK_EXCEEDS_HARDWARE_MAX =
+        _invalid_num_tpb_prefix + " Exceeds the hardware-imposed maximum of ${MAX_TPB}.";
+
 
     /*
      * QUREG CREATION
@@ -719,7 +762,7 @@ namespace report {
         "Line ${LINE_NUMBER} specified ${NUM_LINE_PAULIS} Pauli operators which is inconsistent with the number of Paulis of the previous lines (${NUM_PAULIS}).";
 
     string PARSED_PAULI_STR_SUM_COEFF_EXCEEDS_QCOMP_RANGE =
-        "The coefficient of line ${LINE_NUMBER} is a valid floating-point number but exceeds the range which can be stored in a qcomp. Consider increasing FLOAT_PRECISION.";
+        "The coefficient of line ${LINE_NUMBER} is a valid floating-point number but exceeds the range which can be stored in a qcomp. Consider increasing QUEST_FLOAT_PRECISION.";
 
     string PARSED_STRING_IS_EMPTY =
         "The given string was empty (contained only whitespace characters) and could not be parsed.";
@@ -1121,17 +1164,24 @@ namespace report {
      * ENVIRONMENT VARIABLES
      */
 
-    string INVALID_PERMIT_NODES_TO_SHARE_GPU_ENV_VAR =
-        "The optional, boolean '" + envvar_names::PERMIT_NODES_TO_SHARE_GPU + "' environment variable was specified to an invalid value. The variable can be unspecified, or set to '', '0' or '1'.";
+    string INVALID_QUEST_PERMIT_NODES_TO_SHARE_GPU_ENV_VAR =
+        "The optional, boolean '" + envvar_names::QUEST_PERMIT_NODES_TO_SHARE_GPU + "' environment variable was specified to an invalid value. The variable can be unspecified, or set to '', '0' or '1'.";
 
     string DEFAULT_EPSILON_ENV_VAR_NOT_A_REAL =
-        "The optional '" + envvar_names::DEFAULT_VALIDATION_EPSILON + "' environment variable was not a recognisable real number.";
+        "The optional '" + envvar_names::QUEST_DEFAULT_VALIDATION_EPSILON + "' environment variable was not a recognisable real number.";
 
     string DEFAULT_EPSILON_ENV_VAR_EXCEEDS_QREAL_RANGE = 
-        "The optional '" + envvar_names::DEFAULT_VALIDATION_EPSILON + "' environment variable was larger (in magnitude) than the maximum value which can be stored in a qreal.";
+        "The optional '" + envvar_names::QUEST_DEFAULT_VALIDATION_EPSILON + "' environment variable was larger (in magnitude) than the maximum value which can be stored in a qreal.";
 
     string DEFAULT_EPSILON_ENV_VAR_IS_NEGATIVE =
-        "The optional '" + envvar_names::DEFAULT_VALIDATION_EPSILON + "' environment variable was negative. The value must be zero or positive.";
+        "The optional '" + envvar_names::QUEST_DEFAULT_VALIDATION_EPSILON + "' environment variable was negative. The value must be zero or positive.";
+
+    string DEFAULT_NUM_GPU_THREADS_PER_BLOCK_ENV_VAR_NOT_AN_INT =
+        "The optional '" + envvar_names::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK + "' environment variable was not a recognisable integer.";
+
+    string DEFAULT_NUM_GPU_THREADS_PER_BLOCK_ENV_VAR_EXCEEDS_INT_RANGE =
+        "The optional '" + envvar_names::QUEST_DEFAULT_NUM_GPU_THREADS_PER_BLOCK + "' environment variable was larger (in magnitude) than the maximum value which can be stored in an integer.";
+
 }
 
 
@@ -1142,9 +1192,13 @@ namespace report {
 
 void default_inputErrorHandler(const char* func, const char* msg) {
 
+    // force a std-flush and comm-sync so that the error message is not (well, less likely
+    // to be interrupted_ by users printing from a non-root process
+    printer_sync();
+
     // safe to call even before MPI has been setup, and ignores user-set trailing newlines.
     // It begins with \n to interrupt half-printed lines (when trailing newlines are set to
-    // 0 via setNumReportedNewlines(0)), for visual clarity. Note that user's overriding
+    // 0 via setQuESTNumReportedNewlines(0)), for visual clarity. Note that user's overriding
     // functions might not think to print an initial newline but oh well!
     print(string("\n")
         + "QuEST encountered a validation error during function " 
@@ -1153,11 +1207,13 @@ void default_inputErrorHandler(const char* func, const char* msg) {
 
     // force a synch because otherwise non-main nodes may exit before print, and MPI
     // will then attempt to instantly abort all nodes, losing the error message.
-    comm_sync();
+    printer_sync();
 
-    // finalise MPI before error-exit to avoid scaring user with giant MPI error message
-    if (comm_isInit())
-        comm_end();
+    // finalise QuEST-owned MPI before error-exit to avoid scaring user with giant MPI crash
+    // message. note user-owned MPI is NOT killed because it's possible only SOME processes
+    // reach here, and attempting to sync/kill them would result in an MPI hang/crash anyway
+    if (comm_isActive())
+        comm_end(); // keeps user-owned MPI alive
 
     // simply exit, interrupting any other process (potentially leaking)
     exit(EXIT_FAILURE);
@@ -1220,8 +1276,8 @@ qreal REDUCTION_EPSILON_FACTOR = 100;
  */
 
 // the default epsilon is not known until runtime since the macro
-// UNSPECIFIED_DEFAULT_VALIDATION_EPSILON may be overriden by the
-// DEFAULT_VALIDATION_EPSILON environment variable. We do not read
+// UNSPECIFIED_QUEST_DEFAULT_VALIDATION_EPSILON may be overriden by the
+// QUEST_DEFAULT_VALIDATION_EPSILON environment variable. We do not read
 // the env-var immediately since it may malformed; we must wait for
 // initQuESTEnv() to validate and potentially throw an error
 static qreal global_validationEpsilon = -1; // must be overriden
@@ -1339,7 +1395,7 @@ void assertAllNodesAgreeThat(bool valid, string msg, tokenSubs vars, const char*
     // when performing validation that may be non-uniform between nodes. For
     // example, mallocs may succeed on one node but fail on another due to
     // inhomogeneous loads.
-    if (comm_isInit())
+    if (comm_isActive())
         valid = comm_isTrueOnAllNodes(valid);
 
     // prepare error message only if validation will fail
@@ -1400,12 +1456,18 @@ bool doQuregsHaveIdenticalMemoryLayouts(Qureg a, Qureg b) {
 
 void validate_envNeverInit(bool isQuESTInit, bool isQuESTFinal, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(!isQuESTInit, report::QUEST_ENV_ALREADY_INIT, caller);
     assertThat(!isQuESTFinal, report::QUEST_ENV_ALREADY_FINAL, caller);
 }
 
 void validate_newEnvDeploymentMode(int isDistrib, int isGpuAccel, int isMultithread, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // deployment flags must be boolean or auto
     tokenSubs vars = {{"${AUTO_DEPLOYMENT_FLAG}", modeflag::USE_AUTO}};
     assertThat(isDistrib     == 0 || isDistrib     == 1 || isDistrib     == modeflag::USE_AUTO, report::INVALID_OPTION_FOR_ENV_IS_DISTRIB,     vars, caller);
@@ -1435,6 +1497,9 @@ void validate_newEnvDeploymentMode(int isDistrib, int isGpuAccel, int isMultithr
 
 void validate_newEnvDistributedBetweenPower2Nodes(const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // note that we do NOT finalize MPI before erroring below, because that would necessitate
     // every node (launched by mpirun) serially print the error message, causing spam.
     // Instead, we permit the evil of every MPI process calling exit() and MPI aborting when
@@ -1448,12 +1513,18 @@ void validate_newEnvDistributedBetweenPower2Nodes(const char* caller) {
 
 void validate_newEnvNodesEachHaveUniqueGpu(const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     bool sharedGpus = gpu_areAnyNodesBoundToSameGpu();
     assertAllNodesAgreeThat(!sharedGpus, report::MULTIPLE_NODES_BOUND_TO_SAME_GPU, caller);
 }
 
 void validate_gpuIsCuQuantumCompatible(const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     int minCC = 70;
     int ourCC = gpu_getComputeCapability();
     tokenSubs vars = {
@@ -1466,6 +1537,53 @@ void validate_gpuIsCuQuantumCompatible(const char* caller) {
     assertAllNodesAgreeThat(hasMemPools, report::CUQUANTUM_DEPLOYED_ON_GPU_WITHOUT_MEM_POOLS, caller);
 }
 
+void validate_mpiInitStatus(bool useDistrib, bool userOwnsMpi, const char* caller) {
+
+    // Validation prior to this function confirms init(Custom*)QuESTEnv is only ever called
+    // once, but we must additionally confirm the user has interacted with MPI legally
+
+    if (!global_isValidationEnabled)
+        return;
+
+    // We consult whether MPI itself has been initialised, NOT whether QuEST is using it
+    bool isMpiInit = comm_isMpiInit();
+
+    // (A) If the user does not declare ownership of MPI, they are forbidden to initialise it,
+    //     even when they are not distributing QuEST (i.e. useDistrib=0), just for clarity!
+    if (!userOwnsMpi)
+        assertThat(!isMpiInit, report::QUEST_OWNED_MPI_WAS_PRE_INIT, caller);
+
+    // (B) If QuEST will use MPI owned by the user, the user must have pre-initialised it
+    if (useDistrib && userOwnsMpi)
+        assertThat(isMpiInit, report::USER_OWNED_MPI_WAS_NOT_INIT, caller);
+    
+    // Confirmation that all 8 scenarios are handled:
+    //     useDistrib=0, userOwnsMpi=0, isMpiInit=0 (legal: nobody wants MPI)
+    // (A) useDistrib=0, userOwnsMpi=0, isMpiInit=1 (illegal: user lied about ownership)
+    //     useDistrib=0, userOwnsMpi=1, isMpiInit=0 (legal: user owns MPI but does nothing!)
+    //     useDistrib=0, userOwnsMpi=1, isMpiInit=1 (legal: user owns MPI, QuEST won't use it)
+    //     useDistrib=1, userOwnsMpi=0, isMpiInit=0 (legal: QuEST will init MPI)
+    // (A) useDistrib=1, userOwnsMpi=0, isMpiInit=1 (illegal: user lied about ownership)
+    // (B) useDistrib=1, userOwnsMpi=1, isMpiInit=0 (illegal: user has reponsibility to pre-init)
+    //     useDistrib=1, userOwnsMpi=1, isMpiInit=1 (legal: user fulfilled responsibility to pre-init)
+}
+
+void validate_mpiSubCommIsNonNull(bool isNonNull, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertThat(isNonNull, report::USER_GIVEN_MPI_COMMUNICATOR_IS_NULL, caller);
+}
+
+void validate_mpiSubCommSetSucceeded(bool success, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertThat(success, report::USER_GIVEN_MPI_COMMUNICATOR_FAILED_TO_SET, caller);
+}
+
 
 
 /*
@@ -1474,6 +1592,9 @@ void validate_gpuIsCuQuantumCompatible(const char* caller) {
 
 void validate_envIsInit(const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(isQuESTEnvInit(), report::QUEST_ENV_NOT_INIT, caller);
 }
 
@@ -1485,45 +1606,69 @@ void validate_envIsInit(const char* caller) {
 
 void validate_randomSeeds(unsigned* seeds, int numSeeds, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // only the root node's seeds are consulted, so we permit all non-root
     // nodes to have invalid parameters. All nodes however must know/agree
     // when the root node's seeds are invalid, to synchronise validation
-
+    int isNull = (seeds == nullptr);
     int numRootSeeds = numSeeds;
-    if (getQuESTEnv().isDistributed)
+    if (getQuESTEnv().isDistributed) {
+        comm_broadcastIntsFromRoot(&isNull, 1);
         comm_broadcastIntsFromRoot(&numRootSeeds, 1);
+    }
 
+    assertThat(!isNull, report::RANDOM_SEEDS_PTR_IS_NULL, caller);
     assertThat(numRootSeeds > 0, report::INVALID_NUM_RANDOM_SEEDS, {{"${NUM_SEEDS}", numSeeds}}, caller);
 }
 
 void validate_newEpsilonValue(qreal eps, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(eps >= 0, report::INVALID_NEW_EPSILON, {{"${NEW_EPS}", eps}}, caller);
 }
 
 void validate_newMaxNumReportedScalars(qindex numRows, qindex numCols, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numRows >= 0, report::INVALID_NUM_REPORTED_SCALARS, {{"${NUM_ITEMS}", numRows}}, caller);
     assertThat(numCols >= 0, report::INVALID_NUM_REPORTED_SCALARS, {{"${NUM_ITEMS}", numCols}}, caller);
 }
 
 void validate_newMaxNumReportedSigFigs(int numSigFigs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numSigFigs >= 1, report::INVALID_NUM_REPORTED_SIG_FIGS, {{"${NUM_SIG_FIGS}", numSigFigs}}, caller);
 }
 
 void validate_newNumReportedNewlines(int numNewlines, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numNewlines >= 0, report::INVALID_NUM_REPORTED_NEWLINES, {{"${NUM_NEWLINES}", numNewlines}}, caller);
 }
 
 void validate_numReportedNewlinesAboveZero(const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(printer_getNumTrailingNewlines() > 0, report::INSUFFICIENT_NUM_REPORTED_NEWLINES, caller);
 }
 
 void validate_numPauliChars(const char* paulis, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // check position of terminal char, else default to numChars=5 (illegal)
     int numChars = 0;
     for (int i=0; i<5 && paulis[i] != '\0'; i++)
@@ -1534,9 +1679,55 @@ void validate_numPauliChars(const char* paulis, const char* caller) {
 
 void validate_reportedPauliStrStyleFlag(int flag, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(flag==0 || flag==1, report::INVALID_REPORTED_PAULI_STR_STYLE_FLAG, {{"${FLAG}",flag}}, caller);
 }
 
+void validate_numGpuThreadsPerBlock(int numTPB, bool isGpuActive, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    // var 'isGpuActive' indicates that the GPU backend is compiled, a physical
+    // GPU is available, AND that the QuESTEnv has GPU-acceleration enabled, i.e.
+    // isGPuActive = gpu_isGpuCompiled() && gpu_isGpuAvailable() && env.isGpuAccelerated,
+    // though is established before QuESTEnv initialisation has completed.
+
+    // validate numTPB > 0 with an error message that points out TPB may be redundant
+    tokenSubs vars = {{"${NUM_TPB}", numTPB}};
+    auto errorMsg = isGpuActive? 
+        report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE :
+        report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_POSITIVE_BUT_GPU_NOT_ACTIVE_ANYWAY;
+    assertThat(numTPB > 0, errorMsg, vars, caller);
+
+    // prepare to validate TPB is warp-divisible, again pointing out redundancy...
+    vars["${CUDA_WARP_SIZE}"] = gpu_CUDA_WARP_SIZE;
+    vars["${HIP_WARP_SIZE}"] = gpu_HIP_WARP_SIZE;
+    errorMsg = isGpuActive? 
+        report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE :
+        report::GPU_NUM_THREADS_PER_BLOCK_IS_NOT_WARP_DIVISIBLE_BUT_GPU_NOT_AVAILABLE_ANYWAY;
+
+    // ... but note that when the GPU backend isn't compiled, we don't know whether the
+    // user has an NVIDIA or AMD GPU, which have distinct warps of 32 (CUDA) and 64 (HIP),
+    // and so choose the smaller divisor (32,CUDA), ergo potentially permitting warp TPB
+    // that are incompatible with HIP. An extremely unimportant subtlety!
+    static_assert(gpu_HIP_WARP_SIZE >= gpu_CUDA_WARP_SIZE);
+    int warpSize = gpu_isHipCompiled()? gpu_HIP_WARP_SIZE : gpu_CUDA_WARP_SIZE;
+    assertThat(numTPB % warpSize == 0, errorMsg, vars, caller);
+
+    // the final check of max numTBP requires querying the hardware device, which obviously
+    // isn't possible if not available (and is pointless if available but we're not using!)
+    if (!isGpuActive)
+        return;
+
+    // otherwise, we verify numTPB doesn't exceed the hardware-declared maximum
+    auto maxNumTPB = gpu_getMaxNumThreadsPerBlock();
+    vars = {{"${NUM_TPB}", numTPB}, {"${MAX_TPB}", maxNumTPB}};
+    assertThat(numTPB <= maxNumTPB, report::GPU_NUM_THREADS_PER_BLOCK_EXCEEDS_HARDWARE_MAX, vars, caller);
+}
+
 
 
 /*
@@ -1707,8 +1898,6 @@ void assertQuregFitsInGpuMem(int numQubits, int isDensMatr, int isDistrib, int i
 
 void validate_newQuregParams(int numQubits, int isDensMatr, int isDistrib, int isGpuAccel, int isMultithread, QuESTEnv env, const char* caller) {
 
-    // some of the below validation involves getting distributed node consensus, which
-    // can be an expensive synchronisation, which we avoid if validation is anyway disabled
     if (!global_isValidationEnabled)
         return;
 
@@ -1724,6 +1913,9 @@ void validate_newQuregParams(int numQubits, int isDensMatr, int isDistrib, int i
 
 void validate_newQuregAllocs(Qureg qureg, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // this validation is called AFTER the caller has checked for failed
     // allocs and (in that scenario) freed every pointer, but does not 
     // overwrite any pointers to nullptr, so the failed alloc is known.
@@ -1752,6 +1944,9 @@ void validate_newQuregAllocs(Qureg qureg, const char* caller) {
 
 void validate_quregFields(Qureg qureg, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // attempt to detect the Qureg was not initialised with createQureg by the 
     // struct fields being randomised, and ergo being dimensionally incompatible
     bool valid = true;
@@ -1781,11 +1976,17 @@ void validate_quregFields(Qureg qureg, const char* caller) {
 
 void validate_quregIsStateVector(Qureg qureg, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(!qureg.isDensityMatrix, report::QUREG_NOT_STATE_VECTOR, caller);
 }
 
 void validate_quregIsDensityMatrix(Qureg qureg, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(qureg.isDensityMatrix, report::QUREG_NOT_DENSITY_MATRIX, caller);
 }
 
@@ -2025,6 +2226,10 @@ void assertNewMatrixParamsAreValid(int numQubits, int useDistrib, int useGpu, in
 }
 
 void validate_newCompMatrParams(int numQubits, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
     validate_envIsInit(caller);
 
     // CompMatr can never be distributed nor multithreaded
@@ -2040,6 +2245,10 @@ void validate_newCompMatrParams(int numQubits, const char* caller) {
     assertNewMatrixParamsAreValid(numQubits, useDistrib, useGpu, useMultithread, isDenseType, caller);
 }
 void validate_newDiagMatrParams(int numQubits, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
     validate_envIsInit(caller);
 
     // DiagMatr can never be distributed nor multithreaded
@@ -2055,6 +2264,10 @@ void validate_newDiagMatrParams(int numQubits, const char* caller) {
     assertNewMatrixParamsAreValid(numQubits, useDistrib, useGpu, useMultithread, isDenseType, caller);
 }
 void validate_newFullStateDiagMatrParams(int numQubits, int useDistrib, int useGpu, int useMultithread, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
     validate_envIsInit(caller);
 
     // FullStateDiagMatr stores only the diagonals
@@ -2127,6 +2340,9 @@ void assertNewMatrixAllocsSucceeded(T matr, size_t numBytes, const char* caller)
 
 void validate_newMatrixAllocs(CompMatr matr, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     bool isDenseMatrix = true;
     int numNodes = 1;
     size_t numBytes = mem_getLocalMatrixMemoryRequired(matr.numQubits, isDenseMatrix, numNodes);
@@ -2134,6 +2350,9 @@ void validate_newMatrixAllocs(CompMatr matr, const char* caller) {
 }
 void validate_newMatrixAllocs(DiagMatr matr, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     bool isDenseMatrix = false;
     int numNodes = 1;
     size_t numBytes = mem_getLocalMatrixMemoryRequired(matr.numQubits, isDenseMatrix, numNodes);
@@ -2141,6 +2360,9 @@ void validate_newMatrixAllocs(DiagMatr matr, const char* caller) {
 }
 void validate_newMatrixAllocs(FullStateDiagMatr matr, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     bool isDenseMatrix = false;
     int numNodes = (matr.isDistributed)? comm_getNumNodes() : 1;
     size_t numBytes = mem_getLocalMatrixMemoryRequired(matr.numQubits, isDenseMatrix, numNodes);
@@ -2155,6 +2377,9 @@ void validate_newMatrixAllocs(FullStateDiagMatr matr, const char* caller) {
 
 void validate_matrixNumNewElems(int numQubits, vector<vector<qcomp>> elems, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // CompMatr accept 2D elems   
     qindex dim = powerOf2(numQubits);
     tokenSubs vars = {
@@ -2176,6 +2401,9 @@ void validate_matrixNumNewElems(int numQubits, vector<vector<qcomp>> elems, cons
 }
 void validate_matrixNumNewElems(int numQubits, vector<qcomp> elems, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // DiagMatr accept 1D elems
     qindex dim = powerOf2(numQubits);
     tokenSubs vars = {
@@ -2188,11 +2416,17 @@ void validate_matrixNumNewElems(int numQubits, vector<qcomp> elems, const char*
 
 void validate_matrixNewElemsPtrNotNull(qcomp* elems, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(mem_isAllocated(elems), report::DIAG_MATR_NEW_ELEMS_NULL_PTR, caller);
 }
 
 void validate_matrixNewElemsPtrNotNull(qcomp** elems, qindex numRows, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // messages are suitable for all dense matrices, including SuperOp
 
     assertThat(mem_isOuterAllocated(elems), report::DENSE_MATR_NEW_ELEMS_OUTER_NULL_PTR, caller);
@@ -2203,6 +2437,9 @@ void validate_matrixNewElemsPtrNotNull(qcomp** elems, qindex numRows, const char
 
 void validate_fullStateDiagMatrNewElems(FullStateDiagMatr matr, qindex startInd, qindex numElems, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(
         startInd >= 0 && startInd < matr.numElems, 
         report::FULL_STATE_DIAG_MATR_NEW_ELEMS_INVALID_START_INDEX, 
@@ -2234,6 +2471,9 @@ void validate_fullStateDiagMatrNewElems(FullStateDiagMatr matr, qindex startInd,
 
 void validate_matrixNumQubitsMatchesParam(int numMatrQubits, int numSetterQubits, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {
         {"${NUM_SETTER_QUBITS}", numSetterQubits},
         {"${NUM_MATRIX_QUBITS}", numMatrQubits}};
@@ -2243,6 +2483,9 @@ void validate_matrixNumQubitsMatchesParam(int numMatrQubits, int numSetterQubits
 
 void validate_declaredNumElemsMatchesVectorLength(qindex numElems, qindex vecLength, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {
         {"${NUM_ELEMS}", numElems},
         {"${VEC_LENGTH}", vecLength}};
@@ -2252,6 +2495,9 @@ void validate_declaredNumElemsMatchesVectorLength(qindex numElems, qindex vecLen
 
 void validate_multiVarFuncQubits(int numMatrQubits, int* numQubitsPerVar, int numVars, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numVars > 0, report::MULTI_VAR_FUNC_INVALID_NUM_VARS, {{"${NUM_VARS}", numVars}}, caller);
 
     for (int v=0; v<numVars; v++)
@@ -2267,11 +2513,17 @@ void validate_multiVarFuncQubits(int numMatrQubits, int* numQubitsPerVar, int nu
 
 void validate_funcVarSignedFlag(int areSigned, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(areSigned == 0 || areSigned == 1, report::MULTI_VAR_FUNC_INVALID_ARE_SIGNED_FLAG, {{"${ARE_SIGNED}", areSigned}}, caller);
 }
 
 void validate_matrixRowsAllSameSize(vector<vector<qcomp>> matrix, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     if (matrix.empty())
         return;
 
@@ -2396,13 +2648,55 @@ void assertMatrixFieldsAreValid(T matr, int expectedNumQb, string badFieldMsg, c
     // no risk that they're wrong (because they're const so users cannot modify them) unless 
     // the struct was unitialised, which we have already validated against
 }
-void validate_matrixFields(CompMatr1 m, const char* caller) { assertMatrixFieldsAreValid(m, 1,           report::INVALID_COMP_MATR_1_FIELDS, caller); }
-void validate_matrixFields(CompMatr2 m, const char* caller) { assertMatrixFieldsAreValid(m, 2,           report::INVALID_COMP_MATR_2_FIELDS, caller); }
-void validate_matrixFields(CompMatr  m, const char* caller) { assertMatrixFieldsAreValid(m, m.numQubits, report::INVALID_COMP_MATR_FIELDS,   caller); }
-void validate_matrixFields(DiagMatr1 m, const char* caller) { assertMatrixFieldsAreValid(m, 1,           report::INVALID_DIAG_MATR_1_FIELDS, caller); }
-void validate_matrixFields(DiagMatr2 m, const char* caller) { assertMatrixFieldsAreValid(m, 2,           report::INVALID_DIAG_MATR_2_FIELDS, caller); }
-void validate_matrixFields(DiagMatr  m, const char* caller) { assertMatrixFieldsAreValid(m, m.numQubits, report::INVALID_DIAG_MATR_FIELDS,   caller); }
-void validate_matrixFields(FullStateDiagMatr m, const char* caller) { assertMatrixFieldsAreValid(m, m.numQubits, report::INVALID_FULL_STATE_DIAG_MATR_FIELDS, caller); }
+void validate_matrixFields(CompMatr1 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixFieldsAreValid(m, 1, report::INVALID_COMP_MATR_1_FIELDS, caller);
+}
+void validate_matrixFields(CompMatr2 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixFieldsAreValid(m, 2, report::INVALID_COMP_MATR_2_FIELDS, caller);
+}
+void validate_matrixFields(CompMatr  m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixFieldsAreValid(m, m.numQubits, report::INVALID_COMP_MATR_FIELDS,   caller);
+}
+void validate_matrixFields(DiagMatr1 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixFieldsAreValid(m, 1, report::INVALID_DIAG_MATR_1_FIELDS, caller);
+}
+void validate_matrixFields(DiagMatr2 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixFieldsAreValid(m, 2, report::INVALID_DIAG_MATR_2_FIELDS, caller);
+}
+void validate_matrixFields(DiagMatr  m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixFieldsAreValid(m, m.numQubits, report::INVALID_DIAG_MATR_FIELDS,   caller);
+}
+void validate_matrixFields(FullStateDiagMatr m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixFieldsAreValid(m, m.numQubits, report::INVALID_FULL_STATE_DIAG_MATR_FIELDS, caller);
+}
 
 // type T can be CompMatr, DiagMatr or FullStateDiagMatr
 template <class T>
@@ -2417,9 +2711,27 @@ void assertMatrixIsSynced(T matr, string errMsg, const char* caller) {
     // NOT GPU-accelerated and ergo the GPU memory is not consulted. It's best to build the habit in the user!
     assertThat(*(matr.wasGpuSynced) == 1, errMsg, caller);
 }
-void validate_matrixIsSynced(CompMatr matr, const char* caller) { assertMatrixIsSynced(matr, report::COMP_MATR_NOT_SYNCED_TO_GPU, caller);}
-void validate_matrixIsSynced(DiagMatr matr, const char* caller) { assertMatrixIsSynced(matr, report::DIAG_MATR_NOT_SYNCED_TO_GPU, caller); }
-void validate_matrixIsSynced(FullStateDiagMatr matr, const char* caller) { assertMatrixIsSynced(matr, report::FULL_STATE_DIAG_MATR_NOT_SYNCED_TO_GPU, caller); }
+void validate_matrixIsSynced(CompMatr matr, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsSynced(matr, report::COMP_MATR_NOT_SYNCED_TO_GPU, caller);
+}
+void validate_matrixIsSynced(DiagMatr matr, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsSynced(matr, report::DIAG_MATR_NOT_SYNCED_TO_GPU, caller);
+}
+void validate_matrixIsSynced(FullStateDiagMatr matr, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsSynced(matr, report::FULL_STATE_DIAG_MATR_NOT_SYNCED_TO_GPU, caller);
+}
 
 // type T can be CompMatr1, CompMatr2, CompMatr, DiagMatr1, DiagMatr2, DiagMatr, FullStateDiagMatr
 template <class T> 
@@ -2439,13 +2751,55 @@ void assertMatrixIsUnitary(T matr, const char* caller) {
     // may overwrite matr.isApproxUnitary of heap matrices, otherwise ignores epsilon
     assertThat(util_isUnitary(matr, global_validationEpsilon), report::MATRIX_NOT_UNITARY, caller);
 }
-void validate_matrixIsUnitary(CompMatr1 m, const char* caller) { assertMatrixIsUnitary(m, caller); }
-void validate_matrixIsUnitary(CompMatr2 m, const char* caller) { assertMatrixIsUnitary(m, caller); }
-void validate_matrixIsUnitary(CompMatr  m, const char* caller) { assertMatrixIsUnitary(m, caller); }
-void validate_matrixIsUnitary(DiagMatr1 m, const char* caller) { assertMatrixIsUnitary(m, caller); }
-void validate_matrixIsUnitary(DiagMatr2 m, const char* caller) { assertMatrixIsUnitary(m, caller); }
-void validate_matrixIsUnitary(DiagMatr  m, const char* caller) { assertMatrixIsUnitary(m, caller); }
-void validate_matrixIsUnitary(FullStateDiagMatr m, const char* caller) { assertMatrixIsUnitary(m, caller); }
+void validate_matrixIsUnitary(CompMatr1 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsUnitary(m, caller);
+}
+void validate_matrixIsUnitary(CompMatr2 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsUnitary(m, caller);
+}
+void validate_matrixIsUnitary(CompMatr  m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsUnitary(m, caller);
+}
+void validate_matrixIsUnitary(DiagMatr1 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsUnitary(m, caller);
+}
+void validate_matrixIsUnitary(DiagMatr2 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsUnitary(m, caller);
+}
+void validate_matrixIsUnitary(DiagMatr  m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsUnitary(m, caller);
+}
+void validate_matrixIsUnitary(FullStateDiagMatr m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsUnitary(m, caller);
+}
 
 void validate_unitaryExponentIsReal(qcomp exponent, const char* caller) {
 
@@ -2479,30 +2833,72 @@ void assertMatrixIsHermitian(T matr, const char* caller) {
     // may overwrite matr.isApproxHermitian of heap matrices, otherwise ignores epsilon
     assertThat(util_isHermitian(matr, global_validationEpsilon), report::MATRIX_NOT_HERMITIAN, caller);
 }
-void validate_matrixIsHermitian(CompMatr1 m, const char* caller) { assertMatrixIsHermitian(m, caller); }
-void validate_matrixIsHermitian(CompMatr2 m, const char* caller) { assertMatrixIsHermitian(m, caller); }
-void validate_matrixIsHermitian(CompMatr  m, const char* caller) { assertMatrixIsHermitian(m, caller); }
-void validate_matrixIsHermitian(DiagMatr1 m, const char* caller) { assertMatrixIsHermitian(m, caller); }
-void validate_matrixIsHermitian(DiagMatr2 m, const char* caller) { assertMatrixIsHermitian(m, caller); }
-void validate_matrixIsHermitian(DiagMatr  m, const char* caller) { assertMatrixIsHermitian(m, caller); }
-void validate_matrixIsHermitian(FullStateDiagMatr m, const char* caller) { assertMatrixIsHermitian(m, caller); }
+void validate_matrixIsHermitian(CompMatr1 m, const char* caller) {
 
-// type T can be DiagMatr, FullStateDiagMatr
-template <class T> 
-void assertMatrExpIsNonDiverging(T matr, qcomp exponent, const char* caller) {
+    if (!global_isValidationEnabled)
+        return;
 
-    validate_matrixFields(matr, caller);
-    validate_matrixIsSynced(matr, caller);
+    assertMatrixIsHermitian(m, caller);
+}
+void validate_matrixIsHermitian(CompMatr2 m, const char* caller) {
 
-    // avoid exepensive and epsilon-dependent validation below (do not overwrite matr.isApproxNonZero)
-    if (isNumericalValidationDisabled())
+    if (!global_isValidationEnabled)
         return;
 
-    // divergences are only validated when the imaginary component is strictly 
-    // zero, otherwise alternate complex exponentiation is sometimes performed
-    // with a more complicated numerical stability
-    if (std::imag(exponent) != 0)
-        return;
+    assertMatrixIsHermitian(m, caller);
+}
+void validate_matrixIsHermitian(CompMatr  m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsHermitian(m, caller);
+}
+void validate_matrixIsHermitian(DiagMatr1 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsHermitian(m, caller);
+}
+void validate_matrixIsHermitian(DiagMatr2 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsHermitian(m, caller);
+}
+void validate_matrixIsHermitian(DiagMatr  m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsHermitian(m, caller);
+}
+void validate_matrixIsHermitian(FullStateDiagMatr m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsHermitian(m, caller);
+}
+
+// type T can be DiagMatr, FullStateDiagMatr
+template <class T> 
+void assertMatrExpIsNonDiverging(T matr, qcomp exponent, const char* caller) {
+
+    validate_matrixFields(matr, caller);
+    validate_matrixIsSynced(matr, caller);
+
+    // avoid exepensive and epsilon-dependent validation below (do not overwrite matr.isApproxNonZero)
+    if (isNumericalValidationDisabled())
+        return;
+
+    // divergences are only validated when the imaginary component is strictly 
+    // zero, otherwise alternate complex exponentiation is sometimes performed
+    // with a more complicated numerical stability
+    if (std::imag(exponent) != 0)
+        return;
 
     // when the real exponent is STRICTLY less than zero, it is required that every
     // matrix element's magnitude is APPROX non-zero, to avoid 1/0 divergences. 
@@ -2512,8 +2908,20 @@ void assertMatrExpIsNonDiverging(T matr, qcomp exponent, const char* caller) {
     if (std::real(exponent) < 0)
         assertThat(util_isApproxNonZero(matr, global_validationEpsilon), report::DIAG_MATR_APPROX_ZERO_WHILE_EXPONENT_REAL_AND_NEGATIVE, caller);
 }
-void validate_matrixExpIsNonDiverging(DiagMatr          m, qcomp p, const char* caller) { assertMatrExpIsNonDiverging(m, p, caller); }
-void validate_matrixExpIsNonDiverging(FullStateDiagMatr m, qcomp p, const char* caller) { assertMatrExpIsNonDiverging(m, p, caller); }
+void validate_matrixExpIsNonDiverging(DiagMatr m, qcomp p, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrExpIsNonDiverging(m, p, caller);
+}
+void validate_matrixExpIsNonDiverging(FullStateDiagMatr m, qcomp p, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrExpIsNonDiverging(m, p, caller);
+}
 
 // type T can be DiagMatr, FullStateDiagMatr
 template <class T> 
@@ -2551,8 +2959,20 @@ void assertMatrExpIsHermitian(T matr, qreal exponent, const char* caller) {
     // result tends to 1 so does not vanish or blow up unexpectedly. All fine!
 }
 
-void validate_matrixExpIsHermitian(DiagMatr          m, qreal p, const char* caller) { assertMatrExpIsHermitian(m, p, caller); }
-void validate_matrixExpIsHermitian(FullStateDiagMatr m, qreal p, const char* caller) { assertMatrExpIsHermitian(m, p, caller); }
+void validate_matrixExpIsHermitian(DiagMatr m, qreal p, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrExpIsHermitian(m, p, caller);
+}
+void validate_matrixExpIsHermitian(FullStateDiagMatr m, qreal p, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrExpIsHermitian(m, p, caller);
+}
 
 template <class T>
 void assertMatrixDimMatchesTargs(T matr, int numTargs, const char* caller) {
@@ -2573,15 +2993,54 @@ void assertMatrixDimMatchesTargs(T matr, int numTargs, const char* caller) {
     assertThat(numMatrQubits == numTargs, report::MATRIX_SIZE_MISMATCHES_NUM_TARGETS, vars, caller);
 }
 
-void validate_matrixDimMatchesTargets(CompMatr1 matr, int numTargs, const char* caller) { assertMatrixDimMatchesTargs(matr, numTargs, caller); }
-void validate_matrixDimMatchesTargets(CompMatr2 matr, int numTargs, const char* caller) { assertMatrixDimMatchesTargs(matr, numTargs, caller); }
-void validate_matrixDimMatchesTargets(CompMatr  matr, int numTargs, const char* caller) { assertMatrixDimMatchesTargs(matr, numTargs, caller); }
-void validate_matrixDimMatchesTargets(DiagMatr1 matr, int numTargs, const char* caller) { assertMatrixDimMatchesTargs(matr, numTargs, caller); }
-void validate_matrixDimMatchesTargets(DiagMatr2 matr, int numTargs, const char* caller) { assertMatrixDimMatchesTargs(matr, numTargs, caller); }
-void validate_matrixDimMatchesTargets(DiagMatr  matr, int numTargs, const char* caller) { assertMatrixDimMatchesTargs(matr, numTargs, caller); }
+void validate_matrixDimMatchesTargets(CompMatr1 matr, int numTargs, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixDimMatchesTargs(matr, numTargs, caller);
+}
+void validate_matrixDimMatchesTargets(CompMatr2 matr, int numTargs, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixDimMatchesTargs(matr, numTargs, caller);
+}
+void validate_matrixDimMatchesTargets(CompMatr  matr, int numTargs, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixDimMatchesTargs(matr, numTargs, caller);
+}
+void validate_matrixDimMatchesTargets(DiagMatr1 matr, int numTargs, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixDimMatchesTargs(matr, numTargs, caller);
+}
+void validate_matrixDimMatchesTargets(DiagMatr2 matr, int numTargs, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixDimMatchesTargs(matr, numTargs, caller);
+}
+void validate_matrixDimMatchesTargets(DiagMatr  matr, int numTargs, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixDimMatchesTargs(matr, numTargs, caller);
+}
 
 void validate_matrixAndQuregAreCompatible(FullStateDiagMatr matr, Qureg qureg, bool expecOnly, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // we do not need to define this function for the other matrix types,
     // since their validation will happen through validation of the
     // user-given list of target qubits. But we do need to define it for
@@ -2707,8 +3166,6 @@ void assertSuperOpFitsInGpuMem(int numQubits, int isEnvGpuAccel, bool isInKrausM
 
 void validate_newSuperOpParams(int numQubits, const char* caller) {
 
-    // some of the below validation involves getting distributed node consensus, which
-    // can be an expensive synchronisation, which we avoid if validation is anyway disabled
     if (!global_isValidationEnabled)
         return;
 
@@ -2761,13 +3218,15 @@ void assertNewSuperOpAllocs(SuperOp op, bool isInKrausMap, const char* caller) {
 
 void validate_newSuperOpAllocs(SuperOp op, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     bool isInKrausMap = false;
     assertNewSuperOpAllocs(op, isInKrausMap, caller);
 }
 
 void validate_newInlineSuperOpDimMatchesVectors(int numDeclaredQubits, vector<vector<qcomp>> matrix, const char* caller) {
 
-    // avoid potentially expensive matrix enumeration if validation is anyway disabled
     if (!global_isValidationEnabled)
         return;
 
@@ -2796,7 +3255,6 @@ void validate_newInlineSuperOpDimMatchesVectors(int numDeclaredQubits, vector<ve
 
 void validate_superOpNewMatrixDims(SuperOp op, vector<vector<qcomp>> matrix, const char* caller) {
 
-    // avoid potentially expensive matrix enumeration if validation is anyway disabled
     if (!global_isValidationEnabled)
         return;
 
@@ -2818,6 +3276,9 @@ void validate_superOpNewMatrixDims(SuperOp op, vector<vector<qcomp>> matrix, con
 
 void validate_superOpFieldsMatchPassedParams(SuperOp op, int numQb, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {
         {"${NUM_PASSED_QUBITS}", numQb},
         {"${NUM_OP_QUBITS}",     op.numQubits}};
@@ -2864,12 +3325,18 @@ void assertSuperOpFieldsAreValid(SuperOp op, bool isInKrausMap, const char* call
 
 void validate_superOpFields(SuperOp op, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     bool isInKrausMap = false;
     assertSuperOpFieldsAreValid(op, isInKrausMap, caller);
 }
 
 void validate_superOpIsSynced(SuperOp op, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // we don't need to perform any sync check in CPU-only mode
     if (!mem_isAllocated(util_getGpuMemPtr(op)))
         return;
@@ -2880,6 +3347,9 @@ void validate_superOpIsSynced(SuperOp op, const char* caller) {
 
 void validate_superOpDimMatchesTargs(SuperOp op, int numTargets, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {{"${OP_QUBITS}", op.numQubits}, {"{NUM_TARGS}", numTargets}};
     assertThat(op.numQubits == numTargets, report::SUPER_OP_SIZE_MISMATCHES_NUM_TARGETS, vars, caller);
 }
@@ -2915,8 +3385,6 @@ void assertKrausMapValidNumMatrices(int numQubits, int numMatrices, const char*
 
 void validate_newKrausMapParams(int numQubits, int numMatrices, const char* caller) {
 
-    // some of the below validation involves getting distributed node consensus, which
-    // can be an expensive synchronisation, which we avoid if validation is anyway disabled
     if (!global_isValidationEnabled)
         return;
 
@@ -2944,6 +3412,9 @@ void validate_newKrausMapParams(int numQubits, int numMatrices, const char* call
 
 void validate_newKrausMapAllocs(KrausMap map, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // unlike other post-creation allocation validation, this function
     // expects that when allocation failed and the heap fields have already
     // been cleared, that any nested field (like map.matrices) has had the
@@ -2954,11 +3425,6 @@ void validate_newKrausMapAllocs(KrausMap map, const char* caller) {
     // (and is nullptr), so we must check it last so as not to false report 
     // it as the cause of the failure!
 
-    // we expensively get node consensus about malloc failure, in case of heterogeneous hardware/loads,
-    // but we avoid this if validation is anyway disabled
-    if (!global_isValidationEnabled)
-        return;
-
     // prior validation gaurantees this will not overflow
     qindex matrListMem = map.numMatrices * mem_getLocalMatrixMemoryRequired(map.numQubits, true, 1);
     tokenSubs vars = {
@@ -2981,7 +3447,6 @@ void validate_newKrausMapAllocs(KrausMap map, const char* caller) {
 
 void validate_newInlineKrausMapDimMatchesVectors(int numQubits, int numOperators, vector<vector<vector<qcomp>>> matrices, const char* caller) {
 
-    // avoid potentially expensive matrix enumeration if validation is anyway disabled
     if (!global_isValidationEnabled)
         return;
 
@@ -3012,10 +3477,9 @@ void validate_newInlineKrausMapDimMatchesVectors(int numQubits, int numOperators
 
 void validate_krausMapNewMatrixDims(KrausMap map, vector<vector<vector<qcomp>>> matrices, const char* caller) {
 
-    // avoid potentially expensive matrix enumeration if validation is anyway disabled
     if (!global_isValidationEnabled)
         return;
-    
+
     assertThat(map.numMatrices == (int) matrices.size(), report::KRAUS_MAP_INCOMPATIBLE_NUM_NEW_MATRICES,
         {{"${NUM_GIVEN}", matrices.size()}, {"${NUM_EXPECTED}", map.numMatrices}}, caller);
 
@@ -3035,6 +3499,9 @@ void validate_krausMapNewMatrixDims(KrausMap map, vector<vector<vector<qcomp>>>
 
 void validate_krausMapFieldsMatchPassedParams(KrausMap map, int numQb, int numOps, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {
         {"${NUM_MAP_QUBITS}",    map.numQubits},
         {"${NUM_MAP_OPS}",       map.numMatrices},
@@ -3053,6 +3520,9 @@ void validate_krausMapFieldsMatchPassedParams(KrausMap map, int numQb, int numOp
 
 void validate_krausMapFields(KrausMap map, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {
         {"${NUM_QUBITS}",   map.numQubits},
         {"${NUM_MATRICES}", map.numMatrices},
@@ -3087,6 +3557,9 @@ void validate_krausMapFields(KrausMap map, const char* caller) {
 
 void validate_krausMapIsSynced(KrausMap map, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // we don't need to perform any sync check in CPU-only mode
     if (!mem_isAllocated(util_getGpuMemPtr(map.superop)))
         return;
@@ -3096,6 +3569,10 @@ void validate_krausMapIsSynced(KrausMap map, const char* caller) {
 }
 
 void validate_krausMapIsCPTP(KrausMap map, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
     validate_krausMapFields(map, caller);
     validate_krausMapIsSynced(map, caller);
 
@@ -3109,6 +3586,9 @@ void validate_krausMapIsCPTP(KrausMap map, const char* caller) {
 
 void validate_krausMapMatchesTargets(KrausMap map, int numTargets, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {{"${KRAUS_QUBITS}", map.numQubits}, {"${TARG_QUBITS}", numTargets}};
     assertThat(map.numQubits == numTargets, report::KRAUS_MAP_SIZE_MISMATCHES_TARGETS, vars, caller);
 }
@@ -3179,6 +3659,9 @@ void assertValidNewPauliIndices(int* indices, int numInds, int maxIndExcl, const
 
 void validate_newPauliStrNumPaulis(int numPaulis, int maxNumPaulis, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {{"${NUM_PAULIS}", numPaulis}};
     assertThat(numPaulis > 0, report::NEW_PAULI_STR_NON_POSITIVE_NUM_PAULIS, vars, caller);
 
@@ -3188,6 +3671,9 @@ void validate_newPauliStrNumPaulis(int numPaulis, int maxNumPaulis, const char*
 
 void validate_newPauliStrParams(const char* paulis, int* indices, int numPaulis, int maxNumPaulis, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     validate_newPauliStrNumPaulis(numPaulis, maxNumPaulis, caller);
     assertCorrectNumPauliCharsBeforeTerminationChar(paulis, numPaulis, caller);
     assertRecognisedNewPaulis(paulis, numPaulis, caller);
@@ -3195,6 +3681,9 @@ void validate_newPauliStrParams(const char* paulis, int* indices, int numPaulis,
 }
 void validate_newPauliStrParams(int* paulis, int* indices, int numPaulis, int maxNumPaulis, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     validate_newPauliStrNumPaulis(numPaulis, maxNumPaulis, caller);
     assertValidNewPauliCodes(paulis, numPaulis, caller);
     assertValidNewPauliIndices(indices, numPaulis, maxNumPaulis, caller);
@@ -3202,6 +3691,9 @@ void validate_newPauliStrParams(int* paulis, int* indices, int numPaulis, int ma
 
 void validate_newPauliStrNumChars(int numPaulis, int numIndices, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // this is a C++-only validation, because only std::string gaurantees we can know
     // the passed string length (C char arrays might not contain termination char)
     tokenSubs vars = {{"${NUM_PAULIS}", numPaulis}, {"${NUM_INDS}", numIndices}};
@@ -3216,6 +3708,9 @@ void validate_newPauliStrNumChars(int numPaulis, int numIndices, const char* cal
 
 void validate_pauliStrTargets(Qureg qureg, PauliStr str, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // avoid producing a list of targets which requires enumerating all bits
     int maxTarg = paulis_getIndOfLefmostNonIdentityPauli(str);
 
@@ -3225,6 +3720,9 @@ void validate_pauliStrTargets(Qureg qureg, PauliStr str, const char* caller) {
 
 void validate_controlsAndPauliStrTargets(Qureg qureg, int* ctrls, int numCtrls, PauliStr str, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // validate targets and controls in isolation
     validate_pauliStrTargets(qureg, str, caller);
     validate_controls(qureg, ctrls, numCtrls, caller);
@@ -3237,6 +3735,9 @@ void validate_controlsAndPauliStrTargets(Qureg qureg, int* ctrls, int numCtrls,
 
 void validate_controlAndPauliStrTargets(Qureg qureg, int ctrl, PauliStr str, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     validate_controlsAndPauliStrTargets(qureg, &ctrl, 1, str, caller);
 }
 
@@ -3248,6 +3749,9 @@ void validate_controlAndPauliStrTargets(Qureg qureg, int ctrl, PauliStr str, con
 
 void validate_newPauliStrSumParams(qindex numTerms, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numTerms > 0, report::NEW_PAULI_STR_SUM_NON_POSITIVE_NUM_STRINGS, {{"${NUM_TERMS}", numTerms}}, caller);
 
     // assert that the total memory required does not overflow
@@ -3278,12 +3782,18 @@ void validate_newPauliStrSumParams(qindex numTerms, const char* caller) {
 
 void validate_newPauliStrSumMatchingListLens(qindex numStrs, qindex numCoeffs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {{"${NUM_STRS}", numStrs}, {"${NUM_COEFFS}", numCoeffs}};
     assertThat(numStrs == numCoeffs, report::NEW_PAULI_STR_SUM_DIFFERENT_NUM_STRINGS_AND_COEFFS, vars, caller);
 }
 
 void validate_newPauliStrSumAllocs(PauliStrSum sum, qindex numBytesStrings, qindex numBytesCoeffs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // this validation is called AFTER the caller has checked for failed
     // allocs and (in that scenario) freed every pointer, but does not 
     // overwrite any pointers to nullptr, so the failed alloc is known.
@@ -3311,7 +3821,11 @@ void validate_newPauliStrSumAllocs(PauliStrSum sum, qindex numBytesStrings, qind
 
 void validate_parsedPauliStrSumLineIsInterpretable(bool isInterpretable, string line, qindex lineIndex, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo we cannot yet report 'line' because tokenSubs so far only accepts integers :(
+    (void) line;
 
     tokenSubs vars = {{"${LINE_NUMBER}", lineIndex + 1}}; // line numbers begin at 1
     assertThat(isInterpretable, report::PARSED_PAULI_STR_SUM_UNINTERPRETABLE_LINE, vars, caller);
@@ -3319,7 +3833,11 @@ void validate_parsedPauliStrSumLineIsInterpretable(bool isInterpretable, string
 
 void validate_parsedPauliStrSumLineHasConsistentNumPaulis(int numPaulis, int numLinePaulis, string line, qindex lineIndex, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo we cannot yet report 'line' because tokenSubs so far only accepts integers :(
+    (void) line;
 
     tokenSubs vars = {
         {"${NUM_PAULIS}",      numPaulis},
@@ -3330,7 +3848,11 @@ void validate_parsedPauliStrSumLineHasConsistentNumPaulis(int numPaulis, int num
 
 void validate_parsedPauliStrSumCoeffWithinQcompRange(bool isCoeffValid, string line, qindex lineIndex, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo we cannot yet report 'line' because tokenSubs so far only accepts integers :(
+    (void) line;
 
     tokenSubs vars = {{"${LINE_NUMBER}", lineIndex + 1}}; // lines begin at 1
     assertThat(isCoeffValid, report::PARSED_PAULI_STR_SUM_COEFF_EXCEEDS_QCOMP_RANGE, vars, caller);
@@ -3338,6 +3860,9 @@ void validate_parsedPauliStrSumCoeffWithinQcompRange(bool isCoeffValid, string l
 
 void validate_parsedStringIsNotEmpty(bool stringIsNotEmpty, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(stringIsNotEmpty, report::PARSED_STRING_IS_EMPTY, caller);
 }
 
@@ -3351,6 +3876,9 @@ bool areQubitsDisjoint(qindex qubitsMaskA, int* qubitsB, int numQubitsB);
 
 void validate_pauliStrSumFields(PauliStrSum sum, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(sum.numTerms > 0, report::INVALID_PAULI_STR_SUM_FIELDS, {{"${NUM_TERMS}", sum.numTerms}}, caller);
 
     assertThat(mem_isAllocated(sum.coeffs),  report::INVALID_PAULI_STR_HEAP_PTR, caller);
@@ -3378,6 +3906,9 @@ void validate_pauliStrSumIsHermitian(PauliStrSum sum, const char* caller) {
 
 void validate_pauliStrSumTargets(PauliStrSum sum, Qureg qureg, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     int maxInd = paulis_getIndOfLefmostNonIdentityPauli(sum);
     int minNumQb = maxInd + 1;
 
@@ -3391,6 +3922,9 @@ void validate_pauliStrSumTargets(PauliStrSum sum, Qureg qureg, const char* calle
 
 void validate_controlsAndPauliStrSumTargets(Qureg qureg, int* ctrls, int numCtrls, PauliStrSum sum, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // validate targets and controls in isolation
     validate_pauliStrSumTargets(sum, qureg, caller);
     validate_controls(qureg, ctrls, numCtrls, caller);
@@ -3402,11 +3936,17 @@ void validate_controlsAndPauliStrSumTargets(Qureg qureg, int* ctrls, int numCtrl
 
 void validate_controlAndPauliStrSumTargets(Qureg qureg, int ctrl, PauliStrSum sum, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     validate_controlsAndPauliStrSumTargets(qureg, &ctrl, 1, sum, caller);
 }
 
 void validate_pauliStrSumCanInitMatrix(FullStateDiagMatr matr, PauliStrSum sum, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(!paulis_containsXOrY(sum), report::PAULI_STR_SUM_NOT_ALL_I_Z, caller);
 
     int maxInd = paulis_getIndOfLefmostNonIdentityPauli(sum);
@@ -3428,6 +3968,9 @@ void validate_pauliStrSumCanInitMatrix(FullStateDiagMatr matr, PauliStrSum sum,
 
 void validate_basisStateIndex(Qureg qureg, qindex ind, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     qindex maxIndExcl = powerOf2(qureg.numQubits);
 
     tokenSubs vars = {
@@ -3440,6 +3983,9 @@ void validate_basisStateIndex(Qureg qureg, qindex ind, const char* caller) {
 
 void validate_basisStateRowCol(Qureg qureg, qindex row, qindex col, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     qindex maxIndExcl = powerOf2(qureg.numQubits);
 
     tokenSubs vars = {
@@ -3454,6 +4000,9 @@ void validate_basisStateRowCol(Qureg qureg, qindex row, qindex col, const char*
 
 void validate_basisStateIndices(Qureg qureg, qindex startInd, qindex numInds, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(
         startInd >= 0 && startInd < qureg.numAmps, 
         report::INVALID_STARTING_BASIS_STATE_INDEX, 
@@ -3482,6 +4031,9 @@ void validate_basisStateIndices(Qureg qureg, qindex startInd, qindex numInds, co
 
 void validate_basisStateRowCols(Qureg qureg, qindex startRow, qindex startCol, qindex numRows, qindex numCols, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     qindex maxRowOrColExcl = powerOf2(qureg.numQubits);
 
     assertThat(
@@ -3519,6 +4071,9 @@ void validate_basisStateRowCols(Qureg qureg, qindex startRow, qindex startCol, q
 
 void validate_localAmpIndices(Qureg qureg, qindex localStartInd, qindex numInds, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // note that localStartInd and numInds can validly DIFFER between nodes,
     // so we use assertAllNodesAgreeThat() in lieu of assertThat()
 
@@ -3616,11 +4171,17 @@ void assertValidQubits(
 
 void validate_target(Qureg qureg, int target, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertValidQubit(qureg, target, report::INVALID_TARGET_QUBIT, caller);
 }
 
 void validate_targets(Qureg qureg, int* targets, int numTargets, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // must always have at least 1 target
     bool numCanBeZero = false;
 
@@ -3631,12 +4192,18 @@ void validate_targets(Qureg qureg, int* targets, int numTargets, const char* cal
 }
 void validate_twoTargets(Qureg qureg, int target1, int target2, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     int targs[] = {target1, target2};
     validate_targets(qureg, targs, 2, caller);
 }
 
 void validate_controls(Qureg qureg, int* ctrls, int numCtrls, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // it is fine to have zero controls
     bool numCanBeZero = true;
 
@@ -3648,6 +4215,9 @@ void validate_controls(Qureg qureg, int* ctrls, int numCtrls, const char* caller
 
 void validate_controlsAndTargets(Qureg qureg, int* ctrls, int numCtrls, int* targs, int numTargs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // validate controls and targets in isolation
     validate_targets(qureg, targs, numTargs, caller);
     validate_controls(qureg, ctrls, numCtrls, caller);
@@ -3657,29 +4227,47 @@ void validate_controlsAndTargets(Qureg qureg, int* ctrls, int numCtrls, int* tar
 }
 void validate_controlAndTarget(Qureg qureg, int ctrl, int targ, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     validate_controlsAndTargets(qureg, &ctrl, 1, &targ, 1, caller);
 }
 void validate_controlAndTargets(Qureg qureg, int ctrl, int* targs, int numTargs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     validate_controlsAndTargets(qureg, &ctrl, 1, targs, numTargs, caller);
 }
 void validate_controlsAndTarget(Qureg qureg, int* ctrls, int numCtrls, int targ, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     validate_controlsAndTargets(qureg, ctrls, numCtrls, &targ, 1, caller);
 }
 void validate_controlAndTwoTargets(Qureg qureg, int ctrl, int targ1, int targ2, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     int targs[] = {targ1, targ2};
     validate_controlsAndTargets(qureg, &ctrl, 1, targs, 2, caller);
 }
 void validate_controlsAndTwoTargets(Qureg qureg, int* ctrls, int numCtrls, int targ1, int targ2, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     int targs[] = {targ1, targ2};
     validate_controlsAndTargets(qureg, ctrls, numCtrls, targs, 2, caller);
 }
 
 void validate_controlStates(int* states, int numCtrls, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // states is permittedly unallocated (nullptr) even when numCtrls != 0
     if (!mem_isAllocated(states))
         return;
@@ -3690,6 +4278,9 @@ void validate_controlStates(int* states, int numCtrls, const char* caller) {
 
 void validate_controlsMatchStates(int numCtrls, int numStates, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // only invocable by the C++ interface
     tokenSubs vars = {
         {"${NUM_CTRLS}",  numCtrls},
@@ -3706,11 +4297,17 @@ void validate_controlsMatchStates(int numCtrls, int numStates, const char* calle
 
 void validate_measurementOutcomeIsValid(int outcome, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(outcome == 0 || outcome == 1, report::ONE_QUBIT_MEASUREMENT_OUTCOME_INVALID, {{"${OUTCOME}", outcome}}, caller);
 }
 
 void validate_measurementOutcomesAreValid(int* outcomes, int numOutcomes, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // no need to validate numOutcomes; it is already validated by caller (e.g. through numTargets)
 
     for (int i=0; i<numOutcomes; i++)
@@ -3742,6 +4339,9 @@ void validate_measurementOutcomesProbNotZero(int* outcomes, int numQubits, qreal
 
 void validate_measurementOutcomesFitInGpuMem(Qureg qureg, int numQubits, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // only GPU backend needs temp memory
     if (!qureg.isGpuAccelerated)
         return;
@@ -3774,6 +4374,9 @@ void validate_measurementProbsAreNormalised(vector<qreal> probs, const char* cal
 
 void validate_measurementOutcomesMatchTargets(int numQubits, int numOutcomes, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // invoked only by the C++ user interface
     tokenSubs vars = {
         {"${NUM_QUBITS}",    numQubits},
@@ -3800,6 +4403,9 @@ void validate_rotationAxisNotZeroVector(qreal x, qreal y, qreal z, const char* c
 
 void validate_mixedAmpsFitInNode(Qureg qureg, int numTargets, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // only relevant to distributed quregs
     if (!qureg.isDistributed)
         return;
@@ -3829,7 +4435,10 @@ void validate_mixedAmpsFitInNode(Qureg qureg, int numTargets, const char* caller
  * TROTTERISATION PARAMETERS
  */
 
-void validate_trotterParams(Qureg qureg, int order, int reps, const char* caller) {
+void validate_trotterParams(int order, int reps, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
 
     bool isEven = (order % 2) == 0;
     assertThat(order > 0 && (isEven || order==1), report::INVALID_TROTTER_ORDER, {{"${ORDER}", order}}, caller);
@@ -3844,6 +4453,9 @@ void validate_trotterParams(Qureg qureg, int order, int reps, const char* caller
 
 void validate_lindbladJumpOps(PauliStrSum* jumps, int numJumps, Qureg qureg, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numJumps >= 0, report::NEGATIVE_NUM_LINDBLAD_JUMP_OPS, caller);
 
     // @todo
@@ -3860,6 +4472,9 @@ void validate_lindbladJumpOps(PauliStrSum* jumps, int numJumps, Qureg qureg, con
 
 void validate_lindbladDampingRates(qreal* damps, int numJumps, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // possibly repeated from jump op validation, for safety
     assertThat(numJumps >= 0, report::NEGATIVE_NUM_LINDBLAD_JUMP_OPS, caller);
 
@@ -3874,6 +4489,9 @@ void validate_lindbladDampingRates(qreal* damps, int numJumps, const char* calle
 
 void validate_numLindbladSuperPropagatorTerms(qindex numSuperTerms, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numSuperTerms != 0, report::NUM_LINDBLAD_SUPER_PROPAGATOR_TERMS_OVERFLOWED, caller);
 
     // attempt to fetch RAM, and simply return if we fail; if we unknowingly
@@ -3888,7 +4506,6 @@ void validate_numLindbladSuperPropagatorTerms(qindex numSuperTerms, const char*
     // check whether the superpropagator fits in memory
     bool fits = mem_canPauliStrSumFitInMemory(numSuperTerms, memPerNode);
     assertThat(fits, report::NEW_LINDBLAD_SUPER_PROPAGATOR_CANNOT_FIT_INTO_CPU_MEM, {{"${NUM_TERMS}", numSuperTerms}, {"${NUM_BYTES}", memPerNode}}, caller);
-
 }
 
 
@@ -3899,6 +4516,9 @@ void validate_numLindbladSuperPropagatorTerms(qindex numSuperTerms, const char*
 
 void validate_probability(qreal prob, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo report 'prob' once validation reporting can handle floats
 
     /// @todo 
@@ -3910,6 +4530,9 @@ void validate_probability(qreal prob, const char* caller) {
 
 void validate_probabilities(qreal* probs, int numProbs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // we assume that numProbs>0 was prior validated
 
     /// @todo like above, should we permit -eps <= prob <= 1+eps?
@@ -3931,6 +4554,9 @@ void validate_probabilities(qreal* probs, int numProbs, const char* caller) {
 
 void validate_oneQubitDepashingProb(qreal prob, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo report 'prob' once validation reporting can handle floats
 
     validate_probability(prob, caller);
@@ -3941,6 +4567,9 @@ void validate_oneQubitDepashingProb(qreal prob, const char* caller) {
 
 void validate_twoQubitDepashingProb(qreal prob, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo report 'prob' once validation reporting can handle floats
 
     validate_probability(prob, caller);
@@ -3951,6 +4580,9 @@ void validate_twoQubitDepashingProb(qreal prob, const char* caller) {
 
 void validate_oneQubitDepolarisingProb(qreal prob, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo report 'prob' once validation reporting can handle floats
 
     validate_probability(prob, caller);
@@ -3961,6 +4593,9 @@ void validate_oneQubitDepolarisingProb(qreal prob, const char* caller) {
 
 void validate_twoQubitDepolarisingProb(qreal prob, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo report 'prob' once validation reporting can handle floats
 
     validate_probability(prob, caller);
@@ -3971,6 +4606,9 @@ void validate_twoQubitDepolarisingProb(qreal prob, const char* caller) {
 
 void validate_oneQubitDampingProb(qreal prob, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo report 'prob' once validation reporting can handle floats
 
     // permit one-qubit amplitude damping of any valid probability, 
@@ -3980,6 +4618,9 @@ void validate_oneQubitDampingProb(qreal prob, const char* caller) {
 
 void validate_oneQubitPauliChannelProbs(qreal pX, qreal pY, qreal pZ, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     validate_probability(pX, caller);
     validate_probability(pY, caller);
     validate_probability(pZ, caller);
@@ -4005,6 +4646,9 @@ void validate_oneQubitPauliChannelProbs(qreal pX, qreal pY, qreal pZ, const char
 
 void validate_quregCanBeWorkspace(Qureg qureg, Qureg workspace, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(
         doQuregsHaveIdenticalMemoryLayouts(qureg, workspace),
         report::QUREG_IS_INCOMPATIBLE_WITH_WORKSPACE, caller);
@@ -4015,11 +4659,17 @@ void validate_quregCanBeWorkspace(Qureg qureg, Qureg workspace, const char* call
 
 void validate_numQuregsInSum(int numQuregs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numQuregs > 0, report::NON_POSITIVE_NUM_QUREGS_IN_SUM, {{"${NUM_QUREGS}", numQuregs}}, caller);
 }
 
 void validate_quregsCanBeSummed(Qureg out, Qureg* in, int numIn, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     for (int i=0; i<numIn; i++)
         validate_quregFields(in[i], caller);
 
@@ -4032,6 +4682,9 @@ void validate_quregsCanBeSummed(Qureg out, Qureg* in, int numIn, const char* cal
 
 void validate_quregsCanBeMixed(Qureg out, Qureg* in, int numIn, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // mixing in multiple quregs (done here) is much stricter than when 
     // only one pair is being mixed in, which is handled below
 
@@ -4050,6 +4703,9 @@ void validate_quregsCanBeMixed(Qureg out, Qureg* in, int numIn, const char* call
 
 void validate_quregPairCanBeMixed(Qureg quregOut, Qureg quregIn, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // mixing must be mathematically possible; dims are compatible, but quregIn can be a statevector
     assertThat(quregOut.isDensityMatrix, report::MIXED_QUREG_NOT_DENSITY_MATRIX, caller);
     assertThat(
@@ -4068,6 +4724,9 @@ void validate_quregPairCanBeMixed(Qureg quregOut, Qureg quregIn, const char* cal
 
 void validate_numQuregsMatchesCoeffs(size_t numQuregs, size_t numCoeffs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {
         {"${NUM_QUREGS}", numQuregs},
         {"${NUM_COEFFS}", numCoeffs}
@@ -4077,6 +4736,9 @@ void validate_numQuregsMatchesCoeffs(size_t numQuregs, size_t numCoeffs, const c
 
 void validate_numQuregsMatchesProbs(size_t numQuregs, size_t numProbs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {
         {"${NUM_QUREGS}", numQuregs},
         {"${NUM_PROBS}",  numProbs}
@@ -4108,6 +4770,9 @@ void validateStateVecCanBeInitialisedToPureState(Qureg qureg, Qureg pure, const
 
 void validate_quregCanBeInitialisedToPureState(Qureg qureg, Qureg pure, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(!pure.isDensityMatrix, report::INIT_PURE_STATE_IS_DENSMATR, caller);
 
     // quregs must have the same number of qubits, regardless of dimension
@@ -4124,6 +4789,9 @@ void validate_quregCanBeInitialisedToPureState(Qureg qureg, Qureg pure, const ch
 
 void validate_quregsCanBeCloned(Qureg quregA, Qureg quregB, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // quregs must have identical sizes... 
     assertThat(
         quregA.numQubits == quregB.numQubits, report::CLONED_QUREGS_DIFFER_IN_NUM_QUBITS, 
@@ -4146,7 +4814,10 @@ void validate_quregsCanBeCloned(Qureg quregA, Qureg quregB, const char* caller)
 
 void validate_quregsCanBeProducted(Qureg quregA, Qureg quregB, const char* caller) {
 
-   // number of qubits must always match
+    if (!global_isValidationEnabled)
+        return;
+
+    // number of qubits must always match
     assertThat(
         quregA.numQubits == quregB.numQubits, 
         report::PRODUCTED_QUREGS_HAVE_DIFFERENT_NUM_QUBITS,
@@ -4177,6 +4848,9 @@ void validate_quregsCanBeProducted(Qureg quregA, Qureg quregB, const char* calle
 
 void validate_throwErrorBecauseCalcFidOfDensMatrNotYetImplemented(const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(false, report::CALC_FIDELITY_OF_DENSITY_MATRICES_NOT_YET_SUPPORTED, caller);
 }
 
@@ -4236,6 +4910,9 @@ void validate_quregRenormProbIsNotZero(qreal prob, const char* caller) {
 
 void validate_numInitRandomPureStates(qindex numPureStates,  const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numPureStates >= 1, report::INVALID_NUM_INIT_PURE_STATES, {{"${NUM_STATES}", numPureStates}}, caller);
 }
 
@@ -4302,6 +4979,9 @@ void validate_densMatrExpecDiagMatrValueIsReal(qcomp value, qcomp exponent, cons
 
 void validate_quregCanBeReduced(Qureg qureg, int numTraceQubits, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // 0 < numTraceQubits <= numQubits is assured by validate_targets(), but
     // numTraceQubits == numQubtis is permitted there though forbidden here
     assertThat(numTraceQubits < qureg.numQubits, report::NUM_TRACE_QUBITS_EQUALS_QUREG_SIZE, caller);
@@ -4328,6 +5008,9 @@ void validate_quregCanBeReduced(Qureg qureg, int numTraceQubits, const char* cal
 
 void validate_quregCanBeSetToReducedDensMatr(Qureg out, Qureg in, int numTraceQubits, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     int numRemainingQubits = in.numQubits - numTraceQubits;
 
     tokenSubs vars = {
@@ -4350,6 +5033,9 @@ void validate_quregCanBeSetToReducedDensMatr(Qureg out, Qureg in, int numTraceQu
 
 void validate_canReadFile(string fn, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo embed filename into error message when tokenSubs is updated to permit strings
     assertThat(parser_canReadFile(fn), report::CANNOT_READ_FILE, caller);
 }
@@ -4362,6 +5048,9 @@ void validate_canReadFile(string fn, const char* caller) {
 
 void validate_tempListAllocSucceeded(bool succeeded, qindex numElems, qindex numBytesPerElem, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // avoid showing total bytes in case it overflows
     tokenSubs vars = {
         {"${NUM_ELEMS}", numElems},
@@ -4372,6 +5061,9 @@ void validate_tempListAllocSucceeded(bool succeeded, qindex numElems, qindex num
 
 void validate_tempAllocSucceeded(bool succeeded, size_t numBytes, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(succeeded, report::TEMP_ALLOC_FAILED, {{"${NUM_BYTES}", numBytes}}, caller);
 }
 
@@ -4383,17 +5075,43 @@ void validate_tempAllocSucceeded(bool succeeded, size_t numBytes, const char* ca
 
 void validate_envVarPermitNodesToShareGpu(string varValue, const char* caller) {
 
+    // this presently does absolutely nothing; environment variables are
+    // loaded during QuESTEnv initialisation, before which there is no
+    // way to disable validation... but we keep for clarity/consistency!
+    if (!global_isValidationEnabled)
+        return;
+
     // though caller should gaurantee varValue contains at least one character, 
     // we'll still check to avoid a segfault if this gaurantee is broken
     bool isValid = (varValue.size() == 1) && (varValue[0] == '0' || varValue[0] == '1');
-    assertThat(isValid, report::INVALID_PERMIT_NODES_TO_SHARE_GPU_ENV_VAR, caller);
+    assertThat(isValid, report::INVALID_QUEST_PERMIT_NODES_TO_SHARE_GPU_ENV_VAR, caller);
 }
 
 void validate_envVarDefaultValidationEpsilon(string varValue, const char* caller) {
 
+    // this presently does absolutely nothing; environment variables are
+    // loaded during QuESTEnv initialisation, before which there is no
+    // way to disable validation... but we keep for clarity/consistency!
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(parser_isAnySizedReal(varValue), report::DEFAULT_EPSILON_ENV_VAR_NOT_A_REAL, caller);
     assertThat(parser_isValidReal(varValue), report::DEFAULT_EPSILON_ENV_VAR_EXCEEDS_QREAL_RANGE, caller);
 
     qreal eps = parser_parseReal(varValue);
     assertThat(eps >= 0, report::DEFAULT_EPSILON_ENV_VAR_IS_NEGATIVE, caller);
 }
+
+void validate_envVarDefaultNumGpuThreadsPerBlockIsAnInt(string varValue, const char* caller) {
+
+    // this presently does absolutely nothing; environment variables are
+    // loaded during QuESTEnv initialisation, before which there is no
+    // way to disable validation... but we keep for clarity/consistency!
+    if (!global_isValidationEnabled)
+        return;
+
+    // we here only validate that the value is a valid signed integer;
+    // validation of its GPU-compatibility is performed by another func
+    assertThat(parser_isAnySizedInteger(varValue), report::DEFAULT_NUM_GPU_THREADS_PER_BLOCK_ENV_VAR_NOT_AN_INT, caller);
+    assertThat(parser_isValidInteger(varValue), report::DEFAULT_NUM_GPU_THREADS_PER_BLOCK_ENV_VAR_EXCEEDS_INT_RANGE, caller);
+}
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
index 66fb8f546..87f81a0d6 100644
--- a/quest/src/core/validation.hpp
+++ b/quest/src/core/validation.hpp
@@ -77,6 +77,12 @@ void validate_newEnvNodesEachHaveUniqueGpu(const char* caller);
 
 void validate_gpuIsCuQuantumCompatible(const char* caller);
 
+void validate_mpiInitStatus(bool useDistrib, bool userOwnsMpi, const char* caller);
+
+void validate_mpiSubCommIsNonNull(bool isNonNull, const char* caller);
+
+void validate_mpiSubCommSetSucceeded(bool success, const char* caller);
+
 
 
 /*
@@ -107,6 +113,8 @@ void validate_numPauliChars(const char* paulis, const char* caller);
 
 void validate_reportedPauliStrStyleFlag(int flag, const char* caller);
 
+void validate_numGpuThreadsPerBlock(int numTBP, bool isGpuActive, const char* caller);
+
 
 
 /*
@@ -420,7 +428,7 @@ void validate_mixedAmpsFitInNode(Qureg qureg, int numTargets, const char* caller
  * TROTTERISATION PARAMETERS
  */
 
-void validate_trotterParams(Qureg qureg, int order, int reps, const char* caller);
+void validate_trotterParams(int order, int reps, const char* caller);
 
 
 
@@ -548,6 +556,8 @@ void validate_envVarPermitNodesToShareGpu(string varValue, const char* caller);
 
 void validate_envVarDefaultValidationEpsilon(string varValue, const char* caller);
 
+void validate_envVarDefaultNumGpuThreadsPerBlockIsAnInt(string varValue, const char* caller);
+
 
 
 #endif // VALIDATION_HPP
\ No newline at end of file
diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
index c11ec224d..bd51236bb 100644
--- a/quest/src/cpu/cpu_config.cpp
+++ b/quest/src/cpu/cpu_config.cpp
@@ -22,14 +22,14 @@
 using std::vector;
 
 
-// when COMPILE_OPENMP=1, the compiler expects arguments like -fopenmp
+// when QUEST_COMPILE_OMP=1, the compiler expects arguments like -fopenmp
 // which cause _OPENMP to be defined, which we check to ensure that
-// COMPILE_OPENMP has been set correctly. Note that HIP compilers do
+// QUEST_COMPILE_OMP has been set correctly. Note that HIP compilers do
 // not define _OPENMP even when parsing OpenMP, and it's possible that
 // the user is compiling all the source code (including this file) with
 // HIP; we tolerate _OPENMP being undefined in that instance
 
-#if COMPILE_OPENMP && !defined(_OPENMP) && !defined(__HIP__)
+#if QUEST_COMPILE_OMP && !defined(_OPENMP) && !defined(__HIP__)
     #error "Attempted to compile in multithreaded mode without enabling OpenMP in the compiler flags."
 #endif
 
@@ -40,16 +40,16 @@ using std::vector;
 /// Windows? This validation protects against enabling NUMA awareness
 /// on Windows but silently recieving no benefit due to no NUMA API calls
 
-#if NUMA_AWARE && defined(_WIN32)
+#if QUEST_ENABLE_NUMA && defined(_WIN32)
     #error "NUMA awareness is not currently supported on non-POSIX systems like Windows."
 #endif
 
 
-#if COMPILE_OPENMP
+#if QUEST_COMPILE_OMP
     #include <omp.h>
 #endif
 
-#if NUMA_AWARE && ! defined(_WIN32)
+#if QUEST_ENABLE_NUMA && ! defined(_WIN32)
     #include <sys/mman.h>
     #include <numaif.h>
     #include <numa.h>
@@ -71,17 +71,15 @@ using std::vector;
 
 
 bool cpu_isOpenmpCompiled() {
-    return (bool) COMPILE_OPENMP;
+    return (bool) QUEST_COMPILE_OMP;
 }
 
 
 int cpu_getAvailableNumThreads() {
-#if COMPILE_OPENMP
+#if QUEST_COMPILE_OMP
     int n = -1;
 
-    #pragma omp parallel shared(n)
-    #pragma omp single
-    n = omp_get_num_threads();
+    n = omp_get_max_threads();
 
     return n;
 #else
@@ -92,7 +90,7 @@ int cpu_getAvailableNumThreads() {
 
 
 int cpu_getNumOpenmpProcessors() {
-#if COMPILE_OPENMP
+#if QUEST_COMPILE_OMP
     return omp_get_num_procs();
 #else
     error_cpuThreadsQueriedButEnvNotMultithreaded();
@@ -112,7 +110,7 @@ int cpu_getNumOpenmpProcessors() {
 
 
 int cpu_getOpenmpThreadInd() {
-#if COMPILE_OPENMP
+#if QUEST_COMPILE_OMP
     return omp_get_thread_num();
 #else
     return 0;
@@ -121,7 +119,7 @@ int cpu_getOpenmpThreadInd() {
 
 
 int cpu_getCurrentNumThreads() {
-#if COMPILE_OPENMP
+#if QUEST_COMPILE_OMP
     return omp_get_num_threads();
 #else
     return 1;
@@ -182,7 +180,7 @@ qcomp* cpu_allocArray(qindex length) {
 
 
 qcomp* cpu_allocNumaArray(qindex length) {
-#if ! NUMA_AWARE
+#if ! QUEST_ENABLE_NUMA
     return cpu_allocArray(length);
 
 #elif defined(_WIN32)
@@ -267,7 +265,7 @@ void cpu_deallocNumaArray(qcomp* arr, qindex length) {
     if (arr == nullptr)
         return;
 
-#if ! NUMA_AWARE
+#if ! QUEST_ENABLE_NUMA
     cpu_deallocArray(arr);
 
 #elif defined(_WIN32)
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 5a7baade5..59df946e9 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -216,7 +216,7 @@ void cpu_fullstatediagmatr_setElemsFromMultiVarFunc(FullStateDiagMatr out, qcomp
 
 
 template <int NumQubits>
-qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector<int> qubitInds, vector<int> qubitStates) {
+qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates) {
 
     assert_numQubitsMatchesQubitStatesAndTemplateParam(qubitInds.size(), qubitStates.size(), NumQubits);
 
@@ -281,7 +281,7 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoBuffer, (Qureg, vector<int>, vector<int>) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoBuffer, (Qureg, ConstList64, ConstList64) )
 
 
 
@@ -291,7 +291,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, cpu_statevec_packAmpsIntoBuffe
 
 
 template <int NumCtrls>
-void cpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2) {
+void cpu_statevec_anyCtrlSwap_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
@@ -321,7 +321,7 @@ void cpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<int> c
 
 
 template <int NumCtrls>
-void cpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates) {
+void cpu_statevec_anyCtrlSwap_subB(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
@@ -357,7 +357,7 @@ void cpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector<int> ctrls, vector<int> c
 
 
 template <int NumCtrls>
-void cpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, int targState) {
+void cpu_statevec_anyCtrlSwap_subC(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, int targState) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
@@ -393,9 +393,9 @@ void cpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector<int> ctrls, vector<int> c
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlSwap_subA, (Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlSwap_subB, (Qureg qureg, vector<int> ctrls, vector<int> ctrlStates) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlSwap_subC, (Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, int targState) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlSwap_subA, (Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlSwap_subB, (Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlSwap_subC, (Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, int targState) )
 
 
 
@@ -405,7 +405,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlSwap_subC, (
 
 
 template <int NumCtrls>
-void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, CompMatr1 matr) {
+void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, CompMatr1 matr) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
@@ -441,7 +441,7 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
 
 
 template <int NumCtrls>
-void cpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, qcomp fac0, qcomp fac1) {
+void cpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, qcomp fac0, qcomp fac1) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
@@ -477,8 +477,8 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector<int> ctrls, v
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlOneTargDenseMatr_subA, (Qureg, vector<int>, vector<int>, int, CompMatr1) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlOneTargDenseMatr_subB, (Qureg, vector<int>, vector<int>, qcomp, qcomp) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlOneTargDenseMatr_subA, (Qureg, ConstList64, ConstList64, int, CompMatr1) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlOneTargDenseMatr_subB, (Qureg, ConstList64, ConstList64, qcomp, qcomp) )
 
 
 
@@ -488,7 +488,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlOneTargDense
 
 
 template <int NumCtrls> 
-void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr) {
+void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, CompMatr2 matr) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
@@ -530,7 +530,7 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlTwoTargDenseMatr_sub, (Qureg, vector<int>, vector<int>, int, int, CompMatr2) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlTwoTargDenseMatr_sub, (Qureg, ConstList64, ConstList64, int, int, CompMatr2) )
 
 
 
@@ -540,7 +540,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlTwoTargDense
 
 
 template <int NumCtrls, int NumTargs, bool ApplyConj, bool ApplyTransp>
-void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr) {
+void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, CompMatr matr) {
     
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
     assert_numTargsMatchesTemplateParam(targs.size(), NumTargs);
@@ -572,7 +572,7 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 
     // prepare a mask which yields ctrls in specified state, and targs in all-zero
     auto sortedQubits   = util_getSorted(ctrls, targs);
-    auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, targs, vector<int>(targs.size(),0));
+    auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, targs, util_getConstantList(0,targs.size()));
 
     // attempt to use compile-time variables to automatically optimise/unroll dependent loops
     SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size());
@@ -642,7 +642,7 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 }
 
 
-INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, cpu_statevec_anyCtrlAnyTargDenseMatr_sub, (Qureg, vector<int>, vector<int>, vector<int>, CompMatr) )
+INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, cpu_statevec_anyCtrlAnyTargDenseMatr_sub, (Qureg, ConstList64, ConstList64, ConstList64, CompMatr) )
 
 
 
@@ -652,7 +652,7 @@ INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, cpu_statevec_
 
 
 template <int NumCtrls>
-void cpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, DiagMatr1 matr) {
+void cpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, DiagMatr1 matr) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
@@ -684,7 +684,7 @@ void cpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlOneTargDiagMatr_sub, (Qureg, vector<int>, vector<int>, int, DiagMatr1) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlOneTargDiagMatr_sub, (Qureg, ConstList64, ConstList64, int, DiagMatr1) )
 
 
 
@@ -694,7 +694,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlOneTargDiagM
 
 
 template <int NumCtrls>
-void cpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, DiagMatr2 matr) {
+void cpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, DiagMatr2 matr) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
@@ -726,7 +726,7 @@ void cpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlTwoTargDiagMatr_sub, (Qureg, vector<int>, vector<int>, int, int, DiagMatr2) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlTwoTargDiagMatr_sub, (Qureg, ConstList64, ConstList64, int, int, DiagMatr2) )
 
 
 
@@ -736,7 +736,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlTwoTargDiagM
 
 
 template <int NumCtrls, int NumTargs, bool ApplyConj, bool HasPower>
-void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, DiagMatr matr, qcomp exponent) {
+void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, DiagMatr matr, qcomp exponent) {
     
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
     assert_numTargsMatchesTemplateParam(targs.size(), NumTargs);
@@ -788,7 +788,7 @@ void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
 }
 
 
-INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, cpu_statevec_anyCtrlAnyTargDiagMatr_sub, (Qureg, vector<int>, vector<int>, vector<int>, DiagMatr, qcomp) )
+INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, cpu_statevec_anyCtrlAnyTargDiagMatr_sub, (Qureg, ConstList64, ConstList64, ConstList64, DiagMatr, qcomp) )
 
 
 /// @todo
@@ -961,8 +961,8 @@ INLINE void applyPauliUponAmpPair(
 
 template <int NumCtrls, int NumTargs>
 void cpu_statevector_anyCtrlPauliTensorOrGadget_subA(
-    Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, 
-    vector<int> x, vector<int> y, vector<int> z, qcomp ampFac, qcomp pairAmpFac
+    Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, 
+    ConstList64 x, ConstList64 y, ConstList64 z, qcomp ampFac, qcomp pairAmpFac
 ) {
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
     assert_numTargsMatchesTemplateParam(x.size() + y.size(), NumTargs);
@@ -976,11 +976,11 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA(
     cpu_qcomp f1 = getCpuQcomp(pairAmpFac);
     
     // only X and Y count as targets
-    vector<int> sortedTargsXY = util_getSorted(util_getConcatenated(x, y));
+    auto sortedTargsXY = util_getSorted(util_getConcatenated(x, y));
 
     // prepare a mask which yields ctrls in specified state, and X-Y targs in all-zero
     auto sortedQubits   = util_getSorted(ctrls, sortedTargsXY);
-    auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, sortedTargsXY, vector<int>(sortedTargsXY.size(),0));
+    auto qubitStateMask = util_getBitMask(ctrls, ctrlStates, sortedTargsXY, util_getConstantList(0, sortedTargsXY.size()));
 
     // prepare masks for extracting Pauli parities
     auto maskXY = util_getBitMask(sortedTargsXY);
@@ -1044,8 +1044,8 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA(
 
 template <int NumCtrls>
 void cpu_statevector_anyCtrlPauliTensorOrGadget_subB(
-    Qureg qureg, vector<int> ctrls, vector<int> ctrlStates,
-    vector<int> x, vector<int> y, vector<int> z, qcomp ampFac, qcomp pairAmpFac, qindex bufferMaskXY
+    Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates,
+    ConstList64 x, ConstList64 y, ConstList64 z, qcomp ampFac, qcomp pairAmpFac, qindex bufferMaskXY
 ) {
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
@@ -1091,8 +1091,8 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subB(
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, cpu_statevector_anyCtrlPauliTensorOrGadget_subA, (Qureg, vector<int>, vector<int>, vector<int>, vector<int>, vector<int>, qcomp, qcomp) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevector_anyCtrlPauliTensorOrGadget_subB, (Qureg, vector<int>, vector<int>, vector<int>, vector<int>, vector<int>, qcomp, qcomp, qindex) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, cpu_statevector_anyCtrlPauliTensorOrGadget_subA, (Qureg, ConstList64, ConstList64, ConstList64, ConstList64, ConstList64, qcomp, qcomp) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevector_anyCtrlPauliTensorOrGadget_subB, (Qureg, ConstList64, ConstList64, ConstList64, ConstList64, ConstList64, qcomp, qcomp, qindex) )
 
 
 
@@ -1103,7 +1103,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevector_anyCtrlPauliTens
 
 template <int NumCtrls>
 void cpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(
-    Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, 
+    Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, 
     qcomp fac0, qcomp fac1
 ) {
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
@@ -1135,7 +1135,7 @@ void cpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub, (Qureg, vector<int>, vector<int>, vector<int>, qcomp, qcomp) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub, (Qureg, ConstList64, ConstList64, ConstList64, qcomp, qcomp) )
 
 
 
@@ -1854,7 +1854,7 @@ void cpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) {
 
 
 template <int NumTargs>
-void cpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector<int> targs, vector<int> pairTargs) {
+void cpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, ConstList64 targs, ConstList64 pairTargs) {
 
     assert_numTargsMatchesTemplateParam(targs.size(), NumTargs);
 
@@ -1908,7 +1908,7 @@ void cpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector<int> ta
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_densmatr_partialTrace_sub, (Qureg, Qureg, vector<int>, vector<int>) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_densmatr_partialTrace_sub, (Qureg, Qureg, ConstList64, ConstList64) )
 
 
 
@@ -1990,7 +1990,7 @@ qreal cpu_densmatr_calcTotalProb_sub(Qureg qureg) {
 
 
 template <int NumQubits>
-qreal cpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
+qreal cpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes) {
 
     assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
@@ -2023,7 +2023,7 @@ qreal cpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubi
 
 
 template <int NumQubits>
-qreal cpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
+qreal cpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes) {
 
     assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
@@ -2063,7 +2063,7 @@ qreal cpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubi
 
 
 template <int NumQubits>
-void cpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits) {
+void cpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, ConstList64 qubits) {
 
     assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
@@ -2104,7 +2104,7 @@ void cpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
 
 
 template <int NumQubits>
-void cpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits) {
+void cpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, ConstList64 qubits) {
 
     assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
@@ -2148,10 +2148,10 @@ void cpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qreal, cpu_statevec_calcProbOfMultiQubitOutcome_sub, (Qureg, vector<int>, vector<int>) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qreal, cpu_densmatr_calcProbOfMultiQubitOutcome_sub, (Qureg, vector<int>, vector<int>) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub, (qreal* outProbs, Qureg, vector<int>) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub, (qreal* outProbs, Qureg, vector<int>) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qreal, cpu_statevec_calcProbOfMultiQubitOutcome_sub, (Qureg, ConstList64, ConstList64) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qreal, cpu_densmatr_calcProbOfMultiQubitOutcome_sub, (Qureg, ConstList64, ConstList64) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub, (qreal* outProbs, Qureg, ConstList64) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub, (qreal* outProbs, Qureg, ConstList64) )
 
 
 
@@ -2259,7 +2259,7 @@ template qcomp cpu_densmatr_calcFidelityWithPureState_sub<false>(Qureg, Qureg);
  */
 
 
-qreal cpu_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
+qreal cpu_statevec_calcExpecAnyTargZ_sub(Qureg qureg, ConstList64 targs) {
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
@@ -2283,7 +2283,7 @@ qreal cpu_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
 }
 
 
-qcomp cpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
+qcomp cpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, ConstList64 targs) {
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
@@ -2318,7 +2318,7 @@ qcomp cpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
 }
 
 
-qcomp cpu_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
+qcomp cpu_statevec_calcExpecPauliStr_subA(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z) {
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
@@ -2352,7 +2352,7 @@ qcomp cpu_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vector<int
 }
 
 
-qcomp cpu_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
+qcomp cpu_statevec_calcExpecPauliStr_subB(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z) {
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps   = getCpuQcompPtr(qureg.cpuAmps);
@@ -2395,7 +2395,7 @@ qcomp cpu_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vector<int
 }
 
 
-qcomp cpu_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
+qcomp cpu_densmatr_calcExpecPauliStr_sub(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z) {
 
     // use cpu_qcomp arithmetic overloads (avoid qcomp's)
     cpu_qcomp* amps = getCpuQcompPtr(qureg.cpuAmps);
@@ -2563,7 +2563,7 @@ template qcomp cpu_densmatr_calcExpecFullStateDiagMatr_sub<false,true >(Qureg, F
 
 
 template <int NumQubits>
-void cpu_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) {
+void cpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
     // all qubits are in suffix
     assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
@@ -2594,7 +2594,7 @@ void cpu_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vecto
 
 
 template <int NumQubits>
-void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) {
+void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
     // this function is merely an optimisation to avoid calling the above
     // cpu_statevec_multiQubitProjector_sub() twice upon a density matrix;
@@ -2635,8 +2635,8 @@ void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vecto
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_multiQubitProjector_sub, (Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_densmatr_multiQubitProjector_sub, (Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_statevec_multiQubitProjector_sub, (Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_densmatr_multiQubitProjector_sub, (Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) )
 
 
 
diff --git a/quest/src/cpu/cpu_subroutines.hpp b/quest/src/cpu/cpu_subroutines.hpp
index 9da8fe199..3dbae057b 100644
--- a/quest/src/cpu/cpu_subroutines.hpp
+++ b/quest/src/cpu/cpu_subroutines.hpp
@@ -44,7 +44,7 @@ void cpu_fullstatediagmatr_setElemsFromMultiVarFunc(FullStateDiagMatr out, qcomp
  * COMMUNICATION BUFFER PACKING
  */
 
-template <int NumQubits> qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector<int> qubitInds, vector<int> qubitStates);
+template <int NumQubits> qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, ConstList64 qubitStates);
 
 qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);
 
@@ -53,32 +53,32 @@ qindex cpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
  * SWAPS
  */
 
-template <int NumCtrls> void cpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2);
-template <int NumCtrls> void cpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates);
-template <int NumCtrls> void cpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, int targState);
+template <int NumCtrls> void cpu_statevec_anyCtrlSwap_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2);
+template <int NumCtrls> void cpu_statevec_anyCtrlSwap_subB(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates);
+template <int NumCtrls> void cpu_statevec_anyCtrlSwap_subC(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, int targState);
 
 
 /*
  * DENSE MATRIX
  */
 
-template <int NumCtrls> void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, CompMatr1 matr);
-template <int NumCtrls> void cpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, qcomp fac0, qcomp fac1);
+template <int NumCtrls> void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, CompMatr1 matr);
+template <int NumCtrls> void cpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, qcomp fac0, qcomp fac1);
 
-template <int NumCtrls> void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr);
+template <int NumCtrls> void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, CompMatr2 matr);
 
-template <int NumCtrls, int NumTargs, bool ApplyConj, bool ApplyTransp> void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr);
+template <int NumCtrls, int NumTargs, bool ApplyConj, bool ApplyTransp> void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, CompMatr matr);
 
 
 /*
  * DIAGONAL MATRIX
  */
 
-template <int NumCtrls> void cpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, DiagMatr1 matr);
+template <int NumCtrls> void cpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, DiagMatr1 matr);
 
-template <int NumCtrls> void cpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, DiagMatr2 matr);
+template <int NumCtrls> void cpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, DiagMatr2 matr);
 
-template <int NumCtrls, int NumTargs, bool ApplyConj, bool HasPower> void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, DiagMatr matr, qcomp exponent);
+template <int NumCtrls, int NumTargs, bool ApplyConj, bool HasPower> void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, DiagMatr matr, qcomp exponent);
 
 template <bool HasPower> void cpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
 
@@ -89,11 +89,11 @@ template <bool HasPower, bool ApplyLeft, bool ApplyRight, bool ConjRight> void c
  * PAULI TENSOR AND GADGET
  */
 
-template <int NumCtrls, int NumTargs> void cpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> x, vector<int> y, vector<int> z, qcomp ampFac, qcomp pairAmpFac);
+template <int NumCtrls, int NumTargs> void cpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 x, ConstList64 y, ConstList64 z, qcomp ampFac, qcomp pairAmpFac);
 
-template <int NumCtrls> void cpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> x, vector<int> y, vector<int> z, qcomp ampFac, qcomp pairAmpFac, qindex bufferMaskXY);
+template <int NumCtrls> void cpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 x, ConstList64 y, ConstList64 z, qcomp ampFac, qcomp pairAmpFac, qindex bufferMaskXY);
 
-template <int NumCtrls> void cpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qcomp fac0, qcomp fac1);
+template <int NumCtrls> void cpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, qcomp fac0, qcomp fac1);
 
 
 /*
@@ -140,7 +140,7 @@ void cpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob);
  * PARTIAL TRACE
  */
 
-template <int NumTargs> void cpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector<int> targs, vector<int> pairTargs);
+template <int NumTargs> void cpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, ConstList64 targs, ConstList64 pairTargs);
 
 
 /*
@@ -150,11 +150,11 @@ template <int NumTargs> void cpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg
 qreal cpu_statevec_calcTotalProb_sub(Qureg qureg);
 qreal cpu_densmatr_calcTotalProb_sub(Qureg qureg);
 
-template <int NumQubits> qreal cpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes);
-template <int NumQubits> qreal cpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes);
+template <int NumQubits> qreal cpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes);
+template <int NumQubits> qreal cpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes);
 
-template <int NumQubits> void cpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits);
-template <int NumQubits> void cpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits);
+template <int NumQubits> void cpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, ConstList64 qubits);
+template <int NumQubits> void cpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, ConstList64 qubits);
 
 
 /*
@@ -172,12 +172,12 @@ template <bool Conj> qcomp cpu_densmatr_calcFidelityWithPureState_sub(Qureg rho,
  * EXPECTATION VALUES
  */
 
-qreal cpu_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs);
-qcomp cpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs);
+qreal cpu_statevec_calcExpecAnyTargZ_sub(Qureg qureg, ConstList64 targs);
+qcomp cpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, ConstList64 targs);
 
-qcomp cpu_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vector<int> y, vector<int> z);
-qcomp cpu_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vector<int> y, vector<int> z);
-qcomp cpu_densmatr_calcExpecPauliStr_sub (Qureg qureg, vector<int> x, vector<int> y, vector<int> z);
+qcomp cpu_statevec_calcExpecPauliStr_subA(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z);
+qcomp cpu_statevec_calcExpecPauliStr_subB(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z);
+qcomp cpu_densmatr_calcExpecPauliStr_sub (Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z);
 
 template <bool HasPower, bool UseRealPow> qcomp cpu_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
 template <bool HasPower, bool UseRealPow> qcomp cpu_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
@@ -187,8 +187,8 @@ template <bool HasPower, bool UseRealPow> qcomp cpu_densmatr_calcExpecFullStateD
  * PROJECTORS
  */
 
-template <int NumQubits> void cpu_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob);
-template <int NumQubits> void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob);
+template <int NumQubits> void cpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
+template <int NumQubits> void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
 
 
 /*
diff --git a/quest/src/gpu/CMakeLists.txt b/quest/src/gpu/CMakeLists.txt
index 9a872580c..3085ee41b 100644
--- a/quest/src/gpu/CMakeLists.txt
+++ b/quest/src/gpu/CMakeLists.txt
@@ -6,7 +6,7 @@ target_sources(QuEST
   gpu_subroutines.cpp
 )
 
-if (ENABLE_CUDA)
+if (QUEST_ENABLE_CUDA)
   set_source_files_properties(
     gpu_config.cpp
     gpu_subroutines.cpp
@@ -16,7 +16,7 @@ if (ENABLE_CUDA)
   )
 endif()
 
-if (ENABLE_HIP)
+if (QUEST_ENABLE_HIP)
   set_source_files_properties(
     gpu_config.cpp
     gpu_subroutines.cpp
diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index c7db834b7..001cc62c0 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -26,18 +26,18 @@
 #include <algorithm>
 
 
-#if COMPILE_CUDA && ! (defined(__NVCC__) || defined(__HIP__))
+#if QUEST_COMPILE_CUDA && ! (defined(__NVCC__) || defined(__HIP__))
     #error \
         "Attempted to compile gpu_config.cpp in GPU-accelerated mode with a non-GPU compiler. "\
         "Please compile this file with a CUDA (nvcc) or ROCm (hipcc) compiler."
 #endif
 
 
-#if COMPILE_CUDA && defined(__NVCC__)
+#if QUEST_COMPILE_CUDA && defined(__NVCC__)
     #include <cuda.h>
     #include <cuda_runtime.h>
 #endif
-#if COMPILE_CUDA && defined(__HIP__)
+#if QUEST_COMPILE_CUDA && defined(__HIP__)
     #include "quest/src/gpu/cuda_to_hip.hpp"
 #endif
 
@@ -50,7 +50,7 @@
  * when encountering issues through use of the CUDA API
  */
 
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
 void assertCudaCallSucceeded(int result, const char* call, const char* caller, const char* file, int line) {
 
@@ -99,14 +99,14 @@ void clearPossibleCudaError() {
  * CUQUANTUM MANAGEMENT
  *
  * these functions are defined in gpu_cuquantum.hpp when
- * COMPILE_CUQUANTUM is 1, but are otherwise defaulted to
+ * QUEST_COMPILE_CUQUANTUM is 1, but are otherwise defaulted to
  * the internal errors below. This slight inelegance
  * enables us to keep gpu_cuquantum.hpp as a single header
  * file, without exposing it to code beyond gpu/
  */
 
 
-#if ! COMPILE_CUQUANTUM
+#if ! QUEST_COMPILE_CUQUANTUM
 
 void gpu_initCuQuantum() {
     error_cuQuantumInitOrFinalizedButNotCompiled();
@@ -135,7 +135,7 @@ bool hasGpuBeenBound = false;
 
 
 int getBoundGpuId() {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
     assert_gpuHasBeenBound(hasGpuBeenBound);
 
     int id;
@@ -150,7 +150,7 @@ int getBoundGpuId() {
 
 
 int gpu_getComputeCapability() {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
     assert_gpuHasBeenBound(hasGpuBeenBound);
 
     cudaDeviceProp props;
@@ -165,17 +165,22 @@ int gpu_getComputeCapability() {
 
 
 bool gpu_isGpuCompiled() {
-    return (bool) COMPILE_CUDA;
+    return (bool) QUEST_COMPILE_CUDA;
 }
 
 
 bool gpu_isCuQuantumCompiled() {
-    return (bool) COMPILE_CUQUANTUM;
+    return (bool) QUEST_COMPILE_CUQUANTUM;
+}
+
+
+bool gpu_isHipCompiled() {
+    return (bool) (QUEST_COMPILE_CUDA && QUEST_COMPILE_HIP);
 }
 
 
 int gpu_getNumberOfLocalGpus() {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
     // HIP throws an error when a CUDA API function
     // is called but no devices exist, which we handle
@@ -197,7 +202,7 @@ int gpu_getNumberOfLocalGpus() {
 
 
 bool gpu_isGpuAvailable() {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
     int numDevices = gpu_getNumberOfLocalGpus();
     if (numDevices == 0)
@@ -234,7 +239,7 @@ bool gpu_isGpuAvailable() {
 
 
 bool gpu_isDirectGpuCommPossible() {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
     if (!comm_isMpiGpuAware())
         return false;
@@ -256,7 +261,7 @@ bool gpu_isDirectGpuCommPossible() {
 
 
 size_t gpu_getCurrentAvailableMemoryInBytes() {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
     assert_gpuHasBeenBound(hasGpuBeenBound);
 
     // note that in distributed settings, all GPUs
@@ -275,7 +280,7 @@ size_t gpu_getCurrentAvailableMemoryInBytes() {
 
 
 size_t gpu_getTotalMemoryInBytes() {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
     assert_gpuHasBeenBound(hasGpuBeenBound);
 
     size_t free, total;
@@ -290,7 +295,7 @@ size_t gpu_getTotalMemoryInBytes() {
 
 
 bool gpu_doesGpuSupportMemPools() {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
     assert_gpuHasBeenBound(hasGpuBeenBound);
 
     int supports;
@@ -305,7 +310,7 @@ bool gpu_doesGpuSupportMemPools() {
 
 
 qindex gpu_getMaxNumConcurrentThreads() {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
     assert_gpuHasBeenBound(hasGpuBeenBound);
 
     int deviceId = getBoundGpuId();
@@ -331,8 +336,42 @@ qindex gpu_getMaxNumConcurrentThreads() {
  */
 
 
+// the default numTPB is not known until runtime since the initial value
+// (provided either by the CMake var, or the environment variable) must
+// be validated during QuEST initialisation.
+static int global_numThreadsPerBlock = -1;
+
+
+int gpu_getNumThreadsPerBlock() {
+    if (global_numThreadsPerBlock == -1)
+        error_gpuNumThreadsPerBlockNotSet();
+
+    return global_numThreadsPerBlock;
+}
+
+
+void gpu_setNumThreadsPerBlock(int newNumTPB) {
+
+    global_numThreadsPerBlock = newNumTPB;
+}
+
+
+int gpu_getMaxNumThreadsPerBlock() {
+#if QUEST_COMPILE_CUDA
+
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, getBoundGpuId());
+    return prop.maxThreadsPerBlock; // HIP compatible
+
+#else
+    error_gpuQueriedButGpuNotCompiled();
+    return -1;
+#endif
+}
+
+
 std::array<char,17> getBoundGpuUuid() {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
     assert_gpuHasBeenBound(hasGpuBeenBound);
 
     constexpr int numUuidChars = 16;
@@ -368,7 +407,7 @@ std::array<char,17> getBoundGpuUuid() {
 
 
 void gpu_bindLocalGPUsToNodes() {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
     // distribute local MPI processes across local GPUs;
     int numLocalGpus = gpu_getNumberOfLocalGpus();
@@ -392,10 +431,10 @@ void gpu_bindLocalGPUsToNodes() {
 
 
 bool gpu_areAnyNodesBoundToSameGpu() {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
     assert_gpuHasBeenBound(hasGpuBeenBound);
 
-    if (!comm_isInit())
+    if (!comm_isActive())
         return false;
 
     // obtain bound GPU's UUID; a unique identifier 16-char identifier
@@ -418,7 +457,7 @@ bool gpu_areAnyNodesBoundToSameGpu() {
 
 
 void gpu_sync() {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
     CUDA_CHECK( cudaDeviceSynchronize() );
 
@@ -435,7 +474,7 @@ void gpu_sync() {
 
 
 qcomp* gpu_allocArray(qindex length) {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
     size_t numBytes = mem_getLocalQuregMemoryRequired(length);
 
@@ -467,7 +506,7 @@ qcomp* gpu_allocArray(qindex length) {
 
 
 void gpu_deallocArray(qcomp* amps) {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
     // cudaFree on nullptr is fine
     CUDA_CHECK( cudaFree(amps) );
@@ -492,7 +531,7 @@ enum CopyDirection {
 
 
 void copyArrayIfGpuCompiled(qcomp* cpuArr, qcomp* gpuArr, qindex numElems, enum CopyDirection direction) {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
     // must ensure gpu amps are up to date
     gpu_sync();
@@ -515,7 +554,7 @@ void copyArrayIfGpuCompiled(qcomp* cpuArr, qcomp* gpuArr, qindex numElems, enum
 
 
 void copyMatrixIfGpuCompiled(qcomp** cpuMatr, qcomp* gpuArr, qindex matrDim, enum CopyDirection direction) {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
     // NOTE:
     // this function copies a 2D CPU matrix into a 1D row-major GPU array,
@@ -564,7 +603,7 @@ void assertHeapObjectGpuMemIsAllocated(T obj) {
 
 
 void gpu_copyArray(qcomp* dest, qcomp* src, qindex dim) {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
     // ensure src and dest aren't being modified
     gpu_sync();
@@ -682,7 +721,7 @@ qindex gpuCacheLen = 0;
 
 
 qcomp* gpu_getCacheOfSize(qindex numElemsPerThread, qindex numThreads) {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
     // do not interfere with existing kernels using the cache
     gpu_sync();
@@ -708,7 +747,7 @@ qcomp* gpu_getCacheOfSize(qindex numElemsPerThread, qindex numThreads) {
 
 
 void gpu_clearCache() {
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
     // do not interfere with existing kernels using the cache
     gpu_sync();
diff --git a/quest/src/gpu/gpu_config.hpp b/quest/src/gpu/gpu_config.hpp
index 1b3be6295..98cb9c8a3 100644
--- a/quest/src/gpu/gpu_config.hpp
+++ b/quest/src/gpu/gpu_config.hpp
@@ -20,11 +20,20 @@
 
 
 
+/*
+ * CONSTANTS
+ */
+
+constexpr int gpu_CUDA_WARP_SIZE = 32;
+constexpr int gpu_HIP_WARP_SIZE = 64;
+
+
+
 /*
  * CUDA ERROR HANDLING
  */
 
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
 #define CUDA_CHECK(cmd) \
     assertCudaCallSucceeded((int) (cmd), #cmd, __func__, __FILE__, __LINE__)
@@ -43,6 +52,8 @@ bool gpu_isGpuCompiled();
 
 bool gpu_isCuQuantumCompiled();
 
+bool gpu_isHipCompiled();
+
 bool gpu_isGpuAvailable();
 
 bool gpu_isDirectGpuCommPossible();
@@ -65,6 +76,12 @@ qindex gpu_getMaxNumConcurrentThreads();
  * ENVIRONMENT MANAGEMENT
  */
 
+int gpu_getNumThreadsPerBlock();
+
+void gpu_setNumThreadsPerBlock(int newThreadsPerBlock);
+
+int gpu_getMaxNumThreadsPerBlock();
+
 void gpu_bindLocalGPUsToNodes();
 
 bool gpu_areAnyNodesBoundToSameGpu();
@@ -76,7 +93,6 @@ void gpu_initCuQuantum();
 void gpu_finalizeCuQuantum();
 
 
-
 /*
  * MEMORY MANAGEMENT
  */
@@ -122,4 +138,4 @@ size_t gpu_getCacheMemoryInBytes();
 
 
 
-#endif // GPU_CONFIG_HPP
\ No newline at end of file
+#endif // GPU_CONFIG_HPP
diff --git a/quest/src/gpu/gpu_cuquantum.cuh b/quest/src/gpu/gpu_cuquantum.cuh
index 0fec5ca57..6323f549f 100644
--- a/quest/src/gpu/gpu_cuquantum.cuh
+++ b/quest/src/gpu/gpu_cuquantum.cuh
@@ -2,7 +2,7 @@
  * Subroutines which invoke cuStateVec, which are alternatives to the
  * kernels defined in gpu_kernels.cuh, as invoked by gpu_subroutines.cpp
  * 
- * This file is only ever included when COMPILE_CUQUANTUM=1 and COMPILE_CUDA=1
+ * This file is only ever included when QUEST_COMPILE_CUQUANTUM=1 and QUEST_COMPILE_CUDA=1
  * so it can safely invoke CUDA signatures without guards. Note that many of 
  * the statevector functions herein will be re-leveraged by QuEST's density
  * matrix simulation, so it important we do not pass Qureg.numQubits to the 
@@ -29,11 +29,11 @@
 // compile errors (though we must still obtain the preprocessors from config.h)
 #include "quest/include/config.h"
 
-#if ! COMPILE_CUQUANTUM
+#if ! QUEST_COMPILE_CUQUANTUM
     #error "A file being compiled somehow included gpu_cuquantum.hpp despite QuEST not being compiled in cuQuantum mode."
 #endif
 
-#if ! COMPILE_CUDA
+#if ! QUEST_COMPILE_CUDA
     #error "A file being compiled somehow included gpu_cuquantum.hpp despite QuEST not being compiled in GPU-accelerated mode."
 #endif
 
@@ -44,6 +44,7 @@
 
 #include "quest/include/precision.h"
 
+#include "quest/src/core/lists.hpp"
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/gpu/gpu_config.hpp"
 #include "quest/src/gpu/gpu_qcomp.cuh"
@@ -63,10 +64,10 @@ using std::vector;
  * because QuEST uses only a single qcomp type for both in the API.
  */
 
-#if (FLOAT_PRECISION == 1)
+#if (QUEST_FLOAT_PRECISION == 1)
     #define CUQUANTUM_QCOMP CUDA_C_32F
 
-#elif (FLOAT_PRECISION == 2)
+#elif (QUEST_FLOAT_PRECISION == 2)
     #define CUQUANTUM_QCOMP CUDA_C_64F
 
 #else
@@ -174,7 +175,7 @@ void gpu_finalizeCuQuantum() {
  */
 
 
-void cuquantum_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2) {
+void cuquantum_statevec_anyCtrlSwap_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2) {
 
     // our SWAP targets are bundled into pairs
     int2 targPairs[] = {{targ1, targ2}};;
@@ -199,7 +200,7 @@ void cuquantum_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<
  */
 
 
-void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, gpu_qcomp* flatMatrElems, bool applyAdj) {
+void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, gpu_qcomp* flatMatrElems, bool applyAdj) {
 
     // this funciton is called 'subA' instead of just 'sub', because it is also called in 
     // the one-target case whereby it is strictly the embarrassingly parallel _subA scenario
@@ -222,7 +223,7 @@ void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector<int>
 // there is no bespoke cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subB()
 
 
-void cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, gpu_qcomp* flatMatrElems, bool conj) {
+void cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, gpu_qcomp* flatMatrElems, bool conj) {
 
     // beware that despite diagonal matrices being embarrassingly parallel,
     // the target qubits must still all be suffix-only to avoid a cuStateVec error
@@ -262,10 +263,11 @@ void cuquantum_densmatr_oneQubitDephasing_subA(Qureg qureg, int qubit, qreal pro
     gpu_qcomp a = {1,        0};
     gpu_qcomp b = {1-2*prob, 0};
     gpu_qcomp elems[] = {a, b, b, a};
-    vector<int> targs {qubit, util_getBraQubit(qubit,qureg)};
+    auto targs = lists_getList64({qubit, util_getBraQubit(qubit,qureg)});
 
     bool conj = false;
-    cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, {}, {}, targs, elems, conj);
+    auto empty = lists_getEmptyList64();
+    cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, empty, empty, targs, elems, conj);
 }
 
 
@@ -283,7 +285,10 @@ void cuquantum_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal
     int targ = qureg.logNumAmpsPerNode - 1; // leftmost suffix bra qubit
 
     bool conj = false;
-    cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, {ketQubit}, {!braBit}, {targ}, elems, conj);
+    auto ctrls  = lists_getList64({ketQubit});
+    auto states = lists_getList64({!braBit});
+    auto targs  = lists_getList64({targ});
+    cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, states, targs, elems, conj);
 }
 
 
@@ -299,10 +304,11 @@ void cuquantum_densmatr_twoQubitDephasing_subA(Qureg qureg, int qubitA, int qubi
     gpu_qcomp a = {1,          0};
     gpu_qcomp b = {1-4*prob/3, 0};
     gpu_qcomp elems[] = {a,b,b,b, b,a,b,b, b,b,a,b, b,b,b,a};
-    vector<int> targs {qubitA, qubitB, util_getBraQubit(qubitA,qureg), util_getBraQubit(qubitB,qureg)};
+    auto targs = lists_getList64({qubitA, qubitB, util_getBraQubit(qubitA,qureg), util_getBraQubit(qubitB,qureg)});
 
     bool conj = false;
-    cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, {}, {}, targs, elems, conj);
+    auto empty = lists_getEmptyList64();
+    cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, empty, empty, targs, elems, conj);
 }
 
 
@@ -339,7 +345,7 @@ qreal cuquantum_statevec_calcTotalProb_sub(Qureg qureg) {
 }
 
 
-qreal cuquantum_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
+qreal cuquantum_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes) {
 
     // cuQuantum probabilities are always double
     double prob;
@@ -353,11 +359,11 @@ qreal cuquantum_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int
 }
 
 
-void cuquantum_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits) {
+void cuquantum_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, ConstList64 qubits) {
 
     // cuQuantum can accept a host-pointer (like outProbs), but only
     // double precision; if qreal != double, we use temporary memory
-    #if (FLOAT_PRECISION == 2)
+    #if (QUEST_FLOAT_PRECISION == 2)
         double* outPtr = outProbs;
     #else
         vector<double> tmpProbs;
@@ -371,7 +377,7 @@ void cuquantum_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qu
         outPtr, qubits.data(), qubits.size(), nullptr, nullptr, 0) );
 
     // serially cast and copy output probabilities, if necessary
-    #if (FLOAT_PRECISION != 2)
+    #if (QUEST_FLOAT_PRECISION != 2)
         for (size_t i=0; i<tmpProbs.size(); i++)
             outProbs[i] = static_cast<qreal>(tmpProbs[i]);
     #endif
@@ -384,12 +390,12 @@ void cuquantum_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qu
  */
 
 
-qreal cuquantum_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
+qreal cuquantum_statevec_calcExpecPauliStr_subA(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z) {
 
     // prepare term (XX...YY...ZZ...)
     size_t numPaulis = x.size() + y.size() + z.size();
     vector<custatevecPauli_t> paulis; 
-    vector<int32_t> targs; 
+    vector<int32_t> targs; // forego List64 for symmetry
     
     paulis.reserve(numPaulis);
     targs.reserve(numPaulis);
@@ -416,9 +422,10 @@ qreal cuquantum_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vect
 }
 
 
-qreal cuquantum_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
+qreal cuquantum_statevec_calcExpecAnyTargZ_sub(Qureg qureg, ConstList64 targs) {
 
-    return cuquantum_statevec_calcExpecPauliStr_subA(qureg, {}, {}, targs);
+    auto empty = lists_getEmptyList64();
+    return cuquantum_statevec_calcExpecPauliStr_subA(qureg, empty, empty, targs);
 }
 
 
@@ -428,7 +435,7 @@ qreal cuquantum_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
  */
 
 
-void cuquantum_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) {
+void cuquantum_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
     CUDA_CHECK( custatevecCollapseByBitString(
         config.handle,
diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
index 30377850a..b6954f701 100644
--- a/quest/src/gpu/gpu_kernels.cuh
+++ b/quest/src/gpu/gpu_kernels.cuh
@@ -3,8 +3,8 @@
  * when there is no equivalent utility in Thrust (or cuQuantum, when it is
  * targeted). 
  * 
- * This file is only ever included when COMPILE_CUDA=1 so it can safely invoke 
  * CUDA signatures without guards. Some kernels are templated to compile-time 
+ * This file is only ever included when QUEST_COMPILE_CUDA=1 so it can safely invoke
  * optimise their bitwise and indexing logic depending on the number of qubits.
  * This file is a header since only ever included by gpu_subroutines.cpp.
  * 
@@ -25,7 +25,7 @@
 #include "quest/src/core/fastmath.hpp"
 #include "quest/src/gpu/gpu_qcomp.cuh"
 
-#if ! COMPILE_CUDA
+#if ! QUEST_COMPILE_CUDA
     #error "A file being compiled somehow included gpu_kernels.hpp despite QuEST not being compiled in GPU-accelerated mode."
 #endif
 
@@ -42,23 +42,19 @@
  * THREAD MANAGEMENT
  */
 
-
-const int NUM_THREADS_PER_BLOCK = 128;
-
-
 __forceinline__ __device__ qindex getThreadInd() {
     return blockIdx.x*blockDim.x + threadIdx.x;
 }
 
 
-__host__ qindex getNumBlocks(qindex numThreads) {
+__host__ qindex getNumBlocks(qindex numThreads, int numThreadsPerBlock) {
 
     /// @todo
     /// improve this with cudaOccupancyMaxPotentialBlockSize(),
     /// making it function specific
 
     // CUDA ceil
-    return ceil(numThreads / static_cast<qreal>(NUM_THREADS_PER_BLOCK));
+    return ceil(numThreads / static_cast<qreal>(numThreadsPerBlock));
 }
 
 
@@ -305,7 +301,11 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr(
     // must be strictly through compile-time-known indices, otherwise it will auto-
     // spill to local memory). Hence, this _subA() function is not a subroutine 
     // despite some logic being common to non-compile-time _subB(), and hence
-    // why the loops below are explicitly compile-time unrolled
+    // why the loops below are explicitly compile-time unrolled. Beware that when
+    // numThreadsPerBlock is increased from 128, this kernel will still behave
+    // correctly, but privateCache below will spill over into local memory at a
+    // performance penalty for NumTargs <= 5, with spillage occurring for fewer
+    // NumTargs as numThreadsPerBlock increases.
     REGISTER gpu_qcomp privateCache[1 << NumTargs];
 
     // we know NumTargs <= 5, though NumCtrls is permitted anything (including -1)
diff --git a/quest/src/gpu/gpu_qcomp.cuh b/quest/src/gpu/gpu_qcomp.cuh
index 1fffb5e53..df391f445 100644
--- a/quest/src/gpu/gpu_qcomp.cuh
+++ b/quest/src/gpu/gpu_qcomp.cuh
@@ -24,11 +24,11 @@
 #include "quest/src/core/inliner.hpp"
 #include "quest/src/core/base_qcomp.hpp"
 
-#if ! COMPILE_CUDA
+#if ! QUEST_COMPILE_CUDA
     #error "A file being compiled somehow included gpu_qcomp.hpp despite QuEST not being compiled in GPU-accelerated mode."
 #endif
 
-#if (FLOAT_PRECISION == 4)
+#if (QUEST_FLOAT_PRECISION == 4)
     #error "Build bug; precision.h should have prevented non-float non-double qcomp precision on GPU."
 #endif
 
diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
index cd473ee14..9b8e819b5 100644
--- a/quest/src/gpu/gpu_subroutines.cpp
+++ b/quest/src/gpu/gpu_subroutines.cpp
@@ -22,8 +22,8 @@
  * cpu_subroutines.cpp) and moving it out of the aptly-named
  * accelerator.cpp file.
  * 
- * Despite COMPILE_CUDA=1 whenever COMPILE_CUQUANTUM=1, we will
- * still use superfluous (COMPILE_CUDA || COMPILE_CUQUANTUM) guards
+ * Despite QUEST_COMPILE_CUDA=1 whenever QUEST_COMPILE_CUQUANTUM=1, we will
+ * still use superfluous (QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM) guards
  * to communicate when there is no bespoke cuQuantum routine.
  *
  * When compiling for AMD GPUs, the CUDA symbols invoked herein are
@@ -35,8 +35,8 @@
 // obtain preprocessors from config.h prior to validation
 #include "quest/include/config.h"
 
-#if (COMPILE_CUQUANTUM && ! COMPILE_CUDA)
-    #error "Cannot define COMPILE_CUQUANTUM=1 without simultaneously defining COMPILE_CUDA=1"
+#if (QUEST_COMPILE_CUQUANTUM && ! QUEST_COMPILE_CUDA)
+    #error "Cannot define QUEST_COMPILE_CUQUANTUM=1 without simultaneously defining QUEST_COMPILE_CUDA=1"
 #endif
 
 #include "quest/include/types.h"
@@ -52,13 +52,13 @@
 #include "quest/src/gpu/gpu_config.hpp"
 #include "quest/src/gpu/gpu_subroutines.hpp"
 
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
     #include "quest/src/gpu/gpu_qcomp.cuh"
     #include "quest/src/gpu/gpu_kernels.cuh"
     #include "quest/src/gpu/gpu_thrust.cuh"
 #endif
 
-#if COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUQUANTUM
     #include "quest/src/gpu/gpu_cuquantum.cuh"
 #endif
 
@@ -66,7 +66,6 @@
 using std::vector;
 
 
-
 /*
  * GETTERS
  */
@@ -74,7 +73,7 @@ using std::vector;
 
 qcomp gpu_statevec_getAmp_sub(Qureg qureg, qindex ind) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     // this bespoke function exists (in lieu of caller 
     // just calling copyGpuToCpu() directly) mostly for
@@ -105,7 +104,7 @@ qcomp gpu_statevec_getAmp_sub(Qureg qureg, qindex ind) {
 
 void gpu_densmatr_setAmpsToPauliStrSum_sub(Qureg qureg, PauliStrSum sum) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     thrust_densmatr_setAmpsToPauliStrSum_sub(qureg, sum);
 
@@ -117,7 +116,7 @@ void gpu_densmatr_setAmpsToPauliStrSum_sub(Qureg qureg, PauliStrSum sum) {
 
 void gpu_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliStrSum in) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     thrust_fullstatediagmatr_setElemsToPauliStrSum(out, in);
 
@@ -134,20 +133,21 @@ void gpu_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliStr
 
 
 template <int NumQubits>
-qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector<int> qubits, vector<int> qubitStates) {
+qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates) {
 
     assert_numQubitsMatchesQubitStatesAndTemplateParam(qubits.size(), qubitStates.size(), NumQubits);
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(qubits.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex sendInd = getSubBufferSendInd(qureg);
 
-    devints sortedQubits = util_getSorted(qubits);
+    devints sortedQubits = getDevInts(util_getSorted(qubits));
     qindex qubitStateMask  = util_getBitMask(qubits, qubitStates);
 
-    kernel_statevec_packAmpsIntoBuffer <NumQubits> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_packAmpsIntoBuffer <NumQubits> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + sendInd, numThreads, 
         getPtr(sortedQubits), qubits.size(), qubitStateMask
     );
@@ -166,13 +166,14 @@ qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
 
     assert_bufferPackerGivenIncreasingQubits(qubit1, qubit2, qubit3);
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 8;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex sendInd = getSubBufferSendInd(qureg);
 
-    kernel_statevec_packPairSummedAmpsIntoBuffer <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_packPairSummedAmpsIntoBuffer <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + sendInd, numThreads, 
         qubit1, qubit2, qubit3, bit2
     );
@@ -187,7 +188,7 @@ qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, gpu_statevec_packAmpsIntoBuffer, (Qureg, vector<int>, vector<int>) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, gpu_statevec_packAmpsIntoBuffer, (Qureg, ConstList64, ConstList64) )
 
 
 
@@ -197,23 +198,24 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qindex, gpu_statevec_packAmpsIntoBuffe
 
 
 template <int NumCtrls> 
-void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2) {
+void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
-#if COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUQUANTUM
 
     cuquantum_statevec_anyCtrlSwap_subA(qureg, ctrls, ctrlStates, targ1, targ2);
 
-#elif COMPILE_CUDA
+#elif QUEST_COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(2 + ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
-    devints sortedQubits = util_getSorted(ctrls, {targ2, targ1});
+    devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ2, targ1}));
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1});
 
-    kernel_statevec_anyCtrlSwap_subA <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlSwap_subA <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, 
         getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2
     );
@@ -225,20 +227,21 @@ void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<int> c
 
 
 template <int NumCtrls> 
-void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates) {
+void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
-    devints sortedCtrls = util_getSorted(ctrls);
+    devints sortedCtrls = getDevInts(util_getSorted(ctrls));
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
-    kernel_statevec_anyCtrlSwap_subB <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlSwap_subB <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         getPtr(sortedCtrls), ctrls.size(), ctrlStateMask
     );
@@ -250,20 +253,21 @@ void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector<int> ctrls, vector<int> c
 
 
 template <int NumCtrls> 
-void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, int targState) {
+void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, int targState) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(1 + ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
-    devints sortedQubits = util_getSorted(ctrls, {targ});
+    devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ}));
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState});
 
-    kernel_statevec_anyCtrlSwap_subC <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlSwap_subC <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         getPtr(sortedQubits), ctrls.size(), qubitStateMask
     );
@@ -274,9 +278,9 @@ void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector<int> ctrls, vector<int> c
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlSwap_subA, (Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlSwap_subB, (Qureg qureg, vector<int> ctrls, vector<int> ctrlStates) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlSwap_subC, (Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, int targState) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlSwap_subA, (Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlSwap_subB, (Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlSwap_subC, (Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, int targState) )
 
 
 
@@ -286,27 +290,29 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlSwap_subC, (
 
 
 template <int NumCtrls>
-void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, CompMatr1 matr) {
+void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, CompMatr1 matr) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
-#if COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUQUANTUM
 
     bool applyAdj = false;
+    auto targsList = lists_getList64({targ});
     auto arr = getFlattenedGpuQcompMatrix<2>(matr.elems); // explicit template for MSVC, grr!
-    cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, {targ}, arr.data(), applyAdj);
+    cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, targsList, arr.data(), applyAdj);
 
-#elif COMPILE_CUDA
+#elif QUEST_COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1);
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
-    devints sortedQubits = util_getSorted(ctrls, {targ});
+    devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ}));
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0});
 
     auto [m00, m01, m10, m11] = getFlattenedGpuQcompMatrix<2>(matr.elems); // explicit template for MSVC, grr!
 
-    kernel_statevec_anyCtrlOneTargDenseMatr_subA <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlOneTargDenseMatr_subA <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, 
         getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ, 
         m00, m01, m10, m11
@@ -319,20 +325,21 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
 
 
 template <int NumCtrls>
-void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, qcomp fac0, qcomp fac1) {
+void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, qcomp fac0, qcomp fac1) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
-    devints sortedCtrls = util_getSorted(ctrls);
+    devints sortedCtrls = getDevInts(util_getSorted(ctrls));
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
-    kernel_statevec_anyCtrlOneTargDenseMatr_subB <NumCtrls> <<<numBlocks,NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlOneTargDenseMatr_subB <NumCtrls> <<<numBlocks,numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, 
         getGpuQcomp(fac0), getGpuQcomp(fac1)
@@ -344,8 +351,8 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector<int> ctrls, v
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlOneTargDenseMatr_subA, (Qureg, vector<int>, vector<int>, int, CompMatr1) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlOneTargDenseMatr_subB, (Qureg, vector<int>, vector<int>, qcomp, qcomp) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlOneTargDenseMatr_subA, (Qureg, ConstList64, ConstList64, int, CompMatr1) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlOneTargDenseMatr_subB, (Qureg, ConstList64, ConstList64, qcomp, qcomp) )
 
 
 
@@ -355,28 +362,30 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlOneTargDense
 
 
 template <int NumCtrls> 
-void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr) {
+void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, CompMatr2 matr) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
-#if COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUQUANTUM
 
     bool applyAdj = false;
+    auto targsList = lists_getList64({targ1, targ2});
     auto arr = getFlattenedGpuQcompMatrix<4>(matr.elems); // explicit template for MSVC, grr!
-    cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, {targ1, targ2}, arr.data(), applyAdj);
+    cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, targsList, arr.data(), applyAdj);
 
-#elif COMPILE_CUDA
+#elif QUEST_COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 2);
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
-    devints sortedQubits = util_getSorted(ctrls, {targ1,targ2});
+    devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ1,targ2}));
     qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1,targ2}, {0,0});
 
     // unpack matrix elems which are more efficiently accessed by kernels as args than shared mem (... maybe...)
     auto m = getFlattenedGpuQcompMatrix<4>(matr.elems); // explicit template for MSVC, grr!
 
-    kernel_statevec_anyCtrlTwoTargDenseMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlTwoTargDenseMatr_sub <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, 
         getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2,
         m[0], m[1], m[2],  m[3],  m[4],  m[5],  m[6],  m[7],
@@ -388,7 +397,7 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 #endif
 }
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlTwoTargDenseMatr_sub, (Qureg, vector<int>, vector<int>, int, int, CompMatr2) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlTwoTargDenseMatr_sub, (Qureg, ConstList64, ConstList64, int, int, CompMatr2) )
 
 
 
@@ -398,12 +407,12 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlTwoTargDense
 
 
 template <int NumCtrls, int NumTargs, bool ApplyConj, bool ApplyTransp>
-void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr) {
+void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, CompMatr matr) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
     assert_numTargsMatchesTemplateParam(targs.size(), NumTargs);
 
-#if COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUQUANTUM
 
     auto matrElemsPtr = getGpuQcompPtr(matr.gpuElemsFlat);
     auto matrElemsLen = matr.numRows * matr.numRows;
@@ -425,16 +434,16 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
     if (ApplyConj || ApplyTransp)
         thrust_setElemsToConjugate(matrElemsPtr, matrElemsLen);
 
-#elif COMPILE_CUDA
+#elif QUEST_COMPILE_CUDA
 
     // a 'batch' refers to 2^N amps which become mixed by the matrix,
     // distinguished in this kernel from 'numThreads' since we may
     // task each thread with processing more than a single batch
     qindex numBatches = qureg.numAmpsPerNode / powerOf2(ctrls.size() + targs.size());
 
-    devints deviceTargs = targs;
-    devints deviceQubits = util_getSorted(ctrls, targs);
-    qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targs, vector<int>(targs.size(),0));
+    devints deviceTargs = getDevInts(targs);
+    devints deviceQubits = getDevInts(util_getSorted(ctrls, targs));
+    qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targs, util_getConstantList(0,targs.size()));
 
     // unpacking args (to better distinguish below signatures)
     auto ampsPtr   = getGpuQcompPtr(qureg.gpuAmps);
@@ -453,9 +462,12 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
     if constexpr (NumTargs != -1) {
 
         // when NumTargs <= 5, each thread has a private array stored in the registers,
-        // enabling rapid IO. Given NUM_THREADS_PER_BLOCK = 128, the maximum size of 
-        // this array per-block is 16 * 128 * 2^5 B = 64 KiB which exceeds shared
-        // memory capacity, but does NOT exceed maximum register capacity.
+        // enabling rapid IO. When using the default numThreadsPerBlock = 128, the max
+        // size of this array per-block is 16 * 128 * 2^5 B = 64 KiB which exceeds shared
+        // memory capacity, but does NOT exceed maximum register capacity. When the user
+        // increases numThreadsPerBlock, the thread-private array in the below kernel
+        // will spill from registers into local memory, degrading performance, but
+        // behaving correctly and stably.
 
         /// @todo
         /// We should really check the above claims, otherwise the thread-private arrays could
@@ -463,11 +475,12 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
         /// global memory) and greatly sabotage performance on some GPUs.
 
         qindex numThreads = numBatches;
-        qindex numBlocks = getNumBlocks(numThreads);
+        int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+        qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
         kernel_statevec_anyCtrlFewTargDenseMatr
             <NumCtrls, NumTargs, ApplyConj, ApplyTransp> 
-            <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+            <<<numBlocks, numThreadsPerBlock>>> (
                 ampsPtr, numThreads, 
                 qubitsPtr, nCtrls, qubitStateMask, 
                 targsPtr, matrPtr
@@ -486,6 +499,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
         // where we assign one-block per multiprocessor because we are anyway memory-
         // bandwidth bound (so we don't expect many interweaved blocks per MP).
         qindex numThreads = gpu_getMaxNumConcurrentThreads();
+        int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
         
         // use strictly 2^# threads to maintain precondition of all kernels
         if (!isPowerOf2(numThreads))
@@ -497,15 +511,15 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 
         // evenly distribute the batches between threads, and the threads unevenly between blocks
         qindex numBatchesPerThread = numBatches / numThreads; // divides evenly
-        qindex numBlocks = getNumBlocks(numThreads);
+        qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
         // expand the cache if necessary
-        qindex numKernelInvocations = numBlocks * NUM_THREADS_PER_BLOCK;
+        qindex numKernelInvocations = numBlocks * numThreadsPerBlock;
         qcomp* cache = gpu_getCacheOfSize(powerOf2(targs.size()), numKernelInvocations);
 
         kernel_statevec_anyCtrlManyTargDenseMatr 
             <NumCtrls, ApplyConj, ApplyTransp> 
-            <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+            <<<numBlocks, numThreadsPerBlock>>> (
                 getGpuQcompPtr(cache),
                 ampsPtr, numThreads, numBatchesPerThread, 
                 qubitsPtr, nCtrls, qubitStateMask, 
@@ -519,7 +533,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 }
 
 
-INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, gpu_statevec_anyCtrlAnyTargDenseMatr_sub, (Qureg, vector<int>, vector<int>, vector<int>, CompMatr) )
+INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, gpu_statevec_anyCtrlAnyTargDenseMatr_sub, (Qureg, ConstList64, ConstList64, ConstList64, CompMatr) )
 
 
 
@@ -529,7 +543,7 @@ INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, gpu_statevec_
 
 
 template <int NumCtrls> 
-void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, DiagMatr1 matr) {
+void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, DiagMatr1 matr) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
@@ -542,7 +556,7 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
     // (in this function, only one) are within the suffix substate, otherwise
     // we fall back to using our custom kernels which never require comm.
 
-#if COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUQUANTUM
 
     if (util_isQubitInSuffix(targ, qureg)) {
 
@@ -550,7 +564,8 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
         bool conj = false;
 
         // we can pass 1D CPU .elems array directly to cuQuantum which will recognise host pointers
-        cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ}, getGpuQcompPtr(matr.elems), conj);
+        cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(
+            qureg, ctrls, ctrlStates, lists_getList64({targ}), getGpuQcompPtr(matr.elems), conj);
         
         // explicitly return to avoid re-simulation below
         return;
@@ -559,20 +574,21 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
 #endif
 
 // note preprocessors are not exclusive
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
     /// @todo
     /// when NumCtrls==0, a Thrust functor would be undoubtedly more
     /// efficient (because of improved parallelisation granularity) 
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
-    devints deviceCtrls = util_getSorted(ctrls);
+    devints deviceCtrls = getDevInts(util_getSorted(ctrls));
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
     auto elems = getGpuQcompArray<2>(matr.elems); // explicit template for MSVC, grr!
 
-    kernel_statevec_anyCtrlOneTargDiagMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlOneTargDiagMatr_sub <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
         getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ, elems[0], elems[1]
     );
@@ -587,7 +603,7 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlOneTargDiagMatr_sub, (Qureg, vector<int>, vector<int>, int, DiagMatr1) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlOneTargDiagMatr_sub, (Qureg, ConstList64, ConstList64, int, DiagMatr1) )
 
 
 
@@ -597,7 +613,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlOneTargDiagM
 
 
 template <int NumCtrls> 
-void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, DiagMatr2 matr) {
+void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, DiagMatr2 matr) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
@@ -610,15 +626,17 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
     // are both within the suffix substate, otherwise we fall back to using 
     // our custom kernels which never require comm.
 
-#if COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUQUANTUM
+
+    auto targsList = lists_getList64({targ1, targ2});
 
-    if (util_areAllQubitsInSuffix({targ1,targ2}, qureg)) {
+    if (util_areAllQubitsInSuffix(targsList, qureg)) {
 
         // we never conjugate DiagMatr2 at this level; the caller will have already conjugated
         bool conj = false;
 
         // we can pass 1D CPU array directly to cuQuantum, and it will recognise host pointers
-        cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, {targ1, targ2}, getGpuQcompPtr(matr.elems), conj);
+        cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, ctrls, ctrlStates, targsList, getGpuQcompPtr(matr.elems), conj);
 
         // explicitly return to avoid re-simulation below
         return;
@@ -627,20 +645,21 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
 #endif 
 
 // note preprocessors are not exclusive
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
     /// @todo
     /// when NumCtrls==0, a Thrust functor would be undoubtedly more
     /// efficient (because of improved parallelisation granularity) 
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
-    devints deviceCtrls = util_getSorted(ctrls);
+    devints deviceCtrls = getDevInts(util_getSorted(ctrls));
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
     auto elems = getGpuQcompArray<4>(matr.elems); // explicit template for MSVC, grr!
 
-    kernel_statevec_anyCtrlTwoTargDiagMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlTwoTargDiagMatr_sub <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
         getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ1, targ2,
         elems[0], elems[1], elems[2], elems[3]
@@ -656,7 +675,7 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlTwoTargDiagMatr_sub, (Qureg, vector<int>, vector<int>, int, int, DiagMatr2) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlTwoTargDiagMatr_sub, (Qureg, ConstList64, ConstList64, int, int, DiagMatr2) )
 
 
 
@@ -666,7 +685,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlTwoTargDiagM
 
 
 template <int NumCtrls, int NumTargs, bool ApplyConj, bool HasPower>
-void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, DiagMatr matr, qcomp exponent) {
+void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, DiagMatr matr, qcomp exponent) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
     assert_numTargsMatchesTemplateParam(targs.size(), NumTargs);
@@ -682,7 +701,7 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
     // our custom kernels which never require comm. Furthermore, cuQuantum
     // cannot handle when exponent != 1, for which we also fallback to custom.
 
-#if COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUQUANTUM
 
     // cuQuantum cannot handle HasPower, in which case we fall back to custom kernel
     if (!HasPower && util_areAllQubitsInSuffix(targs, qureg)) {
@@ -695,20 +714,21 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
 #endif
 
 // note preprocessors are not exclusive
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
     /// @todo
     /// when NumCtrls==0, a Thrust functor would be undoubtedly more
     /// efficient (because of improved parallelisation granularity) 
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
-    devints deviceTargs = targs;
-    devints deviceCtrls = util_getSorted(ctrls);
+    devints deviceTargs = getDevInts(targs);
+    devints deviceCtrls = getDevInts(util_getSorted(ctrls));
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
-    kernel_statevec_anyCtrlAnyTargDiagMatr_sub <NumCtrls, NumTargs, ApplyConj, HasPower> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_anyCtrlAnyTargDiagMatr_sub <NumCtrls, NumTargs, ApplyConj, HasPower> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
         getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), 
         getGpuQcompPtr(util_getGpuMemPtr(matr)), getGpuQcomp(exponent)
@@ -724,7 +744,7 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
 }
 
 
-INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, gpu_statevec_anyCtrlAnyTargDiagMatr_sub, (Qureg, vector<int>, vector<int>, vector<int>, DiagMatr, qcomp) )
+INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, gpu_statevec_anyCtrlAnyTargDiagMatr_sub, (Qureg, ConstList64, ConstList64, ConstList64, DiagMatr, qcomp) )
 
 
 
@@ -738,7 +758,7 @@ void gpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
 
     assert_exponentMatchesTemplateParam(exponent, HasPower);
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     // we always use Thrust because we are doubtful that cuQuantum's
     // diagonal-matrix facilities are optimised for the all-qubit case
@@ -756,14 +776,15 @@ void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
 
     assert_exponentMatchesTemplateParam(exponent, HasPower);
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     kernel_densmatr_allTargDiagMatr_sub 
         <HasPower, ApplyLeft, ApplyRight, ConjRight> 
-        <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+        <<<numBlocks, numThreadsPerBlock>>> (
             getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
             getGpuQcompPtr(util_getGpuMemPtr(matr)), matr.numElems, getGpuQcomp(exponent)
     );
@@ -792,7 +813,7 @@ template void gpu_densmatr_allTargDiagMatr_sub<true,  false, true,  false> (Qure
 
 
 template <int NumCtrls, int NumTargs> 
-void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> x, vector<int> y, vector<int> z, qcomp ampFac, qcomp pairAmpFac) {
+void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 x, ConstList64 y, ConstList64 z, qcomp ampFac, qcomp pairAmpFac) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
     assert_numTargsMatchesTemplateParam(x.size() + y.size(), NumTargs);
@@ -804,16 +825,16 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector<int> ct
     // This is true even if we passed down the gadget phase to this function; cuStateVec would
     // exact amp -> a amp + b other_amp for the wrong b, which we cannot thereafter remedy.
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qcomp powI   = util_getPowerOfI(y.size());
     auto targsXY = util_getConcatenated(x, y);
     auto maskXY  = util_getBitMask(targsXY);
     auto maskYZ  = util_getBitMask(util_getConcatenated(y, z));
 
-    devints deviceTargs   = targsXY;
-    devints deviceQubits  = util_getSorted(ctrls, targsXY);
-    qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targsXY, vector<int>(targsXY.size(),0));
+    devints deviceTargs   = getDevInts(targsXY);
+    devints deviceQubits  = getDevInts(util_getSorted(ctrls, targsXY));
+    qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targsXY, util_getConstantList(0,targsXY.size()));
 
     // unlike the analogous cpu routine, this function has only a single parallelisation
     // granularity; where every pair-of-amps is modified by an independent thread, despite
@@ -821,8 +842,9 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector<int> ct
     // faster than when giving threads many pair-amps to modify, due to memory movements
 
     qindex numThreads = (qureg.numAmpsPerNode / powerOf2(ctrls.size())) / 2; // divides evenly
-    qindex numBlocks = getNumBlocks(numThreads);
-    kernel_statevector_anyCtrlPauliTensorOrGadget_subA <NumCtrls, NumTargs> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
+    kernel_statevector_anyCtrlPauliTensorOrGadget_subA <NumCtrls, NumTargs> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads,
         getPtr(deviceQubits), ctrls.size(), qubitStateMask, 
         getPtr(deviceTargs), deviceTargs.size(),
@@ -836,24 +858,25 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector<int> ct
 
 
 template <int NumCtrls> 
-void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> x, vector<int> y, vector<int> z, qcomp ampFac, qcomp pairAmpFac, qindex bufferMaskXY) {
+void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 x, ConstList64 y, ConstList64 z, qcomp ampFac, qcomp pairAmpFac, qindex bufferMaskXY) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
     qcomp powI = util_getPowerOfI(y.size());
     auto maskXY = util_getBitMask(util_getConcatenated(x, y));
     auto maskYZ = util_getBitMask(util_getConcatenated(y, z));
 
-    devints sortedCtrls = util_getSorted(ctrls);
+    devints sortedCtrls = getDevInts(util_getSorted(ctrls));
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
 
-    kernel_statevector_anyCtrlPauliTensorOrGadget_subB <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevector_anyCtrlPauliTensorOrGadget_subB <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         getPtr(sortedCtrls), ctrls.size(), ctrlStateMask,
         maskXY, maskYZ, bufferMaskXY,
@@ -866,8 +889,8 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector<int> ct
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, gpu_statevector_anyCtrlPauliTensorOrGadget_subA, (Qureg, vector<int>, vector<int>, vector<int>, vector<int>, vector<int>, qcomp, qcomp) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevector_anyCtrlPauliTensorOrGadget_subB, (Qureg, vector<int>, vector<int>, vector<int>, vector<int>, vector<int>, qcomp, qcomp, qindex) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, gpu_statevector_anyCtrlPauliTensorOrGadget_subA, (Qureg, ConstList64, ConstList64, ConstList64, ConstList64, ConstList64, qcomp, qcomp) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevector_anyCtrlPauliTensorOrGadget_subB, (Qureg, ConstList64, ConstList64, ConstList64, ConstList64, ConstList64, qcomp, qcomp, qindex) )
 
 
 
@@ -877,20 +900,21 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevector_anyCtrlPauliTens
 
 
 template <int NumCtrls> 
-void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qcomp fac0, qcomp fac1) {
+void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, qcomp fac0, qcomp fac1) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
-    devints sortedCtrls = util_getSorted(ctrls);
+    devints sortedCtrls = getDevInts(util_getSorted(ctrls));
     qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
     qindex targMask = util_getBitMask(targs);
 
-    kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <NumCtrls> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads,
         getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, targMask,
         getGpuQcomp(fac0), getGpuQcomp(fac1)
@@ -902,7 +926,7 @@ void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector<int> c
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub, (Qureg, vector<int>, vector<int>, vector<int>, qcomp, qcomp) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub, (Qureg, ConstList64, ConstList64, ConstList64, qcomp, qcomp) )
 
 
 
@@ -914,10 +938,11 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevector_anyCtrlAnyTargZO
 template <int NumQuregs> 
 void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     // extract amp ptrs from qureg list
     vector<gpu_qcomp*> ptrs;
@@ -929,7 +954,7 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs
     devgpuqcompptrs devQuregAmps = ptrs;
     devcomps devCoeffs = coeffs;
 
-    kernel_statevec_setQuregToWeightedSum_sub <NumQuregs> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_setQuregToWeightedSum_sub <NumQuregs> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(outQureg.gpuAmps), numThreads,
         getPtr(devCoeffs), getPtr(devQuregAmps), inQuregs.size()
     );
@@ -942,7 +967,7 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs
 
 void gpu_densmatr_mixQureg_subA(qreal outProb, Qureg outQureg, qreal inProb, Qureg inQureg) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     thrust_densmatr_mixQureg_subA(outProb, outQureg, inProb, inQureg);
 
@@ -954,12 +979,13 @@ void gpu_densmatr_mixQureg_subA(qreal outProb, Qureg outQureg, qreal inProb, Qur
 
 void gpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qureg inQureg) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
-    kernel_densmatr_mixQureg_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_mixQureg_subB <<<numBlocks, numThreadsPerBlock>>> (
         outProb, getGpuQcompPtr(outQureg.gpuAmps), inProb, getGpuQcompPtr(inQureg.gpuAmps),
         numThreads, inQureg.numAmps
     );
@@ -972,12 +998,13 @@ void gpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qur
 
 void gpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
-    kernel_densmatr_mixQureg_subC <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_mixQureg_subC <<<numBlocks, numThreadsPerBlock>>> (
         outProb, getGpuQcompPtr(outQureg.gpuAmps), inProb, getGpuQcompPtr(outQureg.gpuCommBuffer),
         numThreads, outQureg.rank, powerOf2(outQureg.numQubits), outQureg.logNumAmpsPerNode        
     );
@@ -999,20 +1026,21 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_QUREGS( void, gpu_statevec_setQuregToWeighted
 
 void gpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob) {
 
-#if COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUQUANTUM
 
     // gauranteed that corresponding braQubit is in suffix, so always safe to call cuQuantum
     cuquantum_densmatr_oneQubitDephasing_subA(qureg, ketQubit, prob);
 
-#elif COMPILE_CUDA
+#elif QUEST_COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto fac = util_getOneQubitDephasingFactor(prob);
     int braQubit = util_getBraQubit(ketQubit, qureg);
 
-    kernel_densmatr_oneQubitDephasing_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDephasing_subA <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, fac
     );
 
@@ -1024,21 +1052,22 @@ void gpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob)
 
 void gpu_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal prob) {
 
-#if COMPILE_CUQUANTUM 
+#if QUEST_COMPILE_CUQUANTUM
 
     // gauranteed that corresponding braQubit is in prefix; however, cuQuantum effects
     // the gate as a phase*Id gate on any qubit, so just picks one in suffix
     cuquantum_densmatr_oneQubitDephasing_subB(qureg, ketQubit, prob);
 
-#elif COMPILE_CUDA
+#elif QUEST_COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto fac = util_getOneQubitDephasingFactor(prob);
     int braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
 
-    kernel_densmatr_oneQubitDephasing_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDephasing_subB <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braBit, fac
     );
 
@@ -1056,12 +1085,12 @@ void gpu_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal prob)
 
 void gpu_densmatr_twoQubitDephasing_subA(Qureg qureg, int ketQubitA, int ketQubitB, qreal prob) {
 
-#if COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUQUANTUM
 
     // gauranteed that both corresponding braQubits are in prefix, so safe to invoke cuQuantum
     cuquantum_densmatr_twoQubitDephasing_subA(qureg, ketQubitA, ketQubitB, prob);
 
-#elif COMPILE_CUDA
+#elif QUEST_COMPILE_CUDA
 
     // the rank-agnostic version is identical to the subB algorithm below, because the
     // queried bits of the global index i below will always be in the suffix substate.
@@ -1075,16 +1104,17 @@ void gpu_densmatr_twoQubitDephasing_subA(Qureg qureg, int ketQubitA, int ketQubi
 
 void gpu_densmatr_twoQubitDephasing_subB(Qureg qureg, int ketQubitA, int ketQubitB, qreal prob) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM 
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto term = util_getTwoQubitDephasingTerm(prob);
     int braQubitA = util_getBraQubit(ketQubitA, qureg);
     int braQubitB = util_getBraQubit(ketQubitB, qureg);
 
-    kernel_densmatr_twoQubitDephasing_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDephasing_subB <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, // numAmps, not numCols
         ketQubitA, ketQubitB, braQubitA, braQubitB, term
     );
@@ -1103,15 +1133,16 @@ void gpu_densmatr_twoQubitDephasing_subB(Qureg qureg, int ketQubitA, int ketQubi
 
 void gpu_densmatr_oneQubitDepolarising_subA(Qureg qureg, int ketQubit, qreal prob) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQubit = util_getBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitDepolarisingFactors(prob);
 
-    kernel_densmatr_oneQubitDepolarising_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDepolarising_subA <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3
     );
 
@@ -1123,16 +1154,17 @@ void gpu_densmatr_oneQubitDepolarising_subA(Qureg qureg, int ketQubit, qreal pro
 
 void gpu_densmatr_oneQubitDepolarising_subB(Qureg qureg, int ketQubit, qreal prob) {
     
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
     int braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitDepolarisingFactors(prob);
 
-    kernel_densmatr_oneQubitDepolarising_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDepolarising_subB <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         ketQubit, braBit, factors.c1, factors.c2, factors.c3
     );
@@ -1151,16 +1183,17 @@ void gpu_densmatr_oneQubitDepolarising_subB(Qureg qureg, int ketQubit, qreal pro
 
 void gpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2, qreal prob) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braQb2 = util_getBraQubit(ketQb2, qureg);
     auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3;
 
-    kernel_densmatr_twoQubitDepolarising_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subA <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads,
         ketQb1, ketQb2, braQb1, braQb2, c3
     );
@@ -1173,10 +1206,11 @@ void gpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2,
 
 void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2, qreal prob) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 16;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braQb2 = util_getBraQubit(ketQb2, qureg);
@@ -1185,7 +1219,7 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2,
     // each kernel invocation sums all 4 amps together, so adjusts c1
     qreal altc1 = factors.c1 - factors.c2;
 
-    kernel_densmatr_twoQubitDepolarising_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subB <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads,
         ketQb1, ketQb2, braQb1, braQb2, altc1, factors.c2
     );
@@ -1198,16 +1232,17 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2,
 
 void gpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2, qreal prob) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
     auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3;
 
-    kernel_densmatr_twoQubitDepolarising_subC <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subC <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads,
         ketQb1, ketQb2, braQb1, braBit2, c3
     );
@@ -1220,17 +1255,18 @@ void gpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2,
 
 void gpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2, qreal prob) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 8;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex offset = getBufferRecvInd();
 
     int braQb1 = util_getBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
     auto factors = util_getTwoQubitDepolarisingFactors(prob);
 
-    kernel_densmatr_twoQubitDepolarising_subD <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subD <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + offset, numThreads,
         ketQb1, ketQb2, braQb1, braBit2, factors.c1, factors.c2
     );
@@ -1243,10 +1279,11 @@ void gpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2,
 
 void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2, qreal prob) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
@@ -1255,7 +1292,7 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2,
     qreal fac0 = 1 + factors.c3;
     qreal fac1 = factors.c1 - fac0;
 
-    kernel_densmatr_twoQubitDepolarising_subE <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subE <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads,
         ketQb1, ketQb2, braBit1, braBit2, fac0, fac1
     );
@@ -1268,17 +1305,18 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2,
 
 void gpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2, qreal prob) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex offset = getBufferRecvInd();
 
     int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg);
     int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg);
     auto c2 = util_getTwoQubitDepolarisingFactors(prob).c2;
 
-    kernel_densmatr_twoQubitDepolarising_subF <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_twoQubitDepolarising_subF <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + offset, numThreads,
         ketQb1, ketQb2, braBit1, braBit2, c2
     );
@@ -1297,15 +1335,16 @@ void gpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2,
 
 void gpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI, qreal pX, qreal pY, qreal pZ) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQubit = util_getBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ);
 
-    kernel_densmatr_oneQubitPauliChannel_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitPauliChannel_subA <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braQubit, 
         factors.c1, factors.c2, factors.c3, factors.c4
     );
@@ -1318,16 +1357,17 @@ void gpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI,
 
 void gpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI, qreal pX, qreal pY, qreal pZ) {
     
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
     int braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ);
 
-    kernel_densmatr_oneQubitPauliChannel_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitPauliChannel_subB <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         ketQubit, braBit, factors.c1, factors.c2, factors.c3, factors.c4
     );
@@ -1346,15 +1386,16 @@ void gpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI,
 
 void gpu_densmatr_oneQubitDamping_subA(Qureg qureg, int ketQubit, qreal prob) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 4;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     int braQubit = util_getBraQubit(ketQubit, qureg);
     auto factors = util_getOneQubitDampingFactors(prob);
 
-    kernel_densmatr_oneQubitDamping_subA <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDamping_subA <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads,
         ketQubit, braQubit, prob, factors.c1, factors.c2
     );
@@ -1367,14 +1408,15 @@ void gpu_densmatr_oneQubitDamping_subA(Qureg qureg, int ketQubit, qreal prob) {
 
 void gpu_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto c2 = util_getOneQubitDampingFactors(prob).c2;
 
-    kernel_densmatr_oneQubitDamping_subB <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDamping_subB <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, qubit, c2
     );
 
@@ -1386,15 +1428,16 @@ void gpu_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) {
 
 void gpu_densmatr_oneQubitDamping_subC(Qureg qureg, int ketQubit, qreal prob) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     auto braBit = util_getRankBitOfBraQubit(ketQubit, qureg);
     auto c1 = util_getOneQubitDampingFactors(prob).c1;
 
-    kernel_densmatr_oneQubitDamping_subC <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDamping_subC <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), numThreads, ketQubit, braBit, c1
     );
 
@@ -1406,13 +1449,14 @@ void gpu_densmatr_oneQubitDamping_subC(Qureg qureg, int ketQubit, qreal prob) {
 
 void gpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode / 2;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     qindex recvInd = getBufferRecvInd();
 
-    kernel_densmatr_oneQubitDamping_subD <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_oneQubitDamping_subD <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, 
         qubit, prob
     );
@@ -1430,20 +1474,21 @@ void gpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) {
 
 
 template <int NumTargs> 
-void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector<int> targs, vector<int> pairTargs) {
+void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, ConstList64 targs, ConstList64 pairTargs) {
 
     assert_numTargsMatchesTemplateParam(targs.size(), NumTargs);
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qindex numThreads = outQureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
-    devints devTargs = targs;
-    devints devPairTargs = pairTargs;
-    devints devAllTargs = util_getSorted(targs, pairTargs);
+    devints devTargs = getDevInts(targs);
+    devints devPairTargs = getDevInts(pairTargs);
+    devints devAllTargs = getDevInts(util_getSorted(targs, pairTargs));
 
-    kernel_densmatr_partialTrace_sub <NumTargs> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_partialTrace_sub <NumTargs> <<<numBlocks, numThreadsPerBlock>>> (
         getGpuQcompPtr(inQureg.gpuAmps), getGpuQcompPtr(outQureg.gpuAmps), numThreads,
         getPtr(devTargs), getPtr(devPairTargs), getPtr(devAllTargs), targs.size()
     );
@@ -1454,7 +1499,7 @@ void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector<int> ta
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_densmatr_partialTrace_sub, (Qureg, Qureg, vector<int>, vector<int>) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_densmatr_partialTrace_sub, (Qureg, Qureg, ConstList64, ConstList64) )
 
 
 
@@ -1465,10 +1510,10 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_densmatr_partialTrace_sub, (
 
 qreal gpu_statevec_calcTotalProb_sub(Qureg qureg) {
 
-#if COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUQUANTUM
     return cuquantum_statevec_calcTotalProb_sub(qureg);
 
-#elif COMPILE_CUDA
+#elif QUEST_COMPILE_CUDA
     return thrust_statevec_calcTotalProb_sub(qureg);
 
 #else
@@ -1480,7 +1525,7 @@ qreal gpu_statevec_calcTotalProb_sub(Qureg qureg) {
 
 qreal gpu_densmatr_calcTotalProb_sub(Qureg qureg) {
 
-#if COMPILE_CUQUANTUM || COMPILE_CUDA
+#if QUEST_COMPILE_CUQUANTUM || QUEST_COMPILE_CUDA
     return thrust_densmatr_calcTotalProb_sub(qureg);
 
 #else
@@ -1491,16 +1536,16 @@ qreal gpu_densmatr_calcTotalProb_sub(Qureg qureg) {
 
 
 template <int NumQubits> 
-qreal gpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
+qreal gpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes) {
 
     assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
-#if COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUQUANTUM
 
     // cuQuantum disregards NumQubits compile-time param
     return cuquantum_statevec_calcProbOfMultiQubitOutcome_sub(qureg, qubits, outcomes);
 
-#elif COMPILE_CUDA 
+#elif QUEST_COMPILE_CUDA
 
     return thrust_statevec_calcProbOfMultiQubitOutcome_sub<NumQubits>(qureg, qubits, outcomes);
 
@@ -1512,11 +1557,11 @@ qreal gpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubi
 
 
 template <int NumQubits> 
-qreal gpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
+qreal gpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes) {
 
     assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
-#if COMPILE_CUQUANTUM || COMPILE_CUDA 
+#if QUEST_COMPILE_CUQUANTUM || QUEST_COMPILE_CUDA
 
     return thrust_densmatr_calcProbOfMultiQubitOutcome_sub<NumQubits>(qureg, qubits, outcomes);
 
@@ -1528,11 +1573,11 @@ qreal gpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubi
 
 
 template <int NumQubits> 
-void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits) {
+void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, ConstList64 qubits) {
 
     assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
-#if COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUQUANTUM
 
     /// @todo
     /// cuQuantum assumes all qubits are local (since it does not consult rank) 
@@ -1554,16 +1599,17 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
 #endif
 
 // note preprocessors are not exclusive
-#if COMPILE_CUDA
+#if QUEST_COMPILE_CUDA
 
     qindex numThreads = qureg.numAmpsPerNode;
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
 
     // allocate exponentially-big temporary memory (error if failed)
-    devints devQubits = qubits;
+    devints devQubits = getDevInts(qubits);
     devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws
 
-    kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub<NumQubits> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub<NumQubits> <<<numBlocks, numThreadsPerBlock>>> (
         getPtr(devProbs), getGpuQcompPtr(qureg.gpuAmps), numThreads, 
         qureg.rank, qureg.logNumAmpsPerNode, getPtr(devQubits), devQubits.size()
     );
@@ -1582,25 +1628,26 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
 
 
 template <int NumQubits> 
-void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits) {
+void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, ConstList64 qubits) {
 
     assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     // we decouple numColsPerNode and numThreads for clarity
     // (and in case parallelisation granularity ever changes);
     qindex numThreads = powerOf2(qureg.logNumColsPerNode);
-    qindex numBlocks = getNumBlocks(numThreads);
+    int numThreadsPerBlock = gpu_getNumThreadsPerBlock();
+    qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock);
     
     qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg);
     qindex numAmpsPerCol = powerOf2(qureg.numQubits);
 
     // allocate exponentially-big temporary memory (error if failed)
-    devints devQubits = qubits;
+    devints devQubits = getDevInts(qubits);
     devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws
 
-    kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub<NumQubits> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+    kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub<NumQubits> <<<numBlocks, numThreadsPerBlock>>> (
         getPtr(devProbs), getGpuQcompPtr(qureg.gpuAmps), 
         numThreads, firstDiagInd, numAmpsPerCol,
         qureg.rank, qureg.logNumAmpsPerNode, 
@@ -1616,11 +1663,11 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qreal, gpu_statevec_calcProbOfMultiQubitOutcome_sub, (Qureg, vector<int>, vector<int>) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qreal, gpu_densmatr_calcProbOfMultiQubitOutcome_sub, (Qureg, vector<int>, vector<int>) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qreal, gpu_statevec_calcProbOfMultiQubitOutcome_sub, (Qureg, ConstList64, ConstList64) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( qreal, gpu_densmatr_calcProbOfMultiQubitOutcome_sub, (Qureg, ConstList64, ConstList64) )
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub, (qreal* outProbs, Qureg, vector<int>) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub, (qreal* outProbs, Qureg, vector<int>) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub, (qreal* outProbs, Qureg, ConstList64) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub, (qreal* outProbs, Qureg, ConstList64) )
 
 
 
@@ -1631,7 +1678,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_densmatr_calcProbsOfAllMulti
 
 qcomp gpu_statevec_calcInnerProduct_sub(Qureg quregA, Qureg quregB) {
 
-#if COMPILE_CUQUANTUM || COMPILE_CUDA
+#if QUEST_COMPILE_CUQUANTUM || QUEST_COMPILE_CUDA
 
     gpu_qcomp prod = thrust_statevec_calcInnerProduct_sub(quregA, quregB);
     return getQcomp(prod);
@@ -1645,7 +1692,7 @@ qcomp gpu_statevec_calcInnerProduct_sub(Qureg quregA, Qureg quregB) {
 
 qreal gpu_densmatr_calcHilbertSchmidtDistance_sub(Qureg quregA, Qureg quregB) {
 
-#if COMPILE_CUQUANTUM || COMPILE_CUDA
+#if QUEST_COMPILE_CUQUANTUM || QUEST_COMPILE_CUDA
 
     return thrust_densmatr_calcHilbertSchmidtDistance_sub(quregA, quregB);
 
@@ -1659,7 +1706,7 @@ qreal gpu_densmatr_calcHilbertSchmidtDistance_sub(Qureg quregA, Qureg quregB) {
 template <bool Conj>
 qcomp gpu_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi) {
 
-#if COMPILE_CUQUANTUM || COMPILE_CUDA
+#if QUEST_COMPILE_CUQUANTUM || QUEST_COMPILE_CUDA
 
     gpu_qcomp fid = thrust_densmatr_calcFidelityWithPureState_sub<Conj>(rho, psi);
     return getQcomp(fid);
@@ -1681,13 +1728,13 @@ template qcomp gpu_densmatr_calcFidelityWithPureState_sub<false>(Qureg, Qureg);
  */
 
 
-qreal gpu_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
+qreal gpu_statevec_calcExpecAnyTargZ_sub(Qureg qureg, ConstList64 targs) {
 
-#if COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUQUANTUM
 
     return cuquantum_statevec_calcExpecAnyTargZ_sub(qureg, targs);
 
-#elif COMPILE_CUDA
+#elif QUEST_COMPILE_CUDA
 
     return thrust_statevec_calcExpecAnyTargZ_sub(qureg, targs);
 
@@ -1698,9 +1745,9 @@ qreal gpu_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
 }
 
 
-qcomp gpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
+qcomp gpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, ConstList64 targs) {
 
-#if COMPILE_CUQUANTUM || COMPILE_CUDA
+#if QUEST_COMPILE_CUQUANTUM || QUEST_COMPILE_CUDA
 
     gpu_qcomp value = thrust_densmatr_calcExpecAnyTargZ_sub(qureg, targs);
     return getQcomp(value);
@@ -1712,13 +1759,13 @@ qcomp gpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
 }
 
 
-qcomp gpu_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
+qcomp gpu_statevec_calcExpecPauliStr_subA(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z) {
 
-#if COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUQUANTUM
 
     return cuquantum_statevec_calcExpecPauliStr_subA(qureg, x, y, z);
 
-#elif COMPILE_CUDA
+#elif QUEST_COMPILE_CUDA
 
     gpu_qcomp value = thrust_statevec_calcExpecPauliStr_subA(qureg, x, y, z);
     return getQcomp(value);
@@ -1730,9 +1777,9 @@ qcomp gpu_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vector<int
 }
 
 
-qcomp gpu_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
+qcomp gpu_statevec_calcExpecPauliStr_subB(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z) {
 
-#if COMPILE_CUQUANTUM || COMPILE_CUDA
+#if QUEST_COMPILE_CUQUANTUM || QUEST_COMPILE_CUDA
 
     gpu_qcomp value = thrust_statevec_calcExpecPauliStr_subB(qureg, x, y, z);
     return getQcomp(value);
@@ -1744,9 +1791,9 @@ qcomp gpu_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vector<int
 }
 
 
-qcomp gpu_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
+qcomp gpu_densmatr_calcExpecPauliStr_sub(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z) {
     
-#if COMPILE_CUQUANTUM || COMPILE_CUDA
+#if QUEST_COMPILE_CUQUANTUM || QUEST_COMPILE_CUDA
 
     gpu_qcomp value = thrust_densmatr_calcExpecPauliStr_sub(qureg, x, y, z);
     return getQcomp(value);
@@ -1768,7 +1815,7 @@ template <bool HasPower, bool UseRealPow>
 qcomp gpu_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent) {
     assert_exponentMatchesTemplateParam(exponent, HasPower, UseRealPow);
 
-#if COMPILE_CUQUANTUM || COMPILE_CUDA
+#if QUEST_COMPILE_CUQUANTUM || QUEST_COMPILE_CUDA
 
     gpu_qcomp expo = getGpuQcomp(exponent);
     gpu_qcomp value = thrust_statevec_calcExpecFullStateDiagMatr_sub<HasPower,UseRealPow>(qureg, matr, expo);
@@ -1785,7 +1832,7 @@ template <bool HasPower, bool UseRealPow>
 qcomp gpu_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent) {
     assert_exponentMatchesTemplateParam(exponent, HasPower, UseRealPow);
 
-#if COMPILE_CUQUANTUM || COMPILE_CUDA
+#if QUEST_COMPILE_CUQUANTUM || QUEST_COMPILE_CUDA
 
     gpu_qcomp expo = getGpuQcomp(exponent);
     gpu_qcomp value = thrust_densmatr_calcExpecFullStateDiagMatr_sub<HasPower,UseRealPow>(qureg, matr, expo);
@@ -1815,17 +1862,17 @@ template qcomp gpu_densmatr_calcExpecFullStateDiagMatr_sub<false,true >(Qureg, F
 
 
 template <int NumQubits> 
-void gpu_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) {
+void gpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
     // all qubits are in suffix
     assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
-#if COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUQUANTUM
 
     // cuQuantum disregards NumQubits template param
     cuquantum_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, prob);
 
-#elif COMPILE_CUDA
+#elif QUEST_COMPILE_CUDA
 
     qreal renorm = 1 / std::sqrt(prob);
     thrust_statevec_multiQubitProjector_sub<NumQubits>(qureg, qubits, outcomes, renorm);
@@ -1837,12 +1884,12 @@ void gpu_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vecto
 
 
 template <int NumQubits> 
-void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) {
+void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) {
 
     // qubits are unconstrained, and can include prefix qubits
     assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     qreal renorm = 1 / prob;
     thrust_densmatr_multiQubitProjector_sub<NumQubits>(qureg, qubits, outcomes, renorm);
@@ -1853,8 +1900,8 @@ void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vecto
 }
 
 
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_statevec_multiQubitProjector_sub, (Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) )
-INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_densmatr_multiQubitProjector_sub, (Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_statevec_multiQubitProjector_sub, (Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) )
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_densmatr_multiQubitProjector_sub, (Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob) )
 
 
 
@@ -1864,7 +1911,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, gpu_densmatr_multiQubitProjector
 
 
 void gpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) {
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     thrust_statevec_initUniformState(qureg, getGpuQcomp(amp));
 
@@ -1875,7 +1922,7 @@ void gpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) {
 
 
 void gpu_statevec_initDebugState_sub(Qureg qureg) {
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     thrust_statevec_initDebugState_sub(qureg);
 
@@ -1887,7 +1934,7 @@ void gpu_statevec_initDebugState_sub(Qureg qureg) {
 
 void gpu_statevec_initUnnormalisedUniformlyRandomPureStateAmps_sub(Qureg qureg) {
 
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
+#if QUEST_COMPILE_CUDA || QUEST_COMPILE_CUQUANTUM
 
     thrust_statevec_initUnnormalisedUniformlyRandomPureStateAmps_sub(qureg);
 
diff --git a/quest/src/gpu/gpu_subroutines.hpp b/quest/src/gpu/gpu_subroutines.hpp
index ff42c2239..029e0e871 100644
--- a/quest/src/gpu/gpu_subroutines.hpp
+++ b/quest/src/gpu/gpu_subroutines.hpp
@@ -12,6 +12,8 @@
 #include "quest/include/paulis.h"
 #include "quest/include/matrices.h"
 
+#include "quest/src/core/lists.hpp"
+
 #include <vector>
 
 using std::vector;
@@ -37,7 +39,7 @@ void gpu_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliStr
  * COMMUNICATION BUFFER PACKING
  */
 
-template <int NumQubits> qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector<int> qubits, vector<int> qubitStates);
+template <int NumQubits> qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstList64 qubitStates);
 
 qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2);
 
@@ -46,32 +48,32 @@ qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu
  * SWAPS
  */
 
-template <int NumCtrls> void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2);
-template <int NumCtrls> void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates);
-template <int NumCtrls> void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, int targState);
+template <int NumCtrls> void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2);
+template <int NumCtrls> void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates);
+template <int NumCtrls> void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, int targState);
 
 
 /*
  * DENSE MATRIX
  */
 
-template <int NumCtrls> void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, CompMatr1 matr);
-template <int NumCtrls> void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, qcomp fac0, qcomp fac1);
+template <int NumCtrls> void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, CompMatr1 matr);
+template <int NumCtrls> void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, qcomp fac0, qcomp fac1);
 
-template <int NumCtrls> void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr);
+template <int NumCtrls> void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, CompMatr2 matr);
 
-template <int NumCtrls, int NumTargs, bool ApplyConj, bool ApplyTransp> void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr);
+template <int NumCtrls, int NumTargs, bool ApplyConj, bool ApplyTransp> void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, CompMatr matr);
 
 
 /*
  * DIAGONAL MATRIX
  */
 
-template <int NumCtrls> void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, DiagMatr1 matr);
+template <int NumCtrls> void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ, DiagMatr1 matr);
 
-template <int NumCtrls> void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, DiagMatr2 matr);
+template <int NumCtrls> void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, int targ1, int targ2, DiagMatr2 matr);
 
-template <int NumCtrls, int NumTargs, bool ApplyConj, bool HasPower> void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, DiagMatr matr, qcomp exponent);
+template <int NumCtrls, int NumTargs, bool ApplyConj, bool HasPower> void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, DiagMatr matr, qcomp exponent);
 
 template <bool HasPower> void gpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
 
@@ -82,11 +84,11 @@ template <bool HasPower, bool ApplyLeft, bool ApplyRight, bool ConjRight> void g
  * PAULI TENSOR AND GADGET
  */
 
-template <int NumCtrls, int NumTargs> void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> x, vector<int> y, vector<int> z, qcomp ampFac, qcomp pairAmpFac);
+template <int NumCtrls, int NumTargs> void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 x, ConstList64 y, ConstList64 z, qcomp ampFac, qcomp pairAmpFac);
 
-template <int NumCtrls> void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> x, vector<int> y, vector<int> z, qcomp ampFac, qcomp pairAmpFac, qindex bufferMaskXY);
+template <int NumCtrls> void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 x, ConstList64 y, ConstList64 z, qcomp ampFac, qcomp pairAmpFac, qindex bufferMaskXY);
 
-template <int NumCtrls> void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qcomp fac0, qcomp fac1);
+template <int NumCtrls> void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, ConstList64 ctrls, ConstList64 ctrlStates, ConstList64 targs, qcomp fac0, qcomp fac1);
 
 
 /*
@@ -133,7 +135,7 @@ void gpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob);
  * PARTIAL TRACE
  */
 
-template <int NumTargs> void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector<int> targs, vector<int> pairTargs);
+template <int NumTargs> void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, ConstList64 targs, ConstList64 pairTargs);
 
 
 /*
@@ -143,11 +145,11 @@ template <int NumTargs> void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg
 qreal gpu_statevec_calcTotalProb_sub(Qureg qureg);
 qreal gpu_densmatr_calcTotalProb_sub(Qureg qureg);
 
-template <int NumQubits> qreal gpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes);
-template <int NumQubits> qreal gpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes);
+template <int NumQubits> qreal gpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes);
+template <int NumQubits> qreal gpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes);
 
-template <int NumQubits> void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits);
-template <int NumQubits> void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits);
+template <int NumQubits> void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, ConstList64 qubits);
+template <int NumQubits> void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, ConstList64 qubits);
 
 
 /*
@@ -165,13 +167,13 @@ template <bool Conj> qcomp gpu_densmatr_calcFidelityWithPureState_sub(Qureg rho,
  * EXPECTATION VALUES
  */
 
-qreal gpu_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs);
-qcomp gpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs);
+qreal gpu_statevec_calcExpecAnyTargZ_sub(Qureg qureg, ConstList64 targs);
+qcomp gpu_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, ConstList64 targs);
 
 
-qcomp gpu_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vector<int> y, vector<int> z);
-qcomp gpu_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vector<int> y, vector<int> z);
-qcomp gpu_densmatr_calcExpecPauliStr_sub (Qureg qureg, vector<int> x, vector<int> y, vector<int> z);
+qcomp gpu_statevec_calcExpecPauliStr_subA(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z);
+qcomp gpu_statevec_calcExpecPauliStr_subB(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z);
+qcomp gpu_densmatr_calcExpecPauliStr_sub (Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z);
 
 template <bool HasPower, bool UseRealPow> qcomp gpu_statevec_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
 template <bool HasPower, bool UseRealPow> qcomp gpu_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
@@ -181,8 +183,8 @@ template <bool HasPower, bool UseRealPow> qcomp gpu_densmatr_calcExpecFullStateD
  * PROJECTORS
  */
 
-template <int NumQubits> void gpu_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob);
-template <int NumQubits> void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob);
+template <int NumQubits> void gpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
+template <int NumQubits> void gpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal prob);
 
 
 /*
diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh
index 07a650547..864cca5f8 100644
--- a/quest/src/gpu/gpu_thrust.cuh
+++ b/quest/src/gpu/gpu_thrust.cuh
@@ -1,6 +1,6 @@
 /** @file
  * Subroutines which invoke Thrust. This file is only ever included
- * when COMPILE_CUDA=1 so it can safely invoke CUDA signatures without 
+ * when QUEST_COMPILE_CUDA=1 so it can safely invoke CUDA signatures without
  * guards. Further, as it is entirely a header, it can declare templated
  * times without explicitly instantiating them across all parameter values.
  * 
@@ -24,7 +24,7 @@
 // obtain preprocessors from config.h prior to validation
 #include "quest/include/config.h"
 
-#if ! COMPILE_CUDA
+#if ! QUEST_COMPILE_CUDA
     #error "A file being compiled somehow included gpu_thrust.hpp despite QuEST not being compiled in GPU-accelerated mode."
 #endif
 
@@ -37,6 +37,7 @@
 #include "quest/src/core/errors.hpp"
 #include "quest/src/core/bitwise.hpp"
 #include "quest/src/core/constants.hpp"
+#include "quest/src/core/lists.hpp"
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/core/randomiser.hpp"
 #include "quest/src/core/fastmath.hpp"
@@ -64,12 +65,22 @@
  * copy constructor (devicevec d_vec = hostvec). The pointer 
  * to the data (d_vec.data()) can be cast into a raw pointer
  * and passed directly to CUDA kernels (though qcomp must be
- * reinterpreted to gpu_qcomp)
+ * reinterpreted to gpu_qcomp).
  */
 
 
 using devints = thrust::device_vector<int>;
 
+devints getDevInts(ConstList64 h_list) {
+
+    // DEBUG: this is a placeholder! James' GPU refactor should make it redundant, 
+    // and we can pass List64 directly to a CUDA kernel, paying no heap allocs,
+    // nor CUDA memcpy costs
+
+    devints d_list = std::vector<int>(h_list.data(), h_list.data() + h_list.size());
+    return d_list;
+}
+
 int* getPtr(devints& qubits) {
 
     return thrust::raw_pointer_cast(qubits.data());
@@ -779,9 +790,9 @@ qreal thrust_densmatr_calcTotalProb_sub(Qureg qureg) {
 
 
 template <int NumQubits>
-qreal thrust_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
+qreal thrust_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes) {
 
-    devints sortedQubits = util_getSorted(qubits);
+    devints sortedQubits = getDevInts(util_getSorted(qubits));
     qindex valueMask = util_getBitMask(qubits, outcomes);
 
     auto indFunctor = functor_insertBits<NumQubits>(getPtr(sortedQubits), valueMask, qubits.size());
@@ -799,11 +810,11 @@ qreal thrust_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> q
 
 
 template <int NumQubits>
-qreal thrust_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
+qreal thrust_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes) {
 
     // cannot move these into functor_insertBits constructor, since the memory
     // would dangle - and we cannot bind deviceints as an attribute - it's host-only!
-    devints sortedQubits = util_getSorted(qubits);
+    devints sortedQubits = getDevInts(util_getSorted(qubits));
     qindex valueMask = util_getBitMask(qubits, outcomes);
 
     auto basisIndFunctor = functor_insertBits<NumQubits>(getPtr(sortedQubits), valueMask, qubits.size());
@@ -878,7 +889,7 @@ gpu_qcomp thrust_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi) {
  */
 
 
-qreal thrust_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
+qreal thrust_statevec_calcExpecAnyTargZ_sub(Qureg qureg, ConstList64 targs) {
 
     qindex mask = util_getBitMask(targs);
     auto functor = functor_getExpecStateVecZTerm(mask);
@@ -893,7 +904,7 @@ qreal thrust_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
 }
 
 
-gpu_qcomp thrust_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {
+gpu_qcomp thrust_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, ConstList64 targs) {
 
     qindex dim = powerOf2(qureg.numQubits);
     qindex ind = util_getLocalIndexOfFirstDiagonalAmp(qureg);
@@ -908,7 +919,7 @@ gpu_qcomp thrust_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs)
 }
 
 
-gpu_qcomp thrust_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
+gpu_qcomp thrust_statevec_calcExpecPauliStr_subA(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z) {
 
     qindex maskXY = util_getBitMask(util_getConcatenated(x, y));
     qindex maskYZ = util_getBitMask(util_getConcatenated(y, z));
@@ -925,7 +936,7 @@ gpu_qcomp thrust_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vec
 }
 
 
-gpu_qcomp thrust_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
+gpu_qcomp thrust_statevec_calcExpecPauliStr_subB(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z) {
 
     qindex maskXY = util_getBitMask(util_getConcatenated(x, y));
     qindex maskYZ = util_getBitMask(util_getConcatenated(y, z));
@@ -943,7 +954,7 @@ gpu_qcomp thrust_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vec
 }
 
 
-gpu_qcomp thrust_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {
+gpu_qcomp thrust_densmatr_calcExpecPauliStr_sub(Qureg qureg, ConstList64 x, ConstList64 y, ConstList64 z) {
 
     qindex mXY = util_getBitMask(util_getConcatenated(x, y));
     qindex mYZ = util_getBitMask(util_getConcatenated(y, z));
@@ -1005,9 +1016,9 @@ gpu_qcomp thrust_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateD
 
 
 template <int NumQubits>
-void thrust_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal renorm) {
+void thrust_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal renorm) {
 
-    devints devQubits = qubits;
+    devints devQubits = getDevInts(qubits);
     qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size());
     auto projFunctor = functor_projectStateVec<NumQubits>(
         getPtr(devQubits), qubits.size(), retainValue, renorm);
@@ -1021,9 +1032,9 @@ void thrust_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, ve
 
 
 template <int NumQubits>
-void thrust_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal renorm) {
+void thrust_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, ConstList64 outcomes, qreal renorm) {
 
-    devints devQubits = qubits;
+    devints devQubits = getDevInts(qubits);
     qindex retainValue = getIntegerFromBits(outcomes.data(), outcomes.size());
     auto projFunctor = functor_projectDensMatr<NumQubits>(
         getPtr(devQubits), qubits.size(), qureg.rank, qureg.numQubits,
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 50fc50bfc..4d5050e51 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,4 +1,5 @@
 # @author Oliver Thomson Brown
+# @author Tyson Jones (patched MSVC and test discovery)
 
 add_executable(tests
   main.cpp
@@ -6,6 +7,10 @@ add_executable(tests
 target_link_libraries(tests PRIVATE QuEST::QuEST Catch2::Catch2)
 target_compile_features(tests PUBLIC cxx_std_20)
 
+if (QUEST_ENABLE_MPI AND QUEST_ENABLE_SUBCOMM)
+  target_link_libraries(tests PRIVATE MPI::MPI_CXX)
+endif()
+
 # extend the MSVC max object size
 if (MSVC)
   target_compile_options(tests PRIVATE /bigobj)
@@ -15,9 +20,9 @@ add_subdirectory(unit)
 add_subdirectory(utils)
 add_subdirectory(integration)
 
-if (ENABLE_DEPRECATED_API)
+if (QUEST_ENABLE_DEPRECATED_API)
   add_subdirectory(deprecated)
 endif()
 
-# let Catch2 register all tests with CTest
-catch_discover_tests(tests)
+# defer test discovery, so that (e.g.) MPI libs aren't loaded during build
+catch_discover_tests(tests DISCOVERY_MODE PRE_TEST)
diff --git a/tests/deprecated/CMakeLists.txt b/tests/deprecated/CMakeLists.txt
index f9132c74a..570561332 100644
--- a/tests/deprecated/CMakeLists.txt
+++ b/tests/deprecated/CMakeLists.txt
@@ -1,5 +1,6 @@
 # @author Oliver Thomson Brown
 # @author Erich Essmann (patched MPI)
+# @author Tyson Jones (deferred test discovery)
 
 add_executable(dep_tests
     test_main.cpp
@@ -14,8 +15,8 @@ add_executable(dep_tests
 )
 target_link_libraries(dep_tests PUBLIC QuEST::QuEST Catch2::Catch2)
 
-if (ENABLE_DISTRIBUTION)
+if (QUEST_ENABLE_MPI)
     target_link_libraries(dep_tests PRIVATE MPI::MPI_CXX)
 endif()
 
-catch_discover_tests(dep_tests)
\ No newline at end of file
+catch_discover_tests(dep_tests DISCOVERY_MODE PRE_TEST)
\ No newline at end of file
diff --git a/tests/deprecated/test_calculations.cpp b/tests/deprecated/test_calculations.cpp
index 0f02a6dea..9ce963fb2 100644
--- a/tests/deprecated/test_calculations.cpp
+++ b/tests/deprecated/test_calculations.cpp
@@ -386,9 +386,9 @@ TEST_CASE( "calcExpecPauliProd", "[calculations]" ) {
             // (get real, since we start in a non-Hermitian state, hence diagonal isn't real)
 
             // disable validation during call, because result is non-real and will upset post-check
-            setValidationOff();
+            setQuESTValidationOff();
             qreal res = calcExpecPauliProd(mat, targs, paulis.data(), numTargs, matWork);
-            setValidationOn();
+            setQuESTValidationOn();
 
             REQUIRE( res == Approx(tr).margin(10*REAL_EPS) );
         }
diff --git a/tests/deprecated/test_decoherence.cpp b/tests/deprecated/test_decoherence.cpp
index edf1d9f61..d4a626d47 100644
--- a/tests/deprecated/test_decoherence.cpp
+++ b/tests/deprecated/test_decoherence.cpp
@@ -32,7 +32,7 @@ using std::vector;
     initDebugState(qureg); \
     QMatrix ref = toQMatrix(qureg); \
     assertQuregAndRefInDebugState(qureg, ref); \
-    setValidationEpsilon(REAL_EPS);
+    setQuESTValidationEpsilon(REAL_EPS);
 
 /* allows concise use of ContainsSubstring in catch's REQUIRE_THROWS_WITH */
 using Catch::Matchers::ContainsSubstring;
diff --git a/tests/deprecated/test_main.cpp b/tests/deprecated/test_main.cpp
index 35ba37477..628a9c8d8 100644
--- a/tests/deprecated/test_main.cpp
+++ b/tests/deprecated/test_main.cpp
@@ -41,7 +41,7 @@ extern "C" void validationErrorHandler(const char* errFunc, const char* errMsg)
 int main(int argc, char* argv[]) {
 
   initQuESTEnv();
-  setInputErrorHandler(validationErrorHandler);
+  setQuESTInputErrorHandler(validationErrorHandler);
   setRandomTestStateSeeds();
 
   int result = Catch::Session().run( argc, argv );
diff --git a/tests/deprecated/test_unitaries.cpp b/tests/deprecated/test_unitaries.cpp
index f0bb2f5aa..6cfd9e803 100644
--- a/tests/deprecated/test_unitaries.cpp
+++ b/tests/deprecated/test_unitaries.cpp
@@ -31,13 +31,13 @@
     QMatrix refMatr = toQMatrix(quregMatr); \
     assertQuregAndRefInDebugState(quregVec, refVec); \
     assertQuregAndRefInDebugState(quregMatr, refMatr); \
-    setValidationEpsilon(REAL_EPS);
+    setQuESTValidationEpsilon(REAL_EPS);
 
 /** Destroys the data structures made by PREPARE_TEST */
 #define CLEANUP_TEST(quregVec, quregMatr) \
     destroyQureg(quregVec); \
     destroyQureg(quregMatr); \
-    setValidationEpsilon(REAL_EPS);
+    setQuESTValidationEpsilon(REAL_EPS);
 
 /* allows concise use of ContainsSubstring in catch's REQUIRE_THROWS_WITH */
 using Catch::Matchers::ContainsSubstring;
diff --git a/tests/deprecated/test_utilities.cpp b/tests/deprecated/test_utilities.cpp
index 81be43525..09e289e2a 100644
--- a/tests/deprecated/test_utilities.cpp
+++ b/tests/deprecated/test_utilities.cpp
@@ -17,15 +17,15 @@
 #include <algorithm>
 #include <bitset>
 
-#if COMPILE_MPI 
+#if QUEST_COMPILE_MPI
 
     #include <mpi.h>
 
-    #if (FLOAT_PRECISION == 1)
+    #if (QUEST_FLOAT_PRECISION == 1)
         #define MPI_QCOMP MPI_CXX_FLOAT_COMPLEX
-    #elif (FLOAT_PRECISION == 2)
+    #elif (QUEST_FLOAT_PRECISION == 2)
         #define MPI_QCOMP MPI_CXX_DOUBLE_COMPLEX
-    #elif (FLOAT_PRECISION == 4) && defined(MPI_CXX_LONG_DOUBLE_COMPLEX)
+    #elif (QUEST_FLOAT_PRECISION == 4) && defined(MPI_CXX_LONG_DOUBLE_COMPLEX)
         #define MPI_QCOMP MPI_CXX_LONG_DOUBLE_COMPLEX
     #else
         #define MPI_QCOMP MPI_C_LONG_DOUBLE_COMPLEX
@@ -203,7 +203,7 @@ void setRandomTestStateSeeds() {
     unsigned seed = cspnrg();
     
     // broadcast to ensure node consensus
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
     int sendRank = 0;
     MPI_Bcast(&seed, 1, MPI_UNSIGNED, sendRank, MPI_COMM_WORLD);
 #endif
@@ -1020,7 +1020,7 @@ bool areEqual(Qureg qureg1, Qureg qureg2, qreal precision) {
             
     // if one node's partition wasn't equal, all-nodes must report not-equal
     int allAmpsAgree = ampsAgree;
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
     MPI_Allreduce(&ampsAgree, &allAmpsAgree, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
 #endif
 
@@ -1064,7 +1064,7 @@ bool areEqual(Qureg qureg, QVector vec, qreal precision) {
             
     // if one node's partition wasn't equal, all-nodes must report not-equal
     int allAmpsAgree = ampsAgree;
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
     MPI_Allreduce(&ampsAgree, &allAmpsAgree, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
 #endif
     
@@ -1127,7 +1127,7 @@ bool areEqual(Qureg qureg, QMatrix matr, qreal precision) {
     
     // if one node's partition wasn't equal, all-nodes must report not-equal
     int allAmpsAgree = ampsAgree;
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
     MPI_Allreduce(&ampsAgree, &allAmpsAgree, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
 #endif
         
@@ -1214,7 +1214,7 @@ QMatrix toQMatrix(CompMatr src) {
 
 QMatrix toQMatrix(Qureg qureg) {
     DEMAND( qureg.isDensityMatrix );
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
     DEMAND( qureg.numAmps < MPI_MAX_AMPS_IN_MSG );
 #endif
     
@@ -1226,7 +1226,7 @@ QMatrix toQMatrix(Qureg qureg) {
     qcomp* allAmps = qureg.cpuAmps;
     
     // in distributed mode, give every node the full state vector
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
     if (qureg.isDistributed) {
         allAmps = (qcomp*) malloc(qureg.numAmps * sizeof *allAmps);
         MPI_Allgather(
@@ -1249,7 +1249,7 @@ QMatrix toQMatrix(Qureg qureg) {
 
 QVector toQVector(Qureg qureg) {
     DEMAND( !qureg.isDensityMatrix );
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
     DEMAND( qureg.numAmps < MPI_MAX_AMPS_IN_MSG );
 #endif
     
@@ -1260,7 +1260,7 @@ QVector toQVector(Qureg qureg) {
     qcomp* allAmps = qureg.cpuAmps;
     
     // in distributed mode, give every node the full state vector
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
     if (qureg.isDistributed) {
         allAmps = (qcomp*) malloc(qureg.numAmps * sizeof *allAmps);
 
@@ -1289,7 +1289,7 @@ QVector toQVector(DiagMatr matr) {
 
 QVector toQVector(FullStateDiagMatr matr) {
 
-#if COMPILE_MPI
+#if QUEST_COMPILE_MPI
     DEMAND( matr.numElems < MPI_MAX_AMPS_IN_MSG );
 #endif
 
@@ -1297,7 +1297,7 @@ QVector toQVector(FullStateDiagMatr matr) {
 
     // in distributed mode, give every node the full diagonal operator
     if (matr.isDistributed) {
-        #if COMPILE_MPI
+        #if QUEST_COMPILE_MPI
             MPI_Allgather(
                 matr.cpuElems, matr.numElemsPerNode, MPI_QCOMP,
                 vec.data(),    matr.numElemsPerNode, MPI_QCOMP, MPI_COMM_WORLD);
diff --git a/tests/deprecated/test_utilities.hpp b/tests/deprecated/test_utilities.hpp
index 8145c3f65..94d301bf9 100644
--- a/tests/deprecated/test_utilities.hpp
+++ b/tests/deprecated/test_utilities.hpp
@@ -33,11 +33,11 @@ using std::vector;
 
 // replace REAL_EPS macro with constant
 #undef REAL_EPS
-#if FLOAT_PRECISION == 1
+#if QUEST_FLOAT_PRECISION == 1
     constexpr qreal REAL_EPS = 1E-1;
-#elif FLOAT_PRECISION == 2
+#elif QUEST_FLOAT_PRECISION == 2
     constexpr qreal REAL_EPS = 1E-8;
-#elif FLOAT_PRECISION == 4
+#elif QUEST_FLOAT_PRECISION == 4
     constexpr qreal REAL_EPS = 1E-10;
 #endif
 
diff --git a/tests/main.cpp b/tests/main.cpp
index fca57f5ff..05e54a8fa 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -89,7 +89,7 @@ class startListener : public Catch::EventListenerBase {
         QuESTEnv env = getQuESTEnv();
         std::cout << std::endl;
         std::cout << "QuEST execution environment:" << std::endl;
-        std::cout << "  precision:       " << FLOAT_PRECISION         << std::endl;
+        std::cout << "  precision:       " << QUEST_FLOAT_PRECISION   << std::endl;
         std::cout << "  multithreaded:   " << env.isMultithreaded     << std::endl;
         std::cout << "  distributed:     " << env.isDistributed       << std::endl;
         std::cout << "  GPU-accelerated: " << env.isGpuAccelerated    << std::endl;
@@ -125,7 +125,7 @@ int main(int argc, char* argv[]) {
     // prepare QuEST before anything else, since many
     // testing utility functions repurpose QuEST ones
     initQuESTEnv();
-    setInputErrorHandler(validationErrorHandler);
+    setQuESTInputErrorHandler(validationErrorHandler);
 
     // ensure RNG consensus among all nodes
     setRandomTestStateSeeds();
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index d617ba8df..59341759f 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -7,6 +7,7 @@ target_sources(tests
   debug.cpp
   decoherence.cpp
   environment.cpp
+  experimental.cpp
   initialisations.cpp
   matrices.cpp
   multiplication.cpp
diff --git a/tests/unit/debug.cpp b/tests/unit/debug.cpp
index 07a967493..421cf55ea 100644
--- a/tests/unit/debug.cpp
+++ b/tests/unit/debug.cpp
@@ -46,7 +46,7 @@ using std::vector;
  */
 
 
-TEST_CASE( "setInputErrorHandler", TEST_CATEGORY ) {
+TEST_CASE( "setQuESTInputErrorHandler", TEST_CATEGORY ) {
 
     /// @todo
     /// We can test this by saving the current handler,
@@ -62,7 +62,7 @@ TEST_CASE( "setInputErrorHandler", TEST_CATEGORY ) {
 }
 
 
-TEST_CASE( "setMaxNumReportedSigFigs", TEST_CATEGORY ) {
+TEST_CASE( "setQuESTMaxNumReportedSigFigs", TEST_CATEGORY ) {
 
     SECTION( LABEL_CORRECTNESS ) {
 
@@ -77,11 +77,11 @@ TEST_CASE( "setMaxNumReportedSigFigs", TEST_CATEGORY ) {
         };
 
         // disable auto \n after lines
-        setNumReportedNewlines(0);
+        setQuESTNumReportedNewlines(0);
 
         for (size_t numSigFigs=1; numSigFigs<=refs.size(); numSigFigs++) {
 
-            setMaxNumReportedSigFigs(numSigFigs);
+            setQuESTMaxNumReportedSigFigs(numSigFigs);
 
             // redirect stdout to buffer
             std::stringstream buffer;
@@ -103,22 +103,22 @@ TEST_CASE( "setMaxNumReportedSigFigs", TEST_CATEGORY ) {
 
             int num = GENERATE( -1, 0 );
 
-            REQUIRE_THROWS_WITH( setMaxNumReportedSigFigs(num), ContainsSubstring("Cannot be less than one") );
+            REQUIRE_THROWS_WITH( setQuESTMaxNumReportedSigFigs(num), ContainsSubstring("Cannot be less than one") );
         }
     }
 
     // restore to QuEST default for future tests
-    setMaxNumReportedSigFigs(5);
+    setQuESTMaxNumReportedSigFigs(5);
 }
 
 
-TEST_CASE( "setNumReportedNewlines", TEST_CATEGORY ) {
+TEST_CASE( "setQuESTNumReportedNewlines", TEST_CATEGORY ) {
 
     SECTION( LABEL_CORRECTNESS ) {
 
         for (int numNewlines=0; numNewlines<3; numNewlines++) {
 
-            setNumReportedNewlines(numNewlines);
+            setQuESTNumReportedNewlines(numNewlines);
 
             // redirect stdout to buffer
             std::stringstream buffer;
@@ -138,23 +138,23 @@ TEST_CASE( "setNumReportedNewlines", TEST_CATEGORY ) {
 
         SECTION( "number" ) {
 
-            REQUIRE_THROWS_WITH( setNumReportedNewlines(-1), ContainsSubstring("Cannot generally be less than zero") );
+            REQUIRE_THROWS_WITH( setQuESTNumReportedNewlines(-1), ContainsSubstring("Cannot generally be less than zero") );
         }
 
         SECTION( "multine number" ) {
 
-            setNumReportedNewlines(0);
+            setQuESTNumReportedNewlines(0);
 
             REQUIRE_THROWS_WITH( reportQuESTEnv(), ContainsSubstring("zero") && ContainsSubstring("not permitted when calling multi-line") );
         }
     }
 
     // restore to QuEST default for future tests
-    setNumReportedNewlines(2);
+    setQuESTNumReportedNewlines(2);
 }
 
 
-TEST_CASE( "setSeeds", TEST_CATEGORY ) {
+TEST_CASE( "setQuESTSeeds", TEST_CATEGORY ) {
 
     SECTION( LABEL_CORRECTNESS ) {
 
@@ -173,7 +173,7 @@ TEST_CASE( "setSeeds", TEST_CATEGORY ) {
                     const int numReps = 5;
 
                     // set an arbitrary fixed seed...
-                    setSeeds(seeds, numSeeds);
+                    setQuESTSeeds(seeds, numSeeds);
 
                     // generate and remember a random state
                     initRandomMixedState(qureg, numMixedStates);
@@ -188,7 +188,7 @@ TEST_CASE( "setSeeds", TEST_CATEGORY ) {
                     for (int r=0; r<numReps; r++) {
 
                         // reset the seed
-                        setSeeds(seeds, numSeeds);
+                        setQuESTSeeds(seeds, numSeeds);
 
                         // and confirm all random states are re-produced
                         initRandomMixedState(qureg, numMixedStates);
@@ -207,14 +207,14 @@ TEST_CASE( "setSeeds", TEST_CATEGORY ) {
                     const int ampInd = 0;
 
                     // set arbitrary seed and collect random-state amp
-                    setSeeds(seeds, numSeeds);
+                    setQuESTSeeds(seeds, numSeeds);
                     initRandomPureState(qureg);
                     qcomp amp1 = getDensityQuregAmp(qureg, ampInd, ampInd);
 
                     // change one passed seed and re-collect random-state amp
                     int i = GENERATE_COPY( range(0,numSeeds) );
                     seeds[i] = 987654321;
-                    setSeeds(seeds, numSeeds);
+                    setQuESTSeeds(seeds, numSeeds);
                     initRandomPureState(qureg);
                     qcomp amp2 = getDensityQuregAmp(qureg, ampInd, ampInd);
 
@@ -235,20 +235,28 @@ TEST_CASE( "setSeeds", TEST_CATEGORY ) {
 
         SECTION( "number of seeds" ) {
 
+            unsigned seeds[] = { 0 };
             int numSeeds = GENERATE( -1, 0 );
 
-            REQUIRE_THROWS_WITH( setSeeds(nullptr, numSeeds), ContainsSubstring("Invalid number of random seeds") );
+            REQUIRE_THROWS_WITH( setQuESTSeeds(seeds, numSeeds), ContainsSubstring("Invalid number of random seeds") );
+        }
+
+        SECTION( "invalid pointer" ) {
+
+            int numSeeds = GENERATE( 1, 2 );
+
+            REQUIRE_THROWS_WITH( setQuESTSeeds(nullptr, numSeeds), ContainsSubstring("The given seeds list pointer is NULL") );
         }
 
         // inconsistency between nodes is permitted
     }
 
     // re-randomise seeds for remaining tests
-    setSeedsToDefault();
+    setQuESTSeedsToDefault();
 }
 
 
-TEST_CASE( "setSeedsToDefault", TEST_CATEGORY ) {
+TEST_CASE( "setQuESTSeedsToDefault", TEST_CATEGORY ) {
 
     SECTION( LABEL_CORRECTNESS ) {
 
@@ -264,12 +272,12 @@ TEST_CASE( "setSeedsToDefault", TEST_CATEGORY ) {
                     const int ampInd = 0;
 
                     // randomise seed and collect random-state amp
-                    setSeedsToDefault();
+                    setQuESTSeedsToDefault();
                     initRandomPureState(qureg);
                     qcomp amp1 = getDensityQuregAmp(qureg, ampInd, ampInd);
 
                     // re-randomise seed and collect new random-state amp
-                    setSeedsToDefault();
+                    setQuESTSeedsToDefault();
                     initRandomPureState(qureg);
                     qcomp amp2 = getDensityQuregAmp(qureg, ampInd, ampInd);
 
@@ -290,22 +298,22 @@ TEST_CASE( "setSeedsToDefault", TEST_CATEGORY ) {
     }
 
     // re-randomise seeds for remaining tests
-    setSeedsToDefault();
+    setQuESTSeedsToDefault();
 }
 
 
-TEST_CASE( "getSeeds", TEST_CATEGORY ) {
+TEST_CASE( "getQuESTSeeds", TEST_CATEGORY ) {
 
     SECTION( LABEL_CORRECTNESS ) {
 
         SECTION( "can be called immediately" ) {
 
-            REQUIRE_NOTHROW( getNumSeeds() );
+            REQUIRE_NOTHROW( getQuESTNumSeeds() );
 
-            int numSeeds = getNumSeeds();
+            int numSeeds = getQuESTNumSeeds();
             vector<unsigned> out(numSeeds);
 
-            REQUIRE_NOTHROW( getSeeds(out.data()) );
+            REQUIRE_NOTHROW( getQuESTSeeds(out.data()) );
         }
 
         SECTION( "correct output" ) {
@@ -319,11 +327,11 @@ TEST_CASE( "getSeeds", TEST_CATEGORY ) {
                 in[i] = static_cast<unsigned>(getRandomInt(0, 99999));
 
             // pass seeds to QuEST
-            setSeeds(in.data(), numSeeds);
+            setQuESTSeeds(in.data(), numSeeds);
 
             // check we get them back
             vector<unsigned> out(numSeeds);
-            getSeeds(out.data());
+            getQuESTSeeds(out.data());
             for (int i=0; i<numSeeds; i++)
                 REQUIRE( in[i] == out[i] );
         }
@@ -339,17 +347,17 @@ TEST_CASE( "getSeeds", TEST_CATEGORY ) {
     }
 
     // re-randomise seeds for remaining tests
-    setSeedsToDefault();
+    setQuESTSeedsToDefault();
 }
 
 
-TEST_CASE( "getNumSeeds", TEST_CATEGORY ) {
+TEST_CASE( "getQuESTNumSeeds", TEST_CATEGORY ) {
 
     SECTION( LABEL_CORRECTNESS ) {
 
         SECTION( "can be called immediately" ) {
 
-            REQUIRE_NOTHROW( getNumSeeds() );
+            REQUIRE_NOTHROW( getQuESTNumSeeds() );
         }
 
         SECTION( "correct output" ) {
@@ -363,10 +371,10 @@ TEST_CASE( "getNumSeeds", TEST_CATEGORY ) {
                 in[i] = static_cast<unsigned>(getRandomInt(0, 99999));
 
             // pass seeds to QuEST
-            setSeeds(in.data(), numSeeds);
+            setQuESTSeeds(in.data(), numSeeds);
 
             // confirm we get out correct number
-            REQUIRE( getNumSeeds() == numSeeds );
+            REQUIRE( getQuESTNumSeeds() == numSeeds );
         }
     }
 
@@ -380,20 +388,20 @@ TEST_CASE( "getNumSeeds", TEST_CATEGORY ) {
     }
 
     // re-randomise seeds for remaining tests
-    setSeedsToDefault();
+    setQuESTSeedsToDefault();
 }
 
 
-TEST_CASE( "setValidationOn", TEST_CATEGORY ) {
+TEST_CASE( "setQuESTValidationOn", TEST_CATEGORY ) {
 
     SECTION( LABEL_CORRECTNESS ) {
 
         // always safe to call
         for (int i=0; i<3; i++)
-            REQUIRE_NOTHROW( setValidationOn() );
+            REQUIRE_NOTHROW( setQuESTValidationOn() );
 
         // illegal and caught
-        REQUIRE_THROWS( setSeeds(nullptr, -99) );
+        REQUIRE_THROWS( setQuESTSeeds(nullptr, -99) );
     }
 
     SECTION( LABEL_VALIDATION ) {
@@ -404,13 +412,13 @@ TEST_CASE( "setValidationOn", TEST_CATEGORY ) {
 }
 
 
-TEST_CASE( "setValidationOff", TEST_CATEGORY ) {
+TEST_CASE( "setQuESTValidationOff", TEST_CATEGORY ) {
 
     SECTION( LABEL_CORRECTNESS ) {
 
         // confirm always safe to call
         for (int i=0; i<3; i++)
-            REQUIRE_NOTHROW( setValidationOff() );
+            REQUIRE_NOTHROW( setQuESTValidationOff() );
 
         // prepare non-unitary matrix
         CompMatr1 m = getCompMatr1({{1,2},{3,4}});
@@ -420,7 +428,7 @@ TEST_CASE( "setValidationOff", TEST_CATEGORY ) {
         REQUIRE_NOTHROW( applyCompMatr1(qureg, 0, m) );
 
         // which otherwise triggers
-        setValidationOn();
+        setQuESTValidationOn();
         REQUIRE_THROWS( applyCompMatr1(qureg, 0, m) );
 
         destroyQureg(qureg);
@@ -433,11 +441,11 @@ TEST_CASE( "setValidationOff", TEST_CATEGORY ) {
     }
 
     // ensure validation is on for remaining tests
-    setValidationOn();
+    setQuESTValidationOn();
 }
 
 
-TEST_CASE( "setValidationEpsilon", TEST_CATEGORY ) {
+TEST_CASE( "setQuESTValidationEpsilon", TEST_CATEGORY ) {
 
     SECTION( LABEL_CORRECTNESS ) {
 
@@ -454,14 +462,14 @@ TEST_CASE( "setValidationEpsilon", TEST_CATEGORY ) {
                 REQUIRE_THROWS( applyCompMatr1(qureg, 0, m) );
 
                 // confirm setting = 0 disables epsilon errors...
-                setValidationEpsilon(0);
+                setQuESTValidationEpsilon(0);
                 REQUIRE_NOTHROW( applyCompMatr1(qureg, 0, m) );
 
                 // but does not disable absolute errors
                 REQUIRE_THROWS( applyCompMatr1(qureg, -1, m) );
 
                 // confirm non-zero (forgive all) works
-                setValidationEpsilon(9999); // bigger than dist of m*conj(m) from identity squared
+                setQuESTValidationEpsilon(9999); // bigger than dist of m*conj(m) from identity squared
                 REQUIRE_NOTHROW( applyCompMatr1(qureg, 0, m) );
             }
 
@@ -483,7 +491,7 @@ TEST_CASE( "setValidationEpsilon", TEST_CATEGORY ) {
                 *(m.isApproxUnitary)   = 1;
                 *(m.isApproxHermitian) = 1;
 
-                setValidationEpsilon(.1);
+                setQuESTValidationEpsilon(.1);
                 REQUIRE( *(m.isApproxUnitary)   == -1 );
                 REQUIRE( *(m.isApproxHermitian) == -1 );
 
@@ -497,7 +505,7 @@ TEST_CASE( "setValidationEpsilon", TEST_CATEGORY ) {
                 *(m.isApproxHermitian) = 0;
                 *(m.isApproxNonZero)   = 1;
 
-                setValidationEpsilon(.1);
+                setQuESTValidationEpsilon(.1);
                 REQUIRE( *(m.isApproxUnitary)   == -1 );
                 REQUIRE( *(m.isApproxHermitian) == -1 );
                 REQUIRE( *(m.isApproxNonZero)   == -1 );
@@ -512,7 +520,7 @@ TEST_CASE( "setValidationEpsilon", TEST_CATEGORY ) {
                 *(m.isApproxHermitian) = 0;
                 *(m.isApproxNonZero)   = 1;
 
-                setValidationEpsilon(.1);
+                setQuESTValidationEpsilon(.1);
                 REQUIRE( *(m.isApproxUnitary)   == -1 );
                 REQUIRE( *(m.isApproxHermitian) == -1 );
                 REQUIRE( *(m.isApproxNonZero)   == -1 );
@@ -525,7 +533,7 @@ TEST_CASE( "setValidationEpsilon", TEST_CATEGORY ) {
                 KrausMap k = createKrausMap(1, 3);
                 *(k.isApproxCPTP) = 1;
 
-                setValidationEpsilon(.1);
+                setQuESTValidationEpsilon(.1);
                 REQUIRE( *(k.isApproxCPTP) == -1 );
 
                 destroyKrausMap(k);
@@ -539,30 +547,30 @@ TEST_CASE( "setValidationEpsilon", TEST_CATEGORY ) {
 
             qreal eps = GENERATE( -0.5, -1, -100 );
 
-            REQUIRE_THROWS_WITH( setValidationEpsilon(eps), ContainsSubstring("positive number") );
+            REQUIRE_THROWS_WITH( setQuESTValidationEpsilon(eps), ContainsSubstring("positive number") );
         }
     }
 
     // ensure validation epsilon is default for remaining tests
-    setValidationEpsilonToDefault();
+    setQuESTValidationEpsilonToDefault();
 }
 
 
-TEST_CASE( "getValidationEpsilon", TEST_CATEGORY ) {
+TEST_CASE( "getQuESTValidationEpsilon", TEST_CATEGORY ) {
 
     SECTION( LABEL_CORRECTNESS ) {
 
         // confirm always safe to call
         for (int i=0; i<3; i++)
-            REQUIRE_NOTHROW( getValidationEpsilon() ); // ignores output
+            REQUIRE_NOTHROW( getQuESTValidationEpsilon() ); // ignores output
 
         GENERATE( range(0,10) );
 
         // confirm set correctly
         qreal eps = getRandomReal(0, 99999);
-        setValidationEpsilon(eps);
+        setQuESTValidationEpsilon(eps);
 
-        REQUIRE( getValidationEpsilon() == eps );
+        REQUIRE( getQuESTValidationEpsilon() == eps );
     }
 
     SECTION( LABEL_VALIDATION ) {
@@ -572,18 +580,18 @@ TEST_CASE( "getValidationEpsilon", TEST_CATEGORY ) {
     }
 
     // ensure validation epsilon is default for remaining tests
-    setValidationEpsilonToDefault();
+    setQuESTValidationEpsilonToDefault();
 }
 
 
-TEST_CASE( "setValidationEpsilonToDefault", TEST_CATEGORY ) {
+TEST_CASE( "setQuESTValidationEpsilonToDefault", TEST_CATEGORY ) {
 
     SECTION( LABEL_CORRECTNESS ) {
 
         SECTION( "always safe to call" ) {
 
             for (int i=0; i<3; i++)
-                REQUIRE_NOTHROW( setValidationEpsilonToDefault() );
+                REQUIRE_NOTHROW( setQuESTValidationEpsilonToDefault() );
         }
 
         SECTION( "affects validation" ) {
@@ -596,11 +604,11 @@ TEST_CASE( "setValidationEpsilonToDefault", TEST_CATEGORY ) {
             REQUIRE_THROWS( applyCompMatr1(qureg, 0, m) );
 
             // confirm setting = 0 disables epsilon errors...
-            setValidationEpsilon(0);
+            setQuESTValidationEpsilon(0);
             REQUIRE_NOTHROW( applyCompMatr1(qureg, 0, m) );
 
             // which returns when stored to default
-            setValidationEpsilonToDefault();
+            setQuESTValidationEpsilonToDefault();
             REQUIRE_THROWS( applyCompMatr1(qureg, 0, m) );
 
             destroyQureg(qureg);
@@ -614,7 +622,7 @@ TEST_CASE( "setValidationEpsilonToDefault", TEST_CATEGORY ) {
                 *(m.isApproxUnitary)   = 1;
                 *(m.isApproxHermitian) = 1;
 
-                setValidationEpsilonToDefault();
+                setQuESTValidationEpsilonToDefault();
                 REQUIRE( *(m.isApproxUnitary)   == -1 );
                 REQUIRE( *(m.isApproxHermitian) == -1 );
 
@@ -628,7 +636,7 @@ TEST_CASE( "setValidationEpsilonToDefault", TEST_CATEGORY ) {
                 *(m.isApproxHermitian) = 0;
                 *(m.isApproxNonZero)   = 1;
 
-                setValidationEpsilonToDefault();
+                setQuESTValidationEpsilonToDefault();
                 REQUIRE( *(m.isApproxUnitary)   == -1 );
                 REQUIRE( *(m.isApproxHermitian) == -1 );
                 REQUIRE( *(m.isApproxNonZero)   == -1 );
@@ -643,7 +651,7 @@ TEST_CASE( "setValidationEpsilonToDefault", TEST_CATEGORY ) {
                 *(m.isApproxHermitian) = 0;
                 *(m.isApproxNonZero)   = 1;
 
-                setValidationEpsilonToDefault();
+                setQuESTValidationEpsilonToDefault();
                 REQUIRE( *(m.isApproxUnitary)   == -1 );
                 REQUIRE( *(m.isApproxHermitian) == -1 );
                 REQUIRE( *(m.isApproxNonZero)   == -1 );
@@ -656,7 +664,7 @@ TEST_CASE( "setValidationEpsilonToDefault", TEST_CATEGORY ) {
                 KrausMap k = createKrausMap(1, 3);
                 *(k.isApproxCPTP) = 1;
 
-                setValidationEpsilonToDefault();
+                setQuESTValidationEpsilonToDefault();
                 REQUIRE( *(k.isApproxCPTP) == -1 );
 
                 destroyKrausMap(k);
@@ -674,21 +682,21 @@ TEST_CASE( "setValidationEpsilonToDefault", TEST_CATEGORY ) {
     }
 
     // ensure validation epsilon is default for remaining tests
-    setValidationEpsilonToDefault();
+    setQuESTValidationEpsilonToDefault();
 }
 
 
-TEST_CASE( "getGpuCacheSize", TEST_CATEGORY ) {
+TEST_CASE( "getQuESTGpuCacheSize", TEST_CATEGORY ) {
 
     SECTION( LABEL_CORRECTNESS ) {
 
         // confirm cache begins empty
-        clearGpuCache();
-        REQUIRE( getGpuCacheSize() == 0 );
+        clearQuESTGpuCache();
+        REQUIRE( getQuESTGpuCacheSize() == 0 );
 
         // hackily detect cuQuantum
         char envStr[200];
-        getEnvironmentString(envStr);
+        getQuESTEnvironmentString(envStr);
         bool usingCuQuantum = std::string(envStr).find("cuQuantum=0") == std::string::npos;
 
         // proceed only if we're ever using our own GPU cache
@@ -716,7 +724,7 @@ TEST_CASE( "getGpuCacheSize", TEST_CATEGORY ) {
                 // confirm it expanded, OR stayed the same, which happens when
                 // the total number of simultaneous threads needed hits/exceeds
                 // the number available in the hardware
-                qindex newSize = getGpuCacheSize();
+                qindex newSize = getQuESTGpuCacheSize();
                 CAPTURE( cacheSize, newSize );
                 REQUIRE( newSize >= cacheSize );
 
@@ -746,10 +754,10 @@ TEST_CASE( "getGpuCacheSize", TEST_CATEGORY ) {
  */
 
 
-void setMaxNumReportedItems(qindex numRows, qindex numCols);
+void setQuESTMaxNumReportedItems(qindex numRows, qindex numCols);
 
-void getEnvironmentString(char str[200]);
+void getQuESTEnvironmentString(char str[200]);
 
-void setReportedPauliChars(const char* paulis);
+void setQuESTReportedPauliChars(const char* paulis);
 
-void setReportedPauliStrStyle(int style);
+void setQuESTReportedPauliStrStyle(int style);
diff --git a/tests/unit/decoherence.cpp b/tests/unit/decoherence.cpp
index f36c491bb..60b4cd640 100644
--- a/tests/unit/decoherence.cpp
+++ b/tests/unit/decoherence.cpp
@@ -38,7 +38,8 @@ using std::vector;
  */
 
 
-#define TEST_CATEGORY "[unit][decoherence]"
+#define TEST_CATEGORY \
+    LABEL_UNIT_TAG "[decoherence]"
 
 
 void TEST_ON_CACHED_QUREGS(auto apiFunc, vector<int> targs, vector<qmatrix> kraus) {
diff --git a/tests/unit/environment.cpp b/tests/unit/environment.cpp
index 6d4efb80d..9ecf8e376 100644
--- a/tests/unit/environment.cpp
+++ b/tests/unit/environment.cpp
@@ -83,6 +83,24 @@ TEST_CASE( "initCustomQuESTEnv", TEST_CATEGORY ) {
 }
 
 
+TEST_CASE( "initCustomMpiQuESTEnv", TEST_CATEGORY ) {
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        // cannot be meaningfully tested since env already active
+        SUCCEED( );
+    }
+
+    SECTION( LABEL_VALIDATION ) {
+
+        REQUIRE_THROWS_WITH( initCustomMpiQuESTEnv(0,0,0,0), ContainsSubstring( "already been initialised") );
+
+        // cannot check arguments since env-already-initialised
+        // validation is performed first
+    }
+}
+
+
 TEST_CASE( "finalizeQuESTEnv", TEST_CATEGORY ) {
 
     SECTION( LABEL_CORRECTNESS ) {
@@ -140,12 +158,6 @@ TEST_CASE( "getQuESTEnv", TEST_CATEGORY ) {
 
         QuESTEnv env = getQuESTEnv();
 
-        REQUIRE( (env.isMultithreaded     == 0 || env.isMultithreaded     == 1) );
-        REQUIRE( (env.isGpuAccelerated    == 0 || env.isGpuAccelerated    == 1) );
-        REQUIRE( (env.isDistributed       == 0 || env.isDistributed       == 1) );
-        REQUIRE( (env.isCuQuantumEnabled  == 0 || env.isCuQuantumEnabled  == 1) );
-        REQUIRE( (env.isGpuSharingEnabled == 0 || env.isGpuSharingEnabled == 1) );
-        
         REQUIRE( env.rank     >= 0 );
         REQUIRE( env.numNodes >= 0 );
         
diff --git a/tests/unit/experimental.cpp b/tests/unit/experimental.cpp
new file mode 100644
index 000000000..943645831
--- /dev/null
+++ b/tests/unit/experimental.cpp
@@ -0,0 +1,133 @@
+/** @file
+ * Unit tests of the environment module.
+ *
+ * @author Oliver Brown
+ * @author Tyson Jones
+ * 
+ * @defgroup unitexperi Experimental
+ * @ingroup unittests
+ */
+
+#include "quest.h"
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_string.hpp>
+#include <catch2/generators/catch_generators_range.hpp>
+
+#include "tests/utils/macros.hpp"
+#include "tests/utils/config.hpp"
+
+using Catch::Matchers::ContainsSubstring;
+
+
+
+/*
+ * UTILITIES
+ */
+
+#define TEST_CATEGORY \
+    LABEL_UNIT_TAG "[experimental]"
+
+
+
+/** 
+ * TESTS
+ * 
+ * @ingroup unitexperi
+ * @{
+ */
+
+
+TEST_CASE( "setQuESTNumGpuThreadsPerBlock", TEST_CATEGORY ) {
+
+    // remember the default number for later restoration (hence static)
+    static int initNumTPB = getQuESTNumGpuThreadsPerBlock();
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        // begin at 64 (AMD min, larger than NVIDIA min of 32),
+        // stop at 1024 (should be less than dev-specific max)
+        int inNumTPB = GENERATE( 64, 128, 256, 512, 1024 ); 
+        setQuESTNumGpuThreadsPerBlock(inNumTPB);
+
+        int outNumTPB = getQuESTNumGpuThreadsPerBlock();
+        REQUIRE( inNumTPB == outNumTPB );
+        
+        // BEWARE that we do not here test whether all QuEST
+        // operators succeed with the various numTBP; that must
+        // be ad hoc asssesed via updating the numTBP env-var
+        // before launching the entirety of the tests
+    }
+
+    SECTION( LABEL_VALIDATION ) {
+
+        SECTION( "Negative" ) {
+
+            int badNumTPB = GENERATE( 0, -1, -9999 );
+            REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(badNumTPB), ContainsSubstring( "must be positive" ) );
+        }
+
+        SECTION( "Indivisible by warp size" ) {
+
+            // If HIP status was attached to QuESTEnv, we could do:
+            //     QuESTEnv env = getQuESTEnv();
+            //     int warpSize = (env.isGpuAccelerated && env.isHipCompiled)? 64 : 32;
+            // Since this currently isn't the case, we assume a warp size of 32,
+            // which will mean when this test is run on AMD GPUs, the below tested
+            // badNumTBP won't be as interestingly/rigorously spread
+            int warpSize = 32;
+
+            int badNumTPB = GENERATE_COPY( warpSize - 1, warpSize + 1, warpSize + warpSize/2, 3*warpSize + warpSize/2 );
+
+            REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(badNumTPB), ContainsSubstring( "does not divide evenly into the warp size" ) );
+        }
+
+        SECTION( "Exceeds device maximum" ) {
+
+            int badNumTPB = 999999; // exceeds expected 1024 max
+
+            // Cannot be tested (since validation not imposed) when GPU is not actively used
+            if (getQuESTEnv().isGpuAccelerated)
+                REQUIRE_THROWS_WITH( setQuESTNumGpuThreadsPerBlock(badNumTPB), ContainsSubstring( "Exceeds the hardware-imposed maximum" ) );
+
+            SUCCEED( );
+        }
+    }
+
+    // restore numTBP, so as not to interfere with other tests
+    setQuESTNumGpuThreadsPerBlock(initNumTPB);
+}
+
+
+TEST_CASE( "getQuESTNumGpuThreadsPerBlock", TEST_CATEGORY ) {
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        // check initial value matches either the env-var (if set),
+        // or the fixed default in the codebase (hardcoded in test utils)
+        int defaultNum = getDefaultNumGpuThreadsPerBlock(); // test util via env-var
+        int reportedNum = getQuESTNumGpuThreadsPerBlock();  // QuEST API
+
+        REQUIRE( defaultNum == reportedNum );
+
+        // further testing of this function appears in setQuESTNumGpuThreadsPerBlock()
+    }
+
+    SECTION( LABEL_VALIDATION ) {
+
+        // there is none (except untestable env is init!)
+        SUCCEED( );
+    }
+}
+
+
+/** @} (end defgroup) */
+
+
+
+/**
+ * @todo
+ * UNTESTED FUNCTIONS
+ */
+
+// nothing! :^)
diff --git a/tests/unit/initialisations.cpp b/tests/unit/initialisations.cpp
index ac1f1abd4..175ec633b 100644
--- a/tests/unit/initialisations.cpp
+++ b/tests/unit/initialisations.cpp
@@ -249,8 +249,15 @@ TEST_CASE( "setQuregAmps", TEST_CATEGORY ) {
     SECTION( LABEL_CORRECTNESS ) {
 
         int numTotalAmps = getPow2(getNumCachedQubits());
-        int numSetAmps = GENERATE_COPY( range(0,numTotalAmps+1) ); 
-        int startInd = GENERATE_COPY( range(0,numTotalAmps-numSetAmps) );
+        int numSetAmps = GENERATE_COPY( range(0,numTotalAmps+1) );
+
+        // Bounds-checking causes GENERATE_COPY( range(0,0) ) to fail
+        // when tests are compiled in Debug 
+        int startInd = 0;
+        if (numTotalAmps - numSetAmps > 0) {
+            startInd = GENERATE_COPY( range(0,numTotalAmps-numSetAmps) );
+        }
+        
         qvector amps = getRandomVector(numSetAmps);
 
         auto testFunc = [&](Qureg qureg) {
diff --git a/tests/unit/operations.cpp b/tests/unit/operations.cpp
index 0e33220db..80b75b9c2 100644
--- a/tests/unit/operations.cpp
+++ b/tests/unit/operations.cpp
@@ -744,8 +744,8 @@ void testOperationCorrectness(auto operation, auto matrixRefGen) {
     // upon few qubits are single-precision. So we disable completely until
     // we re-implement 'input validation' checks which force us to fix thresholds
     (Args == compmatr)?
-        setValidationEpsilon(0):
-        setValidationEpsilonToDefault();
+        setQuESTValidationEpsilon(0):
+        setQuESTValidationEpsilonToDefault();
 
     // prepare test function which will receive both statevectors and density matrices
     auto testFunc = [&](Qureg qureg, auto& stateRef) -> void { 
@@ -777,7 +777,7 @@ void testOperationCorrectness(auto operation, auto matrixRefGen) {
 
     // free any heap-alloated API matrices and restore epsilon
     freeRemainingArgs<Targs,Args>(furtherArgs);
-    setValidationEpsilonToDefault();
+    setQuESTValidationEpsilonToDefault();
 }
 
 
@@ -1724,8 +1724,8 @@ TEST_CASE( "applyForcedQubitMeasurement", TEST_CATEGORY_OPS ) {
         // below validation tests assume qubit 0 can collapse to either outcome
         // (which does not require normalisation; qureg can be in the debug state)
         initDebugState(qureg);
-        REQUIRE( calcProbOfQubitOutcome(qureg, 0, 0) > getValidationEpsilon() );
-        REQUIRE( calcProbOfQubitOutcome(qureg, 0, 1) > getValidationEpsilon() );
+        REQUIRE( calcProbOfQubitOutcome(qureg, 0, 0) > getQuESTValidationEpsilon() );
+        REQUIRE( calcProbOfQubitOutcome(qureg, 0, 1) > getQuESTValidationEpsilon() );
 
         SECTION( "qureg uninitialised" ) {
 
@@ -1778,7 +1778,7 @@ TEST_CASE( "applyForcedQubitMeasurement", TEST_CATEGORY_OPS ) {
             qreal goodTheta = 0.1;
             applyRotateX(qureg, 0, goodTheta);
             REQUIRE( 
-                calcProbOfQubitOutcome(qureg, 0, badOutcome) > getValidationEpsilon() 
+                calcProbOfQubitOutcome(qureg, 0, badOutcome) > getQuESTValidationEpsilon() 
             );
             REQUIRE_NOTHROW(
                 applyForcedQubitMeasurement(qureg, 0, badOutcome)
@@ -1806,7 +1806,7 @@ TEST_CASE( "applyForcedMultiQubitMeasurement", TEST_CATEGORY_OPS ) {
         // this test may randomly request a measurement outcome which
         // is illegally unlikely, triggering validation; we merely
         // disable such validation and hope divergences don't break the test!
-        setValidationEpsilon(0);
+        setQuESTValidationEpsilon(0);
 
         auto testFunc = [&](Qureg qureg, auto& ref) {
 
@@ -1830,7 +1830,7 @@ TEST_CASE( "applyForcedMultiQubitMeasurement", TEST_CATEGORY_OPS ) {
         SECTION( LABEL_STATEVEC ) { TEST_ON_CACHED_QUREGS(statevecQuregs, statevecRef, testFunc); }
         SECTION( LABEL_DENSMATR ) { TEST_ON_CACHED_QUREGS(densmatrQuregs, densmatrRef, testFunc); }
 
-        setValidationEpsilonToDefault();
+        setQuESTValidationEpsilonToDefault();
     }
 
     SECTION( LABEL_VALIDATION ) {
@@ -1842,7 +1842,7 @@ TEST_CASE( "applyForcedMultiQubitMeasurement", TEST_CATEGORY_OPS ) {
 
         // below validation tests assume the above parameters are valid (not impossibly unlikely)
         initDebugState(qureg);
-        REQUIRE( calcProbOfMultiQubitOutcome(qureg, targets, outcomes, numTargets) > getValidationEpsilon() );
+        REQUIRE( calcProbOfMultiQubitOutcome(qureg, targets, outcomes, numTargets) > getQuESTValidationEpsilon() );
 
         SECTION( "qureg uninitialised" ) {
 
@@ -1920,7 +1920,7 @@ TEST_CASE( "applyForcedMultiQubitMeasurement", TEST_CATEGORY_OPS ) {
             applyRotateX(qureg, targets[2], goodTheta);
             int goodOutcomes[] = {0, 0, 1};
             REQUIRE( 
-                calcProbOfMultiQubitOutcome(qureg, targets, goodOutcomes, numTargets) > getValidationEpsilon() 
+                calcProbOfMultiQubitOutcome(qureg, targets, goodOutcomes, numTargets) > getQuESTValidationEpsilon() 
             );
             REQUIRE_NOTHROW(
                 applyForcedMultiQubitMeasurement(qureg, targets, goodOutcomes, numTargets)
@@ -2291,7 +2291,7 @@ TEST_CASE( "applyFullStateDiagMatrPower", TEST_CATEGORY_OPS LABEL_MIXED_DEPLOY_T
         GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         if (!testRealExp)
-            setValidationEpsilon(0);
+            setQuESTValidationEpsilon(0);
 
         SECTION( LABEL_STATEVEC ) {
 
@@ -2313,7 +2313,7 @@ TEST_CASE( "applyFullStateDiagMatrPower", TEST_CATEGORY_OPS LABEL_MIXED_DEPLOY_T
             TEST_ON_CACHED_QUREG_AND_MATRIX( cachedDM, cachedMatrs, apiFunc, refDM, refMatr, refFunc);
         }
 
-        setValidationEpsilonToDefault();
+        setQuESTValidationEpsilonToDefault();
     }
 
     /// @todo input validation
diff --git a/tests/unit/paulis.cpp b/tests/unit/paulis.cpp
index 1b36373c6..7cfbea5cd 100644
--- a/tests/unit/paulis.cpp
+++ b/tests/unit/paulis.cpp
@@ -372,7 +372,7 @@ TEST_CASE( "createInlinePauliStrSum", TEST_CATEGORY ) {
 
         SECTION( "coefficient parsing" ) {
 
-            // beware that when FLOAT_PRECISION=1, qcomp cannot store smaller than 1E-37 (triggering a validation error)
+            // beware that when QUEST_FLOAT_PRECISION=1, qcomp cannot store smaller than 1E-37 (triggering a validation error)
             vector<std::string> strs = {"1 X", "0 X", "0.1 X", "5E2-1i X", "-1E-25i X",  "1 - 6E-5i X", "-1.5E-15  -   5.123E-30i  0"};
             vector<qcomp> coeffs     = { 1,     0,     0.1,     5E2-1_i,   -(1E-25)*1_i,  1 -(6E-5)*1_i, qcomp(-1.5E-15, -5.123E-30) };
 
@@ -429,7 +429,7 @@ TEST_CASE( "createInlinePauliStrSum", TEST_CATEGORY ) {
 
         SECTION( "out of range" ) {
 
-            // the max/min qcomp depend upon FLOAT_PRECISION but we'll lazily use something even quad-prec cannot store
+            // the max/min qcomp depend upon QUEST_FLOAT_PRECISION but we'll lazily use something even quad-prec cannot store
             REQUIRE_THROWS_WITH( createInlinePauliStrSum("-1E-9999 XYZ"), ContainsSubstring("exceeds the range which can be stored in a qcomp") );
         }
 
diff --git a/tests/unit/trotterisation.cpp b/tests/unit/trotterisation.cpp
index 5e264ad53..6d8c6ff67 100644
--- a/tests/unit/trotterisation.cpp
+++ b/tests/unit/trotterisation.cpp
@@ -268,8 +268,8 @@ TEST_CASE( "applyTrotterizedUnitaryTimeEvolution", TEST_CATEGORY ) {
         //  - 1E-5 at single precision
         //  - 1E-12 at double precision
         //  - 1E-13 at quad precision
-        qreal initialValidationEps = getValidationEpsilon();
-        setValidationEpsilon(2 * initialValidationEps);
+        qreal initialValidationEps = getQuESTValidationEpsilon();
+        setQuESTValidationEpsilon(2 * initialValidationEps);
 
         const int NUM_QUBITS = 8;
         qreal dt = 0.1;
@@ -331,7 +331,7 @@ TEST_CASE( "applyTrotterizedUnitaryTimeEvolution", TEST_CATEGORY ) {
         }
 
         // Restore validation epsilon
-        setValidationEpsilon(initialValidationEps);
+        setQuESTValidationEpsilon(initialValidationEps);
 
         destroyPauliStrSum(hamil);
         destroyPauliStrSum(observ);
@@ -462,7 +462,7 @@ TEST_CASE( "applyTrotterizedImaginaryTimeEvolution", TEST_CATEGORY ) {
         };
        
 
-#if FLOAT_PRECISION == 4
+#if QUEST_FLOAT_PRECISION == 4
         /*
          * The numerical exponent is sufficiently inaccurate to breach the default
          * tolerances at quad precision, so we apply the following kludge to prevent irritating test failures.
@@ -553,7 +553,7 @@ TEST_CASE( "applyTrotterizedImaginaryTimeEvolution", TEST_CATEGORY ) {
             destroyPauliStrSum(ising);
         }
 
-#if FLOAT_PRECISION == 4
+#if QUEST_FLOAT_PRECISION == 4
         setTestEpsilon(initialEps);
 #endif
     }
diff --git a/tests/utils/compare.cpp b/tests/utils/compare.cpp
index d6d6824e8..a1d1e5aa6 100644
--- a/tests/utils/compare.cpp
+++ b/tests/utils/compare.cpp
@@ -33,13 +33,13 @@ using namespace Catch::Matchers;
  */
 
 
-#if FLOAT_PRECISION == 1
+#if QUEST_FLOAT_PRECISION == 1
     qreal absoluteEpsilon = 1E-2; // default...
     qreal relativeEpsilon = 1E-2;
-#elif FLOAT_PRECISION == 2
+#elif QUEST_FLOAT_PRECISION == 2
     qreal absoluteEpsilon = 1E-8;
     qreal relativeEpsilon = 1E-8;
-#elif FLOAT_PRECISION == 4
+#elif QUEST_FLOAT_PRECISION == 4
     qreal absoluteEpsilon = 1E-10;
     qreal relativeEpsilon = 1E-10;
 #endif
diff --git a/tests/utils/config.cpp b/tests/utils/config.cpp
index c5362e899..d8eeab605 100644
--- a/tests/utils/config.cpp
+++ b/tests/utils/config.cpp
@@ -40,37 +40,52 @@ int getIntEnvVarValueOrDefault(string name, int defaultValue) {
 
 
 /*
- * PUBLIC
- *
- * which each call std::getenv only once
+ * PUBLIC TEST ENV VARS
  */
 
 int getNumQubitsInUnitTestedQuregs() {
 
-    static int value = getIntEnvVarValueOrDefault("TEST_NUM_QUBITS_IN_QUREG", 6);
+    static int value = getIntEnvVarValueOrDefault("QUEST_TEST_NUM_QUBITS_IN_QUREG", 6);
     return value;
 }
 
 int getMaxNumTestedQubitPermutations() {
 
-    static int value = getIntEnvVarValueOrDefault("TEST_MAX_NUM_QUBIT_PERMUTATIONS", 0);
+    static int value = getIntEnvVarValueOrDefault("QUEST_TEST_MAX_NUM_QUBIT_PERMUTATIONS", 0);
     return value;
 }
 
 int getMaxNumTestedSuperoperatorTargets() {
 
-    static int value = getIntEnvVarValueOrDefault("TEST_MAX_NUM_SUPEROP_TARGETS", 4);
+    static int value = getIntEnvVarValueOrDefault("QUEST_TEST_MAX_NUM_SUPEROP_TARGETS", 4);
     return value;
 }
 
 int getNumTestedMixedDeploymentRepetitions() {
 
-    static int value = getIntEnvVarValueOrDefault("TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS", 10);
+    static int value = getIntEnvVarValueOrDefault("QUEST_TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS", 10);
     return value;
 }
 
 bool getWhetherToTestAllDeployments() {
 
-    static bool value = getIntEnvVarValueOrDefault("TEST_ALL_DEPLOYMENTS", 1);
+    static bool value = getIntEnvVarValueOrDefault("QUEST_TEST_TRY_ALL_DEPLOYMENTS", 1);
+    return value;
+}
+
+
+
+/*
+ * PUBLIC QUEST ENV VARS
+ */
+
+int getDefaultNumGpuThreadsPerBlock() {
+
+    // when the env-var is not present, we MUST return the default assumed by the QuEST src code,
+    // which at the time of writing, is a fixed 128 (rather than hardware-specific value)
+    const int compileTimeDefaultTPB = 128;
+
+    // when the env-var is present, we consult that, just like QuEST
+    static int value = getIntEnvVarValueOrDefault("QUEST_NUM_GPU_THREADS_PER_BLOCK", compileTimeDefaultTPB);
     return value;
 }
diff --git a/tests/utils/config.hpp b/tests/utils/config.hpp
index a1ef142c5..80be56e01 100644
--- a/tests/utils/config.hpp
+++ b/tests/utils/config.hpp
@@ -33,7 +33,7 @@
 #if 0
 
     /// @envvardoc
-    const int TEST_NUM_QUBITS_IN_QUREG = 6;
+    const int QUEST_TEST_NUM_QUBITS_IN_QUREG = 6;
 
     /** @envvardoc
      * 
@@ -64,16 +64,16 @@
      * 
      * @author Tyson Jones
      */
-    const int TEST_MAX_NUM_QUBIT_PERMUTATIONS = 0;
+    const int QUEST_TEST_MAX_NUM_QUBIT_PERMUTATIONS = 0;
 
     /// @envvardoc
-    const int TEST_MAX_NUM_SUPEROP_TARGETS = 4;
+    const int QUEST_TEST_MAX_NUM_SUPEROP_TARGETS = 4;
 
     /// @envvardoc
-    const int TEST_ALL_DEPLOYMENTS = 1;
+    const int QUEST_TEST_TRY_ALL_DEPLOYMENTS = 1;
 
     /// @envvardoc
-    const int TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS = 10;
+    const int QUEST_TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS = 10;
 
 #endif
 
@@ -82,12 +82,16 @@
  * ACCESSING ENV-VARS 
  */
 
+// test env-vars
 int getNumQubitsInUnitTestedQuregs();
 int getMaxNumTestedQubitPermutations();
 int getMaxNumTestedSuperoperatorTargets();
 int getNumTestedMixedDeploymentRepetitions();
 bool getWhetherToTestAllDeployments();
 
+// quest env-vars
+int getDefaultNumGpuThreadsPerBlock();
+
 
 #endif // CONFIG_PP
 
diff --git a/tests/utils/random.cpp b/tests/utils/random.cpp
index 65d087518..5c2fe143c 100644
--- a/tests/utils/random.cpp
+++ b/tests/utils/random.cpp
@@ -44,10 +44,10 @@ void setRandomTestStateSeeds() {
     unsigned seed = cspnrg();
     
     // seed QuEST, which uses only the root node's seed
-    setSeeds(&seed, 1);
+    setQuESTSeeds(&seed, 1);
 
     // broadcast root node seed to all nodes
-    getSeeds(&seed);
+    getQuESTSeeds(&seed);
 
     // seed RNG
     RNG.seed(seed);
diff --git a/utils/scripts/compile.sh b/utils/scripts/compile.sh
index 609f85be3..7c7d66e87 100755
--- a/utils/scripts/compile.sh
+++ b/utils/scripts/compile.sh
@@ -13,19 +13,19 @@
 # USER SETTINGS
 
 # numerical precision (1, 2, 4)
-FLOAT_PRECISION=2
+QUEST_FLOAT_PRECISION=2
 
 # deployments to compile (0, 1)
-ENABLE_DISTRIBUTION=0       # MPI
-ENABLE_MULTITHREADING=0     # OpenMP
-ENABLE_CUDA=0               # NVIDIA GPU
-ENABLE_HIP=0                # AMD GPU
-ENABLE_CUQUANTUM=0          # NVIDIA cuStateVec
-ENABLE_NUMA=0               # NUMA awareness
+QUEST_ENABLE_MPI=0                # MPI (multiprocess)
+QUEST_ENABLE_OMP=0                # OpenMP (multithreading)
+QUEST_ENABLE_CUDA=0               # NVIDIA GPU
+QUEST_ENABLE_HIP=0                # AMD GPU
+QUEST_ENABLE_CUQUANTUM=0          # NVIDIA cuStateVec
+QUEST_ENABLE_NUMA=0               # NUMA awareness
 
 # other options (0, 1)
-ENABLE_DEPRECATED_API=0
-DISABLE_DEPRECATION_WARNINGS=0
+QUEST_ENABLE_DEPRECATED_API=0
+QUEST_DISABLE_DEPRECATION_WARNINGS=0
 
 # NVIDIA compute capability or AMD arch (e.g. 60 or gfx908)
 GPU_ARCH=90
@@ -43,8 +43,8 @@ LINKER=g++
 
 # whether to compile the below user source files (0),
 # or the unit tests (1), which when paired with above
-# ENABLE_DEPRECATED_API=1, will use the v3 tests (which
-# you should pair with DISABLE_DEPRECATION_WARNINGS=1)
+# QUEST_ENABLE_DEPRECATED_API=1, will use the v3 tests (which
+# you should pair with QUEST_DISABLE_DEPRECATION_WARNINGS=1)
 COMPILE_TESTS=0
 
 # name of the compiled test executable
@@ -72,7 +72,7 @@ USER_CXX_COMP_FLAGS='-std=c++14'
 # user linker flags
 USER_LINK_FLAGS='-lstdc++'
 
-# whether to compile cuQuantum (consulted only when ENABLE_CUQUANTUM=1)
+# whether to compile cuQuantum (consulted only when QUEST_ENABLE_CUQUANTUM=1)
 # in debug mode, which logs to below file with performance tips and errors
 CUQUANTUM_LOG=0
 CUQUANTUM_LOG_FN="./custatevec_log.txt"
@@ -249,7 +249,7 @@ WARNING_FLAGS='-Wall'
 CUDA_COMP_FLAGS="-x cu -arch=sm_${GPU_ARCH} -I${CUDA_LIB_DIR}/include"
 CUDA_LINK_FLAGS="-L${CUDA_LIB_DIR}/lib -L${CUDA_LIB_DIR}/lib64 -lcudart -lcuda"
 
-if [ $ENABLE_CUQUANTUM == 1 ]
+if [ $QUEST_ENABLE_CUQUANTUM == 1 ]
 then
     # extend GPU flags if cuQuantum enabled
     CUDA_COMP_FLAGS+=" -I${CUQUANTUM_LIB_DIR}/include"
@@ -293,7 +293,7 @@ else
     OMP_LINK_FLAGS+=' -fopenmp'
 fi
 
-if [ $ENABLE_NUMA == 1 ]
+if [ $QUEST_ENABLE_NUMA == 1 ]
 then
     OMP_LINK_FLAGS+=' -lnuma'
 fi
@@ -312,11 +312,11 @@ echo "deployment modes:"
 ALL_LINK_FLAGS="${USER_LINK_FLAGS}"
 
 # choose compiler and flags for CPU/OMP files
-if [ $ENABLE_MULTITHREADING == 1 ]
+if [ $QUEST_ENABLE_OMP == 1 ]
 then
     echo "${INDENT}(multithreading enabled)"
     echo "${INDENT}${INDENT}[compiling OpenMP]"
-    if [ $ENABLE_NUMA == 1 ]
+    if [ $QUEST_ENABLE_NUMA == 1 ]
     then
         echo "${INDENT}${INDENT}[compiling NUMA]"
     fi
@@ -329,7 +329,7 @@ else
 fi
 
 # choose compiler and flags for GPU files
-if [ $ENABLE_CUDA == 1 ]
+if [ $QUEST_ENABLE_CUDA == 1 ]
 then
     echo "${INDENT}(GPU-acceleration enabled)"
     echo "${INDENT}${INDENT}[compiling CUDA]"
@@ -337,7 +337,7 @@ then
     GPU_FILES_FLAGS=$CUDA_COMP_FLAGS
     ALL_LINK_FLAGS+=" ${CUDA_LINK_FLAGS}"
     GPU_WARNING_FLAGS="-Xcompiler ${WARNING_FLAGS}"
-elif [ $ENABLE_HIP == 1 ]
+elif [ $QUEST_ENABLE_HIP == 1 ]
 then
     echo "${INDENT}(GPU-acceleration enabled)"
     echo "${INDENT}${INDENT}[compiling HIP]"
@@ -353,7 +353,7 @@ else
 fi
 
 # merely report cuQuantum status
-if [ $ENABLE_CUQUANTUM == 1 ]
+if [ $QUEST_ENABLE_CUQUANTUM == 1 ]
 then
     echo "${INDENT}(cuQuantum enabled)"
     echo "${INDENT}${INDENT}[compiling cuStateVec]"
@@ -362,7 +362,7 @@ else
 fi
 
 # choose compiler and flags for communication files
-if [ $ENABLE_DISTRIBUTION == 1 ]
+if [ $QUEST_ENABLE_MPI == 1 ]
 then
     echo "${INDENT}(distribution enabled)"
     echo "${INDENT}${INDENT}[compiling MPI]"
@@ -390,15 +390,15 @@ then
 fi
 
 # display precision
-if [ $FLOAT_PRECISION == 1 ]; then
+if [ $QUEST_FLOAT_PRECISION == 1 ]; then
     echo "${INDENT}(single precision)"
-elif [ $FLOAT_PRECISION == 2 ]; then
+elif [ $QUEST_FLOAT_PRECISION == 2 ]; then
     echo "${INDENT}(double precision)"
-elif [ $FLOAT_PRECISION == 4 ]; then
+elif [ $QUEST_FLOAT_PRECISION == 4 ]; then
     echo "${INDENT}(quad precision)"
 else
     echo ""
-    echo "INVALID FLOAT_PRECISION (${FLOAT_PRECISION})"
+    echo "INVALID QUEST_FLOAT_PRECISION (${QUEST_FLOAT_PRECISION})"
     echo "Exiting..."
     exit
 fi
@@ -420,14 +420,14 @@ then
 fi
 
 # test compiler
-if (( $COMPILE_TESTS == 1 && ENABLE_DEPRECATED_API == 0 ))
+if (( $COMPILE_TESTS == 1 && QUEST_ENABLE_DEPRECATED_API == 0 ))
 then
     echo "${INDENT}tests compiler and flags:"
     echo "${INDENT}${INDENT}${TESTS_COMPILER} ${TEST_COMP_FLAGS} ${WARNING_FLAGS}"
 fi
 
 # deprecated compiler
-if (( $COMPILE_TESTS == 1 && ENABLE_DEPRECATED_API == 1 ))
+if (( $COMPILE_TESTS == 1 && QUEST_ENABLE_DEPRECATED_API == 1 ))
 then
     echo "${INDENT}deprecated tests compiler and flags:"
     echo "${INDENT}${INDENT}${TESTS_COMPILER} ${TEST_DEPR_COMP_FLAGS} ${WARNING_FLAGS}"
@@ -501,15 +501,15 @@ echo "generating headers:"
 
 # write user-options as macros to config.h (and set version info to -1)
 sed \
-  -e "s|#cmakedefine FLOAT_PRECISION @FLOAT_PRECISION@|#define FLOAT_PRECISION ${FLOAT_PRECISION}|" \
-  -e "s|#cmakedefine01 INCLUDE_DEPRECATED_FUNCTIONS|#define INCLUDE_DEPRECATED_FUNCTIONS ${ENABLE_DEPRECATED_API}|" \
-  -e "s|#cmakedefine01 DISABLE_DEPRECATION_WARNINGS|#define DISABLE_DEPRECATION_WARNINGS ${DISABLE_DEPRECATION_WARNINGS}|" \
-  -e "s|#cmakedefine01 COMPILE_OPENMP|#define COMPILE_OPENMP ${ENABLE_MULTITHREADING}|" \
-  -e "s|#cmakedefine01 COMPILE_MPI|#define COMPILE_MPI ${ENABLE_DISTRIBUTION}|" \
-  -e "s|#cmakedefine01 COMPILE_CUDA|#define COMPILE_CUDA $(( ENABLE_CUDA || ENABLE_HIP ))|" \
-  -e "s|#cmakedefine01 COMPILE_CUQUANTUM|#define COMPILE_CUQUANTUM ${ENABLE_CUQUANTUM}|" \
-  -e "s|#cmakedefine01 COMPILE_HIP|#define COMPILE_HIP ${ENABLE_HIP}|" \
-  -e "s|#cmakedefine01 NUMA_AWARE|#define NUMA_AWARE ${ENABLE_NUMA}|" \
+  -e "s|#cmakedefine QUEST_FLOAT_PRECISION @QUEST_FLOAT_PRECISION@|#define QUEST_FLOAT_PRECISION ${QUEST_FLOAT_PRECISION}|" \
+  -e "s|#cmakedefine01 QUEST_INCLUDE_DEPRECATED_FUNCTIONS|#define QUEST_INCLUDE_DEPRECATED_FUNCTIONS ${QUEST_ENABLE_DEPRECATED_API}|" \
+  -e "s|#cmakedefine01 QUEST_DISABLE_DEPRECATION_WARNINGS|#define QUEST_DISABLE_DEPRECATION_WARNINGS ${QUEST_DISABLE_DEPRECATION_WARNINGS}|" \
+  -e "s|#cmakedefine01 QUEST_COMPILE_OMP|#define QUEST_COMPILE_OMP ${QUEST_ENABLE_OMP}|" \
+  -e "s|#cmakedefine01 QUEST_COMPILE_MPI|#define QUEST_COMPILE_MPI ${QUEST_ENABLE_MPI}|" \
+  -e "s|#cmakedefine01 QUEST_COMPILE_CUDA|#define QUEST_COMPILE_CUDA $(( QUEST_ENABLE_CUDA || QUEST_ENABLE_HIP ))|" \
+  -e "s|#cmakedefine01 QUEST_COMPILE_CUQUANTUM|#define QUEST_COMPILE_CUQUANTUM ${QUEST_ENABLE_CUQUANTUM}|" \
+  -e "s|#cmakedefine01 QUEST_COMPILE_HIP|#define QUEST_COMPILE_HIP ${QUEST_ENABLE_HIP}|" \
+  -e "s|#cmakedefine01 QUEST_ENABLE_NUMA|#define QUEST_ENABLE_NUMA ${QUEST_ENABLE_NUMA}|" \
   -e "s|@PROJECT_VERSION@|unknown (not populated by manual compilation)|" \
   -e "s|@PROJECT_VERSION_MAJOR@|-1|" \
   -e "s|@PROJECT_VERSION_MINOR@|-1|" \
@@ -555,7 +555,7 @@ fi
 
 # COMPILING TESTS
 
-if (( $COMPILE_TESTS == 1 && $ENABLE_DEPRECATED_API == 0 ))
+if (( $COMPILE_TESTS == 1 && $QUEST_ENABLE_DEPRECATED_API == 0 ))
 then
 
     echo "compiling unit test files:"
@@ -591,11 +591,11 @@ fi
 
 # COMPILING DEPRECATED TESTS
 
-if (( $COMPILE_TESTS == 1 && $ENABLE_DEPRECATED_API == 1 ))
+if (( $COMPILE_TESTS == 1 && $QUEST_ENABLE_DEPRECATED_API == 1 ))
 then
     echo "compiling deprecated test files:"
 
-    if (( $DISABLE_DEPRECATION_WARNINGS == 0 ))
+    if (( $QUEST_DISABLE_DEPRECATION_WARNINGS == 0 ))
     then
         echo "${INDENT}(beware deprecation warnings were not disabled)"
     fi
@@ -702,12 +702,12 @@ OBJECTS+=" $(printf " ${QUEST_OBJ_PREF}%s.o" "${MPI_FILES[@]}")"
 if (( $COMPILE_TESTS == 0 ))
 then
     OBJECTS+=" $(printf " ${USER_OBJ_PREF}%s.o" "${USER_FILES[@]}")"
-elif (( $COMPILE_TESTS == 1 && $ENABLE_DEPRECATED_API == 0 ))
+elif (( $COMPILE_TESTS == 1 && $QUEST_ENABLE_DEPRECATED_API == 0 ))
 then
     OBJECTS+=" $(printf " ${TEST_OBJ_PREF}%s.o" "${TEST_MAIN_FILES[@]}")"
     OBJECTS+=" $(printf " ${TEST_OBJ_PREF}%s.o" "${TEST_UTIL_FILES[@]}")"
     OBJECTS+=" $(printf " ${TEST_OBJ_PREF}%s.o" "${TEST_UNIT_FILES[@]}")"
-elif (( $COMPILE_TESTS == 1 && $ENABLE_DEPRECATED_API == 1 ))
+elif (( $COMPILE_TESTS == 1 && $QUEST_ENABLE_DEPRECATED_API == 1 ))
 then
     OBJECTS+=" $(printf " ${TEST_OBJ_PREF}%s.o" "${TEST_DEPR_FILES[@]}")"
     OBJECTS+=" $(printf " ${TEST_OBJ_PREF}%s.o" "${TEST_DEPR_MPI_FILES[@]}")"