From dc056cbf3f68f0d563557afa5c8d3be6db41bd65 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Sat, 30 May 2026 02:49:49 -0400 Subject: [PATCH] (invalid) define env_isDistributed() This was an attempt to avoid consulting comm_isInit() internally, which only indicates MPI is active, and not whether QuEST is allowed to use it! (QuEST itself may be launched non-distributed). It was already an ugly attempt because of the nuisance of not exposing env_isDistributed in a header... But it is in fact a totally inadequate solution due to a single function: gpu_areAnyNodesBoundToSameGpu()! This is the only function in all of QuEST which needs to perform an MPI communication before the validation within initQuESTEnv has been completed (indeed, it's invoked by validation), in scenarios where the validation is not currently failing. Grr! --- quest/src/api/channels.cpp | 15 ++++++- quest/src/api/environment.cpp | 21 ++++++++++ quest/src/api/matrices.cpp | 15 ++++++- quest/src/api/paulis.cpp | 15 ++++++- quest/src/api/qureg.cpp | 15 ++++++- quest/src/comm/comm_config.cpp | 72 ++++++++++++++++++++++++++++------ quest/src/core/errors.cpp | 10 +++++ quest/src/core/errors.hpp | 5 +++ quest/src/core/randomiser.cpp | 17 +++++++- quest/src/core/validation.cpp | 35 +++++++++-------- quest/src/gpu/gpu_config.cpp | 40 +++++++++++++++++++ 11 files changed, 224 insertions(+), 36 deletions(-) diff --git a/quest/src/api/channels.cpp b/quest/src/api/channels.cpp index d6e3ac4fb..7024e3754 100644 --- a/quest/src/api/channels.cpp +++ b/quest/src/api/channels.cpp @@ -27,6 +27,19 @@ using std::vector; +/* + * EXTERNAL FUNCTIONS + * + * which are regrettably extern'd, rather than included + * in a header, because they are defined within /api/, + * within which the headers are user-visible. Gross! + */ + + +extern bool env_isDistributed(); + + + /* * PRIVATE UTILITIES */ @@ -107,7 +120,7 @@ void freeAllMemoryIfAnyAllocsFailed(T& obj) { // determine whether any node experienced a failure bool anyFail = didAnyLocalAllocsFail(obj); - if (comm_isInit()) + if (env_isDistributed()) anyFail = comm_isTrueOnAllNodes(anyFail); // if so, free all memory before subsequent validation diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp index ab18ced91..cd924216f 100644 --- a/quest/src/api/environment.cpp +++ b/quest/src/api/environment.cpp @@ -66,6 +66,27 @@ static bool global_hasEnvBeenFinalized = false; +/* + * PUBLIC ENV PROPERTIES + * + * which are not exposed in the header, since the header is user-facing! + * For now, we grossly require callers to 'extern' these, blegh! + */ + + +bool env_isDistributed() { + + if (global_envPtr == nullptr) + error_envIsNullPtr(); + + if (global_envPtr->isDistributed && ! comm_isInit()) + error_envDistributedButCommIsNotInit(); + + return global_envPtr->isDistributed; +} + + + /* * PRIVATE QUESTENV INITIALISATION INNER FUNCTIONS */ diff --git a/quest/src/api/matrices.cpp b/quest/src/api/matrices.cpp index b17987eb4..4a5c4a4b6 100644 --- a/quest/src/api/matrices.cpp +++ b/quest/src/api/matrices.cpp @@ -37,6 +37,19 @@ using std::vector; +/* + * EXTERNAL FUNCTIONS + * + * which are regrettably extern'd, rather than included + * in a header, because they are defined within /api/, + * within which the headers are user-visible. Gross! + */ + + +extern bool env_isDistributed(); + + + /* * FIZED-SIZE MATRIX VECTOR GETTERS * @@ -165,7 +178,7 @@ void freeAllMemoryIfAnyAllocsFailed(T matr) { // ascertain whether any allocs failed on any node bool anyFail = didAnyLocalAllocsFail(matr); - if (comm_isInit()) + if (env_isDistributed()) anyFail = comm_isTrueOnAllNodes(anyFail); // if so, free all heap fields diff --git a/quest/src/api/paulis.cpp b/quest/src/api/paulis.cpp index 855a9cfd8..11b4d2e98 100644 --- a/quest/src/api/paulis.cpp +++ b/quest/src/api/paulis.cpp @@ -26,6 +26,19 @@ using std::vector; +/* + * EXTERNAL FUNCTIONS + * + * which are regrettably extern'd, rather than included + * in a header, because they are defined within /api/, + * within which the headers are user-visible. Gross! + */ + + +extern bool env_isDistributed(); + + + /* * PRIVATE UTILITIES */ @@ -38,7 +51,7 @@ bool didAnyAllocsFailOnAnyNode(PauliStrSum sum) { ! mem_isAllocated(sum.coeffs) || ! mem_isAllocated(sum.isApproxHermitian) ); - if (comm_isInit()) + if (env_isDistributed()) anyFail = comm_isTrueOnAllNodes(anyFail); return anyFail; diff --git a/quest/src/api/qureg.cpp b/quest/src/api/qureg.cpp index 034c96e5c..53c00bd1e 100644 --- a/quest/src/api/qureg.cpp +++ b/quest/src/api/qureg.cpp @@ -30,6 +30,19 @@ using std::vector; +/* + * EXTERNAL FUNCTIONS + * + * which are regrettably extern'd, rather than included + * in a header, because they are defined within /api/, + * within which the headers are user-visible. Gross! + */ + + +extern bool env_isDistributed(); + + + /* * INTERNALLY EXPOSED FUNCTION */ @@ -116,7 +129,7 @@ bool didAnyLocalAllocsFail(Qureg qureg) { bool didAnyAllocsFailOnAnyNode(Qureg qureg) { bool anyFail = didAnyLocalAllocsFail(qureg); - if (comm_isInit()) + if (env_isDistributed()) anyFail = comm_isTrueOnAllNodes(anyFail); return anyFail; diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp index c69e72919..2f28ea246 100644 --- a/quest/src/comm/comm_config.cpp +++ b/quest/src/comm/comm_config.cpp @@ -6,8 +6,8 @@ * * Note that even when QUEST_COMPILE_MPI=1, the user may have * disabled distribution when creating the QuEST environment - * at runtime. Ergo we use comm_isInit() to determine whether - * functions should invoke the MPI API. + * at runtime. Ergo we use env_isDistributed() to determine + * whether functions should invoke the MPI API. * * @author Tyson Jones */ @@ -53,6 +53,28 @@ +/* + * EXTERNAL FUNCTIONS + * + * which are regrettably extern'd, rather than included + * in a header, because they are defined within /api/, + * within which the headers are user-visible. Gross! + */ + + +// NOTE I suspect this function is redundant; that we +// can actually just consult whether global_mpiComm +// is NULL to determine whether QuEST distribution is +// active, which may avoid the pitfalls of this function; +// chiefly, that it must not be called before QuESTEnv +// creation, and so must not be invoked during failed +// initQuESTEnv validation (which happens because the +// error-msg printing attempts to print on root only) + +extern bool env_isDistributed(); + + + /* * MPI ENVIRONMENT MANAGEMENT * @@ -154,22 +176,31 @@ void comm_end(bool userOwnsMpi) { int comm_getRank() { #if QUEST_COMPILE_MPI - // if distribution was not runtime enabled (or a validation error was - // triggered), every node (if many MPI processes were launched) - // believes it is the root rank + // We must return ROOT_RANK whenever !env_isDistributed(), but alas we + // cannot immediately call that, because this function can be triggered + // BEFORE QuESTEnv is successfully created; this happens when validation + // during initQuESTEnv failed, triggering print(), which calls this + // function to avoid non-root printing! We first check whether MPI itself + // was ever initialised (by us, or by an MPI-owning user); if no, we exit. if (!comm_isInit()) return ROOT_RANK; // Consult the (potentially sub-) communicator for rank; if it is still - // NULL, as can only validly happen during failed QuESTEnv init validation - // (which triggers root-only error printing and ergo this function), we - // fall back to every process believing it is root and so attempting to - // print. This safely avoids consulting a potentially bugged MPI communicator - // and losing the message. We once tried to fallback to MPI_COMM_WORLD here, + // NULL, as can only validly happen during failed QuESTEnv init validation, + // we fall back to every process believing it is root; all attempts to print. + // This safely avoids consulting a potentially bugged MPI communicator + // and losing the message. We COULD try to fallback to MPI_COMM_WORLD here, // to avoid duplicate output, but it is not worth the risk of msg loss! if (global_mpiComm == MPI_COMM_NULL) return ROOT_RANK; + // Finally, we must not query MPI if it is user-owned, and QuEST has been + // deployed non-distributed. This scenario actually triggers mpiComm==NULL + // above, but we make it very explicit here to highlight that this can + // occur when input validation has NOT been failed. + if (!env_isDistributed()) + return ROOT_RANK; + int rank; MPI_Comm_rank(global_mpiComm, &rank); return rank; @@ -193,12 +224,19 @@ bool comm_isRootNode() { int comm_getNumNodes() { #if QUEST_COMPILE_MPI - // if distribution was not runtime enabled (or a validation error was - // triggered), every node (if many MPI processes were launched) - // believes it is the one and only node + // if MPI is not initialised, either deliberately or because a validation + // error triggered during initialisation, then every node believes it is + // the one and only root node if (!comm_isInit()) return 1; + // if MPI is initialised, but QuEST is non-distributed, then the MPI is + // user-owned and must not influence our QuEST numNodes. We call this + // AFTER !comm_isInit() because it requires the QuESTEnv has been prior + // validly created, so failed validation must encounter above pathway first. + if (!env_isDistributed()) + return 1; + int numNodes; MPI_Comm_size(global_mpiComm, &numNodes); return numNodes; @@ -224,6 +262,14 @@ void comm_sync() { if (global_mpiComm == MPI_COMM_NULL) return; + // when MPI is user-owned (QuEST is not distributed), we never consult it! This + // is checked last because it requires QuESTEnv is validly created, though this + // function can be triggered during failed initQuESTEnv validation. Note too that + // this scenario is already covered by mpiComm == NULL, but we handle it here to + // highlight that this is a valid non-error-triggered scenario! + if (!env_isDistributed()) + return; + MPI_Barrier(global_mpiComm); #endif } diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp index 862136a9c..fd3ae78b7 100644 --- a/quest/src/core/errors.cpp +++ b/quest/src/core/errors.cpp @@ -94,6 +94,16 @@ void error_allocOfQuESTEnvFailed() { raiseInternalError("Attempted memory allocation for the newly created QuESTEnv unexpectedly failed."); } +void error_envIsNullPtr() { + + raiseInternalError("The private QuESTEnv instance of environment.cpp was unexpectedly nullptr, implying an internal function queried it before QuESTEnv initialisation."); +} + +void error_envDistributedButCommIsNotInit() { + + raiseInternalError("The QuESTEnv was believed distributed but the communicator was not initialised."); +} + /* diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp index 33cc0661d..9a5491049 100644 --- a/quest/src/core/errors.hpp +++ b/quest/src/core/errors.hpp @@ -49,6 +49,11 @@ void error_validationListUniquenessCheckExceededMaskSize(); void error_allocOfQuESTEnvFailed(); +void error_envIsNullPtr(); + +void error_envDistributedButCommIsNotInit(); + + /* diff --git a/quest/src/core/randomiser.cpp b/quest/src/core/randomiser.cpp index 65c6da4eb..05150882b 100644 --- a/quest/src/core/randomiser.cpp +++ b/quest/src/core/randomiser.cpp @@ -28,6 +28,19 @@ using std::vector; +/* + * EXTERNAL FUNCTIONS + * + * which are regrettably extern'd, rather than included + * in a header, because they are defined within /api/, + * within which the headers are user-visible. Gross! + */ + + +extern bool env_isDistributed(); + + + /* * RNG HYPERPARAMTERS */ @@ -66,14 +79,14 @@ void rand_setSeeds(vector seeds) { // all nodes learn root node's #seeds unsigned numRootSeeds = seeds.size(); - if (comm_isInit()) + if (env_isDistributed()) comm_broadcastUnsignedsFromRoot(&numRootSeeds, 1); // all nodes ensure they have space to receive root node's seeds seeds.resize(numRootSeeds); // all nodes receive root seeds - if (comm_isInit()) + if (env_isDistributed()) comm_broadcastUnsignedsFromRoot(seeds.data(), seeds.size()); // all nodes remember seeds (in case user wishes to later recall them) diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp index e1df0af76..199891f6c 100644 --- a/quest/src/core/validation.cpp +++ b/quest/src/core/validation.cpp @@ -52,6 +52,18 @@ using std::vector; +/* + * EXTERNAL FUNCTIONS + * + * which are regrettably extern'd, rather than included + * in a header, because they are defined within /api/, + * within which the headers are user-visible. Gross! + */ + +extern bool env_isDistributed(); + + + /* * INVALID INPUT ERROR MESSAGES * which can contain variables with syntax ${VAR1} ${VAR2}, substituted at error-throw with @@ -119,9 +131,6 @@ namespace report { string QUEST_OWNED_MPI_WAS_PRE_INIT = "MPI was already initialised prior to QuESTEnv initialisation, but the user did not declare MPI ownership."; - string QUEST_IS_NON_DISTRIBUTED_BUT_MPI_WAS_INIT = - "QuESTEnv was initialised to be non-distributed but MPI was externally initialised - this is presently unsupported due to a (very minor) technical limitation. If you need this facility, please raise a Github issue!"; - /* * EXISTING QUESTENV @@ -1172,6 +1181,7 @@ void default_inputErrorHandler(const char* func, const char* msg) { // finalise MPI before error-exit to avoid scaring user with giant MPI error message; // we always "take ownership" of MPI here since we're about to kill the whole program + // (hence why we consult comm_isInit(), and not env_isDistributed() - everyone loses!) if (comm_isInit()) comm_end(/*userOwnsMpi=*/false); @@ -1355,7 +1365,7 @@ void assertAllNodesAgreeThat(bool valid, string msg, tokenSubs vars, const char* // when performing validation that may be non-uniform between nodes. For // example, mallocs may succeed on one node but fail on another due to // inhomogeneous loads. - if (comm_isInit()) + if (env_isDistributed()) valid = comm_isTrueOnAllNodes(valid); // prepare error message only if validation will fail @@ -1511,27 +1521,18 @@ void validate_mpiInitStatus(bool useDistrib, bool userOwnsMpi, const char* calle if (!userOwnsMpi) assertThat(!isMpiInit, report::QUEST_OWNED_MPI_WAS_PRE_INIT, caller); - // (B) If QuEST is instructed not to use distribution, we must demand the user is not - // using MPI, because we internally consult comm_isInit() to detect QuEST distribution - // in many functions, and that will give a false positive when the user inits MPI directly. - if (!useDistrib) - assertThat(!isMpiInit, report::QUEST_IS_NON_DISTRIBUTED_BUT_MPI_WAS_INIT, caller); - - // TODO: we can relax above, permitting the user to play with MPI directly while - // disabling it for QuEST, by replacing internal comm_isInit() with e.g. env_isDistributed() - - // (C) If QuEST will use MPI owned by the user, the user must have pre-initialised it + // (B) If QuEST will use MPI owned by the user, the user must have pre-initialised it if (useDistrib && userOwnsMpi) assertThat(isMpiInit, report::USER_OWNED_MPI_WAS_NOT_INIT, caller); // Confirmation that all 8 scenarios are handled: // useDistrib=0, userOwnsMpi=0, isMpiInit=0 (legal: nobody wants MPI) // (A) useDistrib=0, userOwnsMpi=0, isMpiInit=1 (illegal: user lied about ownership) - // useDistrib=0, userOwnsMpi=1, isMpiInit=0 (legal: user owns MPI but does nothing!) - // (B) useDistrib=0, userOwnsMpi=1, isMpiInit=1 (illegal: comm_isInit() limitation as above) + // useDistrib=0, userOwnsMpi=1, isMpiInit=0 (legal: user "owns" MPI but doesn't use it!) + // useDistrib=0, userOwnsMpi=1, isMpiInit=1 (legal: user owns MPI but doesn't give it to QuEST) // useDistrib=1, userOwnsMpi=0, isMpiInit=0 (legal: QuEST will init MPI) // (A) useDistrib=1, userOwnsMpi=0, isMpiInit=1 (illegal: user lied about ownership) - // (C) useDistrib=1, userOwnsMpi=1, isMpiInit=0 (illegal: user has reponsibility to pre-init) + // (B) useDistrib=1, userOwnsMpi=1, isMpiInit=0 (illegal: user has reponsibility to pre-init) // useDistrib=1, userOwnsMpi=1, isMpiInit=1 (legal: user fulfilled responsibility to pre-init) } diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp index 5bf4b257f..af5dd491a 100644 --- a/quest/src/gpu/gpu_config.cpp +++ b/quest/src/gpu/gpu_config.cpp @@ -43,6 +43,19 @@ +/* + * EXTERNAL FUNCTIONS + * + * which are regrettably extern'd, rather than included + * in a header, because they are defined within /api/, + * within which the headers are user-visible. Gross! + */ + + +extern bool env_isDistributed(); + + + /* * CUDA ERROR HANDLING * @@ -395,9 +408,36 @@ bool gpu_areAnyNodesBoundToSameGpu() { #if QUEST_COMPILE_CUDA assert_gpuHasBeenBound(hasGpuBeenBound); + // if not even the user has triggered MPI, there's no issue if (!comm_isInit()) return false; + // if MPI is active, but QuEST isn't distributed, then we have + // no way to communicate the GPU IDs and so assume NO. Note that + // this check happens strictly AFTER comm_isInit(), because this + // ... + // WAIT + // ... + // RUH ROH + // ... + // THIS FUNCTION IS CALLED DURING QUEST ENV INITIALISATION! SO THE + // BELOW CALL WOULD ALWAYS BE PREMATURE AND TRIGGER AN INTERNAL ERROR + // (QuESTEnv not yet initialised)!! BUT WE CANNOT SIMPLY NOT CALL IT + // AND PROCEED, BECAUSE WE MUST NOT TOUCH MPI IF IT IS USER-OWNED + // AND QUEST IS NOT DISTRIBUTED (because we cannot even be sure that + // all processes are participating in initQuESTEnv()!) + // + // BAH! So env_isDistributed() is badly designed; we need a way to + // see that QuEST was never INTENDED to be distributed during its + // validation stage. So I guess we WILL need to commit comm_isActive() + // or similar, to disambiguate from comm_isInit (which should be + // renamed to an explicit comm_isMpiInit()) + // + // REEEEEEE! + + if (!env_isDistributed()) + return false; + // obtain bound GPU's UUID; a unique identifier 16-char identifier auto uuidStr = getBoundGpuUuid();