diff --git a/quest/src/api/channels.cpp b/quest/src/api/channels.cpp index d6e3ac4fb..7024e3754 100644 --- a/quest/src/api/channels.cpp +++ b/quest/src/api/channels.cpp @@ -27,6 +27,19 @@ using std::vector; +/* + * EXTERNAL FUNCTIONS + * + * which are regrettably extern'd, rather than included + * in a header, because they are defined within /api/, + * within which the headers are user-visible. Gross! + */ + + +extern bool env_isDistributed(); + + + /* * PRIVATE UTILITIES */ @@ -107,7 +120,7 @@ void freeAllMemoryIfAnyAllocsFailed(T& obj) { // determine whether any node experienced a failure bool anyFail = didAnyLocalAllocsFail(obj); - if (comm_isInit()) + if (env_isDistributed()) anyFail = comm_isTrueOnAllNodes(anyFail); // if so, free all memory before subsequent validation diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp index ab18ced91..cd924216f 100644 --- a/quest/src/api/environment.cpp +++ b/quest/src/api/environment.cpp @@ -66,6 +66,27 @@ static bool global_hasEnvBeenFinalized = false; +/* + * PUBLIC ENV PROPERTIES + * + * which are not exposed in the header, since the header is user-facing! + * For now, we grossly require callers to 'extern' these, blegh! + */ + + +bool env_isDistributed() { + + if (global_envPtr == nullptr) + error_envIsNullPtr(); + + if (global_envPtr->isDistributed && ! comm_isInit()) + error_envDistributedButCommIsNotInit(); + + return global_envPtr->isDistributed; +} + + + /* * PRIVATE QUESTENV INITIALISATION INNER FUNCTIONS */ diff --git a/quest/src/api/matrices.cpp b/quest/src/api/matrices.cpp index b17987eb4..4a5c4a4b6 100644 --- a/quest/src/api/matrices.cpp +++ b/quest/src/api/matrices.cpp @@ -37,6 +37,19 @@ using std::vector; +/* + * EXTERNAL FUNCTIONS + * + * which are regrettably extern'd, rather than included + * in a header, because they are defined within /api/, + * within which the headers are user-visible. Gross! + */ + + +extern bool env_isDistributed(); + + + /* * FIZED-SIZE MATRIX VECTOR GETTERS * @@ -165,7 +178,7 @@ void freeAllMemoryIfAnyAllocsFailed(T matr) { // ascertain whether any allocs failed on any node bool anyFail = didAnyLocalAllocsFail(matr); - if (comm_isInit()) + if (env_isDistributed()) anyFail = comm_isTrueOnAllNodes(anyFail); // if so, free all heap fields diff --git a/quest/src/api/paulis.cpp b/quest/src/api/paulis.cpp index 855a9cfd8..11b4d2e98 100644 --- a/quest/src/api/paulis.cpp +++ b/quest/src/api/paulis.cpp @@ -26,6 +26,19 @@ using std::vector; +/* + * EXTERNAL FUNCTIONS + * + * which are regrettably extern'd, rather than included + * in a header, because they are defined within /api/, + * within which the headers are user-visible. Gross! + */ + + +extern bool env_isDistributed(); + + + /* * PRIVATE UTILITIES */ @@ -38,7 +51,7 @@ bool didAnyAllocsFailOnAnyNode(PauliStrSum sum) { ! mem_isAllocated(sum.coeffs) || ! mem_isAllocated(sum.isApproxHermitian) ); - if (comm_isInit()) + if (env_isDistributed()) anyFail = comm_isTrueOnAllNodes(anyFail); return anyFail; diff --git a/quest/src/api/qureg.cpp b/quest/src/api/qureg.cpp index 034c96e5c..53c00bd1e 100644 --- a/quest/src/api/qureg.cpp +++ b/quest/src/api/qureg.cpp @@ -30,6 +30,19 @@ using std::vector; +/* + * EXTERNAL FUNCTIONS + * + * which are regrettably extern'd, rather than included + * in a header, because they are defined within /api/, + * within which the headers are user-visible. Gross! + */ + + +extern bool env_isDistributed(); + + + /* * INTERNALLY EXPOSED FUNCTION */ @@ -116,7 +129,7 @@ bool didAnyLocalAllocsFail(Qureg qureg) { bool didAnyAllocsFailOnAnyNode(Qureg qureg) { bool anyFail = didAnyLocalAllocsFail(qureg); - if (comm_isInit()) + if (env_isDistributed()) anyFail = comm_isTrueOnAllNodes(anyFail); return anyFail; diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp index c69e72919..2f28ea246 100644 --- a/quest/src/comm/comm_config.cpp +++ b/quest/src/comm/comm_config.cpp @@ -6,8 +6,8 @@ * * Note that even when QUEST_COMPILE_MPI=1, the user may have * disabled distribution when creating the QuEST environment - * at runtime. Ergo we use comm_isInit() to determine whether - * functions should invoke the MPI API. + * at runtime. Ergo we use env_isDistributed() to determine + * whether functions should invoke the MPI API. * * @author Tyson Jones */ @@ -53,6 +53,28 @@ +/* + * EXTERNAL FUNCTIONS + * + * which are regrettably extern'd, rather than included + * in a header, because they are defined within /api/, + * within which the headers are user-visible. Gross! + */ + + +// NOTE I suspect this function is redundant; that we +// can actually just consult whether global_mpiComm +// is NULL to determine whether QuEST distribution is +// active, which may avoid the pitfalls of this function; +// chiefly, that it must not be called before QuESTEnv +// creation, and so must not be invoked during failed +// initQuESTEnv validation (which happens because the +// error-msg printing attempts to print on root only) + +extern bool env_isDistributed(); + + + /* * MPI ENVIRONMENT MANAGEMENT * @@ -154,22 +176,31 @@ void comm_end(bool userOwnsMpi) { int comm_getRank() { #if QUEST_COMPILE_MPI - // if distribution was not runtime enabled (or a validation error was - // triggered), every node (if many MPI processes were launched) - // believes it is the root rank + // We must return ROOT_RANK whenever !env_isDistributed(), but alas we + // cannot immediately call that, because this function can be triggered + // BEFORE QuESTEnv is successfully created; this happens when validation + // during initQuESTEnv failed, triggering print(), which calls this + // function to avoid non-root printing! We first check whether MPI itself + // was ever initialised (by us, or by an MPI-owning user); if no, we exit. if (!comm_isInit()) return ROOT_RANK; // Consult the (potentially sub-) communicator for rank; if it is still - // NULL, as can only validly happen during failed QuESTEnv init validation - // (which triggers root-only error printing and ergo this function), we - // fall back to every process believing it is root and so attempting to - // print. This safely avoids consulting a potentially bugged MPI communicator - // and losing the message. We once tried to fallback to MPI_COMM_WORLD here, + // NULL, as can only validly happen during failed QuESTEnv init validation, + // we fall back to every process believing it is root; all attempts to print. + // This safely avoids consulting a potentially bugged MPI communicator + // and losing the message. We COULD try to fallback to MPI_COMM_WORLD here, // to avoid duplicate output, but it is not worth the risk of msg loss! if (global_mpiComm == MPI_COMM_NULL) return ROOT_RANK; + // Finally, we must not query MPI if it is user-owned, and QuEST has been + // deployed non-distributed. This scenario actually triggers mpiComm==NULL + // above, but we make it very explicit here to highlight that this can + // occur when input validation has NOT been failed. + if (!env_isDistributed()) + return ROOT_RANK; + int rank; MPI_Comm_rank(global_mpiComm, &rank); return rank; @@ -193,12 +224,19 @@ bool comm_isRootNode() { int comm_getNumNodes() { #if QUEST_COMPILE_MPI - // if distribution was not runtime enabled (or a validation error was - // triggered), every node (if many MPI processes were launched) - // believes it is the one and only node + // if MPI is not initialised, either deliberately or because a validation + // error triggered during initialisation, then every node believes it is + // the one and only root node if (!comm_isInit()) return 1; + // if MPI is initialised, but QuEST is non-distributed, then the MPI is + // user-owned and must not influence our QuEST numNodes. We call this + // AFTER !comm_isInit() because it requires the QuESTEnv has been prior + // validly created, so failed validation must encounter above pathway first. + if (!env_isDistributed()) + return 1; + int numNodes; MPI_Comm_size(global_mpiComm, &numNodes); return numNodes; @@ -224,6 +262,14 @@ void comm_sync() { if (global_mpiComm == MPI_COMM_NULL) return; + // when MPI is user-owned (QuEST is not distributed), we never consult it! This + // is checked last because it requires QuESTEnv is validly created, though this + // function can be triggered during failed initQuESTEnv validation. Note too that + // this scenario is already covered by mpiComm == NULL, but we handle it here to + // highlight that this is a valid non-error-triggered scenario! + if (!env_isDistributed()) + return; + MPI_Barrier(global_mpiComm); #endif } diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp index 862136a9c..fd3ae78b7 100644 --- a/quest/src/core/errors.cpp +++ b/quest/src/core/errors.cpp @@ -94,6 +94,16 @@ void error_allocOfQuESTEnvFailed() { raiseInternalError("Attempted memory allocation for the newly created QuESTEnv unexpectedly failed."); } +void error_envIsNullPtr() { + + raiseInternalError("The private QuESTEnv instance of environment.cpp was unexpectedly nullptr, implying an internal function queried it before QuESTEnv initialisation."); +} + +void error_envDistributedButCommIsNotInit() { + + raiseInternalError("The QuESTEnv was believed distributed but the communicator was not initialised."); +} + /* diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp index 33cc0661d..9a5491049 100644 --- a/quest/src/core/errors.hpp +++ b/quest/src/core/errors.hpp @@ -49,6 +49,11 @@ void error_validationListUniquenessCheckExceededMaskSize(); void error_allocOfQuESTEnvFailed(); +void error_envIsNullPtr(); + +void error_envDistributedButCommIsNotInit(); + + /* diff --git a/quest/src/core/randomiser.cpp b/quest/src/core/randomiser.cpp index 65c6da4eb..05150882b 100644 --- a/quest/src/core/randomiser.cpp +++ b/quest/src/core/randomiser.cpp @@ -28,6 +28,19 @@ using std::vector; +/* + * EXTERNAL FUNCTIONS + * + * which are regrettably extern'd, rather than included + * in a header, because they are defined within /api/, + * within which the headers are user-visible. Gross! + */ + + +extern bool env_isDistributed(); + + + /* * RNG HYPERPARAMTERS */ @@ -66,14 +79,14 @@ void rand_setSeeds(vector seeds) { // all nodes learn root node's #seeds unsigned numRootSeeds = seeds.size(); - if (comm_isInit()) + if (env_isDistributed()) comm_broadcastUnsignedsFromRoot(&numRootSeeds, 1); // all nodes ensure they have space to receive root node's seeds seeds.resize(numRootSeeds); // all nodes receive root seeds - if (comm_isInit()) + if (env_isDistributed()) comm_broadcastUnsignedsFromRoot(seeds.data(), seeds.size()); // all nodes remember seeds (in case user wishes to later recall them) diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp index e1df0af76..199891f6c 100644 --- a/quest/src/core/validation.cpp +++ b/quest/src/core/validation.cpp @@ -52,6 +52,18 @@ using std::vector; +/* + * EXTERNAL FUNCTIONS + * + * which are regrettably extern'd, rather than included + * in a header, because they are defined within /api/, + * within which the headers are user-visible. Gross! + */ + +extern bool env_isDistributed(); + + + /* * INVALID INPUT ERROR MESSAGES * which can contain variables with syntax ${VAR1} ${VAR2}, substituted at error-throw with @@ -119,9 +131,6 @@ namespace report { string QUEST_OWNED_MPI_WAS_PRE_INIT = "MPI was already initialised prior to QuESTEnv initialisation, but the user did not declare MPI ownership."; - string QUEST_IS_NON_DISTRIBUTED_BUT_MPI_WAS_INIT = - "QuESTEnv was initialised to be non-distributed but MPI was externally initialised - this is presently unsupported due to a (very minor) technical limitation. If you need this facility, please raise a Github issue!"; - /* * EXISTING QUESTENV @@ -1172,6 +1181,7 @@ void default_inputErrorHandler(const char* func, const char* msg) { // finalise MPI before error-exit to avoid scaring user with giant MPI error message; // we always "take ownership" of MPI here since we're about to kill the whole program + // (hence why we consult comm_isInit(), and not env_isDistributed() - everyone loses!) if (comm_isInit()) comm_end(/*userOwnsMpi=*/false); @@ -1355,7 +1365,7 @@ void assertAllNodesAgreeThat(bool valid, string msg, tokenSubs vars, const char* // when performing validation that may be non-uniform between nodes. For // example, mallocs may succeed on one node but fail on another due to // inhomogeneous loads. - if (comm_isInit()) + if (env_isDistributed()) valid = comm_isTrueOnAllNodes(valid); // prepare error message only if validation will fail @@ -1511,27 +1521,18 @@ void validate_mpiInitStatus(bool useDistrib, bool userOwnsMpi, const char* calle if (!userOwnsMpi) assertThat(!isMpiInit, report::QUEST_OWNED_MPI_WAS_PRE_INIT, caller); - // (B) If QuEST is instructed not to use distribution, we must demand the user is not - // using MPI, because we internally consult comm_isInit() to detect QuEST distribution - // in many functions, and that will give a false positive when the user inits MPI directly. - if (!useDistrib) - assertThat(!isMpiInit, report::QUEST_IS_NON_DISTRIBUTED_BUT_MPI_WAS_INIT, caller); - - // TODO: we can relax above, permitting the user to play with MPI directly while - // disabling it for QuEST, by replacing internal comm_isInit() with e.g. env_isDistributed() - - // (C) If QuEST will use MPI owned by the user, the user must have pre-initialised it + // (B) If QuEST will use MPI owned by the user, the user must have pre-initialised it if (useDistrib && userOwnsMpi) assertThat(isMpiInit, report::USER_OWNED_MPI_WAS_NOT_INIT, caller); // Confirmation that all 8 scenarios are handled: // useDistrib=0, userOwnsMpi=0, isMpiInit=0 (legal: nobody wants MPI) // (A) useDistrib=0, userOwnsMpi=0, isMpiInit=1 (illegal: user lied about ownership) - // useDistrib=0, userOwnsMpi=1, isMpiInit=0 (legal: user owns MPI but does nothing!) - // (B) useDistrib=0, userOwnsMpi=1, isMpiInit=1 (illegal: comm_isInit() limitation as above) + // useDistrib=0, userOwnsMpi=1, isMpiInit=0 (legal: user "owns" MPI but doesn't use it!) + // useDistrib=0, userOwnsMpi=1, isMpiInit=1 (legal: user owns MPI but doesn't give it to QuEST) // useDistrib=1, userOwnsMpi=0, isMpiInit=0 (legal: QuEST will init MPI) // (A) useDistrib=1, userOwnsMpi=0, isMpiInit=1 (illegal: user lied about ownership) - // (C) useDistrib=1, userOwnsMpi=1, isMpiInit=0 (illegal: user has reponsibility to pre-init) + // (B) useDistrib=1, userOwnsMpi=1, isMpiInit=0 (illegal: user has reponsibility to pre-init) // useDistrib=1, userOwnsMpi=1, isMpiInit=1 (legal: user fulfilled responsibility to pre-init) } diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp index 5bf4b257f..af5dd491a 100644 --- a/quest/src/gpu/gpu_config.cpp +++ b/quest/src/gpu/gpu_config.cpp @@ -43,6 +43,19 @@ +/* + * EXTERNAL FUNCTIONS + * + * which are regrettably extern'd, rather than included + * in a header, because they are defined within /api/, + * within which the headers are user-visible. Gross! + */ + + +extern bool env_isDistributed(); + + + /* * CUDA ERROR HANDLING * @@ -395,9 +408,36 @@ bool gpu_areAnyNodesBoundToSameGpu() { #if QUEST_COMPILE_CUDA assert_gpuHasBeenBound(hasGpuBeenBound); + // if not even the user has triggered MPI, there's no issue if (!comm_isInit()) return false; + // if MPI is active, but QuEST isn't distributed, then we have + // no way to communicate the GPU IDs and so assume NO. Note that + // this check happens strictly AFTER comm_isInit(), because this + // ... + // WAIT + // ... + // RUH ROH + // ... + // THIS FUNCTION IS CALLED DURING QUEST ENV INITIALISATION! SO THE + // BELOW CALL WOULD ALWAYS BE PREMATURE AND TRIGGER AN INTERNAL ERROR + // (QuESTEnv not yet initialised)!! BUT WE CANNOT SIMPLY NOT CALL IT + // AND PROCEED, BECAUSE WE MUST NOT TOUCH MPI IF IT IS USER-OWNED + // AND QUEST IS NOT DISTRIBUTED (because we cannot even be sure that + // all processes are participating in initQuESTEnv()!) + // + // BAH! So env_isDistributed() is badly designed; we need a way to + // see that QuEST was never INTENDED to be distributed during its + // validation stage. So I guess we WILL need to commit comm_isActive() + // or similar, to disambiguate from comm_isInit (which should be + // renamed to an explicit comm_isMpiInit()) + // + // REEEEEEE! + + if (!env_isDistributed()) + return false; + // obtain bound GPU's UUID; a unique identifier 16-char identifier auto uuidStr = getBoundGpuUuid();