diff --git a/quest/include/environment.h b/quest/include/environment.h index a584192d7..c3d867671 100644 --- a/quest/include/environment.h +++ b/quest/include/environment.h @@ -64,12 +64,6 @@ void initQuESTEnv(); */ void initCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread); -/** @notyetdoced - * Advanced initialiser which lets the user positively declare that they take responsibility for MPI. - * This means we assume they have called MPI_Init, and that they will call MPI_Finalize. - */ -void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread); - /// @notyetdoced void finalizeQuESTEnv(); diff --git a/quest/include/experimental.h b/quest/include/experimental.h new file mode 100644 index 000000000..2fabdc34f --- /dev/null +++ b/quest/include/experimental.h @@ -0,0 +1,75 @@ +/** @file + * Experimental functions which are liable to + * API breaks within QuEST minor version releases. + * Some optional functions require compiling this + * file against MPI, despite being outside of /comm/, + * and so require opt-in macros (QUEST_COMPILE_SUBCOMM) + * + * @author Oliver Brown + * @author Tyson Jones (formatting) + * + * @defgroup experimental Experimental + * @ingroup api + * @brief Experimental functions with tentative APIs + * @{ + */ + +#ifndef EXPERIMENTAL_H +#define EXPERIMENTAL_H + +#include "quest/include/config.h" + +#if QUEST_COMPILE_SUBCOMM && ! QUEST_COMPILE_MPI + #error "Macro QUEST_COMPILE_SUBCOMM was true, but QUEST_COMPILE_MPI was illegally false." +#endif + +#if QUEST_COMPILE_SUBCOMM + #include +#endif + +// enable invocation by both C and C++ binaries +#ifdef __cplusplus +extern "C" { +#endif + + +/** @notyetdoced + * + * Advanced initialiser which lets the user positively declare that they take responsibility for MPI. + * This means we assume they have called MPI_Init, and that they will call MPI_Finalize. + * + * @author Oliver Brown + */ +void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread); + + +#if QUEST_COMPILE_SUBCOMM + +/** @notyetdoced + * + * Advanced initialiser which allows the user to provide an MPI communicator for QuEST to use. + * Use of this initialiser implies userOwnsMpi = true, (exposed by initCustomMpiQuESTEnv) and + * therefore that they have already initialised MPI, and they will call MPI_Finalize at the + * appropriate time. + * + * The user-provided MPI communicator undergoes the same validation procedure as any that QuEST + * would use, and so must contain a power-of-2 number of processes. + * + * This function is only compiled and exposed when macro QUEST_COMPILE_SUBCOMM is 1, as is + * defined when providing CMake option QUEST_ENABLE_SUBCOMM during building. + * + * @author Oliver Brown + */ +void initCustomMpiCommQuESTEnv(MPI_Comm questComm, int useGpuAccel, int useMultithread); + +#endif // QUEST_COMPILE_SUBCOMM + + +// end de-mangler +#ifdef __cplusplus +} +#endif + +#endif // EXPERIMENTAL_H + +/** @} */ // (end file-wide doxygen defgroup) diff --git a/quest/include/quest.h b/quest/include/quest.h index 16f8e9b49..da1c778e2 100644 --- a/quest/include/quest.h +++ b/quest/include/quest.h @@ -38,6 +38,7 @@ #include "quest/include/debug.h" #include "quest/include/decoherence.h" #include "quest/include/environment.h" +#include "quest/include/experimental.h" #include "quest/include/trotterisation.h" #include "quest/include/initialisations.h" #include "quest/include/channels.h" @@ -45,7 +46,6 @@ #include "quest/include/operations.h" #include "quest/include/paulis.h" #include "quest/include/qureg.h" -#include "quest/include/subcommunicator.h" #include "quest/include/matrices.h" #include "quest/include/wrappers.h" diff --git a/quest/include/subcommunicator.h b/quest/include/subcommunicator.h deleted file mode 100644 index 8854404d6..000000000 --- a/quest/include/subcommunicator.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef SUBCOMMUNICATOR_H -#define SUBCOMMUNICATOR_H - -#include "quest/include/config.h" - -#if QUEST_COMPILE_MPI && QUEST_COMPILE_SUBCOMM - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/** @notyetdoced - * Advanced initialiser which allows the user to provide an MPI communicator for QuEST to use. - * Use of this initialiser implies userOwnsMpi = true, (exposed by initCustomMpiQuESTEnv) and - * therefore that they have already initialised MPI, and they will call MPI_Finalize at the - * appropriate time. - * - * The user-provided MPI communicator undergoes the same validation procedure as any that QuEST - * would use, and so must contain a power-of-2 number of processes. - */ -void initCustomMpiCommQuESTEnv(MPI_Comm questComm, int useGpuAccel, int useMultithread); - -#ifdef __cplusplus -} -#endif - -#endif - -#endif diff --git a/quest/src/api/CMakeLists.txt b/quest/src/api/CMakeLists.txt index 43b61df7d..7f90dcf17 100644 --- a/quest/src/api/CMakeLists.txt +++ b/quest/src/api/CMakeLists.txt @@ -5,6 +5,7 @@ target_sources(QuEST debug.cpp decoherence.cpp environment.cpp + experimental.cpp initialisations.cpp matrices.cpp modes.cpp @@ -12,7 +13,6 @@ target_sources(QuEST operations.cpp paulis.cpp qureg.cpp - subcommunicator.cpp trotterisation.cpp types.cpp ) diff --git a/quest/src/api/channels.cpp b/quest/src/api/channels.cpp index d6e3ac4fb..afdeea1d4 100644 --- a/quest/src/api/channels.cpp +++ b/quest/src/api/channels.cpp @@ -107,7 +107,7 @@ void freeAllMemoryIfAnyAllocsFailed(T& obj) { // determine whether any node experienced a failure bool anyFail = didAnyLocalAllocsFail(obj); - if (comm_isInit()) + if (comm_isActive()) anyFail = comm_isTrueOnAllNodes(anyFail); // if so, free all memory before subsequent validation diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp index ab18ced91..e7db211ff 100644 --- a/quest/src/api/environment.cpp +++ b/quest/src/api/environment.cpp @@ -412,10 +412,6 @@ void initCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMultithread) { } -void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread) { - validateAndInitCustomQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread, __func__); -} - void initQuESTEnv() { const bool userOwnsMpi = false; @@ -452,7 +448,7 @@ void finalizeQuESTEnv() { if (global_envPtr->isDistributed) { comm_sync(); - comm_end(global_envPtr->isMpiUserOwned); + comm_end(); } // free global env's heap memory and flag it as unallocated diff --git a/quest/src/api/experimental.cpp b/quest/src/api/experimental.cpp new file mode 100644 index 000000000..1ad6fdb42 --- /dev/null +++ b/quest/src/api/experimental.cpp @@ -0,0 +1,89 @@ +/** @file + * Experimental functions which are liable to + * API breaks within QuEST minor version releases. + * Some optional functions require compiling this + * file against MPI, despite being outside of /comm/, + * and so require opt-in macros (QUEST_COMPILE_SUBCOMM) + * + * @author Oliver Brown + */ + +#include "quest/include/config.h" +#include "quest/include/environment.h" + +#include "quest/src/core/validation.hpp" +#include "quest/src/comm/comm_config.hpp" + +#if QUEST_COMPILE_SUBCOMM && ! QUEST_COMPILE_MPI + #error "Macro QUEST_COMPILE_SUBCOMM was true, but QUEST_COMPILE_MPI was illegally false." +#endif + +#if QUEST_COMPILE_SUBCOMM + #include +#endif + + + +/* + * EXTERNAL FUNCTIONS + * + * which we here regretfully 'extern' because we are either + * unsure which header should expose them, or because they + * contain deployment-specific types (like MPI_Comm) which + * we do not wish to expose within internal headers + */ + + +extern void validateAndInitCustomQuESTEnv( + int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread, const char* caller); + + +#if QUEST_COMPILE_SUBCOMM // hide MPI_Comm + extern bool comm_setMpiComm(MPI_Comm newComm, bool userOwnsMpi); +#endif + + + +/* + * API FUNCTIONS + */ + + +// enable invocation by both C and C++ binaries +extern "C" { + + +void initCustomMpiQuESTEnv(int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread) { + validateAndInitCustomQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread, __func__); +} + + +#if QUEST_COMPILE_SUBCOMM // hide MPI_Comm + +void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useMultithread) { + + // useDistrib and userOwnsMpi are implied by the user of this initialiser + const int useDistrib = 1; + const bool userOwnsMpi = true; + + // pre-validate that we are able to set the MPI communicator + validate_mpiInitStatus(useDistrib, userOwnsMpi, __func__); + validate_mpiSubCommIsNonNull(userQuestComm != MPI_COMM_NULL, __func__); + + // avoid re-setting the MPI comm (to avoid an internal error), which happens + // if a user illegally re-calls this function, which will be subsequently + // caught by the validation in validateAndInitCustomQuESTEnv() below + if (!comm_isActive()) { + bool success = comm_setMpiComm(userQuestComm, userOwnsMpi); + validate_mpiSubCommSetSucceeded(success, __func__); + } + + // perform remaining validation (some is harmlessly repeated) and init QuEST env + validateAndInitCustomQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread, __func__); +} + +#endif // QUEST_COMPILE_SUBCOMM + + +// end de-mangler +} diff --git a/quest/src/api/matrices.cpp b/quest/src/api/matrices.cpp index b17987eb4..de27a360c 100644 --- a/quest/src/api/matrices.cpp +++ b/quest/src/api/matrices.cpp @@ -165,7 +165,7 @@ void freeAllMemoryIfAnyAllocsFailed(T matr) { // ascertain whether any allocs failed on any node bool anyFail = didAnyLocalAllocsFail(matr); - if (comm_isInit()) + if (comm_isActive()) anyFail = comm_isTrueOnAllNodes(anyFail); // if so, free all heap fields diff --git a/quest/src/api/paulis.cpp b/quest/src/api/paulis.cpp index 855a9cfd8..7d367ed23 100644 --- a/quest/src/api/paulis.cpp +++ b/quest/src/api/paulis.cpp @@ -38,7 +38,7 @@ bool didAnyAllocsFailOnAnyNode(PauliStrSum sum) { ! mem_isAllocated(sum.coeffs) || ! mem_isAllocated(sum.isApproxHermitian) ); - if (comm_isInit()) + if (comm_isActive()) anyFail = comm_isTrueOnAllNodes(anyFail); return anyFail; diff --git a/quest/src/api/qureg.cpp b/quest/src/api/qureg.cpp index 034c96e5c..286785047 100644 --- a/quest/src/api/qureg.cpp +++ b/quest/src/api/qureg.cpp @@ -116,7 +116,7 @@ bool didAnyLocalAllocsFail(Qureg qureg) { bool didAnyAllocsFailOnAnyNode(Qureg qureg) { bool anyFail = didAnyLocalAllocsFail(qureg); - if (comm_isInit()) + if (comm_isActive()) anyFail = comm_isTrueOnAllNodes(anyFail); return anyFail; diff --git a/quest/src/api/subcommunicator.cpp b/quest/src/api/subcommunicator.cpp deleted file mode 100644 index 74c05293a..000000000 --- a/quest/src/api/subcommunicator.cpp +++ /dev/null @@ -1,49 +0,0 @@ -#include "quest/include/config.h" -#include "quest/include/environment.h" -#include "quest/include/subcommunicator.h" - -#include "quest/src/core/validation.hpp" -#include "quest/src/comm/comm_config.hpp" - -#if QUEST_COMPILE_MPI && QUEST_COMPILE_SUBCOMM - -#include - - -// TODO: -// We must resolve this communicator function which contains an MPI type -// and ergo should not be leaked outside comm_config.cpp. For now, we cheat! -extern bool comm_setMpiComm(MPI_Comm newComm); - - -// TODO: -// We must resolve this inner function of QuEST initialisation, but which is -// private to api/environment.cpp, and so cannot be exposed in the user-facing -// include/environment.hpp. Grr! For now, we here just cheekily extern it c: -extern void validateAndInitCustomQuESTEnv( - int useDistrib, bool userOwnsMpi, int useGpuAccel, int useMultithread, const char* caller); - - -void initCustomMpiCommQuESTEnv(MPI_Comm userQuestComm, int useGpuAccel, int useMultithread) { - - // useDistrib and userOwnsMpi are implied by the user of this initialiser - const int useDistrib = 1; - const bool userOwnsMpi = true; - - // pre-validate that we are able to set the MPI communicator - validate_mpiInitStatus(useDistrib, userOwnsMpi, __func__); - validate_mpiSubCommIsNonNull(userQuestComm != MPI_COMM_NULL, __func__); - - // avoid re-setting the MPI comm (to avoid an internal error), which happens - // if a user illegally re-calls this function, which will be subsequently - // caught by the validation in validateAndInitCustomQuESTEnv() below - if (!comm_isMpiCommSet()) { - bool success = comm_setMpiComm(userQuestComm); - validate_mpiSubCommSetSucceeded(success, __func__); - } - - // perform remaining validation (some is harmlessly repeated) and init QuEST env - validateAndInitCustomQuESTEnv(useDistrib, userOwnsMpi, useGpuAccel, useMultithread, __func__); -} - -#endif diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp index c69e72919..8b7a72ff5 100644 --- a/quest/src/comm/comm_config.cpp +++ b/quest/src/comm/comm_config.cpp @@ -6,8 +6,11 @@ * * Note that even when QUEST_COMPILE_MPI=1, the user may have * disabled distribution when creating the QuEST environment - * at runtime. Ergo we use comm_isInit() to determine whether - * functions should invoke the MPI API. + * at runtime - even despite they themselves initialising and + * using MPI. So we must be careful about consulting MPI status! + * Furthermore, all routines here will only ever consult/affect + * the QuEST communicator, never the entire MPI environment, + * the latter of which may contain non-participating processes. * * @author Tyson Jones */ @@ -20,8 +23,6 @@ #if QUEST_COMPILE_MPI #include - - static MPI_Comm global_mpiComm = MPI_COMM_NULL; #endif @@ -53,10 +54,98 @@ +/* + * COMMUNICATOR MANAGEMENT + * + * QuEST will only ever use the overridable global_mpiComm communicator, + * so that superusers can dedicate external MPI processes to other tasks. + * Beware that it's valid for QuEST to be compiled with MPI, but have + * distribution runtime-disabled, while the user is themselves using + * (and ergo have initialised) MPI. In that scenario, we must not touch + * MPI, hence why comm_isActive() below is distinct from comm_isMpiInit(). + */ + + +// We must record whether the user owns MPI, so that we do not ever attempt +// to kill it when gracefully exiting, or due to a validation error +static bool global_isMpiUserOwned = false; + + +// Guarded since MPI_Comm cannot be exposed when not compiling MPI. This +// communicator is overridden from NULL either BEFORE or DURING comm_init() +#if QUEST_COMPILE_MPI + static MPI_Comm global_mpiComm = MPI_COMM_NULL; +#endif + + +bool comm_isActive() { +#if QUEST_COMPILE_MPI + + // comm_init(), or potentially comm_setMpiComm() before it, will only + // ever override mpiComm with non-NULL, indicating active comm. Note + // it's principally for mpiComm to later return to NULL, via comm_end(), + // and for QuEST execution to continue (though not supported presently). + // if comm_isActive() is true, then it is guaranteed MPI is initialised + return global_mpiComm != MPI_COMM_NULL; + + // note it is legal for QuEST distribution to be disabled (and ergo + // mpiComm never initialised) even when the user is themselves accessing + // MPI, hence this function is semantically distinct from comm_isMpiInit() +#else + + // QuEST communication is obviously never active if + // not even MPI is compiled; though this does not + // imply at all the user isn't themselves using MPI! + return false; + +#endif +} + + +// Hide MPI_Comm from signatures when MPI is not compiled. Beware that +// these are not exposed in comm_config.hpp; callers must 'extern' them! +#if QUEST_COMPILE_MPI + + +MPI_Comm comm_getMpiComm() { + + // illegal to call before communicator has been overridden + if (global_mpiComm == MPI_COMM_NULL) + error_commMpiCommIsNull(); + + return global_mpiComm; +} + + +bool comm_setMpiComm(MPI_Comm newComm, bool userOwnsMpi) { + + // illegal to re-set, or set to null + if (global_mpiComm != MPI_COMM_NULL) + error_commAlreadyHasSetMpiComm(); + if (newComm == MPI_COMM_NULL) + error_commNewMpiCommIsNull(); + + // detect bad communicator, and inform validation + auto status = MPI_Comm_dup(newComm, &global_mpiComm); + if (status != MPI_SUCCESS) + return false; + + // record ownership as soon as QuEST communication becomes active, so + // validation errors during env initialisation never kill user-owned MPI + global_isMpiUserOwned = userOwnsMpi; + return true; +} + + +#endif // QUEST_COMPILE_MPI + + + /* * MPI ENVIRONMENT MANAGEMENT * - * all of which is safely callable in non-distributed mode + * which queries MPI itself (as may be user-activated), rather + * than QuEST's (possibly more limited) MPI environment */ @@ -89,64 +178,93 @@ bool comm_isMpiGpuAware() { } -bool comm_isInit() { +bool comm_isMpiInit() { #if QUEST_COMPILE_MPI // safely callable before MPI initialisation, but NOT after comm_end() int isInit; MPI_Initialized(&isInit); + + // when MPI is not initialised, it is guaranteed that QuEST's communicator + // is inactive, which we double check here so callers can be absolutely sure + if (!isInit && comm_isActive()) + error_commActiveButMpiNotInit(); + return (bool) isInit; #else // obviously MPI is never initialised if not even compiled return false; + #endif } + +/* + * QUEST COMMUNICATION MANAGEMENT + * + * which interacts only with QuEST's MPI environment, + * which may be smaller than the user-controlled MPI env + */ + + void comm_init(bool userOwnsMpi) { #if QUEST_COMPILE_MPI - // re-assert prior user-validations for robustness - if (userOwnsMpi && !comm_isInit()) + // re-assert prior user-validations for clarity + if (userOwnsMpi && !comm_isMpiInit()) error_commNotInit(); - if (!userOwnsMpi && comm_isInit()) + if (!userOwnsMpi && comm_isMpiInit()) error_commAlreadyInit(); // init MPI only when it's not the user's responsibility if (!userOwnsMpi) MPI_Init(NULL, NULL); - // choose communicator only when the user hasn't + // choose communicator only when the user hasn't already + // (via comm_setMpiComm, during custom env initialisation) if (global_mpiComm == MPI_COMM_NULL) - MPI_Comm_dup(MPI_COMM_WORLD, &global_mpiComm); + comm_setMpiComm(MPI_COMM_WORLD, userOwnsMpi); #endif } -void comm_end(bool userOwnsMpi) { +void comm_end() { #if QUEST_COMPILE_MPI - // gracefully permit comm_end() before comm_init(), as input validation can trigger - if (!comm_isInit()) - return; - - // gracefully handle when the communicator is still NULL, because comm_end() may be - // triggered by "bad MPI init" validation, during which, the communicator may not yet - // have been set. We choose NOT to divert to MPI_COMM_WORLD, which is likely just to - // stall at MPI_Barrier, and instead let the user's communicator live on; then crash! - if (global_mpiComm == MPI_COMM_NULL) + // If QuEST isn't using distribution, regardless of whether the user is using MPI, + // then we gracefully exit. We do NOT attempt to end MPI on the user's behalf (as we + // may be tempted to do during validation failure to avoid their MPI-crash), because + // it's possible/legal that not all processes are participating in this comm_end() + // call, in which case so MPI_Finalize() could just cause a hang. + if (!comm_isActive()) return; + // Syncing is not strictly necessary, but it ensures that finalizeQuESTEnv() never + // completes on one process while another process is still performing simulation + // (though that'd be weird), and so may avoid a silly user benchmarking pitfall MPI_Barrier(global_mpiComm); MPI_Comm_free(&global_mpiComm); - // QuEST must finalise MPI if the user does not own it - if (!userOwnsMpi) + // Do NOT close MPI if the user owns; they may still wish to use it after QuEST! + if (!global_isMpiUserOwned) MPI_Finalize(); + // Presently, comm_end() is only ever called during QuESTEnv destruction (either + // deliberately, or because of failed validation during QuESTEnv initialisation). + // This means any comm_*() call hereafter is invalid/illegal and will be prevented + // by validation. However, we can imagine a future where distribution gets runtime + // disabled while QuEST execution continues (e.g. initQuESTEnv automatically + // disabled distribution), and so we must indicate that communication is no longer + // active by overwriting comm to NULL. BEWARE that this is "hacky"; we have + // updated mpiComm here without MPI_Comm_dup(), but that's fine, because hereafter + // MPI will never be used again (illegal to re-init both MPI, and QuEST!) + global_mpiComm = MPI_COMM_NULL; + global_isMpiUserOwned = false; + #endif } @@ -155,21 +273,13 @@ int comm_getRank() { #if QUEST_COMPILE_MPI // if distribution was not runtime enabled (or a validation error was - // triggered), every node (if many MPI processes were launched) - // believes it is the root rank - if (!comm_isInit()) - return ROOT_RANK; - - // Consult the (potentially sub-) communicator for rank; if it is still - // NULL, as can only validly happen during failed QuESTEnv init validation - // (which triggers root-only error printing and ergo this function), we - // fall back to every process believing it is root and so attempting to - // print. This safely avoids consulting a potentially bugged MPI communicator - // and losing the message. We once tried to fallback to MPI_COMM_WORLD here, - // to avoid duplicate output, but it is not worth the risk of msg loss! - if (global_mpiComm == MPI_COMM_NULL) + // triggered during distributed initialisation), every process believes + // it is the root rank; this may lead to unavoidable error msg spam! + if (!comm_isActive()) return ROOT_RANK; + // obtain the process rank within the QuEST communicator, which can + // differ from the global MPI process rank when users own MPI int rank; MPI_Comm_rank(global_mpiComm, &rank); return rank; @@ -178,6 +288,7 @@ int comm_getRank() { // if MPI isn't compiled, we're definitely non-distributed; return main rank return ROOT_RANK; + #endif } @@ -194,19 +305,25 @@ int comm_getNumNodes() { #if QUEST_COMPILE_MPI // if distribution was not runtime enabled (or a validation error was - // triggered), every node (if many MPI processes were launched) - // believes it is the one and only node - if (!comm_isInit()) + // triggered during distributed initialisation), every process is told + // it is the one and only node; this may lead to error msg spam, but + // appears unavoidable! + if (!comm_isActive()) return 1; + // obtain the number of processes within the QuEST communicator, which + // can be smaller than global MPI process count when users own MPI int numNodes; MPI_Comm_size(global_mpiComm, &numNodes); return numNodes; #else - // if MPI isn't compiled, we're definitely non-distributed; return single node + // if MPI isn't compiled, QuEST is definitely non-distributed and + // each process only knows itself (though users may own MPI and + // actually have many processes; that's none of our business!) return 1; + #endif } @@ -214,62 +331,13 @@ int comm_getNumNodes() { void comm_sync() { #if QUEST_COMPILE_MPI - // gracefully handle when not distributed, needed by e.g. pre-MPI-setup validation - if (!comm_isInit()) - return; - - // gracefully handle when the communicator is still NULL, because comm_sync() is - // triggered by "bad MPI init" validation (during the error message printing) - // during which, the communicator may not yet have been overriden - if (global_mpiComm == MPI_COMM_NULL) + // gracefully handle when not distributed, needed by e.g. pre-MPI-setup validation + if (!comm_isActive()) return; MPI_Barrier(global_mpiComm); -#endif -} - - -/* - * MPI COMMUNICATOR MANAGEMENT - * - * some of which requires exposing MPI_Comm in external-facing signatures. - * In lieu of leaking these into comm_config.hpp, callers must extern them. - */ - -bool comm_isMpiCommSet() { -#if QUEST_COMPILE_MPI - - // once comm_init() or comm_setMpiComm() overwrite - // the communicator, is can never return to NULL - return (global_mpiComm != MPI_COMM_NULL); -# else - return false; #endif -} - -#if QUEST_COMPILE_MPI - -MPI_Comm comm_getMpiComm() { - - if (global_mpiComm == MPI_COMM_NULL) - error_commMpiCommIsNull(); - return global_mpiComm; + // do nothing at all when MPI is not compiled (user owned MPI processes go unsynced) } - -bool comm_setMpiComm(MPI_Comm newComm) { - - // this is called prior to QuEST initialisation, - // and merely seeks to overwrite global_mpiComm - - if (global_mpiComm != MPI_COMM_NULL) - error_commAlreadyHasSetMpiComm(); - if (newComm == MPI_COMM_NULL) - error_commMpiCommIsNull(); - - auto status = MPI_Comm_dup(newComm, &global_mpiComm); - return status == MPI_SUCCESS; -} - -#endif // QUEST_COMPILE_MPI diff --git a/quest/src/comm/comm_config.hpp b/quest/src/comm/comm_config.hpp index 8441dbc23..826ebdf1c 100644 --- a/quest/src/comm/comm_config.hpp +++ b/quest/src/comm/comm_config.hpp @@ -12,25 +12,26 @@ constexpr int ROOT_RANK = 0; +// queries of MPI's global/general status (when visible) bool comm_isMpiCompiled(); bool comm_isMpiSubCommCompiled(); bool comm_isMpiGpuAware(); +bool comm_isMpiInit(); +// control of QuEST's (possibly more limited) MPI env +bool comm_isActive(); void comm_init(bool userOwnsMpi); -void comm_end(bool userOwnsMpi); +void comm_end(); void comm_sync(); +// queries of QuEST's (possibly more limited) MPI env int comm_getRank(); int comm_getNumNodes(); - -bool comm_isInit(); bool comm_isRootNode(); bool comm_isRootNode(int rank); -bool comm_isMpiCommSet(); - // Signatures containing MPI types which callers must extern: // extern MPI_Comm comm_getMpiComm() -// extern bool comm_setMpiComm(MPI_Comm newComm) +// extern bool comm_setMpiComm(MPI_Comm newComm, bool userOwnsMpi) #endif // COMM_CONFIG_HPP diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp index 862136a9c..63e44e71e 100644 --- a/quest/src/core/errors.cpp +++ b/quest/src/core/errors.cpp @@ -188,7 +188,17 @@ void error_commAlreadyHasSetMpiComm() { void error_commMpiCommIsNull() { - raiseInternalError("The MPI communicator was queried (or set) but was unexpectedly MPI_COMM_NULL (or set to be)."); + raiseInternalError("The MPI communicator was queried but was unexpectedly MPI_COMM_NULL."); +} + +void error_commNewMpiCommIsNull() { + + raiseInternalError("The MPI communicator was attemptedly set to MPI_COMM_NULL, which validation should have prior caught."); +} + +void error_commActiveButMpiNotInit() { + + raiseInternalError("QuEST believed communication was active, but MPI_Init reported MPI was not initialised."); } void assert_commBoundsAreValid(Qureg qureg, qindex sendInd, qindex recvInd, qindex numAmps) { diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp index 33cc0661d..33cc182c7 100644 --- a/quest/src/core/errors.hpp +++ b/quest/src/core/errors.hpp @@ -95,6 +95,10 @@ void error_commAlreadyHasSetMpiComm(); void error_commMpiCommIsNull(); +void error_commNewMpiCommIsNull(); + +void error_commActiveButMpiNotInit(); + void assert_commBoundsAreValid(Qureg qureg, qindex sendInd, qindex recvInd, qindex numAmps); void assert_commPayloadIsPowerOf2(qindex numAmps); diff --git a/quest/src/core/randomiser.cpp b/quest/src/core/randomiser.cpp index 65c6da4eb..7b35a29fc 100644 --- a/quest/src/core/randomiser.cpp +++ b/quest/src/core/randomiser.cpp @@ -66,14 +66,14 @@ void rand_setSeeds(vector seeds) { // all nodes learn root node's #seeds unsigned numRootSeeds = seeds.size(); - if (comm_isInit()) + if (comm_isActive()) comm_broadcastUnsignedsFromRoot(&numRootSeeds, 1); // all nodes ensure they have space to receive root node's seeds seeds.resize(numRootSeeds); // all nodes receive root seeds - if (comm_isInit()) + if (comm_isActive()) comm_broadcastUnsignedsFromRoot(seeds.data(), seeds.size()); // all nodes remember seeds (in case user wishes to later recall them) diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp index e1df0af76..0f9cecc97 100644 --- a/quest/src/core/validation.cpp +++ b/quest/src/core/validation.cpp @@ -1167,13 +1167,14 @@ void default_inputErrorHandler(const char* func, const char* msg) { + "Exiting...\n"); // force a synch because otherwise non-main nodes may exit before print, and MPI - // will then attempt to instantly abort all nodes, losing the error message. + // will then attempt to instantly abort all nodes, losing the error message comm_sync(); - // finalise MPI before error-exit to avoid scaring user with giant MPI error message; - // we always "take ownership" of MPI here since we're about to kill the whole program - if (comm_isInit()) - comm_end(/*userOwnsMpi=*/false); + // finalise QuEST-owned MPI before error-exit to avoid scaring user with giant MPI crash + // message. note user-owned MPI is NOT killed because it's possible only SOME processes + // reach here, and attempting to sync/kill them would result in an MPI hang/crash anyway + if (comm_isActive()) + comm_end(); // keeps user-owned MPI alive // simply exit, interrupting any other process (potentially leaking) exit(EXIT_FAILURE); @@ -1355,7 +1356,7 @@ void assertAllNodesAgreeThat(bool valid, string msg, tokenSubs vars, const char* // when performing validation that may be non-uniform between nodes. For // example, mallocs may succeed on one node but fail on another due to // inhomogeneous loads. - if (comm_isInit()) + if (comm_isActive()) valid = comm_isTrueOnAllNodes(valid); // prepare error message only if validation will fail @@ -1499,28 +1500,21 @@ void validate_gpuIsCuQuantumCompatible(const char* caller) { void validate_mpiInitStatus(bool useDistrib, bool userOwnsMpi, const char* caller) { - if (!global_isValidationEnabled) - return; - // Validation prior to this function confirms init(Custom*)QuESTEnv is only ever called // once, but we must additionally confirm the user has interacted with MPI legally - bool isMpiInit = comm_isInit(); + if (!global_isValidationEnabled) + return; + + // We consult whether MPI itself has been initialised, NOT whether QuEST is using it + bool isMpiInit = comm_isMpiInit(); - // (A) If the user does not declare ownership of MPI, they are forbidden to initialise it + // (A) If the user does not declare ownership of MPI, they are forbidden to initialise it, + // even when they are not distributing QuEST (i.e. useDistrib=0), just for clarity! if (!userOwnsMpi) assertThat(!isMpiInit, report::QUEST_OWNED_MPI_WAS_PRE_INIT, caller); - // (B) If QuEST is instructed not to use distribution, we must demand the user is not - // using MPI, because we internally consult comm_isInit() to detect QuEST distribution - // in many functions, and that will give a false positive when the user inits MPI directly. - if (!useDistrib) - assertThat(!isMpiInit, report::QUEST_IS_NON_DISTRIBUTED_BUT_MPI_WAS_INIT, caller); - - // TODO: we can relax above, permitting the user to play with MPI directly while - // disabling it for QuEST, by replacing internal comm_isInit() with e.g. env_isDistributed() - - // (C) If QuEST will use MPI owned by the user, the user must have pre-initialised it + // (B) If QuEST will use MPI owned by the user, the user must have pre-initialised it if (useDistrib && userOwnsMpi) assertThat(isMpiInit, report::USER_OWNED_MPI_WAS_NOT_INIT, caller); @@ -1528,10 +1522,10 @@ void validate_mpiInitStatus(bool useDistrib, bool userOwnsMpi, const char* calle // useDistrib=0, userOwnsMpi=0, isMpiInit=0 (legal: nobody wants MPI) // (A) useDistrib=0, userOwnsMpi=0, isMpiInit=1 (illegal: user lied about ownership) // useDistrib=0, userOwnsMpi=1, isMpiInit=0 (legal: user owns MPI but does nothing!) - // (B) useDistrib=0, userOwnsMpi=1, isMpiInit=1 (illegal: comm_isInit() limitation as above) + // useDistrib=0, userOwnsMpi=1, isMpiInit=1 (legal: user owns MPI, QuEST won't use it) // useDistrib=1, userOwnsMpi=0, isMpiInit=0 (legal: QuEST will init MPI) // (A) useDistrib=1, userOwnsMpi=0, isMpiInit=1 (illegal: user lied about ownership) - // (C) useDistrib=1, userOwnsMpi=1, isMpiInit=0 (illegal: user has reponsibility to pre-init) + // (B) useDistrib=1, userOwnsMpi=1, isMpiInit=0 (illegal: user has reponsibility to pre-init) // useDistrib=1, userOwnsMpi=1, isMpiInit=1 (legal: user fulfilled responsibility to pre-init) } diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp index 5bf4b257f..4e03217e5 100644 --- a/quest/src/gpu/gpu_config.cpp +++ b/quest/src/gpu/gpu_config.cpp @@ -395,7 +395,7 @@ bool gpu_areAnyNodesBoundToSameGpu() { #if QUEST_COMPILE_CUDA assert_gpuHasBeenBound(hasGpuBeenBound); - if (!comm_isInit()) + if (!comm_isActive()) return false; // obtain bound GPU's UUID; a unique identifier 16-char identifier