diff --git a/.gitignore b/.gitignore index 7155425..20f1484 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,7 @@ config/ltmain.sh config/ltoptions.m4 config/ltversion.m4 config/lt~obsolete.m4 +config/compile config/config.guess config/config.sub config/depcomp @@ -88,6 +89,7 @@ src/to_wrap.C src/tomod.py src/dysect/DysectAPI/.deps/ src/dysect/DysectAPI/.libs/ +src/dysect/DysectAPI/Aggregates/.deps/ src/dysect/libDysectAPI/src/.deps/ src/dysect/libDysectAPI/src/.libs/ src/dysect/libDysectAPI/src/expr-parser.cc diff --git a/config/x_ac_arch.m4 b/config/x_ac_arch.m4 index 262375c..b3a82d8 100644 --- a/config/x_ac_arch.m4 +++ b/config/x_ac_arch.m4 @@ -8,4 +8,12 @@ AC_DEFUN([X_AC_ARCH], [ [CXXFLAGS="$CXXFLAGS"] AM_CONDITIONAL([ENABLE_BGL], true) ) + +AC_ARG_WITH(cray-xt, + [AS_HELP_STRING([--with-cray-xt], + [Add the flags to run on a Cray XT system] + )], + [AC_DEFINE([CRAYXT], [], [Compilation for CrayXT systems])] + [CXXFLAGS="$CXXFLAGS"] + ) ]) diff --git a/config/x_ac_boost.m4 b/config/x_ac_boost.m4 index 84a0c6f..e64b1db 100644 --- a/config/x_ac_boost.m4 +++ b/config/x_ac_boost.m4 @@ -21,13 +21,13 @@ AC_DEFUN([X_AC_BOOST], [ else AC_SUBST(LIBBOOSTDIR, [$with_boost_path/lib]) fi - AC_SUBST(LIBBOOST,["-lboost_date_time -lboost_thread -lboost_filesystem -lboost_program_options -lboost_regex -lboost_system -lboost_system -lboost_wave"]) + AC_SUBST(LIBBOOST,["-lboost_thread"]) AC_DEFINE(HAVE_BOOST_TO,1,[Define 1 if a compatible boost package is found]) boost_found="yes" elif test -f "$with_boost_path"/include/boost-1_37/boost/algorithm/string.hpp ; then AC_SUBST(BOOST_INCLUDE, -I$with_boost_path/include/boost-1_37) AC_SUBST(LIBBOOSTDIR, [$with_boost_path/lib]) - AC_SUBST(LIBBOOST,["-lboost_date_time -lboost_thread -lboost_filesystem -lboost_program_options -lboost_regex -lboost_system -lboost_system -lboost_wave"]) + AC_SUBST(LIBBOOST,["-lboost_date_time -lboost_thread -lboost_filesystem"]) AC_DEFINE(HAVE_BOOST_TO,1,[Define 1 if a compatible boost package is found]) boost_found="yes $LIBBOOSTDIR" else @@ -43,7 +43,7 @@ AC_DEFUN([X_AC_BOOST], [ else AC_SUBST(LIBBOOSTDIR, [$boost_dflt_dir/lib]) fi - AC_SUBST(LIBBOOST,["-lboost_date_time -lboost_thread -lboost_filesystem -lboost_program_options -lboost_regex -lboost_system -lboost_system -lboost_wave"]) + AC_SUBST(LIBBOOST,["-lboost_date_time -lboost_thread -lboost_filesystem"]) boost_found="yes $LIBBOOSTDIR" else boost_found="no" diff --git a/config/x_ac_mrnet.m4 b/config/x_ac_mrnet.m4 index d51bb83..84f5aaf 100644 --- a/config/x_ac_mrnet.m4 +++ b/config/x_ac_mrnet.m4 @@ -13,6 +13,16 @@ AC_DEFUN([X_AC_MRNET], [ MRNETPREFIX="" ] ) + AC_ARG_WITH(extra-mrnet-rpath, + [AS_HELP_STRING([--with-extra-mrnet-rpath=path], + [Add additional rpath fro mrnet] + )], + [RPATH_FLAGS="$RPATH_FLAGS -Wl,-rpath=${withval}"], + [] + ) + AC_MSG_CHECKING([setting the RPATH_FLAGS for mrnet]) + AC_MSG_RESULT([$RPATH_FLAGS]) + mrn_incs=`ls -d $MRNETPREFIX/lib/*/include` for mrn_inc in $mrn_incs do @@ -66,7 +76,7 @@ AC_DEFUN([X_AC_MRNET], [ AC_MSG_ERROR([libmrnet is required. Specify libmrnet prefix with --with-mrnet]) fi AC_LANG_POP(C++) - AC_PATH_PROG([MRNETCOMMNODEBIN], [mrnet_commnode], [no], [$MRNETPREFIX/bin$PATH_SEPARATOR$PATH]) + AC_PATH_PROG([MRNETCOMMNODEBIN], [mrnet_commnode], [no], [$MRNETPREFIX/bin$PATH_SEPARATOR$MRNETPREFIX/libexec$PATH_SEPARATOR$PATH]) if test $MRNETCOMMNODEBIN = no; then AC_MSG_ERROR([the mrnet_commnode executable is required. Specify mrnet prefix with --with-mrnet]) fi diff --git a/config/x_ac_python.m4 b/config/x_ac_python.m4 index fddd9b6..cdcd9b9 100644 --- a/config/x_ac_python.m4 +++ b/config/x_ac_python.m4 @@ -38,17 +38,13 @@ AC_DEFUN([X_AC_PYTHON], [ AC_MSG_CHECKING([Python version]) python_version=`$PYTHON -c "import distutils.sysconfig; \ print(distutils.sysconfig.get_python_version());"` - python_minor_version=`echo $python_version | sed 's/.*\.\(.*\)/\1/'` - if test $python_version '>' 2.99 ; then - if test $python_minor_version -lt 8 ; then - m="m" - python_version=$python_version$m - else - python_version=$python_version - fi - else + if test $python_version '<' 3.0 ; then AM_CONDITIONAL([ENABLE_PYTHON2], true) fi + if test $python_version '<' 3.9 ; then + m="m" + python_version=$python_version$m + fi AC_MSG_RESULT($python_version) - AM_COND_IF([ENABLE_GDB], [BELIBS="-lpython$python_version $BELIBS"], []) + BELIBS="-lpython$python_version $BELIBS" ]) diff --git a/configure.ac b/configure.ac index 1640741..7bff425 100644 --- a/configure.ac +++ b/configure.ac @@ -27,16 +27,15 @@ X_AC_TEMPORALORDERINGAPI X_AC_PYTHON AM_PATH_PYTHON X_AC_GUI -X_AC_BOOST X_AC_ARCH +X_AC_BOOST X_AC_DEBUGLIBS X_AC_GRAPHLIB -X_AC_LAUNCHMON -X_AC_RM_COMM X_AC_MRNET X_AC_FGFS X_AC_CALLPATH + # Checks for header files. AC_HEADER_STDC AC_CHECK_HEADERS([arpa/inet.h fcntl.h stdlib.h sys/time.h unistd.h]) @@ -68,6 +67,7 @@ AC_ARG_ENABLE(statbench, AM_CONDITIONAL([ENABLE_STATBENCH], [test "$WITH_STATBENCH" = yes]) + AC_ARG_WITH(procspernode, [AS_HELP_STRING([--with-procspernode=num],[Set the maximum number of communication processes per node equal to num, defaults to 1 if not specified])], [AC_DEFINE_UNQUOTED([STAT_PROCS_PER_NODE], [$withval], [The max number of CPs per node])], @@ -80,6 +80,34 @@ AC_ARG_WITH(usagelog, [CXXFLAGS="$CXXFLAGS"] ) +AM_CONDITIONAL([WITH_CTI], false) + +AC_ARG_WITH(cti, + [AS_HELP_STRING([--with-cti], + [Location of the common tools interface package] + ) + ], + [CXXFLAGS="$CXXFLAGS -I${withval}/include" + LDFLAGS="$LDFLAGS -L${withval}/lib" + AM_CONDITIONAL([WITH_CTI], true) + AC_DEFINE([USE_CTI], [], [Use CTI for ALPS and PALS support]) + WITH_CTI=yes + ], + [CXXFLAGS="$CXXFLAGS" + WITH_CTI=no] +) + + +if test "$WITH_CTI" = "yes" +then + PKG_PROG_PKG_CONFIG + PKG_CHECK_MODULES([CRAY_CTI_FE],[common_tools_fe],[],[AC_MSG_ERROR([libcommontools_fe.so not found.])]) + PKG_CHECK_MODULES([CRAY_CTI_BE],[common_tools_be],[],[AC_MSG_ERROR([libcommontools_be.so not found.])]) +else + X_AC_LAUNCHMON + X_AC_RM_COMM +fi + AC_ARG_WITH(alias-suffix, [AS_HELP_STRING([--with-alias-suffix=suffix],[drop suffix from the hostname alias])], [CXXFLAGS="$CXXFLAGS -DSTAT_ALIAS_SUFFIX=\\\"${withval}\\\""], @@ -114,9 +142,6 @@ AC_SUBST(BELIBS) AC_SUBST(MWLIBS) AC_SUBST(MRNETCOMMNODEBIN) AC_SUBST(DOTBINDIR) -AC_SUBST(LAUNCHMONBIN) -AC_SUBST(NEWLAUNCHMONBIN) -AC_SUBST(LAUNCHMONPREFIX) AC_SUBST(GRAPHLIBPREFIX) AC_SUBST(STACKWALKERPREFIX) AC_SUBST(DEPCOREPREFIX) diff --git a/scripts/STAT.in b/scripts/STAT.in index 8acc019..5b481f9 100644 --- a/scripts/STAT.in +++ b/scripts/STAT.in @@ -82,5 +82,14 @@ if test -n "$PYTHONPATH" ; then else export PYTHONPATH=@pythondir@:@pyexecdir@:@PYTHONPATH@ fi +if test -n "$STAT_INSTALL_DIR"; then + if test -z "$STAT_DAEMON_PATH" ; then + export STAT_DAEMON_PATH=$prefix/bin/STATD + fi + if test -z "$STAT_FILTER_PATH" ; then + export STAT_FILTER_PATH=$prefix/lib/STAT_FilterDefinitions.so + fi + export PATH=$PATH:$prefix/bin +fi exec @STATPYTHON@ @pythondir@/STATmain.py $@ diff --git a/scripts/STATGUI.in b/scripts/STATGUI.in index e44c3b4..7836c9f 100644 --- a/scripts/STATGUI.in +++ b/scripts/STATGUI.in @@ -73,13 +73,21 @@ fi if test -z "$LMON_FE_ENGINE_TIMEOUT" ; then export LMON_FE_ENGINE_TIMEOUT=600 fi - export PATH=$PATH:@DOTBINDIR@ if test -n "$PYTHONPATH" ; then export PYTHONPATH=@pythondir@:@pyexecdir@:@PYTHONPATH@:$PYTHONPATH else export PYTHONPATH=@pythondir@:@pyexecdir@:@PYTHONPATH@ fi +if test -n "$STAT_INSTALL_DIR"; then + if test -z "$STAT_DAEMON_PATH" ; then + export STAT_DAEMON_PATH=$prefix/bin/STATD + fi + if test -z "$STAT_FILTER_PATH" ; then + export STAT_FILTER_PATH=$prefix/lib/STAT_FilterDefinitions.so + fi + export PATH=$PATH:$prefix/bin +fi if test -n @GSETTINGS_SCHEMA_DIR@ ; then export GSETTINGS_SCHEMA_DIR=@GSETTINGS_SCHEMA_DIR@ diff --git a/scripts/stat-cl.in b/scripts/stat-cl.in index 5b79027..a6f4f1e 100644 --- a/scripts/stat-cl.in +++ b/scripts/stat-cl.in @@ -80,4 +80,14 @@ if test -n "$PYTHONPATH" ; then else export PYTHONPATH=@pythondir@:@pyexecdir@:@PYTHONPATH@ fi +if test -n "$STAT_INSTALL_DIR"; then + if test -z "$STAT_DAEMON_PATH" ; then + export STAT_DAEMON_PATH=$prefix/bin/STATD + fi + if test -z "$STAT_FILTER_PATH" ; then + export STAT_FILTER_PATH=$prefix/lib/STAT_FilterDefinitions.so + fi + export PATH=$PATH:$prefix/bin +fi + exec @STATPYTHON@ @pythondir@/STATmain.py cl $@ diff --git a/scripts/xdot.py b/scripts/xdot.py index 4360d2c..ad53385 100755 --- a/scripts/xdot.py +++ b/scripts/xdot.py @@ -1507,6 +1507,8 @@ def __init__(self): self.drag_action = NullAction(self) self.presstime = None self.highlight = None + self.pressx = None + self.pressy = None def set_filter(self, filter): self.filter = filter diff --git a/src/Makefile.am b/src/Makefile.am index 6a58063..887c04a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -45,10 +45,16 @@ STATD_SOURCES = STATD.C STAT.h STAT_BackEnd.h STAT_GraphRoutines.h STATbin_LDADD = libstatfe.la @FELIBS@ STATD_LDADD = libstatbe.la @BELIBS@ - -stat_be_la_sources_base = STAT_BackEnd.C STAT_BackEnd.h STAT_lmonBackEnd.C STAT_lmonBackEnd.h STAT.h STAT_timer.h STAT_timer.C STAT_CircularLogs.C -stat_fe_la_sources_base = STAT_FrontEnd.C STAT_lmonFrontEnd.C STAT_FrontEnd.h STAT_lmonFrontEnd.h STAT.h STAT_timer.h STAT_timer.C - +stat_be_la_sources_base = STAT_BackEnd.C STAT_BackEnd.h STAT.h STAT_timer.h STAT_timer.C STAT_CircularLogs.C +stat_fe_la_sources_base = STAT_FrontEnd.C STAT_FrontEnd.h STAT.h STAT_timer.h STAT_timer.C + +if WITH_CTI +stat_be_la_sources_base += STAT_ctiBackEnd.C STAT_ctiBackEnd.h +stat_fe_la_sources_base += STAT_ctiFrontEnd.h STAT_ctiFrontEnd.C +else +stat_be_la_sources_base += STAT_lmonBackEnd.C STAT_lmonBackEnd.h +stat_fe_la_sources_base += STAT_lmonFrontEnd.h STAT_lmonFrontEnd.C +endif libstatbe_la_SOURCES = $(stat_be_la_sources_base) $(stat_graphlib_sources) $(stat_be_la_sources_fgfs) libstatfe_la_SOURCES = $(stat_fe_la_sources_base) $(stat_graphlib_sources) diff --git a/src/STAT.h b/src/STAT.h index 2396a91..1a5af29 100644 --- a/src/STAT.h +++ b/src/STAT.h @@ -119,6 +119,7 @@ typedef enum { STAT_TERMINATE_ERROR, STAT_FILE_ERROR, STAT_LMON_ERROR, + STAT_CTI_ERROR, STAT_ARG_ERROR, STAT_VERSION_ERROR, STAT_NOT_LAUNCHED_ERROR, @@ -192,6 +193,9 @@ typedef enum { case STAT_LMON_ERROR: \ fprintf(outFp, "STAT_LMON_ERROR"); \ break; \ + case STAT_CTI_ERROR: \ + fprintf(outFp, "STAT_CTI_ERROR"); \ + break; \ case STAT_ARG_ERROR: \ fprintf(outFp, "STAT_ARG_ERROR"); \ break; \ diff --git a/src/STAT.i b/src/STAT.i index 4762fef..e01335a 100644 --- a/src/STAT.i +++ b/src/STAT.i @@ -142,6 +142,7 @@ typedef enum { STAT_TERMINATE_ERROR, STAT_FILE_ERROR, STAT_LMON_ERROR, + STAT_CTI_ERROR, STAT_ARG_ERROR, STAT_VERSION_ERROR, STAT_NOT_LAUNCHED_ERROR, diff --git a/src/STATBenchD.C b/src/STATBenchD.C index d283502..ee6e90f 100644 --- a/src/STATBenchD.C +++ b/src/STATBenchD.C @@ -36,8 +36,12 @@ int main(int argc, char **argv) char logOutDir[BUFSIZE]; STAT_BackEnd *statBackEnd; StatError_t statError; +#ifdef USE_CTI + StatDaemonLaunch_t launchType = STATD_CTI_LAUNCH; +#else StatDaemonLaunch_t launchType = STATD_LMON_LAUNCH; - +#endif + struct option longOptions[] = { {"mrnetprintf", no_argument, 0, 'm'}, diff --git a/src/STATD.C b/src/STATD.C index ec53b3b..068df55 100644 --- a/src/STATD.C +++ b/src/STATD.C @@ -35,7 +35,11 @@ int main(int argc, char **argv) char logOutDir[BUFSIZE]; string invocationString; vector serialProcesses; +#ifdef USE_CTI + StatDaemonLaunch_t launchType = STATD_CTI_LAUNCH; +#else StatDaemonLaunch_t launchType = STATD_LMON_LAUNCH; +#endif StatError_t statError; STAT_BackEnd *statBackEnd; diff --git a/src/STAT_BackEnd.C b/src/STAT_BackEnd.C index 8cb71b9..a7ef70c 100644 --- a/src/STAT_BackEnd.C +++ b/src/STAT_BackEnd.C @@ -50,6 +50,7 @@ STAT_BackEnd::STAT_BackEnd(StatDaemonLaunch_t launchType) : { gStatOutFp = NULL; proctabSize_ = 0; + maxRank_ = 0; processMapNonNull_ = 0; logType_ = 0; parentHostName_ = NULL; @@ -208,7 +209,7 @@ StatError_t STAT_BackEnd::update3dNodesAndEdges() { if (edges3d_.find(edgesIter->first) == edges3d_.end()) { - edge = initializeBitVectorEdge(proctabSize_); + edge = initializeBitVectorEdge(maxRank_ + 1); if (edge == NULL) { printMsg(STAT_ALLOCATE_ERROR, __FILE__, __LINE__, "Failed to initialize edge\n"); @@ -275,7 +276,7 @@ StatError_t STAT_BackEnd::update2dEdge(int src, int dst, StatBitVectorEdge_t *ed if (edges2d_.find(dst) == edges2d_.end()) { - newEdge = initializeBitVectorEdge(proctabSize_); + newEdge = initializeBitVectorEdge(maxRank_ + 1); if (newEdge == NULL) { printMsg(STAT_ALLOCATE_ERROR, __FILE__, __LINE__, "Failed to initialize newEdge\n"); @@ -395,6 +396,26 @@ StatError_t STAT_BackEnd::generateGraphs(graphlib_graph_p *prefixTree2d, graphli int index; map::iterator nodeAttrIter; nodeAttr.attr_values = (void **)calloc(1, gNumNodeAttrs * sizeof(void *)); + std::string nodeLabel; + + // Generate the node label from available frame information + { auto nodeAttrs = nodeIdToAttrs_[nodesIter->first]; + if (!nodeAttrs["source"].empty() && !nodeAttrs["line"].empty()) { + nodeLabel = nodeAttrs["source"] + nodeAttrs["line"]; + } else if (!nodeAttrs["function"].empty()) { + nodeLabel = nodeAttrs["function"]; + } else if (!nodeAttrs["module"].empty() && !nodeAttrs["offset"].empty()) { + nodeLabel = nodeAttrs["module"] + nodeAttrs["offset"]; + } else if (!nodeAttrs["pc"].empty()) { + nodeLabel = nodeAttrs["pc"]; + } + } + + if (!nodeLabel.empty()) { + // Copied internally when graphlib_addNode invokes statCopyNode + nodeAttr.label = (void*)nodeLabel.c_str(); + } + if (nodeAttr.attr_values == NULL) { printMsg(STAT_ALLOCATE_ERROR, __FILE__, __LINE__, "%s: Error callocing %d nodeAttr.attr_values\n", strerror(errno), gNumNodeAttrs); @@ -419,11 +440,16 @@ StatError_t STAT_BackEnd::generateGraphs(graphlib_graph_p *prefixTree2d, graphli return STAT_GRAPHLIB_ERROR; } statFreeNodeAttrs(nodeAttr.attr_values, *currentGraph); + nodeAttr.label = (char*)""; } for (edgesIter = (*edges).begin(); edgesIter != (*edges).end(); edgesIter++) { int index; edgeAttr.attr_values = (void **)calloc(1, gNumEdgeAttrs * sizeof(void *)); + + // Edge label will be generated using statEdgeToText + edgeAttr.label = edgesIter->second.second; + if (edgeAttr.attr_values == NULL) { printMsg(STAT_ALLOCATE_ERROR, __FILE__, __LINE__, "%s: Error callocing %d edgeAttr.attr_values\n", strerror(errno), gNumEdgeAttrs); @@ -455,6 +481,7 @@ StatError_t STAT_BackEnd::generateGraphs(graphlib_graph_p *prefixTree2d, graphli continue; } edgeAttr.attr_values[index] = statCopyEdgeAttr(edgeAttrIter->first.c_str(), edgeAttrIter->second); + edgeAttrIter++; } @@ -466,6 +493,7 @@ StatError_t STAT_BackEnd::generateGraphs(graphlib_graph_p *prefixTree2d, graphli return STAT_GRAPHLIB_ERROR; } statFreeEdgeAttrs(edgeAttr.attr_values, *currentGraph); + edgeAttr.label = NULL; } } @@ -599,7 +627,7 @@ void STAT_BackEnd::onCrash(int sig, siginfo_t *, void *context) extern int gStatGraphRoutinesTotalWidth; char outFile[BUFSIZE]; - gStatGraphRoutinesTotalWidth = statBitVectorLength(proctabSize_); + gStatGraphRoutinesTotalWidth = statBitVectorLength(maxRank_ + 1); printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Exporting 2D graph to dot\n"); graphlibError = graphlib_colorGraphByLeadingEdgeLabel(prefixTree2d); @@ -1015,7 +1043,7 @@ StatError_t STAT_BackEnd::mainLoop() edge = edges2d_[nodeId].second; else { - edge = initializeBitVectorEdge(proctabSize_); + edge = initializeBitVectorEdge(maxRank_ + 1); if (edge == NULL) { printMsg(STAT_ALLOCATE_ERROR, __FILE__, __LINE__, "Failed to initialize edge\n"); @@ -1251,7 +1279,7 @@ StatError_t STAT_BackEnd::mainLoop() if (ackTag == PROT_SEND_NODE_IN_EDGE_RESP || ackTag == PROT_SEND_LAST_TRACE_RESP || ackTag == PROT_SEND_TRACES_RESP) { printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Sending serialized contents to FE with tag %d, length %d\n", ackTag, byteArrayLen); - bitVectorLength = statBitVectorLength(proctabSize_); + bitVectorLength = statBitVectorLength(maxRank_ + 1); if (stream->send(ackTag, "%Ac %d %d %ud", byteArray, byteArrayLen, bitVectorLength, myRank_, sampleType_) == -1) { printMsg(STAT_MRNET_ERROR, __FILE__, __LINE__, "stream::send(%d) failure\n", ackTag); @@ -1517,9 +1545,10 @@ StatError_t STAT_BackEnd::attach() if (proc != NULL) processMapNonNull_++; } + for (i = 0, processMapIter = processMap_.begin(); processMapIter != processMap_.end(); i++, processMapIter++) { - procsToRanks_.insert(make_pair(processMapIter->second, i)); + procsToRanks_.insert(make_pair(processMapIter->second, processMapIter->first)); #if defined(GROUP_OPS) int mpirank = processMapIter->first; @@ -1945,7 +1974,6 @@ StatError_t STAT_BackEnd::getVariable(const Frame &frame, char *variableName, ch return STAT_OK; } - StatError_t STAT_BackEnd::sampleStackTraces(unsigned int nTraces, unsigned int traceFrequency, unsigned int nRetries, unsigned int retryFrequency, char *variableSpecification) { int j; @@ -1976,7 +2004,7 @@ StatError_t STAT_BackEnd::sampleStackTraces(unsigned int nTraces, unsigned int t } } - printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Gathering and merging %d traces from each task\n", nTraces); + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Gathering and merging %d traces from all tasks of size: %d\n", nTraces, proctabSize_); #ifdef STAT_GDB_BE if (usingGdb_ == true) @@ -2034,7 +2062,7 @@ StatError_t STAT_BackEnd::sampleStackTraces(unsigned int nTraces, unsigned int t for (j = 0; j < proctabSize_; j++) { /* Set edge label */ - edge = initializeBitVectorEdge(proctabSize_); + edge = initializeBitVectorEdge(maxRank_ + 1); if (edge == NULL) { printMsg(STAT_ALLOCATE_ERROR, __FILE__, __LINE__, "Failed to initialize edge\n"); @@ -2043,6 +2071,7 @@ StatError_t STAT_BackEnd::sampleStackTraces(unsigned int nTraces, unsigned int t return STAT_ALLOCATE_ERROR; } edge->bitVector[j / STAT_BITVECTOR_BITS] |= STAT_GRAPH_BIT(j % STAT_BITVECTOR_BITS); + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Gathering %d task traces having MPIRANK: %d\n", j, proctab_[j].mpirank); pArgs = Py_BuildValue("(i)", proctab_[j].pid); if (!pArgs) @@ -2115,7 +2144,7 @@ StatError_t STAT_BackEnd::sampleStackTraces(unsigned int nTraces, unsigned int t currentFrameString = currentFrame; map nodeAttrs; startPos = 0; - endPos = currentFrameString.find("@"); + endPos = currentFrameString.find_last_of("@"); name += currentFrameString.substr(startPos, endPos - startPos); nodeAttrs["function"] = currentFrameString.substr(startPos, endPos - startPos); if (sampleType_ & STAT_SAMPLE_LINE) @@ -2201,7 +2230,7 @@ StatError_t STAT_BackEnd::sampleStackTraces(unsigned int nTraces, unsigned int t currentFrameString = currentFrame; map nodeAttrs; startPos = 0; - endPos = currentFrameString.find("@"); + endPos = currentFrameString.find_last_of("@"); name += currentFrameString.substr(startPos, endPos - startPos); nodeAttrs["function"] = currentFrameString.substr(startPos, endPos - startPos); if (sampleType_ & STAT_SAMPLE_LINE) @@ -2279,7 +2308,17 @@ StatError_t STAT_BackEnd::sampleStackTraces(unsigned int nTraces, unsigned int t { for (processMapIter = processMap_.begin(), j = 0; processMapIter != processMap_.end(); processMapIter++, j++) { - statError = getStackTrace(processMapIter->second, j, nRetries, retryFrequency); + // Find rank for process + auto procsToRanksIter = procsToRanks_.find(processMapIter->second); + if (procsToRanksIter == procsToRanks_.end()) { + printMsg(STAT_STACKWALKER_ERROR, __FILE__, __LINE__, "Failed fo find walker in procsToRanks_ map\n"); + return STAT_STACKWALKER_ERROR; + } + + // Get stack trace for process with given rank + statError = getStackTrace(processMapIter->second, procsToRanksIter->second, + nRetries, retryFrequency); + if (statError != STAT_OK) { printMsg(statError, __FILE__, __LINE__, "Error getting graph %d of %d\n", i + 1, nTraces); @@ -2478,10 +2517,10 @@ StatError_t STAT_BackEnd::getStackTrace(Walker *proc, int rank, unsigned int nRe OpenMPStackWalker *ompWalker = NULL; #endif - printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Gathering trace from task rank %d of %d\n", rank, proctabSize_); + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Gathering trace from task rank %d of %d\n", rank, maxRank_); /* Set edge label */ - edge = initializeBitVectorEdge(proctabSize_); + edge = initializeBitVectorEdge(maxRank_ + 1); if (edge == NULL) { printMsg(STAT_ALLOCATE_ERROR, __FILE__, __LINE__, "Failed to initialize edge\n"); @@ -2853,7 +2892,7 @@ StatError_t STAT_BackEnd::addFrameToGraph(CallTree *stackwalkerGraph, graphlib_n { for (myRanksIter = myRanks.begin(); myRanksIter != myRanks.end(); myRanksIter++) { - edge = initializeBitVectorEdge(proctabSize_); + edge = initializeBitVectorEdge(maxRank_ + 1); if (edge == NULL) { printMsg(STAT_ALLOCATE_ERROR, __FILE__, __LINE__, "Failed to initialize edge\n"); @@ -2889,7 +2928,6 @@ StatError_t STAT_BackEnd::addFrameToGraph(CallTree *stackwalkerGraph, graphlib_n if (nodes2d_.find(graphlibNode) != nodes2d_.end()) nodes2d_.erase(graphlibNode); } - return STAT_OK; } @@ -3120,7 +3158,7 @@ StatError_t STAT_BackEnd::getStackTraceFromAll(unsigned int nRetries, unsigned i nodes2d_[newChildId] = msg; nodeIdToAttrs_[newChildId]["function"] = msg; - StatBitVectorEdge_t *edge = initializeBitVectorEdge(proctabSize_); + StatBitVectorEdge_t *edge = initializeBitVectorEdge(maxRank_ + 1); if (edge == NULL) { printMsg(STAT_ALLOCATE_ERROR, __FILE__, __LINE__, "Failed to initialize edge\n"); @@ -4133,6 +4171,7 @@ StatError_t STAT_BackEnd::statBenchCreateTraces(unsigned int maxDepth, int nTask proctab_[j].executable_name = NULL; proctab_[j].host_name = NULL; proctab_[j].mpirank = proctab_[0].mpirank + j; + maxRank_ = std::max(maxRank_, proctab_[j].mpirank); } init++; } @@ -4175,7 +4214,7 @@ StatError_t STAT_BackEnd::statBenchCreateTrace(unsigned int maxDepth, unsigned i string path; StatBitVectorEdge_t *edge = NULL; - edge = initializeBitVectorEdge(proctabSize_); + edge = initializeBitVectorEdge(maxRank_ + 1); if (edge == NULL) { printMsg(STAT_ALLOCATE_ERROR, __FILE__, __LINE__, "Failed to initialize edge\n"); diff --git a/src/STAT_BackEnd.h b/src/STAT_BackEnd.h index fcb1cf9..4a9b30d 100644 --- a/src/STAT_BackEnd.h +++ b/src/STAT_BackEnd.h @@ -87,6 +87,7 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND //! An enum type to determine who launched the daemon typedef enum { STATD_LMON_LAUNCH = 0, + STATD_CTI_LAUNCH, STATD_SERIAL_LAUNCH, STATD_MRNET_LAUNCH, } StatDaemonLaunch_t; @@ -583,6 +584,7 @@ class STAT_BackEnd /****************/ int proctabSize_; /*!< the size of the process table */ + int maxRank_; /*!< the largest rank on this node */ int processMapNonNull_; /*!< the number of active processes */ unsigned int logType_; /*!< the logging level */ unsigned int nDaemonsPerNode_; /*!< the number of daemons per node */ diff --git a/src/STAT_FilterDefinitions.C b/src/STAT_FilterDefinitions.C index 1f121ac..a427b6d 100644 --- a/src/STAT_FilterDefinitions.C +++ b/src/STAT_FilterDefinitions.C @@ -59,7 +59,11 @@ const char *statMerge_format_string = "%Ac %d %d %ud"; const char *STAT_checkVersion_format_string = "%d %d %d %d %d"; //! The MRNet format string for the STAT filter initialization -const char *filterInit_format_string = "%uc %s %d"; +const char *filterInit_format_string = "%uc %s %d %d"; + +//! Whether the ranks sent are global, or local to node +//! If local, needs offset applied. +bool gHaveGlobalRanks = false; //! Global variable for file pointer FILE *gStatOutFp = NULL; @@ -191,7 +195,7 @@ void filterInit(vector &inputPackets, { char *logDir; char fileName[BUFSIZE], hostName[BUFSIZE]; - int intRet, mrnetOutputLevel; + int intRet, mrnetOutputLevel, haveGlobalRanks; unsigned int i; graphlib_error_t graphlibError; @@ -210,8 +214,9 @@ void filterInit(vector &inputPackets, { for (i = 0; i < inputPackets.size(); i++) { - if (inputPackets[i]->unpack("%uc %s %d", &gLogging, &logDir, &mrnetOutputLevel) == -1) + if (inputPackets[i]->unpack("%uc %s %d %d", &gLogging, &logDir, &mrnetOutputLevel, &haveGlobalRanks) == -1) cpPrintMsg(STAT_MRNET_ERROR, __FILE__, __LINE__, "failed to unpack packet\n"); + gHaveGlobalRanks = (haveGlobalRanks > 0); if (topology.get_Network()->is_LocalNodeInternal()) { if (gLogging & STAT_LOG_CP) @@ -360,7 +365,7 @@ void statMerge(vector &inputPackets, { currentPacket = inputPackets[childrenOrderIter->second]; edgeLabelWidths[i] = (*currentPacket)[1]->get_int32_t(); - totalWidth += edgeLabelWidths[i]; + totalWidth = std::max(totalWidth, edgeLabelWidths[i]); } if (tag == PROT_SEND_NODE_IN_EDGE_RESP) @@ -388,7 +393,11 @@ void statMerge(vector &inputPackets, /* Deserialize edge in packet element [0] */ byteArray = (char *)((*currentPacket)[0]->get_array(&type, &byteArrayLen)); gStatGraphRoutinesTotalWidth = totalWidth; - gStatGraphRoutinesEdgeLabelWidths = edgeLabelWidths; + if (!gHaveGlobalRanks) { + gStatGraphRoutinesEdgeLabelWidths = edgeLabelWidths; + } else { + gStatGraphRoutinesEdgeLabelWidths = nullptr; + } gStatGraphRoutinesCurrentIndex = rank; statFilterDeserializeEdge((void **)&edge, byteArray, byteArrayLen); statMergeEdge(retEdge, edge); @@ -409,6 +418,7 @@ void statMerge(vector &inputPackets, cpPrintMsg(STAT_GRAPHLIB_ERROR, __FILE__, __LINE__,"%s: Failed to malloc sOutputByteArray\n", strerror(errno)); return; } + statSerializeEdge(sOutputByteArray, retEdge); statFreeEdge(retEdge); } /* if (tag == PROT_SEND_NODE_IN_EDGE_RESP) */ @@ -433,7 +443,11 @@ void statMerge(vector &inputPackets, else { gStatGraphRoutinesTotalWidth = totalWidth; - gStatGraphRoutinesEdgeLabelWidths = edgeLabelWidths; + if (!gHaveGlobalRanks) { + gStatGraphRoutinesEdgeLabelWidths = edgeLabelWidths; + } else { + gStatGraphRoutinesEdgeLabelWidths = nullptr; + } gStatGraphRoutinesCurrentIndex = rank; graphlibError = graphlib_deserializeBasicGraph(¤tGraph, gStatMergeFunctions, byteArray, byteArrayLen); } @@ -444,6 +458,7 @@ void statMerge(vector &inputPackets, } graphlibError = graphlib_mergeGraphs(returnGraph, currentGraph); + if (GRL_IS_FATALERROR(graphlibError)) { cpPrintMsg(STAT_GRAPHLIB_ERROR, __FILE__, __LINE__, "Failed to merge graph %d\n", rank); diff --git a/src/STAT_FrontEnd.C b/src/STAT_FrontEnd.C index 0b6efbf..deb85a6 100644 --- a/src/STAT_FrontEnd.C +++ b/src/STAT_FrontEnd.C @@ -207,15 +207,9 @@ STAT_FrontEnd::STAT_FrontEnd() statInitializeMergeFunctions(); /* Get the FE hostname */ - string temp; - intRet = XPlat::NetUtils::GetLocalHostName(temp); - if (intRet == 0) - snprintf(hostname_, BUFSIZE, "%s", temp.c_str()); - else - { - intRet = gethostname(hostname_, BUFSIZE); - if (intRet != 0) - printMsg(STAT_WARNING, __FILE__, __LINE__, "gethostname failed with error code %d\n", intRet); + intRet = gethostname(hostname_, BUFSIZE); + if (intRet != 0) { + printMsg(STAT_WARNING, __FILE__, __LINE__, "gethostname failed with error code %d\n", intRet); } /* Initialize variables */ @@ -234,7 +228,7 @@ STAT_FrontEnd::STAT_FrontEnd() launcherArgc_ = 1; topologySize_ = 0; logging_ = 0; - jobId_ = NULL; + jobId_ = 0; launcherArgv_ = NULL; applExe_ = NULL; remoteNode_ = NULL; @@ -755,7 +749,7 @@ StatError_t STAT_FrontEnd::connectMrnetTree(bool blocking) sConnectAttempt = sConnectAttempt + 1; if (daemonsHaveExited()) { - printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, "LMON detected the daemons have exited\n"); + printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, daemonExitedMsg()); return STAT_DAEMON_ERROR; } if (sConnectAttempt < sConnectTimeout * 100) @@ -773,7 +767,7 @@ StatError_t STAT_FrontEnd::connectMrnetTree(bool blocking) { if (daemonsHaveExited()) { - printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, "LMON detected the daemons have exited\n"); + printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, daemonExitedMsg()); return STAT_DAEMON_ERROR; } if (gNumCallbacks == nApplNodes_ * nDaemonsPerNode_) @@ -862,7 +856,8 @@ StatError_t STAT_FrontEnd::setupConnectedMrnetTree() } /* Send an initial message using the broadcast stream */ - if (broadcastStream_->send(PROT_SEND_BROADCAST_STREAM, "%uc %s %d", logging_, logOutDir_, mrnetOutputLevel_) == -1) + if (broadcastStream_->send(PROT_SEND_BROADCAST_STREAM, "%uc %s %d %d", logging_, logOutDir_, mrnetOutputLevel_, + (haveGlobalRanks()) ? 1 : 0) == -1) { printMsg(STAT_MRNET_ERROR, __FILE__, __LINE__, "failed to send on broadcast stream\n"); return STAT_MRNET_ERROR; @@ -1369,7 +1364,7 @@ StatError_t STAT_FrontEnd::receiveAck(bool blocking) { if (daemonsHaveExited()) { - printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, "LMON detected the daemons have exited\n"); + printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, daemonExitedMsg()); isPendingAck_ = false; return STAT_DAEMON_ERROR; } @@ -1392,7 +1387,7 @@ StatError_t STAT_FrontEnd::receiveAck(bool blocking) { if (daemonsHaveExited()) { - printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, "LMON detected the daemons have exited\n"); + printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, daemonExitedMsg()); isPendingAck_ = false; return STAT_DAEMON_ERROR; } @@ -2195,7 +2190,7 @@ StatError_t STAT_FrontEnd::checkVersion() { if (daemonsHaveExited()) { - printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, "LMON detected the daemons have exited\n"); + printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, daemonExitedMsg()); return STAT_DAEMON_ERROR; } usleep(1000); @@ -2265,12 +2260,12 @@ StatError_t STAT_FrontEnd::attachApplication(bool blocking) } if (isKilled()) { - printMsg(STAT_APPLICATION_EXITED, __FILE__, __LINE__, "LMON detected the application has exited\n"); + printMsg(STAT_APPLICATION_EXITED, __FILE__, __LINE__, appExitedMsg()); return STAT_APPLICATION_EXITED; } if (daemonsHaveExited()) { - printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, "LMON detected the daemons have exited\n"); + printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, daemonExitedMsg()); return STAT_DAEMON_ERROR; } @@ -2343,12 +2338,12 @@ StatError_t STAT_FrontEnd::pause(bool blocking) } if (isKilled()) { - printMsg(STAT_APPLICATION_EXITED, __FILE__, __LINE__, "LMON detected the application has exited\n"); + printMsg(STAT_APPLICATION_EXITED, __FILE__, __LINE__, appExitedMsg()); return STAT_APPLICATION_EXITED; } if (daemonsHaveExited()) { - printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, "LMON detected the daemons have exited\n"); + printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, daemonExitedMsg()); return STAT_DAEMON_ERROR; } @@ -2419,12 +2414,12 @@ StatError_t STAT_FrontEnd::resume(bool blocking) } if (isKilled()) { - printMsg(STAT_APPLICATION_EXITED, __FILE__, __LINE__, "LMON detected the application has exited\n"); + printMsg(STAT_APPLICATION_EXITED, __FILE__, __LINE__, appExitedMsg()); return STAT_APPLICATION_EXITED; } if (daemonsHaveExited()) { - printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, "LMON detected the daemons have exited\n"); + printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, daemonExitedMsg()); return STAT_DAEMON_ERROR; } @@ -2494,12 +2489,12 @@ StatError_t STAT_FrontEnd::sampleStackTraces(unsigned int sampleType, unsigned i } if (isKilled()) { - printMsg(STAT_APPLICATION_EXITED, __FILE__, __LINE__, "LMON detected the application has exited\n"); + printMsg(STAT_APPLICATION_EXITED, __FILE__, __LINE__, appExitedMsg()); return STAT_APPLICATION_EXITED; } if (daemonsHaveExited()) { - printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, "LMON detected the daemons have exited\n"); + printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, daemonExitedMsg()); return STAT_DAEMON_ERROR; } @@ -2600,7 +2595,7 @@ StatError_t STAT_FrontEnd::gatherImpl(StatProt_t type, bool blocking) } if (daemonsHaveExited()) { - printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, "LMON detected the daemons have exited\n"); + printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, daemonExitedMsg()); return STAT_DAEMON_ERROR; } @@ -2665,7 +2660,7 @@ StatError_t STAT_FrontEnd::receiveStackTraces(bool blocking) { if (daemonsHaveExited()) { - printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, "LMON detected the daemons have exited\n"); + printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, daemonExitedMsg()); return STAT_DAEMON_ERROR; } if (blocking == true) @@ -2756,10 +2751,15 @@ StatError_t STAT_FrontEnd::receiveStackTraces(bool blocking) for (ranksIter = remapRanksList_.begin(); ranksIter != remapRanksList_.end(); ranksIter++) { /* Fill edge labels for this daemon */ - hostRanks = mrnetRankToMpiRanksMap_[*ranksIter]; - gStatGraphRoutinesRanksList = hostRanks->list; - gStatGraphRoutinesRanksListLength = hostRanks->count; - gStatGraphRoutinesCurrentIndex = offset; + if (!haveGlobalRanks()) { + hostRanks = mrnetRankToMpiRanksMap_[*ranksIter]; + gStatGraphRoutinesRanksList = hostRanks->list; + gStatGraphRoutinesRanksListLength = hostRanks->count; + gStatGraphRoutinesCurrentIndex = offset; + } else { + hostRanks = nullptr; + gStatGraphRoutinesRanksList = nullptr; + } graphlibError = graphlib_mergeGraphs(sortedStackTraces, stackTraces); if (GRL_IS_FATALERROR(graphlibError)) { @@ -2768,7 +2768,9 @@ StatError_t STAT_FrontEnd::receiveStackTraces(bool blocking) } /* update offset, round up to the nearest bit vector count*/ - offset += statBitVectorLength(hostRanks->count); + if (hostRanks != nullptr) { + offset += statBitVectorLength(hostRanks->count); + } } gEndTime.setTime(); @@ -3023,12 +3025,12 @@ char *STAT_FrontEnd::getNodeInEdge(int nodeId) } if (isKilled()) { - printMsg(STAT_APPLICATION_EXITED, __FILE__, __LINE__, "LMON detected the application has exited\n"); + printMsg(STAT_APPLICATION_EXITED, __FILE__, __LINE__, appExitedMsg()); return NULL; } if (daemonsHaveExited()) { - printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, "LMON detected the daemons have exited\n"); + printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, daemonExitedMsg()); return NULL; } @@ -3065,7 +3067,7 @@ char *STAT_FrontEnd::getNodeInEdge(int nodeId) { if (daemonsHaveExited()) { - printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, "LMON detected the daemons have exited\n"); + printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, daemonExitedMsg()); return NULL; } usleep(1000); @@ -3108,15 +3110,19 @@ char *STAT_FrontEnd::getNodeInEdge(int nodeId) /* Fill edge label on a per daemon basis */ printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Filling in edges\n"); offset = 0; - for (ranksIter = remapRanksList_.begin(); ranksIter != remapRanksList_.end(); ranksIter++) - { - /* Fill edge labels for this daemon */ - hostRanks = mrnetRankToMpiRanksMap_[*ranksIter]; - gStatGraphRoutinesRanksList = hostRanks->list; - gStatGraphRoutinesRanksListLength = hostRanks->count; - gStatGraphRoutinesCurrentIndex = offset; + if (!haveGlobalRanks()) { + for (ranksIter = remapRanksList_.begin(); ranksIter != remapRanksList_.end(); ranksIter++) + { + /* Fill edge labels for this daemon */ + hostRanks = mrnetRankToMpiRanksMap_[*ranksIter]; + gStatGraphRoutinesRanksList = hostRanks->list; + gStatGraphRoutinesRanksListLength = hostRanks->count; + gStatGraphRoutinesCurrentIndex = offset; + statMergeEdgeOrdered(orderedEdge, unorderedEdge); + offset += statBitVectorLength(hostRanks->count); + } + } else { statMergeEdgeOrdered(orderedEdge, unorderedEdge); - offset += statBitVectorLength(hostRanks->count); } statFreeEdge((void *)unorderedEdge); } @@ -3311,14 +3317,14 @@ StatError_t STAT_FrontEnd::detachApplication(int *stopList, int stopListSize, bo if (isKilled()) { #ifndef DYSECTAPI - printMsg(STAT_APPLICATION_EXITED, __FILE__, __LINE__, "LMON detected the application has exited\n"); + printMsg(STAT_APPLICATION_EXITED, __FILE__, __LINE__, appExitedMsg()); return STAT_APPLICATION_EXITED; #endif } if (daemonsHaveExited()) { #ifndef DYSECTAPI - printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, "LMON detected the daemons have exited\n"); + printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, daemonExitedMsg()); #endif return STAT_DAEMON_ERROR; } @@ -3481,12 +3487,12 @@ StatError_t STAT_FrontEnd::terminateApplication(bool blocking) } if (isKilled()) { - printMsg(STAT_APPLICATION_EXITED, __FILE__, __LINE__, "LMON detected the application has exited\n"); + printMsg(STAT_APPLICATION_EXITED, __FILE__, __LINE__, appExitedMsg()); return STAT_APPLICATION_EXITED; } if (daemonsHaveExited()) { - printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, "LMON detected the daemons have exited\n"); + printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, daemonExitedMsg()); return STAT_DAEMON_ERROR; } @@ -4370,12 +4376,12 @@ StatError_t STAT_FrontEnd::statBenchCreateStackTraces(unsigned int maxDepth, uns { if (isKilled()) { - printMsg(STAT_APPLICATION_EXITED, __FILE__, __LINE__, "LMON detected the application has exited\n"); + printMsg(STAT_APPLICATION_EXITED, __FILE__, __LINE__, appExitedMsg()); return STAT_APPLICATION_EXITED; } if (daemonsHaveExited()) { - printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, "LMON detected the daemons have exited\n"); + printMsg(STAT_DAEMON_ERROR, __FILE__, __LINE__, daemonExitedMsg()); return STAT_DAEMON_ERROR; } usleep(1000); diff --git a/src/STAT_FrontEnd.h b/src/STAT_FrontEnd.h index 3f984ef..28236d7 100644 --- a/src/STAT_FrontEnd.h +++ b/src/STAT_FrontEnd.h @@ -107,7 +107,7 @@ typedef struct _remap_node struct _remap_node **children; } RemapNode_t; -//! The statPack function registered to LMON to send data to the daemons +//! The statPack function registered to send data to the daemons /*! \param data - the input data \param[out] buf - the output buffer @@ -933,6 +933,10 @@ class STAT_FrontEnd StatError_t serveFileRequest(const char *receiveFileName); #endif + virtual bool haveGlobalRanks() = 0; + + virtual const char* daemonExitedMsg() = 0; + virtual const char* appExitedMsg() = 0; /****************/ /* Private data */ diff --git a/src/STAT_GraphRoutines.C b/src/STAT_GraphRoutines.C index 525b23f..2dfc4f0 100644 --- a/src/STAT_GraphRoutines.C +++ b/src/STAT_GraphRoutines.C @@ -41,7 +41,7 @@ int gNumEdgeAttrs; int gStatGraphRoutinesTotalWidth; //! the input list of bit vector widths -int *gStatGraphRoutinesEdgeLabelWidths; +int *gStatGraphRoutinesEdgeLabelWidths = NULL; //! the current index into the bit vector int gStatGraphRoutinesCurrentIndex; @@ -666,8 +666,11 @@ void statFilterDeserializeEdge(void **edge, const char *buf, unsigned int bufLen } offset = 0; - for (i = 0; i < gStatGraphRoutinesCurrentIndex; i++) - offset += gStatGraphRoutinesEdgeLabelWidths[i]; + if (gStatGraphRoutinesEdgeLabelWidths != nullptr) { + for (i = 0; i < gStatGraphRoutinesCurrentIndex; i++) { + offset += gStatGraphRoutinesEdgeLabelWidths[i]; + } + } memcpy((void *)&(e->bitVector[offset]), ptr, STAT_BITVECTOR_BYTES * currentEdgeLength); *edge = (void *)e; @@ -702,18 +705,34 @@ int bitVectorContains(StatBitVector_t *vec, int val) void *statMergeEdgeOrdered(void *edge1, const void *edge2) { - int i, bit, byte; + int i, n, bit, byte; StatBitVectorEdge_t *e1 = (StatBitVectorEdge_t *)edge1, *e2 = (StatBitVectorEdge_t *)edge2; if (edge1 == NULL || edge2 == NULL) return NULL; - for (i = 0; i < gStatGraphRoutinesRanksListLength; i++) - { - if (bitVectorContains(e2->bitVector, gStatGraphRoutinesCurrentIndex * STAT_BITVECTOR_BITS + i) == 1) + + // Use offset rank list if provided + if (gStatGraphRoutinesRanksList != nullptr) { + for (i = 0, n = e2->length * STAT_BITVECTOR_BITS; i < n; ++i) { - byte = gStatGraphRoutinesRanksList[i] / STAT_BITVECTOR_BITS; - bit = gStatGraphRoutinesRanksList[i] % STAT_BITVECTOR_BITS; - e1->bitVector[byte] |= STAT_GRAPH_BIT(bit); + if (bitVectorContains(e2->bitVector, i)) + { + // gStatGraphRoutineRanksList maps edge2's subset of ranks + // to the full set referenced by edge1 + byte = gStatGraphRoutinesRanksList[i] / STAT_BITVECTOR_BITS; + bit = gStatGraphRoutinesRanksList[i] % STAT_BITVECTOR_BITS; + if (byte < e1->length) + e1->bitVector[byte] |= STAT_GRAPH_BIT(bit); + else + fprintf(stderr, "cannot merge edge bit set\n"); + } + } + + // Regular bitset merge + } else { + auto len = std::min(e1->length, e2->length); + for (i = 0; i < len; i++) { + e1->bitVector[i] |= e2->bitVector[i]; } } return edge1; diff --git a/src/STAT_ctiBackEnd.C b/src/STAT_ctiBackEnd.C new file mode 100644 index 0000000..f1cf72d --- /dev/null +++ b/src/STAT_ctiBackEnd.C @@ -0,0 +1,297 @@ +#include "STAT_ctiBackEnd.h" +#include "common_tools_be.h" +#include +#include + +STAT_ctiBackEnd::STAT_ctiBackEnd(StatDaemonLaunch_t launchType) + : STAT_BackEnd(launchType) +{ + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Create CTI Backend.\n"); +} + +StatError_t STAT_ctiBackEnd::init(int *argc, char ***argv) +{ + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Initializing stat-cti backend\n"); + return STAT_BackEnd::init(argc, argv); +} + +StatError_t STAT_ctiBackEnd::finalize() +{ + return STAT_OK; +} + +// initialize cti +StatError_t STAT_ctiBackEnd::initLauncher() +{ + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Getting app processes.\n"); + + // get the proc table + cti_pidList_t* pids = cti_be_findAppPids(); + if (!pids) + { + printMsg(STAT_CTI_ERROR, __FILE__, __LINE__, "CTI failed to get processes.\n"); + return STAT_CTI_ERROR; + } + + int n = pids->numPids; + if (n <= 0) { + printMsg(STAT_CTI_ERROR, __FILE__, __LINE__, "CTI found no procesess.\n"); + cti_be_destroyPidList(pids); + return STAT_CTI_ERROR; + } + + proctab_ = (StatBackEndProcInfo_t*) malloc(n * sizeof(StatBackEndProcInfo_t)); + for (int i=0; ipids[i].pid; + + // Get the executable for the process + char exe[PATH_MAX+1]; + std::string procpath = "/proc/" + std::to_string(pid) + "/exe"; + ssize_t plen = readlink(procpath.c_str(), exe, PATH_MAX+1); + if (plen > 0 && plen <= PATH_MAX) { + exe[plen] = '\0'; + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "got executable %s from %s\n", + exe, procpath.c_str()); + } else { + printMsg(STAT_CTI_ERROR, __FILE__, __LINE__, "could not get application executable\n"); + cti_be_destroyPidList(pids); + return STAT_CTI_ERROR; + } + + proctab_[i].executable_name = strdup(exe); + proctab_[i].host_name = nullptr; + proctab_[i].pid = pid; + proctab_[i].mpirank = pids->pids[i].rank; + + maxRank_ = std::max(maxRank_, pids->pids[i].rank); + } + proctabSize_ = n; + + myRank_ = cti_be_getNodeFirstPE(); + + cti_be_destroyPidList(pids); + + return STAT_OK; +} + +#ifdef STAT_GDB_BE +StatError_t STAT_ctiBackEnd::initGdb() +{ + PyObject *pName; + const char *moduleName = "stat_cuda_gdb"; + + Py_Initialize(); +#if PY_MAJOR_VERSION >= 3 + pName = PyUnicode_FromString(moduleName); +#else + pName = PyString_FromString(moduleName); +#endif + if (pName == NULL) + { + fprintf(errOutFp_, "Cannot convert argument\n"); + return STAT_SYSTEM_ERROR; + } + + char* fileDir = cti_be_getFileDir(); + if (!fileDir) { + fprintf(errOutFp_, "Cannot get CTI file directory\n"); + return STAT_SYSTEM_ERROR; + } + + std::ostringstream os; + os << "import sys\n"; + os << "sys.path.append('" << fileDir << "')\n"; + + if (PyRun_SimpleString(os.str().c_str())) { + fprintf(errOutFp_, "Setting python path fails\n"); + } + + gdbModule_ = PyImport_Import(pName); + Py_DECREF(pName); + if (gdbModule_ == NULL) + { + fprintf(errOutFp_, "Failed to import Python module %s\n", moduleName); + PyErr_Print(); + return STAT_SYSTEM_ERROR; + } + usingGdb_ = true; + + std::string newFunctionName = "new_gdb_instance"; + auto newFunc = PyObject_GetAttrString(gdbModule_, newFunctionName.c_str()); + if (!newFunc || !PyCallable_Check(newFunc)) { + if (PyErr_Occurred()) + PyErr_Print(); + } + + return STAT_OK; +} +#endif + + +// Open the connection file \a filename and return the input stream. +// The connection file is generated by the front end and broadcast via the +// CTI manifest, so we we may need to wait for it to show up. +static std::ifstream waitForConnectionFile(const std::string connectionFile) +{ + std::ifstream connections; + static const int timeout = 60; // seconds + for (int attempt = 0; attempt < 100 * timeout; ++attempt) { + connections.open(connectionFile.c_str()); + if (connections) { + return connections; + } + usleep(10000); + } + return connections; +} + +// Setting up the MRNet + cti tool takes a few steps: +// 1- launch the back-end processes (gives us the nodes involved) +// 2- then create the front and communication nodes of the MRNet tree (giving us +// the set of parent nodes) +// 3- gather the parent connection information an broadcast via the cti manifest +// 4- each back-end node reads the connection file to find it's MRNet parent +// 5- with which it can create the back-end MRNet node +// When this function is called, steps 1 though 3 are already done. This call performs steps +// 4 and 5. +StatError_t STAT_ctiBackEnd::connect(int argc, char **argv) +{ + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "connecting backend\n"); + if (argc != 0) { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "not expecting arguments\n"); + return STAT_SYSTEM_ERROR; + } + + // get the hostname + char* hostnamePtr = cti_be_getNodeHostname(); + if (!hostnamePtr) { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "could not get hostname\n"); + return STAT_SYSTEM_ERROR; + } + + std::string hostname(hostnamePtr); + free(hostnamePtr); + + // Get the parent information so that we can connect this back end node to the mrnet tree. + char* fileDir = cti_be_getFileDir(); + if (!fileDir) { + printMsg(STAT_CTI_ERROR, __FILE__, __LINE__, "CTI failed to file dir.\n"); + return STAT_CTI_ERROR; + } + + std::string connectionFile = std::string(fileDir) + "/daemoninfo.txt"; + + free(fileDir); + fileDir = nullptr; + + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "trying to read file %s\n", connectionFile.c_str()); + std::ifstream connections = waitForConnectionFile(connectionFile); + + if (!connections) { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "could not open connection file %s\n", + connectionFile.c_str()); + return STAT_SYSTEM_ERROR; + } else { + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "successfully opened connection file %s\n", + connectionFile.c_str()); + } + + + // find the hostname in the connections files + + // The file starts with back-end hostnames : as hostname rank parentIndex + int numHosts = 0; + if (connections >> numHosts) { + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "got %d hosts\n", numHosts); + } else { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "reading number of hosts failed\n"); + } + + if (!connections || numHosts <= 0) { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "could not get find parent connection info\n"); + return STAT_SYSTEM_ERROR; + } + + int parentIdx = -1, mrnRank = -1; + for (int hostIdx=0; connections && hostIdx < numHosts; ++hostIdx) { + std::string host; int rank, pidx; + connections >> host >> rank >> pidx; + if (parentIdx < 0 && host == hostname) { + parentIdx = pidx; + mrnRank = rank; + } + } + + if (parentIdx < 0) { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "could not find host %s in host list\n", + hostname.c_str()); + return STAT_SYSTEM_ERROR; + } + + // Then the connection info for the parents as: hostname port rank + int numParents = 0; + connections >> numParents; + std::string parentHostname; int parentPort = -1, parentRank = -1; + std::string phost; int pport, prank; + for (int idx=0; connections && idx> phost >> pport >> prank)) + break; + + if (idx == parentIdx) { + parentHostname = phost; + parentPort = pport; + parentRank = prank; + break; + } + } + + if (parentHostname.empty()) { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "could not find parent info\n"); + return STAT_SYSTEM_ERROR; + } + + // create the back-end MRNet node + std::array sbeArgs = { "", + parentHostname, + std::to_string(parentPort), + std::to_string(parentRank), + hostname, + std::to_string(mrnRank) }; + char* beArgs[6]; + for (int i=0; i<6; ++i) { + beArgs[i] = const_cast(sbeArgs[i].c_str()); + } + + for (int i=0; i<6; ++i) { + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "mrnet be arg[%d] = %s\n", i, beArgs[i]); + } + + network_ = MRN::Network::CreateNetworkBE(6, beArgs); + + if (!network_) { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "back end network creation failed\n"); + return STAT_SYSTEM_ERROR; + } + + // And Bob's your uncle. + connected_ = true; + + return STAT_OK; +} + +/****************** + * STATBench Code * + ******************/ + +StatError_t STAT_ctiBackEnd::statBenchConnectInfoDump() +{ + printMsg(STAT_CTI_ERROR, __FILE__, __LINE__, "Statbench info dump not supported in CTI implementatioin.\n"); + return STAT_SYSTEM_ERROR; +} + +#ifdef USE_CTI +STAT_BackEnd* STAT_BackEnd::make(StatDaemonLaunch_t launchType) +{ + return new STAT_ctiBackEnd(launchType); +} +#endif diff --git a/src/STAT_ctiBackEnd.h b/src/STAT_ctiBackEnd.h new file mode 100644 index 0000000..f9113ea --- /dev/null +++ b/src/STAT_ctiBackEnd.h @@ -0,0 +1,31 @@ +#ifndef __STAT_CTIBACKEND_H +#define __STAT_CTIBACKEND_H + +#include "STAT_BackEnd.h" + +// backend implementation class that uses cti to launch tool daemons +class STAT_ctiBackEnd : public STAT_BackEnd +{ +public: + + STAT_ctiBackEnd(StatDaemonLaunch_t launchType); + + virtual StatError_t init(int *argc, char ***argv); + virtual StatError_t finalize(); + + //! Initialize and set up the cti + /* + \return STAT_OK on success + */ + virtual StatError_t initLauncher(); + +#ifdef STAT_GDB_BE + virtual StatError_t initGdb(); +#endif + + virtual StatError_t connect(int argc = 0, char **argv = NULL); + virtual StatError_t statBenchConnectInfoDump(); +}; + + +#endif diff --git a/src/STAT_ctiFrontEnd.C b/src/STAT_ctiFrontEnd.C new file mode 100644 index 0000000..c3bd456 --- /dev/null +++ b/src/STAT_ctiFrontEnd.C @@ -0,0 +1,642 @@ +#include "STAT_ctiFrontEnd.h" +#include + +#define ctiError() (printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "CTI Error: %s\n", cti_error_str()),STAT_SYSTEM_ERROR) + +STAT_ctiFrontEnd::STAT_ctiFrontEnd() : appId_(0), session_(0), hosts_(nullptr), + tasksPerPE_(1) +{ + std::string temp; + if (XPlat::NetUtils::GetLocalHostName(temp) == 0) + { + snprintf(hostname_, BUFSIZE, "%s", temp.c_str()); + } + + // Ensure CTI can initialize + if (auto hostname = cti_getHostname()) { + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "CTI initialized with hostname %s\n", hostname); + free(hostname); + } else { + ctiError(); + exit(STAT_SYSTEM_ERROR); + } +} + +STAT_ctiFrontEnd::~STAT_ctiFrontEnd() +{ + if (hosts_) cti_destroyHostsList(hosts_); + + if (appId_ != 0 && cti_appIsValid(appId_)) + { + cti_deregisterApp(appId_); + } +} + +StatError_t STAT_ctiFrontEnd::setupForSerialAttach() +{ + printMsg(STAT_ARG_ERROR, __FILE__, __LINE__, "Serial launch is not supported in CTI\n"); + return STAT_ARG_ERROR; +} + +StatError_t STAT_ctiFrontEnd::attach() +{ + // register the app with CTI + void* vops = nullptr; + auto wlm = cti_open_ops(&vops); + + if (!vops) { + return ctiError(); + } + + switch (wlm) { + case CTI_WLM_SLURM: + { + auto ops = static_cast(vops); + cti_srunProc_t *srunInfo = ops->getJobInfo(launcherPid_); + if (!srunInfo) + return ctiError(); + + appId_ = ops->registerJobStep(srunInfo->jobid, srunInfo->stepid); + free(srunInfo); + + if (!appId_) + return ctiError(); + + break; + } + case CTI_WLM_SSH: + { + auto ops = static_cast(vops); + appId_ = ops->registerLauncherPid((pid_t)launcherPid_); + if (!appId_) { + return ctiError(); + } + break; + } + case CTI_WLM_ALPS: + { + auto ops = static_cast(vops); + uint64_t apid = ops->getApid((pid_t)launcherPid_); + if (apid == 0) { + return ctiError(); + } + appId_ = ops->registerApid(apid); + if (appId_ == 0) + { + return ctiError(); + } + break; + } + + case CTI_WLM_PALS: + { + auto ops = static_cast(vops); + + char* apid = ops->getApid((pid_t)launcherPid_); + if (!apid) { + return ctiError(); + } + + appId_ = ops->registerApid(apid); + + free(apid); + if (!appId_) { + return ctiError(); + } + break; + } + + case CTI_WLM_FLUX: + { + auto ops = static_cast(vops); + + char* jobid = ops->getJobid((pid_t)launcherPid_); + if (!jobid) { + return ctiError(); + } + + appId_ = ops->registerJob(jobid); + + free(jobid); + if (!appId_) { + return ctiError(); + } + break; + } + + default: + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "Unsupported Cray WLM!\n"); + return STAT_SYSTEM_ERROR; + } + return STAT_OK; +} + +StatError_t STAT_ctiFrontEnd::launch() +{ + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Launching application with CTI\n"); + + // launcherArgv_ is null terminated. But the stat documentation tells you to include + // the launcher in the argument list, so I'll ignore the first one. + if (launcherArgc_ < 3) { + printMsg(STAT_ARG_ERROR, __FILE__, __LINE__, "No application given for launch\n"); + return STAT_ARG_ERROR; + } + + const char* env[] = { nullptr }; + appId_ = cti_launchAppBarrier(launcherArgv_+1, -1, -1, nullptr, nullptr, env); + //appId_ = cti_launchApp(launcherArgv_+1, -1, -1, nullptr, nullptr, env); + if (!appId_) { + return ctiError(); + } + + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Application launched successfully\n"); + return STAT_OK; +} + +StatError_t STAT_ctiFrontEnd::postAttachApplication() +{ + if (applicationOption_ == STAT_LAUNCH && appId_) { + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Releasing app barrier\n"); + if (cti_releaseAppBarrier(appId_)) + return ctiError(); + //printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "No Releasing app barrier\n"); + } + return STAT_FrontEnd::postAttachApplication(); +} + + +// Launch a back-end process on every node where the application is running. We don't +// yet have an MRNet network, so the one link we'll have available is the back-end nodes +// can access files the front end ships in the CTI manifest. +StatError_t STAT_ctiFrontEnd::launchDaemons() +{ + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Launching daemons with CTI\n"); + + if (!toolDaemonExe_) { + printMsg(STAT_ARG_ERROR, __FILE__, __LINE__, "Tool daemon path not set\n"); + return STAT_ARG_ERROR; + } + + if (applicationOption_ == STAT_SERIAL_ATTACH || applicationOption_ == STAT_SERIAL_GDB_ATTACH) { + printMsg(STAT_ARG_ERROR, __FILE__, __LINE__, "Serial launch is not supported in CTI\n"); + return STAT_ARG_ERROR; + } + + // initialize cti app id + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Attaching to job\n"); + + StatError_t statError = STAT_OK; + if (applicationOption_ == STAT_ATTACH || applicationOption_ == STAT_GDB_ATTACH) { + statError = attach(); + if (statError != STAT_OK) { + return statError; + } + + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Attached to job\n"); + + } else if (applicationOption_ == STAT_LAUNCH) { + statError = launch(); + if (statError != STAT_OK) { + return statError; + } + + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Launched the job\n"); + } else { + printMsg(STAT_ARG_ERROR, __FILE__, __LINE__, "Launch option %d it not supported by CTI\n", + applicationOption_); + return STAT_ARG_ERROR; + } + + + /* Increase the max proc and max fd limits for MRNet threads */ + +#if (defined(HAVE_GETRLIMIT) && defined(HAVE_SETRLIMIT)) + statError = increaseSysLimits(); + if (statError != STAT_OK) + printMsg(statError, __FILE__, __LINE__, "Failed to increase limits... attempting to run with current configuration\n"); +#endif + + int daemonArgc = 1; + char **daemonArgv = nullptr; + + if (applicationOption_ == STAT_GDB_ATTACH || applicationOption_ == STAT_SERIAL_GDB_ATTACH) { + const char* pythonPath = getenv("PYTHONPATH"); + if (!pythonPath) + pythonPath = ":"; + + const char* gdbCommand = getenv("STAT_GDB"); + if (!gdbCommand) + gdbCommand = "gdb"; + + daemonArgc += 4; + daemonArgv = (char **)realloc(daemonArgv, daemonArgc * sizeof(char *)); + daemonArgv[daemonArgc - 5] = strdup("-P"); + daemonArgv[daemonArgc - 4] = strdup(pythonPath); + daemonArgv[daemonArgc - 3] = strdup("-G"); + daemonArgv[daemonArgc - 2] = strdup(gdbCommand); + daemonArgv[daemonArgc - 1] = NULL; + } + + daemonArgc += 2; + daemonArgv = (char **)realloc(daemonArgv, daemonArgc * sizeof(char *)); + daemonArgv[daemonArgc - 3] = strdup("-d"); + char tempString[BUFSIZE]; + snprintf(tempString, BUFSIZE, "%d", nDaemonsPerNode_); + daemonArgv[daemonArgc - 2] = strdup(tempString); + daemonArgv[daemonArgc - 1] = NULL; + + statError = addDaemonLogArgs(daemonArgc, daemonArgv); + if (statError != STAT_OK) { + printMsg(statError, __FILE__, __LINE__, "Failed to add daemon logging args\n"); + return statError; + } + + for (int i = 0; i < daemonArgc; i++) + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "daemonArgv[%d] = %s\n", i, daemonArgv[i]); + + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Create CTI session\n"); + + session_ = cti_createSession(appId_); + if (!session_) + return ctiError(); + + cti_manifest_id_t manifest = cti_createManifest(session_); + if (!manifest) + return ctiError(); + + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Adding binary %s to manifest\n", toolDaemonExe_); + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "LD_LIBRARY_PATH=%s\n", getenv("LD_LIBRARY_PATH")); + + if (cti_addManifestBinary(manifest, toolDaemonExe_)) + return ctiError(); + + if (cti_addManifestLibrary(manifest, filterPath_)) + return ctiError(); + + if (applicationOption_ == STAT_GDB_ATTACH || applicationOption_ == STAT_SERIAL_GDB_ATTACH) { + char* cpFilterPath = strdup(filterPath_); + char* libDir = dirname(cpFilterPath); + std::string cudaLib; + std::string gdbLib; + + const char* pythonPath = getenv("PYTHONPATH"); + if (!pythonPath) + return ctiError(); + + std::string python_path = pythonPath; + if (python_path.find("python3.6") != std::string::npos) { + cudaLib = std::string(libDir) + "/python3.6/site-packages/cuda_gdb.py"; + gdbLib = std::string(libDir) + "/python3.6/site-packages/gdb.py"; + } else if (python_path.find("python3.9") != std::string::npos) { + cudaLib = std::string(libDir) + "/python3.9/site-packages/cuda_gdb.py"; + gdbLib = std::string(libDir) + "/python3.9/site-packages/gdb.py"; + } + free(cpFilterPath); + + if (cti_addManifestFile(manifest, cudaLib.c_str())) + return ctiError(); + if (cti_addManifestFile(manifest, gdbLib.c_str())) + return ctiError(); + } + + char* cpExe = strdup(toolDaemonExe_); + char* toolExe = basename(cpExe); + + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Launching tool %s.\n", toolExe); + + if (cti_execToolDaemon(manifest, toolExe, daemonArgv, nullptr)) { + free(cpExe); + return ctiError(); + } + + free(cpExe); + + isLaunched_ = true; + + statError = getProcInfo(); + if (statError != STAT_OK) { + return statError; + } + + if (strcmp(outDir_, "NULL") == 0 || strcmp(filePrefix_, "NULL") == 0) { + statError = createOutputDir(); + if (statError != STAT_OK) { + printMsg(statError, __FILE__, __LINE__, "Failed to create output directory\n"); + return statError; + } + } + + return STAT_OK; +} + +// Assemble the back-end process information which will include the rank to node mapping and +// and a the label based on the name of the executable(s). +StatError_t STAT_ctiFrontEnd::getProcInfo() +{ + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Getting proc info and applExe from CTI.\n"); + + nApplProcs_ = cti_getNumAppPEs(appId_); + if (!nApplProcs_) + return ctiError(); + + hosts_ = cti_getAppHostsPlacement(appId_); + if (!hosts_) + return ctiError(); + + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Done.\n"); + + nApplNodes_ = hosts_->numHosts; + procToHost_.reserve(nApplNodes_); + + // Note: the CTI implementation is not currently trying to track the real MPI rank. It's just + // assigning a rank value based based on the order of the backend nodes. We could add a little + // protocol to fix up the bookkeeping after the backends are connected to MRNet, but I'm not + // sure it's really worth it. + int totNumPEs = 0; + for (int i=0, totNumPEs=0; ihosts[i].hostname, hosts_->hosts[i].numPes); + + totNumPEs += hosts_->hosts[i].numPes; + procToHost_.push_back(totNumPEs); + } + + std::string applName; + if (cti_binaryList_t* binList = cti_getAppBinaryList(appId_)) { + for (char** binIt = binList->binaries; *binIt; ++binIt) { + char* bin = *binIt; + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "binary = %s\n", bin); + + if (!applName.empty()) + applName += "_"; + applName += basename(bin); + } + + cti_destroyBinaryList(binList); + } + + if (applName.empty()) { + printMsg(STAT_CTI_ERROR, __FILE__, __LINE__, "did not get application name\n"); + applName = "unknown"; + } + + applExe_ = strdup(applName.c_str()); + + return STAT_OK; +} + +StatError_t STAT_ctiFrontEnd::sendDaemonInfo() +{ + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "sending daemon info\n"); + + // the backend processes can't connect to mrnet without the parent information, so + // we need to ship enough information via the cti manifest to infer the connectivity + if (leafInfo_.leafCps.empty()) { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "MRNet tree was not created\n"); + return STAT_SYSTEM_ERROR; + } + + std::string connectFileDir = std::string(outDir_) + "/" + filePrefix_ + ".daemons"; + if (mkdir(connectFileDir.c_str(), S_IRWXU)) { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "could not create directory %s\n", + connectFileDir.c_str()); + return STAT_SYSTEM_ERROR; + } + + std::string connectFile = connectFileDir + "/daemoninfo.txt"; + std::ofstream str(connectFile.c_str()); + if (!str) { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "could not create file %s\n", + connectFile.c_str()); + return STAT_SYSTEM_ERROR; + } + + + int numParents = leafInfo_.leafCps.size(); + int numHosts = hosts_->numHosts; + + // print out the hosts to parent index info + str << numHosts << "\n"; + for (int i=0; ihosts[i]; + int rank = i + topologySize_; + int parentIdx = (numParents * i) / numHosts; + str << host.hostname << " " << rank << " " << parentIdx << "\n"; + } + + + // mrnet parent nodes + str << leafInfo_.leafCps.size() << "\n"; + + for ( auto node : leafInfo_.leafCps) { + str << node->get_HostName() << " " << node->get_Port() << " " << node->get_Rank() << "\n"; + } + + if (!str) { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "writing daemon info file %s failed\n", + connectFile.c_str()); + return STAT_SYSTEM_ERROR; + } + str.close(); + + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "sending manifest with file %s\n", + connectFile.c_str()); + + // and ship the file. + cti_manifest_id_t manifest = cti_createManifest(session_); + if (!manifest) { + return ctiError(); + } + + if (cti_addManifestFile(manifest, connectFile.c_str())) { + return ctiError(); + } + + if (cti_sendManifest(manifest)) { + return ctiError(); + } + + return STAT_OK; +} + +StatError_t STAT_ctiFrontEnd::createMRNetNetwork(const char* topologyFileName) +{ + cti_manifest_id_t manifest = cti_createManifest(session_); + + std::map attrs; + attrs["CRAY_CTI_APPID"] = std::to_string(appId_); + attrs["CRAY_CTI_MID"] = std::to_string(manifest); + + // the filter should have already been sent in the manifest + network_ = MRN::Network::CreateNetworkFE(topologyFileName, NULL, NULL, &attrs); + return STAT_OK; +} + + +void STAT_ctiFrontEnd::detachFromLauncher(const char* errMsg) +{ + if (session_) { + if (cti_destroySession(session_)) { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "Detach failed %s\n", errMsg); + } else { + session_ = 0; + } + } +} + +void STAT_ctiFrontEnd::shutDown() { + + if (network_ != NULL && isConnected_ == true) + shutdownMrnetTree(); + + detachFromLauncher("CTI failed to detach from launcher...\n"); + isLaunched_ = false; +} + +bool STAT_ctiFrontEnd::daemonsHaveExited() { + return !cti_appIsValid(appId_); +} +bool STAT_ctiFrontEnd::isKilled() { + return !cti_appIsValid(appId_); +} + +int STAT_ctiFrontEnd::getNumProcs() { + return nApplProcs_; +} +const char* STAT_ctiFrontEnd::getHostnameForProc(int procIdx) { + if (!hosts_) { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "proc table is not initialized\n"); + return "invalid process"; + } + + // for STATBench, + procIdx /= tasksPerPE_; + + if (procIdx < 0 || procIdx >= nApplProcs_) { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "invalid procIdx in getHostnameForProc\n"); + return "invalid process"; + } + + auto hostIdx = std::upper_bound(procToHost_.begin(), procToHost_.end(), procIdx) - + procToHost_.begin(); + + return hosts_->hosts[hostIdx].hostname; +}; + +int STAT_ctiFrontEnd::getMpiRankForProc(int procIdx) +{ + static int cnt = 0; + if (!cnt++) { + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Using pseudo mpi rank for CTI\n"); + } + return procIdx; +} + +StatError_t STAT_ctiFrontEnd::dumpProctab() +{ + char fileName[BUFSIZE]; + snprintf(fileName, BUFSIZE, "%s/%s.ptab", outDir_, filePrefix_); + + FILE* file = fopen(fileName, "w"); + if (!file) { + printMsg(STAT_FILE_ERROR, __FILE__, __LINE__, "%s: fopen failed to create ptab file %s\n", strerror(errno), fileName); + return STAT_FILE_ERROR; + } + + if (!hosts_) { + fprintf(file, "host names are unavailable\n"); + } else { + int totPEs = 0; + for (int i=0, n=hosts_->numHosts; ihosts[i].numPes; + fprintf(file, "%d - %d : %s" , totPEs, totPEs+pes-1, hosts_->hosts[i].hostname); + totPEs += pes; + } + } + + fclose(file); + return STAT_OK; +} + +bool STAT_ctiFrontEnd::checkNodeAccess(const char *node) +{ + /* MRNet CPs launched through alps */ + return true; +} + +StatError_t STAT_ctiFrontEnd::addSerialProcess(const char *pidString) +{ + printMsg(STAT_ARG_ERROR, __FILE__, __LINE__, "Serial launch is not supported in CTI\n"); + return STAT_ARG_ERROR; +} +StatError_t STAT_ctiFrontEnd::addDaemonSerialProcArgs(int& deamonArgc, char ** &deamonArgv) +{ + printMsg(STAT_ARG_ERROR, __FILE__, __LINE__, "Serial launch is not supported in CTI\n"); + return STAT_ARG_ERROR; +} + +StatError_t STAT_ctiFrontEnd::setAppNodeList() +{ + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Generating application node list\n"); + applicationNodeMultiSet_.clear(); + + if (!hosts_) { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "host table is not available\n"); + return STAT_SYSTEM_ERROR; + } + + int numHosts = hosts_->numHosts; + for (int i=0; ihosts[i].hostname); + } + + nApplNodes_ = numHosts; + + return STAT_OK; +} + +StatError_t STAT_ctiFrontEnd::STATBench_setAppNodeList() +{ + printMsg(STAT_LOG_MESSAGE, __FILE__, __LINE__, "Generating application node list\n"); + applicationNodeMultiSet_.clear(); + + if (!hosts_) { + printMsg(STAT_SYSTEM_ERROR, __FILE__, __LINE__, "host table is not available\n"); + return STAT_SYSTEM_ERROR; + } + + for (int i=0, n=hosts_->numHosts; ihosts[i].hostname; + for (int j=0, numPEs=hosts_->hosts[i].numPes; j procToHost_; + int tasksPerPE_; +}; + + + +#endif diff --git a/src/STAT_lmonFrontEnd.C b/src/STAT_lmonFrontEnd.C index 551cede..3b19c89 100644 --- a/src/STAT_lmonFrontEnd.C +++ b/src/STAT_lmonFrontEnd.C @@ -735,6 +735,15 @@ int STAT_lmonFrontEnd::getMpiRankForProc(int procIdx) { return proctab_[procIdx].mpirank; } +const char* STAT_lmonFrontEnd::daemonExitedMsg() +{ + return "LMON detected the daemons have exited\n"; +} + +const char* STAT_lmonFrontEnd::appExitedMsg() +{ + return "LMON detected the daemons have exited\n"; +} #ifndef USE_CTI STAT_FrontEnd *STAT_FrontEnd::make() diff --git a/src/STAT_lmonFrontEnd.h b/src/STAT_lmonFrontEnd.h index b108747..81e39d5 100644 --- a/src/STAT_lmonFrontEnd.h +++ b/src/STAT_lmonFrontEnd.h @@ -48,6 +48,9 @@ class STAT_lmonFrontEnd : public STAT_FrontEnd virtual StatError_t setAppNodeList(); virtual StatError_t STATBench_setAppNodeList(); virtual StatError_t STATBench_resetProctab(unsigned int nTasks); + virtual bool haveGlobalRanks() { return false; } + virtual const char* daemonExitedMsg(); + virtual const char* appExitedMsg(); private: //! validate the apid with CTI