diff --git a/libkineto/CMakeLists.txt b/libkineto/CMakeLists.txt index add1a1e..6862f96 100644 --- a/libkineto/CMakeLists.txt +++ b/libkineto/CMakeLists.txt @@ -30,6 +30,7 @@ set(KINETO_LIBRARY_TYPE "default" CACHE STRING set_property(CACHE KINETO_LIBRARY_TYPE PROPERTY STRINGS default shared) option(KINETO_BUILD_TESTS "Build kineto unit tests" ON) +set(LIBKINETO_DIR "${CMAKE_CURRENT_SOURCE_DIR}") set(LIBKINETO_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src") set(LIBKINETO_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include") set(LIBKINETO_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) @@ -181,8 +182,10 @@ elseif(KINETO_LIBRARY_TYPE STREQUAL "static") $) elseif(KINETO_LIBRARY_TYPE STREQUAL "shared") add_library(kineto SHARED - $) + $ + $) set_property(TARGET kineto_base PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET kineto_api PROPERTY POSITION_INDEPENDENT_CODE ON) set_target_properties(kineto PROPERTIES CXX_VISIBILITY_PRESET hidden) else() @@ -217,4 +220,5 @@ install(EXPORT kinetoLibraryConfig DESTINATION share/cmake/kineto if(KINETO_BUILD_TESTS) add_subdirectory(test) + add_subdirectory("${LIBKINETO_THIRDPARTY_DIR}/googletest") endif() diff --git a/libkineto/include/ActivityProfilerInterface.h b/libkineto/include/ActivityProfilerInterface.h index db173d7..b6bc5d9 100644 --- a/libkineto/include/ActivityProfilerInterface.h +++ b/libkineto/include/ActivityProfilerInterface.h @@ -59,6 +59,9 @@ class ActivityProfilerInterface { const std::set& activityTypes, const std::string& configStr = "") {} + // Toggle GPU tracing as a trace is running to omit certain parts of a graph + virtual void toggleCollectionDynamic(const bool enable) {} + // Start recording, potentially reusing any buffers allocated since // prepareTrace was called. virtual void startTrace() {} diff --git a/libkineto/include/ActivityType.h b/libkineto/include/ActivityType.h index 8433ba4..ae629af 100644 --- a/libkineto/include/ActivityType.h +++ b/libkineto/include/ActivityType.h @@ -10,6 +10,7 @@ #include #include +#include namespace libkineto { @@ -24,23 +25,30 @@ enum class ActivityType { GPU_MEMSET, CONCURRENT_KERNEL, // on-device kernels EXTERNAL_CORRELATION, - MUSA_RUNTIME, // host side musa runtime events - MUSA_DRIVER, // host side musa driver events + CUDA_RUNTIME, // host side cuda runtime events + CUDA_DRIVER, // host side cuda driver events CPU_INSTANT_EVENT, // host side point-like events PYTHON_FUNCTION, - OVERHEAD, // MUPTI induced overhead events sampled from its overhead API. + OVERHEAD, // CUPTI induced overhead events sampled from its overhead API. + MTIA_RUNTIME, // host side MTIA runtime events + MTIA_CCP_EVENTS, // MTIA ondevice CCP events + CUDA_SYNC, // synchronization events between runtime and kernels // Optional Activity types - MUSA_SYNC, // synchronization events between runtime and kernels GLOW_RUNTIME, // host side glow runtime events - MTIA_RUNTIME, // host side MTIA runtime events - MUSA_PROFILER_RANGE, // MUPTI Profiler range for performance metrics - MTIA_CCP_EVENTS, // MTIA ondevice CCP events + CUDA_PROFILER_RANGE, // MUPTI Profiler range for performance metrics HPU_OP, // HPU host side runtime event XPU_RUNTIME, // host side xpu runtime events + COLLECTIVE_COMM, // collective communication + MTIA_WORKLOADD, // MTIA workloadd events + + // PRIVATEUSE1 Activity types are used for custom backends. + // The corresponding device type is `DeviceType::PrivateUse1` in PyTorch. + PRIVATEUSE1_RUNTIME, // host side privateUse1 runtime events + PRIVATEUSE1_DRIVER, // host side privateUse1 driver events ENUM_COUNT, // This is to add buffer and not used for any profiling logic. Add your new type before it. - OPTIONAL_ACTIVITY_TYPE_START = MUSA_SYNC, + OPTIONAL_ACTIVITY_TYPE_START = GLOW_RUNTIME, }; const char* toString(ActivityType t); diff --git a/libkineto/include/Config.h b/libkineto/include/Config.h index e18cad5..d57f889 100644 --- a/libkineto/include/Config.h +++ b/libkineto/include/Config.h @@ -493,4 +493,8 @@ class Config : public AbstractConfig { constexpr char kUseDaemonEnvVar[] = "KINETO_USE_DAEMON"; -} // namespace KINETO_NAMESPACE +#if __linux__ +bool isDaemonEnvVarSet(); +#endif + +} // namespace libkineto diff --git a/libkineto/include/GenericTraceActivity.h b/libkineto/include/GenericTraceActivity.h index 4735d7d..7639a54 100644 --- a/libkineto/include/GenericTraceActivity.h +++ b/libkineto/include/GenericTraceActivity.h @@ -30,7 +30,7 @@ constexpr unsigned int kLinkAsyncCpuGpu = 2; class GenericTraceActivity : public ITraceActivity { public: GenericTraceActivity() - : activityType(ActivityType::ENUM_COUNT), traceSpan_(NULL) {} + : activityType(ActivityType::ENUM_COUNT), traceSpan_(nullptr) {} GenericTraceActivity( const TraceSpan& trace, ActivityType type, const std::string& name) diff --git a/libkineto/include/IActivityProfiler.h b/libkineto/include/IActivityProfiler.h index 84b647a..c78394e 100644 --- a/libkineto/include/IActivityProfiler.h +++ b/libkineto/include/IActivityProfiler.h @@ -41,12 +41,19 @@ enum class TraceStatus { }; /* DeviceInfo: - * Can be used to specify process name, PID and device label + * Can be used to specify process name, sort order, PID and device label. + * The sort order is determined by the sortIndex field to handle ordering of + * processes and gpu rows in the trace viewer. */ struct DeviceInfo { - DeviceInfo(int64_t id, const std::string& name, const std::string& label) - : id(id), name(name), label(label) {} + DeviceInfo( + int64_t id, + int64_t sortIndex, + const std::string& name, + const std::string& label) + : id(id), sortIndex(sortIndex), name(name), label(label) {} int64_t id; // process id + int64_t sortIndex; // position in trace view const std::string name; // process name const std::string label; // device label }; @@ -66,6 +73,10 @@ struct ResourceInfo { int64_t deviceId; // id of device which owns this resource (specified in DeviceInfo.id) const std::string name; // resource name }; + +using getLinkedActivityCallback = + std::function; + /* IActivityProfilerSession: * an opaque object that can be used by a high level profiler to * start/stop and return trace events. @@ -91,6 +102,12 @@ class IActivityProfilerSession { // processes trace activities using logger virtual void processTrace(ActivityLogger& logger) = 0; + virtual void processTrace(ActivityLogger& logger, + getLinkedActivityCallback /*getLinkedActivity*/, + int64_t /*startTime*/, int64_t /*endTime*/) { + processTrace(logger); + } + // returns device info used in this trace, could be nullptr virtual std::unique_ptr getDeviceInfo() = 0; @@ -103,6 +120,12 @@ class IActivityProfilerSession { // XXX define trace formats // virtual save(string name, TraceFormat format) + virtual void pushCorrelationId(uint64_t /*id*/) {} + virtual void popCorrelationId() {} + + virtual void pushUserCorrelationId(uint64_t /*id*/) {} + virtual void popUserCorrelationId() {} + protected: TraceStatus status_ = TraceStatus::READY; }; diff --git a/libkineto/include/ThreadUtil.h b/libkineto/include/ThreadUtil.h index 2c4a2fb..12b37da 100644 --- a/libkineto/include/ThreadUtil.h +++ b/libkineto/include/ThreadUtil.h @@ -28,7 +28,3 @@ std::string processName(int32_t pid); std::vector> pidCommandPairsOfAncestors(); } // namespace libkineto - -#ifdef HAS_ROCTRACER -using namespace libkineto; -#endif diff --git a/libkineto/src/output_base.h b/libkineto/include/output_base.h similarity index 79% rename from libkineto/src/output_base.h rename to libkineto/include/output_base.h index 2438fe4..9997f8d 100644 --- a/libkineto/src/output_base.h +++ b/libkineto/include/output_base.h @@ -14,19 +14,26 @@ #include #include -#include "ActivityBuffers.h" +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "IActivityProfiler.h" #include "GenericTraceActivity.h" #include "ThreadUtil.h" #include "TraceSpan.h" namespace KINETO_NAMESPACE { - class Config; + struct ActivityBuffers; } namespace libkineto { using namespace KINETO_NAMESPACE; +// Used by sortIndex to put GPU tracks at the bottom +// of the trace timelines. The largest valid CPU PID is 4,194,304, +// so 5000000 is enough to guarantee that GPU tracks are sorted after CPU. +constexpr int64_t kExceedMaxPid = 5000000; + class ActivityLogger { public: @@ -60,7 +67,7 @@ class ActivityLogger { } virtual void finalizeTrace( - const KINETO_NAMESPACE::Config& config, + const Config& config, std::unique_ptr buffers, int64_t endTime, std::unordered_map>& metadata) = 0; @@ -69,4 +76,4 @@ class ActivityLogger { ActivityLogger() = default; }; -} // namespace KINETO_NAMESPACE +} // namespace libkineto diff --git a/libkineto/include/time_since_epoch.h b/libkineto/include/time_since_epoch.h index 55f9cfb..f22617b 100644 --- a/libkineto/include/time_since_epoch.h +++ b/libkineto/include/time_since_epoch.h @@ -15,7 +15,7 @@ namespace libkineto { template inline int64_t timeSinceEpoch( const std::chrono::time_point& t) { - return std::chrono::duration_cast( + return std::chrono::duration_cast( t.time_since_epoch()) .count(); } diff --git a/libkineto/libkineto_defs.bzl b/libkineto/libkineto_defs.bzl index aaf1431..e4844fc 100644 --- a/libkineto/libkineto_defs.bzl +++ b/libkineto/libkineto_defs.bzl @@ -74,6 +74,7 @@ def get_libkineto_public_headers(): "include/ThreadUtil.h", "include/libkineto.h", "include/time_since_epoch.h", + "include/output_base.h", ] # kineto code should be updated to not have to diff --git a/libkineto/sample_programs/README.md b/libkineto/sample_programs/README.md index e6ea437..1a3645c 100644 --- a/libkineto/sample_programs/README.md +++ b/libkineto/sample_programs/README.md @@ -1,10 +1,10 @@ # How to Run Sample Programs -To run `kineto_playground.cpp` in the `sample_programs` folder, you can use the following steps: (Note: scripts below are hard-coded to a specific set of sample programs, you can modify them to work with a different program. TODO: make these scripts more flexible.) +To run `kineto_mupti_profiler.cpp` in the `sample_programs` folder, you can use the following steps: (Note: scripts below are hard-coded to a specific set of sample programs, you can modify them to work with a different program. TODO: make these scripts more flexible.) -1. `./build-cu.sh` - - this generates `kplay-cu.o` +1. `./build-mu.sh` + - this generates `kplay-mu.o` 2. `./build.sh` - this generates binary called `main` 3. Run `./main` - - runs your code defined in `kineto_playground.cpp` + - runs your code defined in `kineto_mupti_profiler.cpp` diff --git a/libkineto/sample_programs/build.sh b/libkineto/sample_programs/build.sh index bd662c7..e7d137a 100755 --- a/libkineto/sample_programs/build.sh +++ b/libkineto/sample_programs/build.sh @@ -12,7 +12,8 @@ g++ \ kineto_mupti_profiler.cpp \ -o main \ kplay_mu.o \ - /usr/local/lib/libkineto.a \ + /home/pytorch/torch/lib/libkineto.a \ + -I../include \ -I/usr/local/musa/include \ -I../third_party/fmt/include \ -I/usr/local/include/kineto \ diff --git a/libkineto/sample_programs/kineto_mupti_profiler.cpp b/libkineto/sample_programs/kineto_mupti_profiler.cpp index 8fde80b..c5d9f49 100644 --- a/libkineto/sample_programs/kineto_mupti_profiler.cpp +++ b/libkineto/sample_programs/kineto_mupti_profiler.cpp @@ -20,7 +20,7 @@ using namespace kineto; -static const std::string kFileName = "/tmp/kineto_playground_trace.json"; +static const std::string kFileName = "./kineto_playground_trace.json"; int main() { warmup(); @@ -34,18 +34,22 @@ int main() { libkineto::ActivityType::GPU_MEMSET, libkineto::ActivityType::CONCURRENT_KERNEL, libkineto::ActivityType::EXTERNAL_CORRELATION, - libkineto::ActivityType::MUSA_RUNTIME, - libkineto::ActivityType::MUSA_DRIVER, + libkineto::ActivityType::CUDA_RUNTIME, + libkineto::ActivityType::CUDA_DRIVER, libkineto::ActivityType::CPU_INSTANT_EVENT, libkineto::ActivityType::PYTHON_FUNCTION, libkineto::ActivityType::OVERHEAD, - libkineto::ActivityType::MUSA_SYNC, - libkineto::ActivityType::GLOW_RUNTIME, libkineto::ActivityType::MTIA_RUNTIME, - libkineto::ActivityType::MUSA_PROFILER_RANGE, libkineto::ActivityType::MTIA_CCP_EVENTS, + libkineto::ActivityType::CUDA_SYNC, + libkineto::ActivityType::GLOW_RUNTIME, + libkineto::ActivityType::CUDA_PROFILER_RANGE, libkineto::ActivityType::HPU_OP, libkineto::ActivityType::XPU_RUNTIME, + libkineto::ActivityType::COLLECTIVE_COMM, + libkineto::ActivityType::MTIA_WORKLOADD, + libkineto::ActivityType::PRIVATEUSE1_RUNTIME, + libkineto::ActivityType::PRIVATEUSE1_DRIVER, }; libkineto_init(false, true); diff --git a/libkineto/sample_programs/kineto_playground.cpp b/libkineto/sample_programs/kineto_playground.cpp index 368925d..7fa9409 100644 --- a/libkineto/sample_programs/kineto_playground.cpp +++ b/libkineto/sample_programs/kineto_playground.cpp @@ -18,7 +18,7 @@ using namespace kineto; -static const std::string kFileName = "/tmp/kineto_playground_trace.json"; +static const std::string kFileName = "./kineto_playground_trace.json"; int main() { warmup(); diff --git a/libkineto/src/ActivityProfilerController.cpp b/libkineto/src/ActivityProfilerController.cpp index 3089683..619e3e3 100644 --- a/libkineto/src/ActivityProfilerController.cpp +++ b/libkineto/src/ActivityProfilerController.cpp @@ -16,9 +16,6 @@ #include "ActivityTrace.h" #include "MuptiActivityApi.h" -#ifdef HAS_ROCTRACER -#include "RoctracerActivityApi.h" -#endif #include "ThreadUtil.h" #include "output_json.h" @@ -45,14 +42,11 @@ void ActivityProfilerController::setLoggerCollectorFactory( ActivityProfilerController::ActivityProfilerController( ConfigLoader& configLoader, bool cpuOnly) : configLoader_(configLoader) { -#ifdef HAS_ROCTRACER - profiler_ = std::make_unique( - RoctracerActivityApi::singleton(), cpuOnly); -#else + // Initialize ChromeTraceBaseTime first of all. profiler_ = std::make_unique( MuptiActivityApi::singleton(), cpuOnly); -#endif configLoader_.addHandler(ConfigLoader::ConfigKind::ActivityProfiler, this); + ChromeTraceBaseTime::singleton().init(); #if !USE_GOOGLE_LOG if (loggerCollectorFactory()) { @@ -363,6 +357,10 @@ void ActivityProfilerController::prepareTrace(const Config& config) { profiler_->setSyncProfilingRunning(true); } +void ActivityProfilerController::toggleCollectionDynamic(const bool enable) { + profiler_->toggleCollectionDynamic(enable); +} + void ActivityProfilerController::startTrace() { if (profiler_->isOnDemandProfilingRunning()) { LOG(WARNING) << "Ignored startTrace request - on-demand profiler busy"; diff --git a/libkineto/src/ActivityProfilerController.h b/libkineto/src/ActivityProfilerController.h index ede7454..16ea841 100644 --- a/libkineto/src/ActivityProfilerController.h +++ b/libkineto/src/ActivityProfilerController.h @@ -62,6 +62,7 @@ class ActivityProfilerController : public ConfigLoader::ConfigHandler { // These API are used for Synchronous Tracing. void prepareTrace(const Config& config); + void toggleCollectionDynamic(const bool enable); void startTrace(); void step(); std::unique_ptr stopTrace(); diff --git a/libkineto/src/ActivityProfilerProxy.cpp b/libkineto/src/ActivityProfilerProxy.cpp index 27f12f9..54a3f71 100644 --- a/libkineto/src/ActivityProfilerProxy.cpp +++ b/libkineto/src/ActivityProfilerProxy.cpp @@ -13,9 +13,6 @@ #include "MuptiActivityApi.h" #include "Logger.h" #include -#ifdef HAS_ROCTRACER -#include "RoctracerActivityApi.h" -#endif namespace KINETO_NAMESPACE { @@ -82,6 +79,10 @@ void ActivityProfilerProxy::prepareTrace( controller_->prepareTrace(config); } +void ActivityProfilerProxy::toggleCollectionDynamic(const bool enable) { + controller_->toggleCollectionDynamic(enable); +} + void ActivityProfilerProxy::startTrace() { controller_->startTrace(); } @@ -102,20 +103,11 @@ bool ActivityProfilerProxy::isActive() { void ActivityProfilerProxy::pushCorrelationId(uint64_t id) { MuptiActivityApi::pushCorrelationID(id, MuptiActivityApi::CorrelationFlowType::Default); -#ifdef HAS_ROCTRACER - // FIXME: bad design here - RoctracerActivityApi::pushCorrelationID(id, - RoctracerActivityApi::CorrelationFlowType::Default); -#endif } void ActivityProfilerProxy::popCorrelationId() { MuptiActivityApi::popCorrelationID( MuptiActivityApi::CorrelationFlowType::Default); -#ifdef HAS_ROCTRACER - RoctracerActivityApi::popCorrelationID( - RoctracerActivityApi::CorrelationFlowType::Default); -#endif } void ActivityProfilerProxy::pushUserCorrelationId(uint64_t id) { diff --git a/libkineto/src/ActivityProfilerProxy.h b/libkineto/src/ActivityProfilerProxy.h index e114b7c..69b5428 100644 --- a/libkineto/src/ActivityProfilerProxy.h +++ b/libkineto/src/ActivityProfilerProxy.h @@ -54,6 +54,8 @@ class ActivityProfilerProxy : public ActivityProfilerInterface { const std::set& activityTypes, const std::string& configStr = "") override; + void toggleCollectionDynamic(const bool enable) override; + void startTrace() override; void step() override; std::unique_ptr stopTrace() override; diff --git a/libkineto/src/ActivityType.cpp b/libkineto/src/ActivityType.cpp index 2bcbd6c..066429a 100644 --- a/libkineto/src/ActivityType.cpp +++ b/libkineto/src/ActivityType.cpp @@ -25,18 +25,22 @@ static constexpr std::array map{{ {"gpu_memset", ActivityType::GPU_MEMSET}, {"kernel", ActivityType::CONCURRENT_KERNEL}, {"external_correlation", ActivityType::EXTERNAL_CORRELATION}, - {"musa_runtime", ActivityType::MUSA_RUNTIME}, - {"musa_driver", ActivityType::MUSA_DRIVER}, + {"cuda_runtime", ActivityType::CUDA_RUNTIME}, + {"cuda_driver", ActivityType::CUDA_DRIVER}, {"cpu_instant_event", ActivityType::CPU_INSTANT_EVENT}, {"python_function", ActivityType::PYTHON_FUNCTION}, {"overhead", ActivityType::OVERHEAD}, - {"musa_sync", ActivityType::MUSA_SYNC}, - {"glow_runtime", ActivityType::GLOW_RUNTIME}, {"mtia_runtime", ActivityType::MTIA_RUNTIME}, - {"musa_profiler_range", ActivityType::MUSA_PROFILER_RANGE}, {"mtia_ccp_events", ActivityType::MTIA_CCP_EVENTS}, + {"cuda_sync", ActivityType::CUDA_SYNC}, + {"glow_runtime", ActivityType::GLOW_RUNTIME}, + {"musa_profiler_range", ActivityType::CUDA_PROFILER_RANGE}, {"hpu_op", ActivityType::HPU_OP}, {"xpu_runtime", ActivityType::XPU_RUNTIME}, + {"collective_comm", ActivityType::COLLECTIVE_COMM}, + {"mtia_workloadd", ActivityType::MTIA_WORKLOADD}, + {"privateuse1_runtime", ActivityType::PRIVATEUSE1_RUNTIME}, + {"privateuse1_driver", ActivityType::PRIVATEUSE1_DRIVER}, {"ENUM_COUNT", ActivityType::ENUM_COUNT} }}; diff --git a/libkineto/src/ApproximateClock.h b/libkineto/src/ApproximateClock.h new file mode 100644 index 0000000..991085e --- /dev/null +++ b/libkineto/src/ApproximateClock.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* The contents of this file was borrowed c10/util/ApproximateClock.h */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace libkineto { + + +#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) +#define KINETO_RDTSC +#if defined(_MSC_VER) +#include +#elif defined(__CUDACC__) || defined(__HIPCC__) +#undef KINETO_RDTSC +#elif defined(__clang__) +// `__rdtsc` is available by default. +// NB: This has to be first, because Clang will also define `__GNUC__` +#elif defined(__GNUC__) +#include +#else +#undef KINETO_RDTSC +#endif +#endif + + +#if defined(_MSC_VER) && !defined(__clang__) +#define KINETO_UNUSED __pragma(warning(suppress : 4100 4101)) +#else +#define KINETO_UNUSED __attribute__((__unused__)) +#endif //_MSC_VER + +using time_t = int64_t; +using steady_clock_t = std::conditional_t< + std::chrono::high_resolution_clock::is_steady, + std::chrono::high_resolution_clock, + std::chrono::steady_clock>; + +inline time_t getTime(bool allow_monotonic = false) { +#if defined(_WIN32) || defined(__MACH__) + return std::chrono::duration_cast( + steady_clock_t::now().time_since_epoch()) + .count(); +#else + // clock_gettime is *much* faster than std::chrono implementation on Linux + struct timespec t {}; + auto mode = CLOCK_REALTIME; + if (allow_monotonic) { + mode = CLOCK_MONOTONIC; + } + clock_gettime(mode, &t); + return static_cast(t.tv_sec) * 1000000000 + + static_cast(t.tv_nsec); +#endif +} + +// We often do not need to capture true wall times. If a fast mechanism such +// as TSC is available we can use that instead and convert back to epoch time +// during post processing. This greatly reduce the clock's contribution to +// profiling. +// http://btorpey.github.io/blog/2014/02/18/clock-sources-in-linux/ +// https://quick-bench.com/q/r8opkkGZSJMu9wM_XTbDouq-0Io +// TODO: We should use +// `https://github.com/google/benchmark/blob/main/src/cycleclock.h` +inline auto getApproximateTime() { +#if defined(KINETO_RDTSC) + return static_cast(__rdtsc()); +#else + return getTime(); +#endif +} + +using approx_time_t = decltype(getApproximateTime()); +static_assert( + std::is_same_v || + std::is_same_v, + "Expected either int64_t (`getTime`) or uint64_t (some TSC reads)."); + +std::function& get_time_converter(); +} // namespace libkineto diff --git a/libkineto/src/Config.cpp b/libkineto/src/Config.cpp index 216aa6f..378a7d1 100644 --- a/libkineto/src/Config.cpp +++ b/libkineto/src/Config.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -133,6 +134,10 @@ constexpr char kProfileStartIterationKey[] = "PROFILE_START_ITERATION"; constexpr char kProfileStartIterationRoundUpKey[] = "PROFILE_START_ITERATION_ROUNDUP"; +constexpr char kRequestTraceID[] = "REQUEST_TRACE_ID"; +constexpr char kRequestGroupTraceID[] = + "REQUEST_GROUP_TRACE_ID"; + // Enable on-demand trigger via kill -USR2 // When triggered in this way, /tmp/libkineto.conf will be used as config. constexpr char kEnableSigUsr2Key[] = "ENABLE_SIGUSR2"; @@ -241,6 +246,16 @@ Config::Config() #endif } +#if __linux__ +bool isDaemonEnvVarSet() { + static bool rc = [] { + void* ptr = getenv(kUseDaemonEnvVar); + return ptr != nullptr; + }(); + return rc; +} +#endif + std::shared_ptr Config::getStaticObjectsLifetimeHandle() { return configFactories(); } @@ -384,6 +399,10 @@ bool Config::handleOption(const std::string& name, std::string& val) { activitiesWarmupIterations_ = toInt32(val); } else if (!name.compare(kActivitiesDisplayMusaSyncWaitEvents)) { activitiesMusaSyncWaitEvents_ = toBool(val); + } else if (!name.compare(kRequestTraceID)) { + requestTraceID_ = val; + } else if (!name.compare(kRequestGroupTraceID)) { + requestGroupTraceID_ = val; } // TODO: Deprecate Client Interface diff --git a/libkineto/src/DaemonConfigLoader.cpp b/libkineto/src/DaemonConfigLoader.cpp index 2b56b0d..6011230 100644 --- a/libkineto/src/DaemonConfigLoader.cpp +++ b/libkineto/src/DaemonConfigLoader.cpp @@ -18,9 +18,11 @@ namespace KINETO_NAMESPACE { // TODO : implications of this singleton being thread safe on forks? -IpcFabricConfigClient* getConfigClient() { - static auto client = new IpcFabricConfigClient(); - return client; +IpcFabricConfigClient* DaemonConfigLoader::getConfigClient() { + if (!configClient){ + configClient = std::make_unique(); + } + return configClient.get(); } std::string DaemonConfigLoader::readBaseConfig() { diff --git a/libkineto/src/DaemonConfigLoader.h b/libkineto/src/DaemonConfigLoader.h index 91d040d..bcd2480 100644 --- a/libkineto/src/DaemonConfigLoader.h +++ b/libkineto/src/DaemonConfigLoader.h @@ -14,6 +14,9 @@ #if !USE_GOOGLE_LOG #include #endif // !USE_GOOGLE_LOG +#ifdef __linux__ +#include "IpcFabricConfigClient.h" +#endif // __linux__ namespace KINETO_NAMESPACE { @@ -53,7 +56,11 @@ class DaemonConfigLoader : public IDaemonConfigLoader { void setCommunicationFabric(bool enabled) override; + IpcFabricConfigClient* getConfigClient(); + static void registerFactory(); +private: + std::unique_ptr configClient; }; #endif // __linux__ diff --git a/libkineto/src/EventProfiler.cpp b/libkineto/src/EventProfiler.cpp index 625e302..d7beb9b 100644 --- a/libkineto/src/EventProfiler.cpp +++ b/libkineto/src/EventProfiler.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/libkineto/src/MuptiActivity.cpp b/libkineto/src/MuptiActivity.cpp index ab9713a..a9ac29c 100644 --- a/libkineto/src/MuptiActivity.cpp +++ b/libkineto/src/MuptiActivity.cpp @@ -31,6 +31,10 @@ inline ActivityType GpuActivity::type() const { return ActivityType::CONCURRENT_KERNEL; } +inline bool isWaitEventSync(MUpti_ActivitySynchronizationType type) { + return (type == MUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT); +} + inline bool isEventSync(MUpti_ActivitySynchronizationType type) { return ( type == MUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE || @@ -65,7 +69,7 @@ inline int64_t MusaSyncActivity::resourceId() const { // set to MUPTI_SYNCHRONIZATION_INVALID_VALUE (-1) // converting to an integer will automatically wrap the number to -1 // in the trace. - return int32_t(raw().streamId); + return static_cast(raw().streamId); } inline void MusaSyncActivity::log(ActivityLogger& logger) const { @@ -81,7 +85,7 @@ inline const std::string MusaSyncActivity::metadataJson() const { "device": {}, "context": {})JSON", syncTypeString(sync.type), isEventSync(raw().type) ? eventSyncInfo(raw(), srcStream_, srcCorrId_) : "", - sync.streamId, sync.correlationId, + static_cast(sync.streamId), sync.correlationId, deviceId(), sync.contextId); // clang-format on return ""; @@ -101,6 +105,9 @@ constexpr int64_t us(int64_t timestamp) { template<> inline const std::string GpuActivity::metadataJson() const { const MUpti_ActivityKernel6& kernel = raw(); + float blocksPerSmVal = blocksPerSm(kernel); + float warpsPerSmVal = warpsPerSm(kernel); + // clang-format off return fmt::format(R"JSON( "queued": {}, "device": {}, "context": {}, @@ -116,8 +123,8 @@ inline const std::string GpuActivity::metadataJson() cons kernel.streamId, kernel.correlationId, kernel.registersPerThread, kernel.staticSharedMemory + kernel.dynamicSharedMemory, - blocksPerSm(kernel), - warpsPerSm(kernel), + std::isinf(blocksPerSmVal) ? "\"inf\"" : std::to_string(blocksPerSmVal), + std::isinf(warpsPerSmVal) ? "\"inf\"" : std::to_string(warpsPerSmVal), kernel.gridX, kernel.gridY, kernel.gridZ, kernel.blockX, kernel.blockY, kernel.blockZ, (int) (0.5 + kernelOccupancy(kernel) * 100.0)); @@ -248,7 +255,12 @@ inline bool RuntimeActivity::flowStart() const { activity_.cbid == MUPTI_RUNTIME_TRACE_CBID_musaDeviceSynchronize_v3020 || activity_.cbid == MUPTI_RUNTIME_TRACE_CBID_musaStreamWaitEvent_v3020; -#if defined(MUPTI_API_VERSION) && MUPTI_API_VERSION >= 18 +/* Todo: why here set 14 + - when porting mupti, define MUPTI_API_VERSION=14 in musa runtine + - when porting kineto, the macro definition here is used directly, did not do test and adapt to cover each details. + - setting to 14, but do not test on S4000 Kuae 1.3. +*/ +#if defined(MUPTI_API_VERSION) && MUPTI_API_VERSION >= 14 should_correlate |= activity_.cbid == MUPTI_RUNTIME_TRACE_CBID_musaLaunchKernelExC_v11060; #endif @@ -261,10 +273,14 @@ inline const std::string RuntimeActivity::metadataJson() const { activity_.cbid, activity_.correlationId); } -inline bool DriverActivity::flowStart() const { +inline bool isKernelLaunchApi(const MUpti_ActivityAPI& activity_) { return activity_.cbid == MUPTI_DRIVER_TRACE_CBID_muLaunchKernel; } +inline bool DriverActivity::flowStart() const { + return isKernelLaunchApi(activity_); +} + inline const std::string DriverActivity::metadataJson() const { return fmt::format(R"JSON( "cbid": {}, "correlation": {})JSON", @@ -273,8 +289,13 @@ inline const std::string DriverActivity::metadataJson() const { inline const std::string DriverActivity::name() const { // currently only muLaunchKernel is expected - assert(activity_.cbid == MUPTI_DRIVER_TRACE_CBID_muLaunchKernel); - return "muLaunchKernel"; + assert(isKernelLaunchApi(activity_)); + // not yet implementing full name matching + if (activity_.cbid == MUPTI_DRIVER_TRACE_CBID_muLaunchKernel) { + return "muLaunchKernel"; + } else { + return "Unknown"; // should not reach here + } } template diff --git a/libkineto/src/MuptiActivity.h b/libkineto/src/MuptiActivity.h index ff6e801..7f9486f 100644 --- a/libkineto/src/MuptiActivity.h +++ b/libkineto/src/MuptiActivity.h @@ -38,10 +38,10 @@ struct MuptiActivity : public ITraceActivity { explicit MuptiActivity(const T* activity, const ITraceActivity* linked) : activity_(*activity), linked_(linked) {} int64_t timestamp() const override { - return nsToUs(unixEpochTimestamp(activity_.start)); + return (unixEpochTimestamp(activity_.start)); } int64_t duration() const override { - return nsToUs(activity_.end - activity_.start); + return (activity_.end - activity_.start); } // TODO(T107507796): Deprecate ITraceActivity int64_t correlationId() const override {return 0;} @@ -67,7 +67,7 @@ struct RuntimeActivity : public MuptiActivity { int64_t correlationId() const override {return activity_.correlationId;} int64_t deviceId() const override {return processId();} int64_t resourceId() const override {return threadId_;} - ActivityType type() const override {return ActivityType::MUSA_RUNTIME;} + ActivityType type() const override {return ActivityType::PRIVATEUSE1_RUNTIME;} bool flowStart() const override; const std::string name() const override {return runtimeCbidName(activity_.cbid);} void log(ActivityLogger& logger) const override; @@ -87,7 +87,7 @@ struct DriverActivity : public MuptiActivity { int64_t correlationId() const override {return activity_.correlationId;} int64_t deviceId() const override {return processId();} int64_t resourceId() const override {return threadId_;} - ActivityType type() const override {return ActivityType::MUSA_DRIVER;} + ActivityType type() const override {return ActivityType::PRIVATEUSE1_DRIVER;} bool flowStart() const override; const std::string name() const override; void log(ActivityLogger& logger) const override; @@ -106,10 +106,10 @@ struct OverheadActivity : public MuptiActivity { : MuptiActivity(activity, linked), threadId_(threadId) {} int64_t timestamp() const override { - return nsToUs(unixEpochTimestamp(activity_.start)); + return (unixEpochTimestamp(activity_.start)); } int64_t duration() const override { - return nsToUs(activity_.end - activity_.start); + return (activity_.end - activity_.start); } // TODO: Update this with PID ordering int64_t deviceId() const override {return -1;} @@ -137,7 +137,7 @@ struct MusaSyncActivity : public MuptiActivity { int64_t correlationId() const override {return raw().correlationId;} int64_t deviceId() const override; int64_t resourceId() const override; - ActivityType type() const override {return ActivityType::MUSA_SYNC;} + ActivityType type() const override {return ActivityType::CUDA_SYNC;} bool flowStart() const override {return false;} const std::string name() const override; void log(ActivityLogger& logger) const override; diff --git a/libkineto/src/MuptiActivityApi.cpp b/libkineto/src/MuptiActivityApi.cpp index eed5925..247852d 100644 --- a/libkineto/src/MuptiActivityApi.cpp +++ b/libkineto/src/MuptiActivityApi.cpp @@ -13,7 +13,6 @@ #include #include -#include "mupti_call.h" #include "Logger.h" #include "Config.h" #include "MusaUtil.h" @@ -328,13 +327,13 @@ void MuptiActivityApi::enableMuptiActivities( MUPTI_CALL(muptiActivityEnable(MUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); externalCorrelationEnabled_ = true; } - if (activity == ActivityType::MUSA_SYNC) { + if (activity == ActivityType::CUDA_SYNC) { MUPTI_CALL(muptiActivityEnable(MUPTI_ACTIVITY_KIND_SYNCHRONIZATION)); } - if (activity == ActivityType::MUSA_RUNTIME) { + if (activity == ActivityType::PRIVATEUSE1_RUNTIME) { MUPTI_CALL(muptiActivityEnable(MUPTI_ACTIVITY_KIND_RUNTIME)); } - if (activity == ActivityType::MUSA_DRIVER) { + if (activity == ActivityType::PRIVATEUSE1_DRIVER) { MUPTI_CALL(muptiActivityEnable(MUPTI_ACTIVITY_KIND_DRIVER)); } if (activity == ActivityType::OVERHEAD) { @@ -365,13 +364,13 @@ void MuptiActivityApi::disableMuptiActivities( if (activity == ActivityType::EXTERNAL_CORRELATION) { MUPTI_CALL(muptiActivityDisable(MUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); } - if (activity == ActivityType::MUSA_SYNC) { + if (activity == ActivityType::CUDA_SYNC) { MUPTI_CALL(muptiActivityDisable(MUPTI_ACTIVITY_KIND_SYNCHRONIZATION)); } - if (activity == ActivityType::MUSA_RUNTIME) { + if (activity == ActivityType::PRIVATEUSE1_RUNTIME) { MUPTI_CALL(muptiActivityDisable(MUPTI_ACTIVITY_KIND_RUNTIME)); } - if (activity == ActivityType::MUSA_DRIVER) { + if (activity == ActivityType::PRIVATEUSE1_DRIVER) { MUPTI_CALL(muptiActivityDisable(MUPTI_ACTIVITY_KIND_DRIVER)); } if (activity == ActivityType::OVERHEAD) { diff --git a/libkineto/src/MuptiActivityProfiler.cpp b/libkineto/src/MuptiActivityProfiler.cpp index c2b1aaa..cdc8c41 100644 --- a/libkineto/src/MuptiActivityProfiler.cpp +++ b/libkineto/src/MuptiActivityProfiler.cpp @@ -5,12 +5,13 @@ * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ - +#include "ApproximateClock.h" #include "MuptiActivityProfiler.h" #include #include #include +#include #include #include #include @@ -23,10 +24,6 @@ #ifdef HAS_MUPTI #include -// TODO(T90238193) -// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude -#include "musa_call.h" -#include "mupti_call.h" #include "MusaUtil.h" #endif @@ -36,16 +33,16 @@ #include "MuptiActivity.h" #include "MuptiActivity.cpp" #include "MuptiActivityApi.h" +#include "MuptiActivityPlatform.h" #endif // HAS_MUPTI -#ifdef HAS_ROCTRACER -#include "RoctracerActivityApi.h" -#endif #include "output_base.h" +#include "ActivityBuffers.h" #include "Logger.h" #include "ThreadUtil.h" using namespace std::chrono; +using namespace libkineto; using std::string; struct CtxEventPair { @@ -92,17 +89,21 @@ std::unordered_map& ctxToDeviceId() { namespace KINETO_NAMESPACE { -const int FLUSH_INTERVAL_MILLISECONDS = 10; -std::atomic running(true); -std::unique_ptr taskThread; +// Sets the timestamp converter. If nothing is set then the converter just +// returns the input. For this reason, until we add profiler impl of passing in +// TSC converter we just need to guard the callback itself +std::function& get_time_converter() { + static std::function _time_converter = + [](approx_time_t t) { return t; }; + return _time_converter; +} -void periodicTask(int intervalMilliseconds){ - while (running) - { - std::this_thread::sleep_for(milliseconds(intervalMilliseconds)); - MUPTI_CALL(muptiActivityFlushAll(0)); - } +#ifdef HAS_MUPTI +bool& use_mupti_tsc() { + static bool use_mupti_tsc = true; + return use_mupti_tsc; } +#endif ConfigDerivedState::ConfigDerivedState(const Config& config) { profileActivityTypes_ = config.selectedActivityTypes(); @@ -175,12 +176,12 @@ std::ostream& operator<<(std::ostream& oss, const MuptiActivityProfiler::ErrorCo << ", Blocklisted runtime = " << ecs.blocklisted_runtime_events << ", Invalid ext correlations = " << ecs.invalid_external_correlation_events << ", CPU GPU out-of-order = " << ecs.gpu_and_cpu_op_out_of_order -#if defined(HAS_MUPTI) || defined(HAS_ROCTRACER) +#if defined(HAS_MUPTI) << ", Unexpected MUSA events = " << ecs.unexepected_musa_events << ", MUPTI stopped early? = " << ecs.mupti_stopped_early; #else ; -#endif // HAS_MUPTI || HAS_ROCTRACER +#endif // HAS_MUPTI return oss; } @@ -203,15 +204,9 @@ void MuptiActivityProfiler::transferCpuTrace( traceBuffers_->cpu.push_back(std::move(cpuTrace)); } -#ifdef HAS_ROCTRACER -MuptiActivityProfiler::MuptiActivityProfiler( - RoctracerActivityApi& mupti, - bool cpuOnly) -#else MuptiActivityProfiler::MuptiActivityProfiler( MuptiActivityApi& mupti, bool cpuOnly) -#endif : mupti_(mupti), flushOverhead_{0, 0}, setupOverhead_{0, 0}, @@ -227,16 +222,21 @@ MuptiActivityProfiler::MuptiActivityProfiler( #ifdef HAS_MUPTI void MuptiActivityProfiler::logMusaVersions() { - // check Nvidia versions - uint32_t muptiVersion; - int musaRuntimeVersion, musaDriverVersion; - + // check MUSA versions + uint32_t muptiVersion = 0; + int musaRuntimeVersion = 0, musaDriverVersion = 0; MUPTI_CALL(muptiGetVersion(&muptiVersion)); MUSA_CALL(musaRuntimeGetVersion(&musaRuntimeVersion)); MUSA_CALL(musaDriverGetVersion(&musaDriverVersion)); LOG(INFO) << "MUSA versions. MUPTI: " << muptiVersion << "; Runtime: " << musaRuntimeVersion << "; Driver: " << musaDriverVersion; + LOGGER_OBSERVER_ADD_METADATA( + "mupti_version", std::to_string(muptiVersion)); + LOGGER_OBSERVER_ADD_METADATA( + "musa_runtime_version", std::to_string(musaRuntimeVersion)); + LOGGER_OBSERVER_ADD_METADATA( + "musa_driver_version", std::to_string(musaDriverVersion)); } #endif @@ -245,6 +245,8 @@ void MuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) { VLOG(0) << "Profile time range: " << captureWindowStartTime_ << " - " << captureWindowEndTime_; logger.handleTraceStart(metadata_); + setCpuActivityPresent(false); + setGpuActivityPresent(false); for (auto& cpu_trace : traceBuffers_->cpu) { string trace_name = cpu_trace->span.name; VLOG(0) << "Processing CPU buffer for " << trace_name << " (" @@ -271,7 +273,7 @@ void MuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) { this, std::placeholders::_1, &logger)); - + logDeferredEvents(); LOG(INFO) << "Processed " << count_and_size.first << " GPU records (" << count_and_size.second << " bytes)"; LOGGER_OBSERVER_ADD_EVENT_COUNT(count_and_size.first); @@ -284,20 +286,14 @@ void MuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) { LOGGER_OBSERVER_ADD_METADATA( "ResourceOverhead", std::to_string(resourceOverheadCount_)); } + if (!gpuActivityPresent()){ + LOG(WARNING) << "GPU trace is empty!"; + } } #endif // HAS_MUPTI -#ifdef HAS_ROCTRACER - if (!cpuOnly_) { - VLOG(0) << "Retrieving GPU activity buffers"; - const int count = mupti_.processActivities( - logger, - std::bind(&MuptiActivityProfiler::cpuActivity, this, std::placeholders::_1), - captureWindowStartTime_, - captureWindowEndTime_); - LOG(INFO) << "Processed " << count << " GPU records"; - LOGGER_OBSERVER_ADD_EVENT_COUNT(count); + if (!traceNonEmpty()) { + LOG(WARNING) << "No Valid Trace Events (CPU/GPU) found. Outputting empty trace."; } -#endif // HAS_ROCTRACER for (const auto& session : sessions_) { LOG(INFO) << "Processing child profiler trace"; @@ -325,12 +321,15 @@ void MuptiActivityProfiler::processCpuTrace( LOG(WARNING) << "CPU trace is empty!"; return; } + setCpuActivityPresent(true); CpuGpuSpanPair& span_pair = recordTraceSpan(cpuTrace.span, cpuTrace.gpuOpCount); TraceSpan& cpu_span = span_pair.first; for (auto const& act : cpuTrace.activities) { - LOG(VERBOSE) << "CPU OP correlation id: "<< act->correlationId() << " name: " << act->activityName; + LOG(VERBOSE) << "CPU OP correlation id: " << act->correlationId() + << " name: " << act->activityName; + VLOG(2) << act->correlationId() << ": OP " << act->activityName; if (derivedConfig_->profileActivityTypes().count(act->type())) { static_assert( std::is_same< @@ -351,7 +350,9 @@ void MuptiActivityProfiler::processCpuTrace( #ifdef HAS_MUPTI inline void MuptiActivityProfiler::handleCorrelationActivity( const MUpti_ActivityExternalCorrelation* correlation) { - LOG(VERBOSE) << "handleCorrelationActivity kind: " << correlation->externalKind << " correlation id: " << correlation->correlationId << " external id: " << correlation->externalId; + LOG(VERBOSE) << "handleCorrelationActivity kind: " << correlation->externalKind + << " correlation id: " << correlation->correlationId + << " external id: " << correlation->externalId; if (correlation->externalKind == MUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0) { cpuCorrelationMap_[correlation->correlationId] = correlation->externalId; } else if ( @@ -457,6 +458,9 @@ void MuptiActivityProfiler::handleRuntimeActivity( LOG(VERBOSE) << "Runtime OP correlation id=" << activity->correlationId << " cbid=" << activity->cbid << " tid=" << activity->threadId; + VLOG(2) << activity->correlationId + << ": MUPTI_ACTIVITY_KIND_RUNTIME, cbid=" << activity->cbid + << " tid=" << activity->threadId; int32_t tid = activity->threadId; const auto& it = resourceInfo_.find({processId(), tid}); if (it != resourceInfo_.end()) { @@ -471,19 +475,23 @@ void MuptiActivityProfiler::handleRuntimeActivity( return; } runtime_activity.log(*logger); + setGpuActivityPresent(true); } void MuptiActivityProfiler::handleDriverActivity( const MUpti_ActivityAPI* activity, ActivityLogger* logger) { // we only want to collect muLaunchKernel events, for triton kernel launches - if (activity->cbid != MUPTI_DRIVER_TRACE_CBID_muLaunchKernel) { + if (!isKernelLaunchApi(*activity)) { // XXX should we count other driver events? return; } LOG(VERBOSE) << "Driver OP correlation id=" << activity->correlationId << " cbid=" << activity->cbid << " tid=" << activity->threadId; + VLOG(2) << activity->correlationId + << ": MUPTI_ACTIVITY_KIND_DRIVER, cbid=" << activity->cbid + << " tid=" << activity->threadId; int32_t tid = activity->threadId; const auto& it = resourceInfo_.find({processId(), tid}); @@ -499,6 +507,7 @@ void MuptiActivityProfiler::handleDriverActivity( return; } runtime_activity.log(*logger); + setGpuActivityPresent(true); } void MuptiActivityProfiler::handleOverheadActivity( @@ -517,6 +526,7 @@ void MuptiActivityProfiler::handleOverheadActivity( return; } overhead_activity.log(*logger); + setGpuActivityPresent(true); } @@ -553,7 +563,9 @@ void MuptiActivityProfiler::handleMusaSyncActivity( << " eventId=" << activity->musaEventId << " contextId=" << activity->contextId; - if (!config_->activitiesMusaSyncWaitEvents() && isEventSync(activity->type)) { + if (!config_->activitiesMusaSyncWaitEvents() && + isWaitEventSync(activity->type)) { + printf("in handleMusaSyncActivity, activitiesMusaSyncWaitEvents\n"); return; } @@ -588,15 +600,58 @@ void MuptiActivityProfiler::handleMusaSyncActivity( } } - const auto& musa_sync_activity = traceBuffers_->addActivityWrapper( - MusaSyncActivity(activity, linked, src_stream, src_corrid)); + // Marshal the logging to a functor so we can defer it if needed. + auto log_event = [=](){ + const ITraceActivity* linked = + linkedActivity(activity->correlationId, cpuCorrelationMap_); + const auto& musa_sync_activity = traceBuffers_->addActivityWrapper( + MusaSyncActivity(activity, linked, src_stream, src_corrid)); - if (outOfRange(musa_sync_activity)) { - return; + if (outOfRange(musa_sync_activity)) { + return; + } + + if (int32_t(activity->streamId) != -1) { + recordStream(device_id, activity->streamId, ""); + } else { + recordDevice(device_id); + } + VLOG(2) << "Logging sync event device = " << device_id + << " stream = " << activity->streamId + << " sync type = " << syncTypeString(activity->type); + musa_sync_activity.log(*logger); + setGpuActivityPresent(true); + }; + + if (isWaitEventSync(activity->type)) { + // Defer logging wait event syncs till the end so we only + // log these events if a stream has some GPU kernels on it. + DeferredLogEntry entry{ + .device = device_id, + .stream = activity->streamId, + .logMe = log_event, + }; + logQueue_.push_back(entry); + } else { + log_event(); } - musa_sync_activity.log(*logger); } +void MuptiActivityProfiler::logDeferredEvents() { + // Stream Wait Events tend to be noisy, only pass these events if + // there was some GPU kernel/memcopy/memset observed on it in the trace window. + for (const auto& entry: logQueue_) { + if (seenDeviceStreams_.find({entry.device, entry.stream}) == + seenDeviceStreams_.end()) { + VLOG(2) << "Skipping Event Sync as no kernels have run yet on stream = " + << entry.stream; + } else { + entry.logMe(); + } + } +} +#endif // HAS_MUPTI + inline void MuptiActivityProfiler::updateGpuNetSpan( const ITraceActivity& gpuOp) { if (!gpuOp.linkedActivity()) { @@ -632,7 +687,7 @@ void MuptiActivityProfiler::checkTimestampOrder(const ITraceActivity* act1) { // If we have a runtime activity in the map, it should mean that we // have a GPU activity passed in, and vice versa. const ITraceActivity* act2 = it->second; - if (act2->type() == ActivityType::MUSA_RUNTIME) { + if (act2->type() == ActivityType::PRIVATEUSE1_RUNTIME) { // Buffer is out-of-order. // Swap so that runtime activity is first for the comparison below. std::swap(act1, act2); @@ -647,6 +702,20 @@ void MuptiActivityProfiler::checkTimestampOrder(const ITraceActivity* act1) { } } +const ITraceActivity* MuptiActivityProfiler::linkedActivity( + int32_t correlationId, + const std::unordered_map& correlationMap) { + const auto& it = correlationMap.find(correlationId); + if (it != correlationMap.end()) { + const auto& it2 = activityMap_.find(it->second); + if (it2 != activityMap_.end()) { + return it2->second; + } + } + return nullptr; +} + + inline void MuptiActivityProfiler::handleGpuActivity( const ITraceActivity& act, ActivityLogger* logger) { @@ -660,6 +729,7 @@ inline void MuptiActivityProfiler::handleGpuActivity( seenDeviceStreams_.insert({act.deviceId(), act.resourceId()}); act.log(*logger); + setGpuActivityPresent(true); updateGpuNetSpan(act); if (derivedConfig_->profileActivityTypes().count( ActivityType::GPU_USER_ANNOTATION)) { @@ -674,19 +744,7 @@ inline void MuptiActivityProfiler::handleGpuActivity( } } -const ITraceActivity* MuptiActivityProfiler::linkedActivity( - int32_t correlationId, - const std::unordered_map& correlationMap) { - const auto& it = correlationMap.find(correlationId); - if (it != correlationMap.end()) { - const auto& it2 = activityMap_.find(it->second); - if (it2 != activityMap_.end()) { - return it2->second; - } - } - return nullptr; -} - +#ifdef HAS_MUPTI template inline void MuptiActivityProfiler::handleGpuActivity( const T* act, @@ -698,7 +756,6 @@ inline void MuptiActivityProfiler::handleGpuActivity( handleGpuActivity(gpu_activity, logger); } - template inline void updateCtxToDeviceId(const T* act) { if (ctxToDeviceId().count(act->contextId) == 0) { @@ -777,16 +834,21 @@ void MuptiActivityProfiler::configureChildProfilers() { derivedConfig_->profileStartTime().time_since_epoch()) .count(); for (auto& profiler : profilers_) { - LOG(INFO) << "Evaluating whether to run child profiler = " << profiler->name(); + LOG(INFO) << "[Profiler = " << profiler->name() << "] " + << "Evaluating whether to run child profiler." ; auto session = profiler->configure( start_time_ms, derivedConfig_->profileDuration().count(), derivedConfig_->profileActivityTypes(), *config_); if (session) { - LOG(INFO) << "Running child profiler " << profiler->name() << " for " + LOG(INFO) << "[Profiler = " << profiler->name() << "] " + << "Running child profiler " << profiler->name() << " for " << derivedConfig_->profileDuration().count() << " ms"; sessions_.push_back(std::move(session)); + } else { + LOG(INFO) << "[Profiler = " << profiler->name() << "] " + << "Not running child profiler."; } } } @@ -816,7 +878,6 @@ void MuptiActivityProfiler::configure( // Check if now is a valid time to start. if (!derivedConfig_->canStart(now)) { - // Added by qiaoning @20240912 should update on-demand running status? MAYBE. return; } @@ -842,7 +903,7 @@ void MuptiActivityProfiler::configure( LOGGER_OBSERVER_SET_GROUP_TRACE_ID(config_->requestGroupTraceID()); } -#if defined(HAS_MUPTI) || defined(HAS_ROCTRACER) +#if defined(HAS_MUPTI) if (!cpuOnly_) { // Enabling MUPTI activity tracing incurs a larger perf hit at first, // presumably because structures are allocated and initialized, callbacks @@ -866,7 +927,7 @@ void MuptiActivityProfiler::configure( setupOverhead_, duration_cast(t2 - timestamp).count()); } } -#endif // HAS_MUPTI || HAS_ROCTRACER +#endif // HAS_MUPTI if (profilers_.size() > 0) { configureChildProfilers(); @@ -903,10 +964,18 @@ void MuptiActivityProfiler::configure( currentRunloopState_ = RunloopState::Warmup; } +void MuptiActivityProfiler::toggleCollectionDynamic(const bool enable) { +#ifdef HAS_MUPTI + if (enable) { + mupti_.enableMuptiActivities(derivedConfig_->profileActivityTypes()); + } else { + mupti_.disableMuptiActivities(derivedConfig_->profileActivityTypes()); + } +#endif +} + void MuptiActivityProfiler::startTraceInternal( const time_point& now) { - running = true; - taskThread = std::make_unique(periodicTask, FLUSH_INTERVAL_MILLISECONDS); captureWindowStartTime_ = libkineto::timeSinceEpoch(now); VLOG(0) << "Warmup -> CollectTrace"; for (auto& session : sessions_) { @@ -919,22 +988,7 @@ void MuptiActivityProfiler::startTraceInternal( void MuptiActivityProfiler::stopTraceInternal( const time_point& now) { captureWindowEndTime_ = libkineto::timeSinceEpoch(now); - bool enable_mt_timer_gpu_events = (getenv("MT_TIMER_GPU_EVENTS") != nullptr); - int captureWindowLen = 60; // set window len as 60s to avoid too much replicated data - if(enable_mt_timer_gpu_events) { - if (getenv("MT_TIMER_CAPTURE_WINDOW_LEN")) { - try { - captureWindowLen = std::stoi(getenv("MT_TIMER_CAPTURE_WINDOW_LEN")); - } catch (const std::invalid_argument& e) { - LOG(ERROR) << "Invalid value for MT_TIMER_CAPTURE_WINDOW_LEN. Using default 60s.\n"; - } - } - if (captureWindowLen > 0) { - captureWindowEndTime_ = captureWindowStartTime_ + 1000000*captureWindowLen; - } - LOG(INFO) << "MT_TIMER_GPU_EVENTS enabled, capture window length: " << captureWindowEndTime_ - captureWindowStartTime_ << "us"; - } -#if defined(HAS_MUPTI) || defined(HAS_ROCTRACER) +#if defined(HAS_MUPTI) if (!cpuOnly_) { time_point timestamp; if (VLOG_IS_ON(1)) { @@ -944,14 +998,14 @@ void MuptiActivityProfiler::stopTraceInternal( mupti_.disableMuptiActivities(derivedConfig_->profileActivityTypes()); #else mupti_.disableActivities(derivedConfig_->profileActivityTypes()); -#endif +#endif // HAS_MUPTI if (VLOG_IS_ON(1)) { auto t2 = system_clock::now(); addOverheadSample( setupOverhead_, duration_cast(t2 - timestamp).count()); } } -#endif // HAS_MUPTI || HAS_ROCTRACER +#endif // HAS_MUPTI if (currentRunloopState_ == RunloopState::CollectTrace) { VLOG(0) << "CollectTrace -> ProcessTrace"; @@ -965,11 +1019,6 @@ void MuptiActivityProfiler::stopTraceInternal( session->stop(); } currentRunloopState_ = RunloopState::ProcessTrace; - - if(taskThread && taskThread->joinable()){ - running = false; - taskThread->join(); - } } void MuptiActivityProfiler::resetInternal() { @@ -996,7 +1045,7 @@ const time_point MuptiActivityProfiler::performRunLoopStep( case RunloopState::Warmup: VLOG(1) << "State: Warmup"; warmup_done = derivedConfig_->isWarmupDone(now, currentIter); -#if defined(HAS_MUPTI) || defined(HAS_ROCTRACER) +#if defined(HAS_MUPTI) // Flushing can take a while so avoid doing it close to the start time if (!cpuOnly_ && currentIter < 0 && (derivedConfig_->isProfilingByIteration() || @@ -1012,7 +1061,7 @@ const time_point MuptiActivityProfiler::performRunLoopStep( VLOG(0) << "Warmup -> WaitForRequest"; break; } -#endif // HAS_MUPTI || HAS_ROCTRACER +#endif // HAS_MUPTI if (warmup_done) { UST_LOGGER_MARK_COMPLETED(kWarmUpStage); @@ -1044,9 +1093,9 @@ const time_point MuptiActivityProfiler::performRunLoopStep( collection_done = derivedConfig_->isCollectionDone(now, currentIter); if (collection_done -#if defined(HAS_MUPTI) || defined(HAS_ROCTRACER) +#if defined(HAS_MUPTI) || mupti_.stopCollection -#endif // HAS_MUPTI || HAS_ROCTRACER +#endif // HAS_MUPTI ){ // Update runloop state first to prevent further updates to shared state LOG(INFO) << "Tracing complete."; @@ -1056,9 +1105,9 @@ const time_point MuptiActivityProfiler::performRunLoopStep( libkineto::api().client()->stop(); } -#if defined(HAS_MUPTI) || defined(HAS_ROCTRACER) +#if defined(HAS_MUPTI) ecs_.mupti_stopped_early = mupti_.stopCollection; -#endif // HAS_MUPTI || HAS_ROCTRACER +#endif // HAS_MUPTI std::lock_guard guard(mutex_); stopTraceInternal(now); @@ -1088,7 +1137,6 @@ const time_point MuptiActivityProfiler::performRunLoopStep( processTraceInternal(*logger_); UST_LOGGER_MARK_COMPLETED(kPostProcessingStage); resetInternal(); - onDemandProfilingRunning_ = false; VLOG(0) << "ProcessTrace -> WaitForRequest"; break; } @@ -1110,13 +1158,17 @@ void MuptiActivityProfiler::finalizeTrace(const Config& config, ActivityLogger& string process_name = processName(pid); if (!process_name.empty()) { logger.handleDeviceInfo( - {pid, process_name, "CPU"}, captureWindowStartTime_); + {pid, pid, process_name, "CPU"}, captureWindowStartTime_); if (!cpuOnly_) { - // GPU events use device id as pid (0-7). - constexpr int kMaxGpuCount = 8; - for (int gpu = 0; gpu < kMaxGpuCount; gpu++) { + // Usually, GPU events use device id as pid (0-7). + // In some cases, CPU sockets are numbered starting from 0. + // In the worst case, 8 CPU sockets + 8 GPUs, so the max GPU ID is 15. + constexpr int kMaxGpuID = 15; + // sortIndex is gpu + kExceedMaxPid to put GPU tracks at the bottom + // of the trace timelines. + for (int gpu = 0; gpu <= kMaxGpuID; gpu++) { logger.handleDeviceInfo( - {gpu, process_name, fmt::format("GPU {}", gpu)}, + {gpu, gpu + kExceedMaxPid, process_name, fmt::format("GPU {}", gpu)}, captureWindowStartTime_); } } @@ -1149,17 +1201,25 @@ void MuptiActivityProfiler::finalizeTrace(const Config& config, ActivityLogger& } } +#ifdef HAS_MUPTI // Overhead info overheadInfo_.push_back(ActivityLogger::OverheadInfo("MUPTI Overhead")); for(const auto& info : overheadInfo_) { logger.handleOverheadInfo(info, captureWindowStartTime_); } +#endif // HAS_MUPTI gpuUserEventMap_.logEvents(&logger); for (auto& session : sessions_){ auto trace_buffer = session->getTraceBuffer(); - traceBuffers_->cpu.push_back(std::move(trace_buffer)); + if (trace_buffer) { + // Set child start time to profiling start time if not set + if (trace_buffer->span.startTime == 0) { + trace_buffer->span.startTime = captureWindowStartTime_; + } + traceBuffers_->cpu.push_back(std::move(trace_buffer)); + } } // Logger Metadata contains a map of LOGs collected in Kineto @@ -1185,13 +1245,53 @@ MuptiActivityProfiler::getLoggerMetadata() { return loggerMD; } +void MuptiActivityProfiler::pushCorrelationId(uint64_t id) { +#ifdef HAS_MUPTI + MuptiActivityApi::pushCorrelationID(id, + MuptiActivityApi::CorrelationFlowType::Default); +#endif // HAS_MUPTI + for (auto& session : sessions_) { + session->pushCorrelationId(id); + } +} + +void MuptiActivityProfiler::popCorrelationId() { +#ifdef HAS_MUPTI + MuptiActivityApi::popCorrelationID( + MuptiActivityApi::CorrelationFlowType::Default); +#endif // HAS_MUPTI + for (auto& session : sessions_) { + session->popCorrelationId(); + } +} + +void MuptiActivityProfiler::pushUserCorrelationId(uint64_t id) { +#ifdef HAS_MUPTI + MuptiActivityApi::pushCorrelationID(id, + MuptiActivityApi::CorrelationFlowType::User); +#endif // HAS_MUPTI + for (auto& session : sessions_) { + session->pushUserCorrelationId(id); + } +} + +void MuptiActivityProfiler::popUserCorrelationId() { +#ifdef HAS_MUPTI + MuptiActivityApi::popCorrelationID( + MuptiActivityApi::CorrelationFlowType::User); +#endif // HAS_MUPTI + for (auto& session : sessions_) { + session->popUserCorrelationId(); + } +} + void MuptiActivityProfiler::resetTraceData() { -#if defined(HAS_MUPTI) || defined(HAS_ROCTRACER) +#if defined(HAS_MUPTI) if (!cpuOnly_) { mupti_.clearActivities(); mupti_.teardownContext(); } -#endif // HAS_MUPTI || HAS_ROCTRACER +#endif // HAS_MUPTI activityMap_.clear(); cpuCorrelationMap_.clear(); correlatedMusaActivities_.clear(); @@ -1199,6 +1299,7 @@ void MuptiActivityProfiler::resetTraceData() { traceSpans_.clear(); clientActivityTraceMap_.clear(); seenDeviceStreams_.clear(); + logQueue_.clear(); traceBuffers_ = nullptr; metadata_.clear(); sessions_.clear(); diff --git a/libkineto/src/MuptiActivityProfiler.h b/libkineto/src/MuptiActivityProfiler.h index 9b51743..809c7d6 100644 --- a/libkineto/src/MuptiActivityProfiler.h +++ b/libkineto/src/MuptiActivityProfiler.h @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -176,6 +177,22 @@ class MuptiActivityProfiler { logger_ = logger; } + inline void setCpuActivityPresent(bool val){ + cpuActivityPresent_ = val; + } + + inline void setGpuActivityPresent(bool val){ + gpuActivityPresent_ = val; + } + + inline bool gpuActivityPresent(){ + return gpuActivityPresent_; + } + + inline bool traceNonEmpty(){ + return cpuActivityPresent_ || gpuActivityPresent_; + } + // Synchronous control API void startTrace( const std::chrono::time_point& now) { @@ -204,6 +221,9 @@ class MuptiActivityProfiler { const Config& config, const std::chrono::time_point& now); + // Toggle GPU tracing during a profile instance + void toggleCollectionDynamic(const bool enable); + // Registered with client API to pass CPU trace events over void transferCpuTrace( std::unique_ptr cpuTrace); @@ -248,12 +268,26 @@ class MuptiActivityProfiler { std::unordered_map> getLoggerMetadata(); + void pushCorrelationId(uint64_t id); + void popCorrelationId(); + + void pushUserCorrelationId(uint64_t id); + void popUserCorrelationId(); + protected: using CpuGpuSpanPair = std::pair; static const CpuGpuSpanPair& defaultTraceSpan(); private: + // Deferred logging of MUSA-event synchronization + struct DeferredLogEntry { + uint32_t device; + uint32_t stream; + std::function logMe; + }; + + std::deque logQueue_; // Map of gpu activities to user defined events class GpuUserEventMap { @@ -384,6 +418,7 @@ class MuptiActivityProfiler { ActivityLogger* logger); template void handleGpuActivity(const T* act, ActivityLogger* logger); + void logDeferredEvents(); #endif // HAS_MUPTI void resetTraceData(); @@ -412,11 +447,7 @@ class MuptiActivityProfiler { ActivityLogger* logger_; // Calls to MUPTI is encapsulated behind this interface -#ifdef HAS_ROCTRACER - RoctracerActivityApi& mupti_; // Design failure here -#else MuptiActivityApi& mupti_; -#endif enum class RunloopState { WaitForRequest, @@ -450,6 +481,8 @@ class MuptiActivityProfiler { profilerOverhead setupOverhead_; bool cpuOnly_{false}; + bool cpuActivityPresent_{false}; + bool gpuActivityPresent_{false}; // *************************************************************************** // Below state is shared with external threads. @@ -501,10 +534,10 @@ class MuptiActivityProfiler { int32_t out_of_range_events = 0; int32_t gpu_and_cpu_op_out_of_order = 0; int32_t blocklisted_runtime_events = 0; -#if defined(HAS_MUPTI) || defined(HAS_ROCTRACER) +#if defined(HAS_MUPTI) int32_t unexepected_musa_events = 0; bool mupti_stopped_early = false; -#endif // HAS_MUPTI || HAS_ROCTRACER +#endif // HAS_MUPTI }; friend std::ostream& operator<<(std::ostream& oss, const ErrorCounts& ecs); diff --git a/libkineto/src/MuptiCallbackApi.cpp b/libkineto/src/MuptiCallbackApi.cpp index e8b10b1..cd33ef4 100644 --- a/libkineto/src/MuptiCallbackApi.cpp +++ b/libkineto/src/MuptiCallbackApi.cpp @@ -18,7 +18,7 @@ #include #ifdef HAS_MUPTI -#include "mupti_call.h" +#include "MusaUtil.h" #endif #include "Logger.h" @@ -109,7 +109,7 @@ void MuptiCallbackApi::__callback_switchboard( // TODO: MUPTI muptiFinalize is not yet implemented // MUPTI_CALL(muptiFinalize()); initSuccess_ = false; - subscriber_ = 0; + subscriber_ = nullptr; MuptiActivityApi::singleton().teardownMupti_ = 0; MuptiActivityApi::singleton().finalizeCond_.notify_all(); return; diff --git a/libkineto/src/MuptiCallbackApi.h b/libkineto/src/MuptiCallbackApi.h index d33bd13..d74a348 100644 --- a/libkineto/src/MuptiCallbackApi.h +++ b/libkineto/src/MuptiCallbackApi.h @@ -145,7 +145,7 @@ class MuptiCallbackApi { #ifdef HAS_MUPTI MUptiResult lastMuptiStatus_; - MUpti_SubscriberHandle subscriber_ {0}; + MUpti_SubscriberHandle subscriber_ {nullptr}; #endif }; diff --git a/libkineto/src/MuptiCallbackApiMock.h b/libkineto/src/MuptiCallbackApiMock.h index 86daa4d..95747cc 100644 --- a/libkineto/src/MuptiCallbackApiMock.h +++ b/libkineto/src/MuptiCallbackApiMock.h @@ -17,6 +17,10 @@ enum MUpti_CallbackDomain { }; enum MUpti_CallbackId { MUPTI_RUNTIME_TRACE_CBID_musaLaunchKernel_v7000, + +#if defined(MUSA_VERSION) && (MUSA_VERSION >= 11080) + MUPTI_RUNTIME_TRACE_CBID_musaLaunchKernelExC_v11060, +#endif MUPTI_CBID_RESOURCE_CONTEXT_CREATED, MUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING, }; diff --git a/libkineto/src/MuptiEventApi.cpp b/libkineto/src/MuptiEventApi.cpp index 283e1bc..7a79253 100644 --- a/libkineto/src/MuptiEventApi.cpp +++ b/libkineto/src/MuptiEventApi.cpp @@ -10,8 +10,8 @@ #include +#include "MusaUtil.h" #include "Logger.h" -#include "mupti_call.h" using std::vector; diff --git a/libkineto/src/MuptiMetricApi.cpp b/libkineto/src/MuptiMetricApi.cpp index 314929e..d67bcb8 100644 --- a/libkineto/src/MuptiMetricApi.cpp +++ b/libkineto/src/MuptiMetricApi.cpp @@ -10,8 +10,8 @@ #include +#include "MusaUtil.h" #include "Logger.h" -#include "mupti_call.h" using std::vector; diff --git a/libkineto/src/MuptiRangeProfiler.cpp b/libkineto/src/MuptiRangeProfiler.cpp index 4a7c6e4..b8741df 100644 --- a/libkineto/src/MuptiRangeProfiler.cpp +++ b/libkineto/src/MuptiRangeProfiler.cpp @@ -23,7 +23,7 @@ namespace KINETO_NAMESPACE { -const ActivityType kProfActivityType = ActivityType::MUSA_PROFILER_RANGE; +const ActivityType kProfActivityType = ActivityType::CUDA_PROFILER_RANGE; const std::set kSupportedActivities{kProfActivityType}; const std::string kProfilerName{"MuptiRangeProfiler"}; diff --git a/libkineto/src/MuptiRangeProfilerApi.cpp b/libkineto/src/MuptiRangeProfilerApi.cpp index be07bf1..8db419b 100644 --- a/libkineto/src/MuptiRangeProfilerApi.cpp +++ b/libkineto/src/MuptiRangeProfilerApi.cpp @@ -8,17 +8,15 @@ #include #include +#include "ILoggerObserver.h" #ifdef HAS_MUPTI #include +#include "MusaUtil.h" // #include #endif // HAS_MUPTI #include #include -#ifdef HAS_MUPTI -#include "mupti_call.h" -#endif - #include "time_since_epoch.h" #include "Logger.h" #include "Demangle.h" @@ -30,7 +28,6 @@ #if HAS_MUPTI_RANGE_PROFILER #include // #include -#include "mupti_call.h" #endif // HAS_MUPTI_RANGE_PROFILER namespace KINETO_NAMESPACE { @@ -143,15 +140,6 @@ void __trackMusaCtx(MUcontext ctx, uint32_t device_id, MUpti_CallbackId cbid) { VLOG(0) << "MUPTI Profiler observed MUSA Context created = " << ctx << " device id = " << device_id; active_devices.insert(device_id); - // TODO Revisit -#if 0 - if constexpr (kSetCounterAvail) { - if (active_devices.size() == 1) { - MuptiRBProfilerSession::setCounterAvailabilityImage( - getCounterAvailiability(ctx)); - } - } -#endif ctx_to_dev[ctx] = device_id; } else if (cbid == MUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING) { @@ -439,6 +427,8 @@ void MuptiRBProfilerSession::startInternal( LOG(WARNING) << "Failed to start MUPTI range profiler"; initSuccess_ = false; return; + } else { + LOG(INFO) << "Successfully started MUPTI range profiler"; } // Set counter configuration @@ -451,6 +441,7 @@ void MuptiRBProfilerSession::startInternal( setConfigParams.passIndex = 0; setConfigParams.minNestingLevel = 1; setConfigParams.numNestingLevels = numNestingLevels_; + setConfigParams.targetNestingLevel = setConfigParams.minNestingLevel; status = MUPTI_CALL(muptiProfilerSetConfig(&setConfigParams)); if (status != MUPTI_SUCCESS) { diff --git a/libkineto/src/MuptiRangeProfilerConfig.cpp b/libkineto/src/MuptiRangeProfilerConfig.cpp index afc3ffb..6e128eb 100644 --- a/libkineto/src/MuptiRangeProfilerConfig.cpp +++ b/libkineto/src/MuptiRangeProfilerConfig.cpp @@ -13,6 +13,7 @@ #include #include +#include #include diff --git a/libkineto/src/MusaDeviceProperties.cpp b/libkineto/src/MusaDeviceProperties.cpp index cb0e81b..569d309 100644 --- a/libkineto/src/MusaDeviceProperties.cpp +++ b/libkineto/src/MusaDeviceProperties.cpp @@ -9,31 +9,41 @@ #include "MusaDeviceProperties.h" #include +#include #include +#ifdef HAS_MUPTI #include #include +#endif #include "Logger.h" namespace KINETO_NAMESPACE { -static const std::vector createDeviceProps() { - std::vector props; +#ifdef HAS_MUPTI +#define gpuDeviceProp musaDeviceProp +#define gpuError_t musaError_t +#define gpuSuccess musaSuccess +#define gpuGetDeviceCount musaGetDeviceCount +#define gpuGetDeviceProperties musaGetDeviceProperties + +static const std::vector createDeviceProps() { + std::vector props; int device_count; - musaError_t error_id = musaGetDeviceCount(&device_count); + gpuError_t error_id = gpuGetDeviceCount(&device_count); // Return empty vector if error. - if (error_id != musaSuccess) { - LOG(ERROR) << "musaGetDeviceCount failed with code " << error_id; + if (error_id != gpuSuccess) { + LOG(ERROR) << "gpuGetDeviceCount failed with code " << error_id; return {}; } VLOG(0) << "Device count is " << device_count; for (size_t i = 0; i < device_count; ++i) { - musaDeviceProp prop; - error_id = musaGetDeviceProperties(&prop, i); + gpuDeviceProp prop; + error_id = gpuGetDeviceProperties(&prop, i); // Return empty vector if any device property fail to get. - if (error_id != musaSuccess) { - LOG(ERROR) << "musaGetDeviceProperties failed with " << error_id; + if (error_id != gpuSuccess) { + LOG(ERROR) << "gpuGetDeviceProperties failed with " << error_id; return {}; } // TODO: temp workaround, see https://github.mthreads.com/mthreads/kineto/issues/3 @@ -44,28 +54,32 @@ static const std::vector createDeviceProps() { return props; } -static const std::vector& deviceProps() { - static const std::vector props = createDeviceProps(); +static const std::vector& deviceProps() { + static const std::vector props = createDeviceProps(); return props; } static const std::string createDevicePropertiesJson( - size_t id, const musaDeviceProp& props) { + size_t id, const gpuDeviceProp& props) { + std::string gpuSpecific = ""; + gpuSpecific = fmt::format(R"JSON( + , "regsPerMultiprocessor": {}, "sharedMemPerBlockOptin": {}, "sharedMemPerMultiprocessor": {})JSON", + props.regsPerMultiprocessor, props.sharedMemPerBlockOptin, props.sharedMemPerMultiprocessor); + return fmt::format(R"JSON( {{ "id": {}, "name": "{}", "totalGlobalMem": {}, "computeMajor": {}, "computeMinor": {}, "maxThreadsPerBlock": {}, "maxThreadsPerMultiprocessor": {}, - "regsPerBlock": {}, "regsPerMultiprocessor": {}, "warpSize": {}, - "sharedMemPerBlock": {}, "sharedMemPerMultiprocessor": {}, - "numSms": {}, "sharedMemPerBlockOptin": {} + "regsPerBlock": {}, "warpSize": {}, + "sharedMemPerBlock": {}, "numSms": {}{} }})JSON", id, props.name, props.totalGlobalMem, props.major, props.minor, props.maxThreadsPerBlock, props.maxThreadsPerMultiProcessor, - props.regsPerBlock, props.regsPerMultiprocessor, props.warpSize, - props.sharedMemPerBlock, props.sharedMemPerMultiprocessor, - props.multiProcessorCount, props.sharedMemPerBlockOptin); + props.regsPerBlock, props.warpSize, + props.sharedMemPerBlock, props.multiProcessorCount, + gpuSpecific); } static const std::string createDevicePropertiesJson() { @@ -83,11 +97,22 @@ const std::string& devicePropertiesJson() { } int smCount(uint32_t deviceId) { - const std::vector &props = deviceProps(); + const std::vector &props = deviceProps(); return deviceId >= props.size() ? 0 : props[deviceId].multiProcessorCount; } +#else +const std::string& devicePropertiesJson() { + static std::string devicePropsJson = ""; + return devicePropsJson; +} + +int smCount(uint32_t deviceId) { + return 0; +} +#endif // HAS_MUPTI +#ifdef HAS_MUPTI float blocksPerSm(const MUpti_ActivityKernel6& kernel) { return (kernel.gridX * kernel.gridY * kernel.gridZ) / (float) smCount(kernel.deviceId); @@ -159,5 +184,6 @@ float kernelOccupancy( } return occupancy; } +#endif // HAS_MUPTI } // namespace KINETO_NAMESPACE diff --git a/libkineto/src/MusaDeviceProperties.h b/libkineto/src/MusaDeviceProperties.h index 0835e8c..86149f5 100644 --- a/libkineto/src/MusaDeviceProperties.h +++ b/libkineto/src/MusaDeviceProperties.h @@ -11,11 +11,15 @@ #include #include +#ifdef HAS_MUPTI #include +#endif namespace KINETO_NAMESPACE { int smCount(uint32_t deviceId); + +#ifdef HAS_MUPTI float blocksPerSm(const MUpti_ActivityKernel6& kernel); float warpsPerSm(const MUpti_ActivityKernel6& kernel); @@ -33,5 +37,6 @@ float kernelOccupancy( // Return compute properties for each device as a json string const std::string& devicePropertiesJson(); +#endif } // namespace KINETO_NAMESPACE diff --git a/libkineto/src/MusaUtil.cpp b/libkineto/src/MusaUtil.cpp index 574c5e3..d5a567e 100644 --- a/libkineto/src/MusaUtil.cpp +++ b/libkineto/src/MusaUtil.cpp @@ -17,23 +17,25 @@ namespace KINETO_NAMESPACE { -bool gpuAvailable = false; +bool musaGpuAvailable = false; -bool isGpuAvailable() { +bool isMUSAGpuAvailable() { #ifdef HAS_MUPTI static std::once_flag once; std::call_once(once, [] { - // determine GPU availability on the system + // determine MUSA GPU availability on the system musaError_t error; int deviceCount; error = musaGetDeviceCount(&deviceCount); - gpuAvailable = (error == musaSuccess && deviceCount > 0); + musaGpuAvailable = (error == musaSuccess && deviceCount > 0); }); - - return gpuAvailable; -#else - return false; #endif + return musaGpuAvailable; +} + +bool isGpuAvailable() { + bool musa = isMUSAGpuAvailable(); + return musa; } } // namespace KINETO_NAMESPACE diff --git a/libkineto/src/MusaUtil.h b/libkineto/src/MusaUtil.h index ff362f2..3f5bdbd 100644 --- a/libkineto/src/MusaUtil.h +++ b/libkineto/src/MusaUtil.h @@ -8,8 +8,52 @@ #pragma once +#include + +#ifdef HAS_MUPTI +#include +#include +#include + +#define MUSA_CALL(call) \ + [&]() -> musaError_t { \ + musaError_t _status_ = call; \ + if (_status_ != musaSuccess) { \ + const char* _errstr_ = musaGetErrorString(_status_); \ + LOG(WARNING) << fmt::format( \ + "function {} failed with error {} ({})", \ + #call, \ + _errstr_, \ + (int)_status_); \ + } \ + return _status_; \ + }() + +#define MUPTI_CALL(call) \ + [&]() -> MUptiResult { \ + MUptiResult _status_ = call; \ + if (_status_ != MUPTI_SUCCESS) { \ + const char* _errstr_ = nullptr; \ + muptiGetResultString(_status_, &_errstr_); \ + LOG(WARNING) << fmt::format( \ + "function {} failed with error {} ({})", \ + #call, \ + _errstr_, \ + (int)_status_); \ + } \ + return _status_; \ + }() + +#else +#define MUPTI_CALL(call) call +#endif // HAS_MUPTI + +#define MUPTI_CALL_NOWARN(call) call + namespace KINETO_NAMESPACE { +bool isMUSAGpuAvailable(); + bool isGpuAvailable(); } // namespace KINETO_NAMESPACE diff --git a/libkineto/src/RoctracerActivityApi.cpp b/libkineto/src/RoctracerActivityApi.cpp deleted file mode 100644 index 8dc3f82..0000000 --- a/libkineto/src/RoctracerActivityApi.cpp +++ /dev/null @@ -1,401 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include - -#include "RoctracerActivityApi.h" -#include "RoctracerLogger.h" -#include "Demangle.h" -#include "output_base.h" -#include "ThreadUtil.h" - -typedef uint64_t timestamp_t; - -static timestamp_t timespec_to_ns(const timespec& time) { - return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; - } - -using namespace std::chrono; - -// Local copy of hip op types. These are public (and stable) in later rocm releases -typedef enum { - HIP_OP_COPY_KIND_UNKNOWN_ = 0, - HIP_OP_COPY_KIND_DEVICE_TO_HOST_ = 0x11F3, - HIP_OP_COPY_KIND_HOST_TO_DEVICE_ = 0x11F4, - HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_ = 0x11F5, - HIP_OP_COPY_KIND_DEVICE_TO_HOST_2D_ = 0x1201, - HIP_OP_COPY_KIND_HOST_TO_DEVICE_2D_ = 0x1202, - HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_2D_ = 0x1203, - HIP_OP_COPY_KIND_FILL_BUFFER_ = 0x1207 -} hip_op_copy_kind_t_; - -typedef enum { - HIP_OP_DISPATCH_KIND_UNKNOWN_ = 0, - HIP_OP_DISPATCH_KIND_KERNEL_ = 0x11F0 -} hip_op_dispatch_kind_t_; - -typedef enum { - HIP_OP_BARRIER_KIND_UNKNOWN_ = 0 -} hip_op_barrier_kind_t_; -// end hip op defines - -namespace KINETO_NAMESPACE { - -constexpr size_t kBufSize(2 * 1024 * 1024); - -RoctracerActivityApi& RoctracerActivityApi::singleton() { - static RoctracerActivityApi instance; - return instance; -} - -RoctracerActivityApi::RoctracerActivityApi() -: d(&RoctracerLogger::singleton()) { -} - -RoctracerActivityApi::~RoctracerActivityApi() { - disableActivities(std::set()); -} - -void RoctracerActivityApi::pushCorrelationID(int id, CorrelationFlowType type) { -#ifdef HAS_ROCTRACER - if (!singleton().d->externalCorrelationEnabled_) { - return; - } - singleton().d->pushCorrelationID(id, static_cast(type)); -#endif -} - -void RoctracerActivityApi::popCorrelationID(CorrelationFlowType type) { -#ifdef HAS_ROCTRACER - if (!singleton().d->externalCorrelationEnabled_) { - return; - } - singleton().d->popCorrelationID(static_cast(type)); -#endif -} - -void RoctracerActivityApi::setMaxBufferSize(int size) { - // FIXME: implement? - //maxGpuBufferCount_ = 1 + size / kBufSize; -} - -inline bool inRange(int64_t start, int64_t end, int64_t stamp) { - return ((stamp > start) && (stamp < end)); -} - -inline bool RoctracerActivityApi::isLogged(libkineto::ActivityType atype) { - return activityMaskSnapshot_ & (1 << static_cast(atype)); -} - -int RoctracerActivityApi::processActivities( - ActivityLogger& logger, std::function linkedActivity, - int64_t startTime, int64_t endTime) { - // Find offset to map from monotonic clock to system clock. - // This will break time-ordering of events but is status quo. - - timespec t0, t1, t00; - clock_gettime(CLOCK_REALTIME, &t0); - clock_gettime(CLOCK_MONOTONIC, &t1); - clock_gettime(CLOCK_REALTIME, &t00); - - const timestamp_t toffset = (timespec_to_ns(t0) >> 1) + (timespec_to_ns(t00) >> 1) - timespec_to_ns(t1); - // Our stored timestamps (from roctracer and generated) are in CLOCK_MONOTONIC domain (in ns). - // Convert the guards rather than each timestamp. - startTime = (startTime * 1000) - toffset; - endTime = (endTime * 1000) - toffset; - - int count = 0; - - auto &externalCorrelations = d->externalCorrelations_[RoctracerLogger::CorrelationDomain::Domain0]; - - // Basic Api calls - - if (isLogged(ActivityType::MUSA_RUNTIME)) { - - for (auto &item : d->rows_) { - if (!inRange(startTime, endTime, item.begin)) - continue; - GenericTraceActivity a; - a.startTime = (item.begin + toffset) / 1000; - a.endTime = (item.end + toffset) / 1000; - a.id = item.id; - a.device = item.pid; - a.resource = item.tid; - a.activityType = ActivityType::MUSA_RUNTIME; - a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0)); - a.flow.id = item.id; - a.flow.type = kLinkAsyncCpuGpu; - a.flow.start = true; - - auto it = externalCorrelations.find(a.id); - a.linked = linkedActivity(it == externalCorrelations.end() ? 0 : it->second); - - logger.handleGenericActivity(a); - ++count; - } - - // Malloc/Free calls - for (auto &item : d->mallocRows_) { - if (!inRange(startTime, endTime, item.begin)) - continue; - GenericTraceActivity a; - a.startTime = (item.begin + toffset) / 1000; - a.endTime = (item.end + toffset) / 1000; - a.id = item.id; - a.device = item.pid; - a.resource = item.tid; - a.activityType = ActivityType::MUSA_RUNTIME; - a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0)); - a.flow.id = item.id; - a.flow.type = kLinkAsyncCpuGpu; - a.flow.start = true; - - auto it = externalCorrelations.find(a.id); - a.linked = linkedActivity(it == externalCorrelations.end() ? 0 : it->second); - - a.addMetadataQuoted("ptr", fmt::format("{}", item.ptr)); - if (item.cid == HIP_API_ID_hipMalloc) { - a.addMetadata("size", item.size); - } - - logger.handleGenericActivity(a); - ++count; - } - - // HipMemcpy calls - for (auto &item : d->copyRows_) { - if (!inRange(startTime, endTime, item.begin)) - continue; - GenericTraceActivity a; - a.startTime = (item.begin + toffset) / 1000; - a.endTime = (item.end + toffset) / 1000; - a.id = item.id; - a.device = item.pid; - a.resource = item.tid; - a.activityType = ActivityType::MUSA_RUNTIME; - a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0)); - a.flow.id = item.id; - a.flow.type = kLinkAsyncCpuGpu; - a.flow.start = true; - - auto it = externalCorrelations.find(a.id); - a.linked = linkedActivity(it == externalCorrelations.end() ? 0 : it->second); - - a.addMetadataQuoted("src", fmt::format("{}", item.src)); - a.addMetadataQuoted("dst", fmt::format("{}", item.dst)); - a.addMetadata("size", item.size); - a.addMetadata("kind", fmt::format("{}", fmt::underlying(item.kind))); - if ((item.cid == HIP_API_ID_hipMemcpyAsync) || (item.cid == HIP_API_ID_hipMemcpyWithStream)) { - a.addMetadataQuoted("stream", fmt::format("{}", reinterpret_cast(item.stream))); - } - - logger.handleGenericActivity(a); - ++count; - } - - // Kernel Launch Api calls - - for (auto &item : d->kernelRows_) { - if (!inRange(startTime, endTime, item.begin)) - continue; - GenericTraceActivity a; - a.startTime = (item.begin + toffset) / 1000; - a.endTime = (item.end + toffset) / 1000; - a.id = item.id; - a.device = item.pid; - a.resource = item.tid; - a.activityType = ActivityType::MUSA_RUNTIME; - a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0)); - a.flow.id = item.id; - a.flow.type = kLinkAsyncCpuGpu; - a.flow.start = true; - - auto it = externalCorrelations.find(a.id); - a.linked = linkedActivity(it == externalCorrelations.end() ? 0 : it->second); - - // TODO: Use lowercase kernel, once legacy tools update. - if (item.functionAddr != nullptr) { - a.addMetadataQuoted( - "Kernel", demangle(hipKernelNameRefByPtr(item.functionAddr, item.stream))); - } - else if (item.function != nullptr) { - a.addMetadataQuoted( - "Kernel", demangle(hipKernelNameRef(item.function))); - } - a.addMetadata("grid dim", fmt::format("[{}, {}, {}]", item.gridX, item.gridY, item.gridZ)); - a.addMetadata("block dim", fmt::format("[{}, {}, {}]", item.workgroupX, item.workgroupY, item.workgroupZ)); - a.addMetadata("shared size", item.groupSegmentSize); - a.addMetadataQuoted("stream", fmt::format("{}", reinterpret_cast(item.stream))); - - // Stash launches to tie to the async ops - kernelLaunches_[a.id] = a; - - // Stash kernel names to tie to the async ops - std::string name; - if (item.functionAddr != nullptr) { - name = demangle(hipKernelNameRefByPtr(item.functionAddr, item.stream)); - } - else if (item.function != nullptr) { - name = demangle(hipKernelNameRef(item.function)); - } - if (!name.empty()) { - uint32_t string_id = reverseStrings_[name]; - if (string_id == 0) { - string_id = nextStringId_++; - reverseStrings_[name] = string_id; - strings_[string_id] = name; - } - kernelNames_[item.id] = string_id; - } - - logger.handleGenericActivity(a); - ++count; - } - } // isLogged(ActivityType::MUSA_RUNTIME) - - // Async Ops - - for (auto& buffer : *d->gpuTraceBuffers_) { - const roctracer_record_t* record = (const roctracer_record_t*)(buffer.data_); - const roctracer_record_t* end_record = (const roctracer_record_t*)(buffer.data_ + buffer.validSize_); - GenericTraceActivity a; - - while (record < end_record) { - if (record->domain == ACTIVITY_DOMAIN_HIP_API) { - const char *name = roctracer_op_string(record->domain, record->op, record->kind); - a.device = record->process_id; - a.resource = record->thread_id; - - a.startTime = (record->begin_ns + toffset) / 1000; - a.endTime = (record->end_ns + toffset) / 1000; - a.id = record->correlation_id; - - a.activityType = ActivityType::MUSA_RUNTIME; - a.activityName = std::string(name); - a.flow.id = record->correlation_id; - a.flow.type = kLinkAsyncCpuGpu; - a.flow.start = true; - - auto it = externalCorrelations.find(a.id); - a.linked = linkedActivity(it == externalCorrelations.end() ? 0 : it->second); - - if (inRange(startTime, endTime, record->begin_ns)) { - logger.handleGenericActivity(a); - ++count; - } - } - else if (record->domain == ACTIVITY_DOMAIN_HCC_OPS) { - // Overlay launch metadata for kernels - auto kit = kernelLaunches_.find(record->correlation_id); - if (kit != kernelLaunches_.end()) { - a = (*kit).second; - } - - const char *name = roctracer_op_string(record->domain, record->op, record->kind); - a.device = record->device_id; - a.resource = record->queue_id; - - a.startTime = (record->begin_ns + toffset) / 1000; - a.endTime = (record->end_ns + toffset) / 1000; - a.id = record->correlation_id; - - a.activityType = ActivityType::CONCURRENT_KERNEL; - a.activityName = std::string(name); - a.flow.id = record->correlation_id; - a.flow.type = kLinkAsyncCpuGpu; - a.flow.start = false; - - auto eit = externalCorrelations.find(a.id); - a.linked = linkedActivity(eit == externalCorrelations.end() ? 0 : eit->second); - - auto it = kernelNames_.find(record->correlation_id); - if (it != kernelNames_.end()) { - a.activityName = strings_[it->second]; - } - - bool filtered = false; - - switch (record->kind) { - case HIP_OP_COPY_KIND_DEVICE_TO_HOST_: - case HIP_OP_COPY_KIND_HOST_TO_DEVICE_: - case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_: - case HIP_OP_COPY_KIND_DEVICE_TO_HOST_2D_: - case HIP_OP_COPY_KIND_HOST_TO_DEVICE_2D_: - case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_2D_: - if (!isLogged(ActivityType::GPU_MEMCPY)) - filtered = true; - a.activityType = ActivityType::GPU_MEMCPY; - break; - case HIP_OP_COPY_KIND_FILL_BUFFER_: - if (!isLogged(ActivityType::GPU_MEMSET)) - filtered = true; - a.activityType = ActivityType::GPU_MEMSET; - break; - case HIP_OP_DISPATCH_KIND_KERNEL_: - default: - if (!isLogged(ActivityType::CONCURRENT_KERNEL)) - filtered = true; - if (record->op == HIP_OP_ID_BARRIER) // Don't record barriers/markers - filtered = true; - a.activityType = ActivityType::CONCURRENT_KERNEL; - break; - } - - if (!filtered && inRange(startTime, endTime, record->begin_ns)) { - logger.handleGenericActivity(a); - ++count; - } - } - - roctracer_next_record(record, &record); - } - } - return count; -} - -void RoctracerActivityApi::clearActivities() { - d->clearLogs(); - kernelLaunches_.clear(); - kernelNames_.clear(); -} - - -void RoctracerActivityApi::enableActivities( - const std::set& selected_activities) { -#ifdef HAS_ROCTRACER - d->startLogging(); - - for (const auto& activity : selected_activities) { - activityMask_ |= (1 << static_cast(activity)); - if (activity == ActivityType::EXTERNAL_CORRELATION) { - d->externalCorrelationEnabled_ = true; - } - } -#endif -} - -void RoctracerActivityApi::disableActivities( - const std::set& selected_activities) { -#ifdef HAS_ROCTRACER - d->stopLogging(); - - activityMaskSnapshot_ = activityMask_; - - for (const auto& activity : selected_activities) { - activityMask_ &= ~(1 << static_cast(activity)); - if (activity == ActivityType::EXTERNAL_CORRELATION) { - d->externalCorrelationEnabled_ = false; - } - } -#endif -} - -} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/RoctracerActivityApi.h b/libkineto/src/RoctracerActivityApi.h deleted file mode 100644 index a758c8b..0000000 --- a/libkineto/src/RoctracerActivityApi.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#ifdef HAS_ROCTRACER -#include -#endif - -#include "ActivityType.h" -#include "GenericTraceActivity.h" - -class RoctracerLogger; -class roctracerRow; - -namespace KINETO_NAMESPACE { - -using namespace libkineto; - -class RoctracerActivityApi { - public: - enum CorrelationFlowType { - Default, - User - }; - - RoctracerActivityApi(); - RoctracerActivityApi(const RoctracerActivityApi&) = delete; - RoctracerActivityApi& operator=(const RoctracerActivityApi&) = delete; - - virtual ~RoctracerActivityApi(); - - static RoctracerActivityApi& singleton(); - - static void pushCorrelationID(int id, CorrelationFlowType type); - static void popCorrelationID(CorrelationFlowType type); - - void enableActivities( - const std::set& selected_activities); - void disableActivities( - const std::set& selected_activities); - void clearActivities(); - void teardownContext() {} - - int processActivities(ActivityLogger& logger, - std::function linkedActivity, - int64_t startTime, int64_t endTime); - - void setMaxBufferSize(int size); - - std::atomic_bool stopCollection{false}; - - private: - bool registered_{false}; - - //Name cache - uint32_t nextStringId_{2}; - std::map strings_; - std::map reverseStrings_; - std::map kernelNames_; - - std::map kernelLaunches_; - - // Enabled Activity Filters - uint32_t activityMask_{0}; - uint32_t activityMaskSnapshot_{0}; - bool isLogged(libkineto::ActivityType atype); - - RoctracerLogger *d; -}; - -} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/RoctracerActivityBuffer.h b/libkineto/src/RoctracerActivityBuffer.h deleted file mode 100644 index 2f0e680..0000000 --- a/libkineto/src/RoctracerActivityBuffer.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include -#include - -namespace KINETO_NAMESPACE { - -class RoctracerActivityBuffer { - public: - // data must be allocated using malloc. - // Ownership is transferred to this object. - RoctracerActivityBuffer(uint8_t* data, size_t validSize) - : data(data), validSize(validSize) {} - - ~RoctracerActivityBuffer() { - free(data); - } - - // Allocated by malloc - uint8_t* data{nullptr}; - - // Number of bytes used - size_t validSize; -}; - -} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/RoctracerLogger.cpp b/libkineto/src/RoctracerLogger.cpp deleted file mode 100644 index b2f7807..0000000 --- a/libkineto/src/RoctracerLogger.cpp +++ /dev/null @@ -1,364 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include "RoctracerLogger.h" - -#include -#include -#include - -#include "ThreadUtil.h" - -typedef uint64_t timestamp_t; - -static timestamp_t timespec_to_ns(const timespec& time) { - return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; - } - -using namespace std::chrono; - -constexpr size_t kBufSize(2 * 1024 * 1024); - -RoctracerLogger& RoctracerLogger::singleton() { - static RoctracerLogger instance; - return instance; -} - -RoctracerLogger::RoctracerLogger() { - gpuTraceBuffers_ = std::make_unique>(); -} - -RoctracerLogger::~RoctracerLogger() { - stopLogging(); - endTracing(); -} - -namespace { - thread_local std::deque t_externalIds[RoctracerLogger::CorrelationDomain::size]; -} - -void RoctracerLogger::pushCorrelationID(uint64_t id, CorrelationDomain type) { - if (!singleton().externalCorrelationEnabled_) { - return; - } - t_externalIds[type].push_back(id); -} - -void RoctracerLogger::popCorrelationID(CorrelationDomain type) { - if (!singleton().externalCorrelationEnabled_) { - return; - } - t_externalIds[type].pop_back(); -} - -void RoctracerLogger::clearLogs() { - rows_.clear(); - kernelRows_.clear(); - copyRows_.clear(); - mallocRows_.clear(); - gpuTraceBuffers_->clear(); - for (int i = 0; i < CorrelationDomain::size; ++i) { - externalCorrelations_[i].clear(); - } -} - -void RoctracerLogger::api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg) -{ - RoctracerLogger *dis = &singleton(); - - if (domain == ACTIVITY_DOMAIN_HIP_API && dis->loggedIds_.contains(cid)) { - const hip_api_data_t* data = (const hip_api_data_t*)(callback_data); - - // Pack callbacks into row structures - - thread_local timespec timestamp; - - if (data->phase == ACTIVITY_API_PHASE_ENTER) { - clock_gettime(CLOCK_MONOTONIC, ×tamp); // record proper clock - } - else { // (data->phase == ACTIVITY_API_PHASE_EXIT) - timespec endTime; - timespec startTime { timestamp }; - clock_gettime(CLOCK_MONOTONIC, &endTime); // record proper clock - - switch (cid) { - case HIP_API_ID_hipLaunchKernel: - case HIP_API_ID_hipExtLaunchKernel: - case HIP_API_ID_hipLaunchCooperativeKernel: // Should work here - { - auto &args = data->args.hipLaunchKernel; - dis->kernelRows_.emplace_back(data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - timespec_to_ns(startTime), - timespec_to_ns(endTime), - args.function_address, - nullptr, - args.numBlocks.x, - args.numBlocks.y, - args.numBlocks.z, - args.dimBlocks.x, - args.dimBlocks.y, - args.dimBlocks.z, - args.sharedMemBytes, - args.stream - ); - } - break; - case HIP_API_ID_hipHccModuleLaunchKernel: - case HIP_API_ID_hipModuleLaunchKernel: - case HIP_API_ID_hipExtModuleLaunchKernel: - { - auto &args = data->args.hipModuleLaunchKernel; - dis->kernelRows_.emplace_back(data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - timespec_to_ns(startTime), - timespec_to_ns(endTime), - nullptr, - args.f, - args.gridDimX, - args.gridDimY, - args.gridDimZ, - args.blockDimX, - args.blockDimY, - args.blockDimZ, - args.sharedMemBytes, - args.stream - ); - } - break; - case HIP_API_ID_hipLaunchCooperativeKernelMultiDevice: - case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice: -#if 0 - { - auto &args = data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList__val; - dis->kernelRows_.emplace_back(data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - timespec_to_ns(startTime), - timespec_to_ns(endTime), - args.function_address, - nullptr, - args.numBlocks.x, - args.numBlocks.y, - args.numBlocks.z, - args.dimBlocks.x, - args.dimBlocks.y, - args.dimBlocks.z, - args.sharedMemBytes, - args.stream - ); - } -#endif - break; - case HIP_API_ID_hipMalloc: - dis->mallocRows_.emplace_back(data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - timespec_to_ns(startTime), - timespec_to_ns(endTime), - data->args.hipMalloc.ptr__val, - data->args.hipMalloc.size - ); - break; - case HIP_API_ID_hipFree: - dis->mallocRows_.emplace_back(data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - timespec_to_ns(startTime), - timespec_to_ns(endTime), - data->args.hipFree.ptr, - 0 - ); - break; - case HIP_API_ID_hipMemcpy: - { - auto &args = data->args.hipMemcpy; - dis->copyRows_.emplace_back(data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - timespec_to_ns(startTime), - timespec_to_ns(endTime), - args.src, - args.dst, - args.sizeBytes, - args.kind, - static_cast(0) // use placeholder? - ); - } - break; - case HIP_API_ID_hipMemcpyAsync: - case HIP_API_ID_hipMemcpyWithStream: - { - auto &args = data->args.hipMemcpyAsync; - dis->copyRows_.emplace_back(data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - timespec_to_ns(startTime), - timespec_to_ns(endTime), - args.src, - args.dst, - args.sizeBytes, - args.kind, - args.stream - ); - } - break; - default: - dis->rows_.emplace_back(data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - timespec_to_ns(startTime), - timespec_to_ns(endTime) - ); - break; - } // switch - // External correlation - for (int it = CorrelationDomain::begin; it < CorrelationDomain::end; ++it) { - if (t_externalIds[it].size() > 0) { - dis->externalCorrelations_[it][data->correlation_id] = t_externalIds[it].back(); - } - } - } // phase exit - } -} - -void RoctracerLogger::activity_callback(const char* begin, const char* end, void* arg) -{ - size_t size = end - begin; - uint8_t *buffer = (uint8_t*) malloc(size); - auto &gpuTraceBuffers = singleton().gpuTraceBuffers_; - memcpy(buffer, begin, size); - gpuTraceBuffers->emplace_back(buffer, size); -} - -void RoctracerLogger::startLogging() { - if (!registered_) { - roctracer_set_properties(ACTIVITY_DOMAIN_HIP_API, nullptr); // Magic encantation - - // Set some api calls to ignore - loggedIds_.setInvertMode(true); // Omit the specified api - loggedIds_.add("hipGetDevice"); - loggedIds_.add("hipSetDevice"); - loggedIds_.add("hipGetLastError"); - loggedIds_.add("__hipPushCallConfiguration"); - loggedIds_.add("__hipPopCallConfiguration"); - loggedIds_.add("hipCtxSetCurrent"); - loggedIds_.add("hipEventRecord"); - loggedIds_.add("hipEventQuery"); - loggedIds_.add("hipGetDeviceProperties"); - loggedIds_.add("hipPeekAtLastError"); - loggedIds_.add("hipModuleGetFunction"); - loggedIds_.add("hipEventCreateWithFlags"); - loggedIds_.add("hipGetDeviceCount"); - loggedIds_.add("hipDevicePrimaryCtxGetState"); - - // Enable API callbacks - if (loggedIds_.invertMode() == true) { - // exclusion list - enable entire domain and turn off things in list - roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, api_callback, nullptr); - const std::unordered_map &filter = loggedIds_.filterList(); - for (auto it = filter.begin(); it != filter.end(); ++it) { - roctracer_disable_op_callback(ACTIVITY_DOMAIN_HIP_API, it->first); - } - } - else { - // inclusion list - only enable things in the list - const std::unordered_map &filter = loggedIds_.filterList(); - roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API); - for (auto it = filter.begin(); it != filter.end(); ++it) { - roctracer_enable_op_callback(ACTIVITY_DOMAIN_HIP_API, it->first, api_callback, nullptr); - } - } - //roctracer_enable_domain_callback(ACTIVITY_DOMAIN_ROCTX, api_callback, nullptr); - - // Allocate default tracing pool - roctracer_properties_t properties; - memset(&properties, 0, sizeof(roctracer_properties_t)); - properties.buffer_size = 0x1000; - roctracer_open_pool(&properties); - - // Enable async op collection - roctracer_properties_t hcc_cb_properties; - memset(&hcc_cb_properties, 0, sizeof(roctracer_properties_t)); - hcc_cb_properties.buffer_size = 0x4000; - hcc_cb_properties.buffer_callback_fun = activity_callback; - roctracer_open_pool_expl(&hcc_cb_properties, &hccPool_); - roctracer_enable_domain_activity_expl(ACTIVITY_DOMAIN_HCC_OPS, hccPool_); - - registered_ = true; - } - - externalCorrelationEnabled_ = true; - roctracer_start(); -} - -void RoctracerLogger::stopLogging() { - roctracer_stop(); - roctracer_flush_activity_expl(hccPool_); -} - -void RoctracerLogger::endTracing() { - if (registered_ == true) { - roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API); - //roctracer_disable_domain_callback(ACTIVITY_DOMAIN_ROCTX); - - roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS); - roctracer_close_pool_expl(hccPool_); - hccPool_ = nullptr; - } -} - - -ApiIdList::ApiIdList() -: invert_(true) -{ -} - -void ApiIdList::add(const std::string &apiName) -{ - uint32_t cid = 0; - if (roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) == ROCTRACER_STATUS_SUCCESS) { - filter_[cid] = 1; - } -} -void ApiIdList::remove(const std::string &apiName) -{ - uint32_t cid = 0; - if (roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) == ROCTRACER_STATUS_SUCCESS) { - filter_.erase(cid); - } -} - -bool ApiIdList::loadUserPrefs() -{ - // placeholder - return false; -} -bool ApiIdList::contains(uint32_t apiId) -{ - return (filter_.find(apiId) != filter_.end()) ? !invert_ : invert_; // XOR -} - diff --git a/libkineto/src/RoctracerLogger.h b/libkineto/src/RoctracerLogger.h deleted file mode 100644 index b1ab448..0000000 --- a/libkineto/src/RoctracerLogger.h +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - - -namespace onnxruntime{ -namespace profiling { -class RocmProfiler; -} -} - -namespace libkineto { -class RoctracerActivityApi; -} - -class RoctracerActivityBuffer { -public: - // data must be allocated using malloc. - // Ownership is transferred to this object. - RoctracerActivityBuffer(uint8_t* data, size_t validSize) - : data_(data), validSize_(validSize) {} - - ~RoctracerActivityBuffer() { - free(data_); - } - - // Allocated by malloc - uint8_t* data_{nullptr}; - - // Number of bytes used - size_t validSize_; -}; - - -class ApiIdList -{ -public: - ApiIdList(); - bool invertMode() { return invert_; } - void setInvertMode(bool invert) { invert_ = invert; } - void add(const std::string &apiName); - void remove(const std::string &apiName); - bool loadUserPrefs(); - bool contains(uint32_t apiId); - const std::unordered_map &filterList() { return filter_; } - -private: - std::unordered_map filter_; - bool invert_; -}; - -struct roctracerRow { - roctracerRow(uint64_t id, uint32_t domain, uint32_t cid, uint32_t pid - , uint32_t tid, uint64_t begin, uint64_t end) - : id(id), domain(domain), cid(cid), pid(pid), tid(tid), begin(begin), end(end) {} - uint64_t id; // correlation_id - uint32_t domain; - uint32_t cid; - uint32_t pid; - uint32_t tid; - uint64_t begin; - uint64_t end; -}; - -struct kernelRow : public roctracerRow { - kernelRow(uint64_t id, uint32_t domain, uint32_t cid, uint32_t pid - , uint32_t tid, uint64_t begin, uint64_t end - , const void *faddr, hipFunction_t function - , unsigned int gx, unsigned int gy, unsigned int gz - , unsigned int wx, unsigned int wy, unsigned int wz - , size_t gss, hipStream_t stream) - : roctracerRow(id, domain, cid, pid, tid, begin, end), functionAddr(faddr) - , function(function), gridX(gx), gridY(gy), gridZ(gz) - , workgroupX(wx), workgroupY(wy), workgroupZ(wz), groupSegmentSize(gss) - , stream(stream) {} - const void* functionAddr; - hipFunction_t function; - unsigned int gridX; - unsigned int gridY; - unsigned int gridZ; - unsigned int workgroupX; - unsigned int workgroupY; - unsigned int workgroupZ; - size_t groupSegmentSize; - hipStream_t stream; -}; - -struct copyRow : public roctracerRow { - copyRow(uint64_t id, uint32_t domain, uint32_t cid, uint32_t pid - , uint32_t tid, uint64_t begin, uint64_t end - , const void* src, const void *dst, size_t size, hipMemcpyKind kind - , hipStream_t stream) - : roctracerRow(id, domain, cid, pid, tid, begin, end) - , src(src), dst(dst), size(size), kind(kind), stream(stream) {} - const void *src; - const void *dst; - size_t size; - hipMemcpyKind kind; - hipStream_t stream; -}; - -struct mallocRow : public roctracerRow { - mallocRow(uint64_t id, uint32_t domain, uint32_t cid, uint32_t pid - , uint32_t tid, uint64_t begin, uint64_t end - , const void* ptr, size_t size) - : roctracerRow(id, domain, cid, pid, tid, begin, end) - , ptr(ptr), size(size) {} - const void *ptr; - size_t size; -}; - - -class RoctracerLogger { - public: - enum CorrelationDomain { - begin, - Default = begin, - Domain0 = begin, - Domain1, - end, - size = end - }; - - RoctracerLogger(); - RoctracerLogger(const RoctracerLogger&) = delete; - RoctracerLogger& operator=(const RoctracerLogger&) = delete; - - virtual ~RoctracerLogger(); - - static RoctracerLogger& singleton(); - - static void pushCorrelationID(uint64_t id, CorrelationDomain type); - static void popCorrelationID(CorrelationDomain type); - - void startLogging(); - void stopLogging(); - void clearLogs(); - - private: - bool registered_{false}; - void endTracing(); - - roctracer_pool_t *hccPool_{NULL}; - static void api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg); - static void activity_callback(const char* begin, const char* end, void* arg); - - ApiIdList loggedIds_; - - // Api callback data - std::deque rows_; - std::deque kernelRows_; - std::deque copyRows_; - std::deque mallocRows_; - std::map externalCorrelations_[CorrelationDomain::size]; // tracer -> ext - - std::unique_ptr> gpuTraceBuffers_; - bool externalCorrelationEnabled_{true}; - - friend class onnxruntime::profiling::RocmProfiler; - friend class libkineto::RoctracerActivityApi; -}; diff --git a/libkineto/src/ThreadUtil.cpp b/libkineto/src/ThreadUtil.cpp index 64d2089..fa52085 100644 --- a/libkineto/src/ThreadUtil.cpp +++ b/libkineto/src/ThreadUtil.cpp @@ -8,12 +8,12 @@ #include "ThreadUtil.h" -#ifndef _MSC_VER +#ifndef _WIN32 #include #include #include #include -#else // _MSC_VER +#else // _WIN32 #include #include #define WIN32_LEAN_AND_MEAN @@ -21,7 +21,7 @@ #include #include #undef ERROR -#endif // _MSC_VER +#endif // _WIN32 #ifdef __ANDROID__ #include @@ -41,7 +41,7 @@ thread_local int32_t _sysTid = 0; int32_t processId() { if (!_pid) { -#ifndef _MSC_VER +#ifndef _WIN32 _pid = (int32_t)getpid(); #else _pid = (int32_t)GetCurrentProcessId(); @@ -54,7 +54,7 @@ int32_t systemThreadId() { if (!_sysTid) { #ifdef __APPLE__ _sysTid = (int32_t)syscall(SYS_thread_selfid); -#elif defined _MSC_VER +#elif defined _WIN32 _sysTid = (int32_t)GetCurrentThreadId(); #else _sysTid = (int32_t)syscall(SYS_gettid); @@ -69,7 +69,7 @@ int32_t threadId() { uint64_t tid; pthread_threadid_np(nullptr, &tid); _tid = tid; -#elif defined _MSC_VER +#elif defined _WIN32 _tid = (int32_t)GetCurrentThreadId(); #else pthread_t pth = pthread_self(); @@ -88,9 +88,9 @@ static constexpr const char* basename(const char* s, int off = 0) { ? s : s[off] == '/' ? basename(&s[off + 1]) : basename(s, off + 1); } -#if defined(_MSC_VER) +#if defined(_WIN32) void *getKernel32Func(const char* procName) { - return GetProcAddress(GetModuleHandleA("KERNEL32.DLL"), procName); + return reinterpret_cast(GetProcAddress(GetModuleHandleA("KERNEL32.DLL"), procName)); } #endif } @@ -98,7 +98,7 @@ void *getKernel32Func(const char* procName) { bool setThreadName(const std::string& name) { #ifdef __APPLE__ return 0 == pthread_setname_np(name.c_str()); -#elif defined _MSC_VER +#elif defined _WIN32 // Per https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreaddescription // Use runtime linking to set thread description static auto _SetThreadDescription = reinterpret_cast(getKernel32Func("SetThreadDescription")); @@ -115,7 +115,7 @@ bool setThreadName(const std::string& name) { } std::string getThreadName() { -#ifndef _MSC_VER +#ifndef _WIN32 char buf[kMaxThreadNameLength] = ""; if ( #ifndef __ANDROID__ @@ -127,7 +127,7 @@ std::string getThreadName() { return "Unknown"; } return buf; -#else // _MSC_VER +#else // _WIN32 static auto _GetThreadDescription = reinterpret_cast(getKernel32Func("GetThreadDescription")); if (!_GetThreadDescription) { return "Unknown"; diff --git a/libkineto/src/init.cpp b/libkineto/src/init.cpp index 34ffa6e..5c6af07 100644 --- a/libkineto/src/init.cpp +++ b/libkineto/src/init.cpp @@ -13,23 +13,40 @@ // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude #include "ActivityProfilerProxy.h" #include "Config.h" +#include "ConfigLoader.h" #include "DaemonConfigLoader.h" +#include "MusaUtil.h" +#include "ThreadUtil.h" #ifdef HAS_MUPTI #include "MuptiCallbackApi.h" #include "MuptiActivityApi.h" #include "MuptiRangeProfiler.h" #include "EventProfilerController.h" #endif -#include "mupti_call.h" #include "libkineto.h" #include "Logger.h" namespace KINETO_NAMESPACE { -#ifdef HAS_MUPTI +#if __linux__ || defined(HAS_MUPTI) static bool initialized = false; -static std::mutex initMutex; + +static void initProfilersCPU() { + if (!initialized) { + libkineto::api().initProfilerIfRegistered(); + initialized = true; + VLOG(0) << "libkineto profilers activated"; + } +} + +#endif // __linux__ || defined(HAS_MUPTI) + +#ifdef HAS_MUPTI +static std::mutex& initEventMutex() { + static std::mutex initMutex_; + return initMutex_; +} bool enableEventProfiler() { if (getenv("KINETO_ENABLE_EVENT_PROFILER") != nullptr) { @@ -44,24 +61,20 @@ static void initProfilers( MUpti_CallbackId /*cbid*/, const MUpti_CallbackData* cbInfo) { VLOG(0) << "MUSA Context created"; - std::lock_guard lock(initMutex); - - if (!initialized) { - libkineto::api().initProfilerIfRegistered(); - initialized = true; - VLOG(0) << "libkineto profilers activated"; - } + initProfilersCPU(); if (!enableEventProfiler()) { VLOG(0) << "Kineto EventProfiler disabled, skipping start"; return; } else { + std::lock_guard lock(initEventMutex()); MUpti_ResourceData* d = (MUpti_ResourceData*)cbInfo; MUcontext ctx = d->context; ConfigLoader& config_loader = libkineto::api().configLoader(); config_loader.initBaseConfig(); auto config = config_loader.getConfigCopy(); if (config->eventProfilerEnabled()) { + // This function needs to be called under lock. EventProfilerController::start(ctx, config_loader); LOG(INFO) << "Kineto EventProfiler started"; } @@ -86,14 +99,12 @@ static void stopProfiler( MUpti_CallbackId /*cbid*/, const MUpti_CallbackData* cbInfo) { VLOG(0) << "MUSA Context destroyed"; - std::lock_guard lock(initMutex); - - if (enableEventProfiler()) { - MUpti_ResourceData* d = (MUpti_ResourceData*)cbInfo; - MUcontext ctx = d->context; - EventProfilerController::stopIfEnabled(ctx); - LOG(INFO) << "Kineto EventProfiler stopped"; - } + std::lock_guard lock(initEventMutex()); + MUpti_ResourceData* d = (MUpti_ResourceData*)cbInfo; + MUcontext ctx = d->context; + // This function needs to be called under lock. + EventProfilerController::stopIfEnabled(ctx); + LOG(INFO) << "Kineto EventProfiler stopped"; } static std::unique_ptr rangeProfilerInit; @@ -111,14 +122,14 @@ void libkineto_init(bool cpuOnly, bool logOnError) { const char* logLevelEnv = getenv("KINETO_LOG_LEVEL"); if (logLevelEnv) { // atoi returns 0 on error, so that's what we want - default to VERBOSE - static_assert (static_cast(VERBOSE) == 0, ""); + static_assert(static_cast(VERBOSE) == 0, ""); SET_LOG_SEVERITY_LEVEL(atoi(logLevelEnv)); } // Factory to connect to open source daemon if present #if __linux__ - if (getenv(kUseDaemonEnvVar) != nullptr) { - LOG(INFO) << "Registering daemon config loader"; + if (libkineto::isDaemonEnvVarSet()) { + LOG(INFO) << "Registering daemon config loader, cpuOnly = " << cpuOnly; DaemonConfigLoader::registerFactory(); } #endif @@ -133,18 +144,28 @@ void libkineto_init(bool cpuOnly, bool logOnError) { bool status = false; bool initRangeProfiler = true; - if (cbapi->initSuccess()){ + if (cbapi->initSuccess()) { const MUpti_CallbackDomain domain = MUPTI_CB_DOMAIN_RESOURCE; status = cbapi->registerCallback( domain, MuptiCallbackApi::RESOURCE_CONTEXT_CREATED, initProfilers); - status = status && cbapi->registerCallback( - domain, MuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED, stopProfiler); - if (status) { status = cbapi->enableCallback( domain, MuptiCallbackApi::RESOURCE_CONTEXT_CREATED); - status = status && cbapi->enableCallback( - domain, MuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED); + } + + // Register stopProfiler callback only for event profiler. + // This callback is not required for activities tracing. + if (enableEventProfiler()) { + if (status) { + status = cbapi->registerCallback( + domain, + MuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED, + stopProfiler); + } + if (status) { + status = cbapi->enableCallback( + domain, MuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED); + } } } @@ -175,10 +196,20 @@ void libkineto_init(bool cpuOnly, bool logOnError) { libkineto::api().registerProfiler( std::make_unique(cpuOnly, config_loader)); +#if __linux__ + // When MUSA/GPU is used the profiler initialization happens on the + // creation of the first MUSA stream (see initProfilers()). + // This section bootstraps the profiler and its connection to a profiling + // daemon in the CPU only case. + if (cpuOnly && getenv(kUseDaemonEnvVar) != nullptr) { + initProfilersCPU(); + libkineto::api().configLoader().initBaseConfig(); + } +#endif } // The musa driver calls this function if the MUSA_INJECTION64_PATH environment -// variable is set +// variable is set. Should be skipped if unset or MUSA_INJECTION64_PATH=none. int InitializeInjection(void) { LOG(INFO) << "Injection mode: Initializing libkineto"; libkineto_init(false /*cpuOnly*/, true /*logOnError*/); diff --git a/libkineto/src/mupti_call.h b/libkineto/src/mupti_call.h deleted file mode 100644 index 9fd0c3c..0000000 --- a/libkineto/src/mupti_call.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#ifdef HAS_MUPTI - -#include - -#define MUPTI_CALL(call) \ - [&]() -> MUptiResult { \ - MUptiResult _status_ = call; \ - if (_status_ != MUPTI_SUCCESS) { \ - const char* _errstr_ = nullptr; \ - muptiGetResultString(_status_, &_errstr_); \ - LOG(WARNING) << fmt::format( \ - "function {} failed with error {} ({})", \ - #call, \ - _errstr_, \ - (int)_status_); \ - } \ - return _status_; \ - }() - -#define MUPTI_CALL_NOWARN(call) call - -#else - -#define MUPTI_CALL(call) call -#define MUPTI_CALL_NOWARN(call) call - -#endif // HAS_MUPTI diff --git a/libkineto/src/mupti_strings.cpp b/libkineto/src/mupti_strings.cpp index 1628965..4116130 100644 --- a/libkineto/src/mupti_strings.cpp +++ b/libkineto/src/mupti_strings.cpp @@ -515,25 +515,25 @@ static const char* runtimeCbidNames[] = { "musaDeviceGetGraphMemAttribute", "musaDeviceSetGraphMemAttribute", // comment not implemented musa apis - /* "musaGraphNodeSetEnabled", */ - /* "musaGraphNodeGetEnabled", */ - /* "musaArrayGetMemoryRequirements", */ - /* "musaMipmappedArrayGetMemoryRequirements", */ + // "musaGraphNodeSetEnabled", + // "musaGraphNodeGetEnabled", + // "musaArrayGetMemoryRequirements", + // "musaMipmappedArrayGetMemoryRequirements", "musaLaunchKernelExC", "musaLaunchKernelExC_ptsz", - /* "musaOccupancyMaxPotentialClusterSize", */ - /* "musaOccupancyMaxActiveClusters", */ - /* "musaCreateTextureObject_v2", */ - /* "musaGetTextureObjectTextureDesc_v2", */ - /* "musaGraphInstantiateWithParams", */ - /* "musaGraphInstantiateWithParams_ptsz", */ - /* "musaGraphExecGetFlags", */ - /* "musa439", */ - /* "musaGetDeviceProperties_v2", */ - /* "musaStreamGetId", */ - /* "musaStreamGetId_ptsz", */ - /* "musaGraphInstantiate", */ - /* "musa444", */ + // "musaOccupancyMaxPotentialClusterSize", + // "musaOccupancyMaxActiveClusters", + // "musaCreateTextureObject_v2", + // "musaGetTextureObjectTextureDesc_v2", + // "musaGraphInstantiateWithParams", + // "musaGraphInstantiateWithParams_ptsz", + // "musaGraphExecGetFlags", + // "musa439", + // "musaGetDeviceProperties_v2", + // "musaStreamGetId", + // "musaStreamGetId_ptsz", + // "musaGraphInstantiate", + // "musa444", "SIZE", "FORCE_INT" }; diff --git a/libkineto/src/musa_call.h b/libkineto/src/musa_call.h deleted file mode 100644 index 2925eef..0000000 --- a/libkineto/src/musa_call.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#ifdef HAS_MUPTI - -#include - -#define MUSA_CALL(call) \ - [&]() -> musaError_t { \ - musaError_t _status_ = call; \ - if (_status_ != musaSuccess) { \ - const char* _errstr_ = musaGetErrorString(_status_); \ - LOG(WARNING) << fmt::format( \ - "function {} failed with error {} ({})", \ - #call, \ - _errstr_, \ - (int)_status_); \ - } \ - return _status_; \ - }() - -#endif // HAS_MUPTI diff --git a/libkineto/src/output_json.cpp b/libkineto/src/output_json.cpp index b795d16..4736cfa 100644 --- a/libkineto/src/output_json.cpp +++ b/libkineto/src/output_json.cpp @@ -32,12 +32,19 @@ static constexpr char kFlowEnd = 'f'; // TODO: share the same string across c10d, profiler and libkineto static constexpr const char* kParamCommsCallName = "record_param_comms"; // Collective function metadata populated from CPU op to GPU kernel +static constexpr const char* kCollectiveName = "Collective name"; static constexpr const char* kDtype = "dtype"; -static constexpr const char* kInMsgSize = "In msg size"; -static constexpr const char* kOutMsgSize = "Out msg size"; +static constexpr const char* kInMsgNelems = "In msg nelems"; +static constexpr const char* kOutMsgNelems = "Out msg nelems"; static constexpr const char* kGroupSize = "Group size"; static constexpr const char* kInSplit = "In split size"; static constexpr const char* kOutSplit = "Out split size"; +static constexpr const char* kProcessGroupName = "Process Group Name"; +static constexpr const char* kProcessGroupDesc = "Process Group Description"; +static constexpr const char* kGroupRanks = "Process Group Ranks"; +static constexpr const char* kRank = "Rank"; +static constexpr const char* kP2pSrc = "Src Rank"; +static constexpr const char* kP2pDst = "Dst Rank"; #ifdef __linux__ static constexpr char kDefaultLogFileFmt[] = @@ -46,17 +53,44 @@ static constexpr char kDefaultLogFileFmt[] = static constexpr char kDefaultLogFileFmt[] = "libkineto_activities_{}.json"; #endif -std::string& ChromeTraceLogger::sanitizeStrForJSON(std::string& value) { -// Replace all backslashes with forward slash because Windows paths causing JSONDecodeError. +ChromeTraceBaseTime& ChromeTraceBaseTime::singleton() { + static ChromeTraceBaseTime instance; + return instance; +} + +// The 'ts' field written into the json file has 19 significant digits, +// while a double can only represent 15-16 digits. By using relative time, +// other applications can accurately read the 'ts' field as a double. +// Use the program loading time as the baseline time. +inline int64_t transToRelativeTime(int64_t time) { + // Sometimes after converting to relative time, it can be a few nanoseconds negative. + // Since Chrome trace and json processing will throw a parser error, guard this. + int64_t res = time - ChromeTraceBaseTime::singleton().get(); + if (res < 0) { + return 0; + } + return res; +} + +void ChromeTraceLogger::sanitizeStrForJSON(std::string& value) { + // Replace all backslashes with forward slash because Windows paths causing JSONDecodeError. std::replace(value.begin(), value.end(), '\\', '/'); - return value; + // Remove all new line characters + value.erase(std::remove(value.begin(), value.end(), '\n'), value.end()); } void ChromeTraceLogger::metadataToJSON( const std::unordered_map& metadata) { - for (const auto& kv : metadata) { + for (auto [k, v]: metadata) { + std::string sanitizedValue = v; + // There is a seperate mechanism for recording distributedInfo in on-demand + // so add a guard to prevent "double counting" in auto-trace. + if (k == "distributedInfo") { + distInfo_.distInfo_present_ = true; + } + sanitizeStrForJSON(sanitizedValue); traceOf_ << fmt::format(R"JSON( - "{}": {},)JSON", kv.first, kv.second); + "{}": {},)JSON", k, sanitizedValue); } } @@ -66,11 +100,9 @@ void ChromeTraceLogger::handleTraceStart( {{ "schemaVersion": {},)JSON", kSchemaVersion); -#ifdef HAS_MUPTI traceOf_ << fmt::format(R"JSON( "deviceProperties": [{} ],)JSON", devicePropertiesJson()); -#endif metadataToJSON(metadata); traceOf_ << R"JSON( @@ -107,31 +139,32 @@ void ChromeTraceLogger::handleDeviceInfo( // M is for metadata // process_name needs a pid and a name arg // clang-format off + time = transToRelativeTime(time); traceOf_ << fmt::format(R"JSON( {{ - "name": "process_name", "ph": "M", "ts": {}, "pid": {}, "tid": 0, + "name": "process_name", "ph": "M", "ts": {}.{:03}, "pid": {}, "tid": 0, "args": {{ "name": "{}" }} }}, {{ - "name": "process_labels", "ph": "M", "ts": {}, "pid": {}, "tid": 0, + "name": "process_labels", "ph": "M", "ts": {}.{:03}, "pid": {}, "tid": 0, "args": {{ "labels": "{}" }} }}, {{ - "name": "process_sort_index", "ph": "M", "ts": {}, "pid": {}, "tid": 0, + "name": "process_sort_index", "ph": "M", "ts": {}.{:03}, "pid": {}, "tid": 0, "args": {{ "sort_index": {} }} }},)JSON", - time, info.id, + time/1000, time%1000, info.id, info.name, - time, info.id, + time/1000, time%1000, info.id, info.label, - time, info.id, - info.id < 8 ? info.id + 0x1000000ll : info.id); + time/1000, time%1000, info.id, + info.sortIndex); // clang-format on } @@ -145,22 +178,23 @@ void ChromeTraceLogger::handleResourceInfo( // M is for metadata // thread_name needs a pid and a name arg // clang-format off + time = transToRelativeTime(time); traceOf_ << fmt::format(R"JSON( {{ - "name": "thread_name", "ph": "M", "ts": {}, "pid": {}, "tid": {}, + "name": "thread_name", "ph": "M", "ts": {}.{:03}, "pid": {}, "tid": {}, "args": {{ "name": "{}" }} }}, {{ - "name": "thread_sort_index", "ph": "M", "ts": {}, "pid": {}, "tid": {}, + "name": "thread_sort_index", "ph": "M", "ts": {}.{:03}, "pid": {}, "tid": {}, "args": {{ "sort_index": {} }} }},)JSON", - time, info.deviceId, info.id, + time/1000, time%1000, info.deviceId, info.id, info.name, - time, info.deviceId, info.id, + time/1000, time%1000, info.deviceId, info.id, info.sortIndex); // clang-format on } @@ -175,22 +209,23 @@ void ChromeTraceLogger::handleOverheadInfo( // TOOD: reserve pid = -1 for overhead but we need to rethink how to scale this for // other metadata // clang-format off + time = transToRelativeTime(time); traceOf_ << fmt::format(R"JSON( {{ - "name": "process_name", "ph": "M", "ts": {}, "pid": -1, "tid": 0, + "name": "process_name", "ph": "M", "ts": {}.{:03}, "pid": -1, "tid": 0, "args": {{ "name": "{}" }} }}, {{ - "name": "process_sort_index", "ph": "M", "ts": {}, "pid": -1, "tid": 0, + "name": "process_sort_index", "ph": "M", "ts": {}.{:03}, "pid": -1, "tid": 0, "args": {{ "sort_index": {} }} }},)JSON", - time, + time/1000, time%1000, info.name, - time, + time/1000, time%1000, 0x100000All); // clang-format on } @@ -200,10 +235,15 @@ void ChromeTraceLogger::handleTraceSpan(const TraceSpan& span) { return; } + uint64_t start = transToRelativeTime(span.startTime); + + // If endTime is 0 and start time is non-zero, dur can overflow. Add + // a guard to prevent this. + uint64_t dur = (span.endTime == 0) ? 0 : span.endTime - span.startTime; // clang-format off traceOf_ << fmt::format(R"JSON( {{ - "ph": "X", "cat": "Trace", "ts": {}, "dur": {}, + "ph": "X", "cat": "Trace", "ts": {}.{:03}, "dur": {}.{:03}, "pid": "Spans", "tid": "{}", "name": "{}{} ({})", "args": {{ @@ -211,17 +251,17 @@ void ChromeTraceLogger::handleTraceSpan(const TraceSpan& span) { }} }}, {{ - "name": "process_sort_index", "ph": "M", "ts": {}, + "name": "process_sort_index", "ph": "M", "ts": {}.{:03}, "pid": "Spans", "tid": 0, "args": {{ "sort_index": {} }} }},)JSON", - span.startTime, span.endTime - span.startTime, + start/1000, start%1000, dur/1000, dur%1000, span.name, span.prefix, span.name, span.iteration, span.opCount, - span.startTime, + start/1000, start%1000, // Large sort index to appear at the bottom 0x20000000ll); // clang-format on @@ -235,13 +275,15 @@ void ChromeTraceLogger::addIterationMarker(const TraceSpan& span) { } // clang-format off + uint64_t start = transToRelativeTime(span.startTime); + traceOf_ << fmt::format(R"JSON( {{ "name": "Iteration Start: {}", "ph": "i", "s": "g", - "pid": "Traces", "tid": "Trace {}", "ts": {} + "pid": "Traces", "tid": "Trace {}", "ts": {}.{:03} }},)JSON", span.name, - span.name, span.startTime); + span.name, start/1000, start%1000); // clang-format on } @@ -251,17 +293,18 @@ void ChromeTraceLogger::handleGenericInstantEvent( return; } + uint64_t ts = transToRelativeTime(op.timestamp()); traceOf_ << fmt::format(R"JSON( {{ "ph": "i", "cat": "{}", "s": "t", "name": "{}", "pid": {}, "tid": {}, - "ts": {}, + "ts": {}.{:03}, "args": {{ {} }} }},)JSON", toString(op.type()), op.name(), op.deviceId(), op.resourceId(), - op.timestamp(), op.metadataJson()); + ts/1000, ts%1000, op.metadataJson()); } void ChromeTraceLogger::handleActivity( @@ -289,22 +332,18 @@ void ChromeTraceLogger::handleActivity( // The GPU user annotations start at the same time as the // first associated GPU op. Since they appear later // in the trace file, this causes a visualization issue in Chrome. - // Make it start one us earlier. - ts--; - duration++; // Still need it to end at the orginal point + // Make it start one ns earlier and end 2 ns later. + ts-=1; + duration+=2; // Still need it to end at the original point rounded up. } std::string arg_values = ""; if (op.correlationId() != 0) { - arg_values.append(fmt::format("\"External id\": {}", op.correlationId())); + arg_values.append(fmt::format("\"External id\": {}", + op.linkedActivity() ? op.linkedActivity()->correlationId() : op.correlationId())); } - if (op.linkedActivity()) { - if (!arg_values.empty()) { - arg_values.append(", "); - } - arg_values.append(fmt::format("\"Linked id\": {}", op.linkedActivity()->correlationId())); - } - const std::string op_metadata = op.metadataJson(); + std::string op_metadata = op.metadataJson(); + sanitizeStrForJSON(op_metadata); if (op_metadata.find_first_not_of(" \t\n") != std::string::npos) { if (!arg_values.empty()) { arg_values.append(","); @@ -317,20 +356,24 @@ void ChromeTraceLogger::handleActivity( op.linkedActivity()->name() == kParamCommsCallName) { const auto* collectiveRecord = op.linkedActivity(); // Get the value out of the collective record - const auto& inMsgSize = collectiveRecord->getMetadataValue(kInMsgSize); - const auto& outMsgSize = collectiveRecord->getMetadataValue(kOutMsgSize); + const auto& collectiveName = + collectiveRecord->getMetadataValue(kCollectiveName); + const auto& inMsgSize = collectiveRecord->getMetadataValue(kInMsgNelems); + const auto& outMsgSize = collectiveRecord->getMetadataValue(kOutMsgNelems); const auto& groupSize = collectiveRecord->getMetadataValue(kGroupSize); const auto& dtype = collectiveRecord->getMetadataValue(kDtype); - if (!inMsgSize.empty() && !outMsgSize.empty() && !groupSize.empty() && - !dtype.empty()) { + if (!collectiveName.empty() && !inMsgSize.empty() && !outMsgSize.empty() && + !groupSize.empty() && !dtype.empty()) { if (!arg_values.empty()) { arg_values.append(","); } arg_values.append(fmt::format( - "\"{}\": {}, \"{}\": {}, \"{}\": {}, \"{}\": {}", - kInMsgSize, + " \"{}\": {}, \"{}\": {}, \"{}\": {}, \"{}\": {}, \"{}\": {}", + kCollectiveName, + collectiveName, + kInMsgNelems, inMsgSize, - kOutMsgSize, + kOutMsgNelems, outMsgSize, kGroupSize, groupSize, @@ -345,12 +388,64 @@ void ChromeTraceLogger::handleActivity( arg_values.append(","); } arg_values.append(fmt::format( - "\"{}\": {}, \"{}\": {}", + " \"{}\": {}, \"{}\": {}", kInSplit, inSplitSize, kOutSplit, outSplitSize)); } + const auto& processGroupName = + collectiveRecord->getMetadataValue(kProcessGroupName); + if (!processGroupName.empty()) { + if (!arg_values.empty()) { + arg_values.append(","); + } + arg_values.append( + fmt::format(" \"{}\": {}", kProcessGroupName, processGroupName)); + } + const auto& processGroupDesc = + collectiveRecord->getMetadataValue(kProcessGroupDesc); + if (!processGroupName.empty()) { + if (!arg_values.empty()) { + arg_values.append(","); + } + arg_values.append( + fmt::format(" \"{}\": {}", kProcessGroupDesc, processGroupDesc)); + } + const auto& groupRanks = collectiveRecord->getMetadataValue(kGroupRanks); + if (!groupRanks.empty()) { + if (!arg_values.empty()) { + arg_values.append(","); + } + arg_values.append(fmt::format(" \"{}\": {}", kGroupRanks, groupRanks)); + } + const auto& dstRank = collectiveRecord->getMetadataValue(kP2pDst); + const auto& srcRank = collectiveRecord->getMetadataValue(kP2pSrc); + if (!dstRank.empty()) { + arg_values.append(fmt::format(", \"{}\": {}", kP2pDst, dstRank)); + } + if (!srcRank.empty()) { + arg_values.append(fmt::format(", \"{}\": {}", kP2pSrc, srcRank)); + } + + + if (distInfo_.backend=="" && processGroupDesc=="\"default_pg\"") { + distInfo_.backend = "nccl"; + distInfo_.rank = collectiveRecord->getMetadataValue(kRank); + distInfo_.world_size = groupSize; + // Not sure if we want to have output.json depend on nccl at compilation so + // set nccl_version to "unknown" for now until we can determine if we can pass + // it at runtime or use ifdefs. Should not be necessary to enable HTA + distInfo_.nccl_version = "unknown"; + } + auto pg_config = pgConfig(); + pg_config.pg_name = processGroupName; + pg_config.pg_desc = processGroupDesc; + pg_config.backend_config = "musa:mccl"; + pg_config.pg_size = groupSize; + pg_config.ranks = groupRanks; + pgMap.insert({processGroupName, pg_config}); + } std::string args = ""; @@ -365,15 +460,17 @@ void ChromeTraceLogger::handleActivity( int resource = op.resourceId(); // TODO: Remove this once legacy tools are updated. std::string op_name = op.name() == "kernel" ? "Kernel" : op.name(); + sanitizeStrForJSON(op_name); // clang-format off + ts = transToRelativeTime(ts); traceOf_ << fmt::format(R"JSON( {{ "ph": "X", "cat": "{}", "name": "{}", "pid": {}, "tid": {}, - "ts": {}, "dur": {}{} + "ts": {}.{:03}, "dur": {}.{:03}{} }},)JSON", - toString(op.type()), sanitizeStrForJSON(op_name), device, resource, - ts, duration, args); + toString(op.type()), op_name, device, resource, + ts/1000, ts %1000, duration/1000, duration %1000, args); // clang-format on if (op.flowId() > 0) { handleGenericLink(op); @@ -423,12 +520,13 @@ void ChromeTraceLogger::handleLink( // Flow start automatically sets binding point to enclosing slice. const auto binding = (type == kFlowEnd) ? ", \"bp\": \"e\"" : ""; // clang-format off + uint64_t ts = transToRelativeTime(e.timestamp()); traceOf_ << fmt::format(R"JSON( {{ - "ph": "{}", "id": {}, "pid": {}, "tid": {}, "ts": {}, + "ph": "{}", "id": {}, "pid": {}, "tid": {}, "ts": {}.{:03}, "cat": "{}", "name": "{}"{} }},)JSON", - type, id, e.deviceId(), e.resourceId(), e.timestamp(), name, name, binding); + type, id, e.deviceId(), e.resourceId(), ts/1000, ts%1000, name, name, binding); // clang-format on } @@ -437,20 +535,48 @@ void ChromeTraceLogger::finalizeTrace( std::unique_ptr /*unused*/, int64_t endTime, std::unordered_map>& metadata) { + finalizeTrace(endTime, metadata); +} + +void ChromeTraceLogger::addOnDemandDistMetadata() { + if (distInfo_.backend == "") { + return; + } + traceOf_ << fmt::format(R"JSON( + "distributedInfo": {{"backend": "{}", "rank": {}, "world_size": {}, "pg_count": {}, "pg_config": [)JSON", + distInfo_.backend, distInfo_.rank, distInfo_.world_size, std::to_string(pgMap.size())); + + for (const auto& element : pgMap) { + traceOf_ << fmt::format(R"JSON({{"pg_name": {}, "pg_desc": {}, "backend_config": "{}", "pg_size": {}, "ranks": {}}},)JSON", + element.second.pg_name, element.second.pg_desc, element.second.backend_config, element.second.pg_size, element.second.ranks); + } + traceOf_.seekp(-1, std::ios_base::end); + traceOf_ << fmt::format(R"JSON(], "nccl_version": "{}"}},)JSON", distInfo_.nccl_version); + distInfo_.distInfo_present_ = true; +} + +void ChromeTraceLogger::finalizeTrace( + int64_t endTime, + std::unordered_map>& metadata) { if (!traceOf_) { LOG(ERROR) << "Failed to write to log file!"; return; } + sanitizeStrForJSON(fileName_); LOG(INFO) << "Chrome Trace written to " << fileName_; // clang-format off + endTime = transToRelativeTime(endTime); traceOf_ << fmt::format(R"JSON( {{ "name": "Record Window End", "ph": "i", "s": "g", - "pid": "", "tid": "", "ts": {} + "pid": "", "tid": "", "ts": {}.{:03} }} ],)JSON", - endTime); + endTime/1000, endTime %1000); + if (!distInfo_.distInfo_present_) { + addOnDemandDistMetadata(); + } #if !USE_GOOGLE_LOG std::unordered_map PreparedMetadata; for (const auto& kv : metadata) { @@ -462,7 +588,8 @@ void ChromeTraceLogger::finalizeTrace( // "WARNING": ["Warning 1", "Warning 2", "Warning 3"], // ... int mdv_count = kv.second.size(); - for (const auto& v : kv.second) { + for (auto v : kv.second) { + sanitizeStrForJSON(v); value.append("\"" + v + "\""); if(mdv_count > 1) { value.append(","); @@ -470,16 +597,19 @@ void ChromeTraceLogger::finalizeTrace( } } value.append("]"); - PreparedMetadata[kv.first] = sanitizeStrForJSON(value); + PreparedMetadata[kv.first] = value; } } metadataToJSON(PreparedMetadata); #endif // !USE_GOOGLE_LOG // Putting this here because the last entry MUST not end with a comma. + traceOf_ << fmt::format(R"JSON( - "traceName": "{}" -}})JSON", sanitizeStrForJSON(fileName_)); + "traceName": "{}", + "displayTimeUnit": "ms", + "baseTimeNanoseconds": {} +}})JSON", fileName_, ChromeTraceBaseTime::singleton().get()); // clang-format on traceOf_.close(); diff --git a/libkineto/src/output_json.h b/libkineto/src/output_json.h index 21cbb63..9fe9bcf 100644 --- a/libkineto/src/output_json.h +++ b/libkineto/src/output_json.h @@ -8,14 +8,19 @@ #pragma once +#include #include #include #include +#include +#include #include #include #include "GenericTraceActivity.h" #include "output_base.h" +#include "ActivityBuffers.h" +#include "time_since_epoch.h" namespace KINETO_NAMESPACE { // Previous declaration of TraceSpan is struct. Must match the same here. @@ -26,6 +31,28 @@ namespace KINETO_NAMESPACE { class Config; + +struct pgConfig { + pgConfig() = default; + std::string pg_name{""}; + std::string pg_desc{""}; + std::string backend_config{""}; + std::string pg_size{""}; + std::string ranks{""}; + +}; + +struct DistributedInfo { + DistributedInfo() = default; + + std::string backend{""}; + std::string rank{""}; + std::string world_size{""}; + std::string pg_count{""}; + std::string nccl_version{""}; + bool distInfo_present_{false}; +}; + class ChromeTraceLogger : public libkineto::ActivityLogger { public: explicit ChromeTraceLogger(const std::string& traceFileName); @@ -58,6 +85,11 @@ class ChromeTraceLogger : public libkineto::ActivityLogger { return fileName_; } + protected: + void finalizeTrace( + int64_t endTime, + std::unordered_map>& metadata); + private: // Create a flow event (arrow) @@ -78,11 +110,55 @@ class ChromeTraceLogger : public libkineto::ActivityLogger { void metadataToJSON( const std::unordered_map& metadata); - std::string& sanitizeStrForJSON(std::string& value); + void sanitizeStrForJSON(std::string& value); + + void addOnDemandDistMetadata(); std::string fileName_; std::string tempFileName_; std::ofstream traceOf_; + DistributedInfo distInfo_ = DistributedInfo(); + // Map of all observed process groups to their configs in trace. Key is pg_name, + // value is pgConfig that will be used to populate pg_config in + // distributedInfo of trace + std::unordered_map pgMap = {}; +}; + +//std::chrono header start +#ifdef _GLIBCXX_USE_C99_STDINT_TR1 +# define _KINETO_GLIBCXX_CHRONO_INT64_T int64_t +#elif defined __INT64_TYPE__ +# define _KINETO_GLIBCXX_CHRONO_INT64_T __INT64_TYPE__ +#else +# define _KINETO_GLIBCXX_CHRONO_INT64_T long long +#endif +// std::chrono header end + +// There are tools like Chrome Trace Viewer that uses double to represent +// each element in the timeline. Double has a 53 bit mantissa to support +// up to 2^53 significant digits (up to 9007199254740992). This holds at the +// nanosecond level, about 3 months and 12 days. So, let's round base time to +// 3 months intervals, so we can still collect traces across ranks relative +// to each other. +// A month is 2629746, so 3 months is 7889238. +using _trimonths = std::chrono::duration< + _KINETO_GLIBCXX_CHRONO_INT64_T, std::ratio<7889238>>; +#undef _GLIBCXX_CHRONO_INT64_T + +class ChromeTraceBaseTime { + public: + ChromeTraceBaseTime() = default; + static ChromeTraceBaseTime& singleton(); + void init() { + get(); + } + int64_t get() { + // Make all timestamps relative to 3 month intervals. + static int64_t base_time = libkineto::timeSinceEpoch( + std::chrono::time_point( + std::chrono::floor<_trimonths>(std::chrono::system_clock::now()))); + return base_time; + } }; } // namespace KINETO_NAMESPACE diff --git a/libkineto/third_party/fmt b/libkineto/third_party/fmt index 90932dd..4986b4c 160000 --- a/libkineto/third_party/fmt +++ b/libkineto/third_party/fmt @@ -1 +1 @@ -Subproject commit 90932dd20d86adc7fd3ff830c33509859af69632 +Subproject commit 4986b4c0efd0fad653463e867d068c12f42e9e0a diff --git a/libkineto/third_party/googletest b/libkineto/third_party/googletest index a7f443b..0953a17 160000 --- a/libkineto/third_party/googletest +++ b/libkineto/third_party/googletest @@ -1 +1 @@ -Subproject commit a7f443b80b105f940225332ed3c31f2790092f47 +Subproject commit 0953a17a4281fc26831da647ad3fcd5e21e6473b