diff --git a/.gitignore b/.gitignore index 8d39e29..d0cca3d 100644 --- a/.gitignore +++ b/.gitignore @@ -3,11 +3,10 @@ *.code-workspace # --- Agent --- -build/ .cache/ compile_commands.json sources/agent/agent.conf .clang-format # --- Server --- -sources/server/generated/ \ No newline at end of file +sources/server/generated/ diff --git a/.helix/languages.toml b/.helix/languages.toml new file mode 100644 index 0000000..6666436 --- /dev/null +++ b/.helix/languages.toml @@ -0,0 +1,18 @@ +[[language]] +name = "c" +scope = "source.c" +file-types = [] + +[[language]] +name = "cpp" +scope = "source.cpp" +file-types = ["cc", "hh", "c++", "cpp", "hpp", "h", "ipp", "tpp", "cxx", "hxx", "ixx", "txx", "ino", "C", "H", "cu", "cuh"] +language-servers = ["clangd"] + +[language-server.clangd] +command = "clangd" +args = [ + "--header-insertion=never", + "--query-driver=/usr/bin/g++,/usr/bin/c++", + "--compile-commands-dir=build" +] diff --git a/sources/agent/src/collectors/rapl_collector.cc b/sources/agent/src/collectors/rapl_collector.cc new file mode 100644 index 0000000..6616659 --- /dev/null +++ b/sources/agent/src/collectors/rapl_collector.cc @@ -0,0 +1,94 @@ +#include "rapl_collector.h" + +#include +#include + +#include +#include +#include +#include + +namespace volta { +namespace agent { +namespace collectors { + +RaplCollector::RaplCollector() { + OpenMSR(); + uint64_t readout = ReadMSR(0, MSR_RAPL::POWER_UNIT); + power_units_ = pow(0.5, (double)(readout & 0xf)); + energy_units_ = pow(0.5, (double)((readout >> 8) & 0x1f)); + time_units_ = pow(0.5, (double)((readout >> 16) & 0xf)); + readout = ReadMSR(0, MSR_RAPL::PKG::ENERGY_STATUS); + last_value = energy_units_ * readout; +} + +std::vector RaplCollector::Collect() { + uint64_t readout; + + try { + readout = ReadMSR(0, MSR_RAPL::PKG::ENERGY_STATUS); + } catch (const MSR_Read_Exception &e) { + return {}; + } + + double value = energy_units_ * readout; + + Metric m; + m.name = "cpu_energy_usage_total"; + m.value = value - last_value; + m.timestamp = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + last_value = value; + return {m}; +} + +uint64_t RaplCollector::ReadMSR(uint8_t core, uint32_t offset) { + uint64_t data; + if (core + 1 > MSR_files_.size()) { + throw MSR_Read_Exception(); + } + // c-like read for thread safety + if (pread(MSR_files_[core], &data, sizeof data, offset) != sizeof data) { + return {}; + } + + return data; +} + +void RaplCollector::OpenMSR() { + const std::filesystem::path cpu_base = "/dev/cpu"; + MSR_files_ = std::vector(); + std::error_code ec; + + if (!std::filesystem::exists(cpu_base, ec)) { + throw MSR_Open_Exception(); + } + std::vector> cpu_entries; + for (const auto &entry : std::filesystem::directory_iterator(cpu_base)) { + if (!entry.is_directory()) continue; + const auto &dirname = entry.path().filename().string(); + if (!std::ranges::all_of(dirname, ::isdigit)) continue; + cpu_entries.emplace_back(std::stoi(dirname), entry.path()); + } + + std::ranges::sort(cpu_entries); + + for (const auto &[id, path] : cpu_entries) { + int fd = open((path / "msr").c_str(), O_RDONLY); + if (fd >= 0) { + MSR_files_.push_back(fd); + } + } +} + +void RaplCollector::CloseMSR(int fd) { close(fd); } + +RaplCollector::~RaplCollector() { + for (auto file : MSR_files_) { + CloseMSR(file); + } +}; +} // namespace collectors +} // namespace agent +} // namespace volta diff --git a/sources/agent/src/collectors/rapl_collector.h b/sources/agent/src/collectors/rapl_collector.h new file mode 100644 index 0000000..4696516 --- /dev/null +++ b/sources/agent/src/collectors/rapl_collector.h @@ -0,0 +1,74 @@ +#ifndef VOLTA_AGENT_SRC_COLLECTORS_RAPL_COLLECTOR_H_ +#define VOLTA_AGENT_SRC_COLLECTORS_RAPL_COLLECTOR_H_ + +#include "collectors/collector.h" + +namespace volta { +namespace agent { +namespace collectors { + +class RaplCollector : public Collector { + public: + RaplCollector(); + // ~RaplCollector() override; + RaplCollector(const RaplCollector&) = delete; + RaplCollector& operator=(const RaplCollector&) = delete; + std::vector Collect() override; + ~RaplCollector(); + + private: + uint64_t ReadMSR(uint8_t core, uint32_t offset); + void OpenMSR(); + void CloseMSR(int fd); + bool initialized_ = false; + double power_units_, energy_units_, time_units_; + std::vector MSR_files_; + double last_value; + + class MSR_Read_Exception : std::exception {}; + class MSR_Open_Exception : std::exception {}; + + struct MSR_RAPL { + static constexpr uint32_t POWER_UNIT = 0x606; + struct Units { + static constexpr uint32_t POWER_UNIT_OFFSET = 0; + static constexpr uint32_t POWER_UNIT_MASK = 0x0F; + static constexpr uint32_t ENERGY_UNIT_OFFSET = 0x08; + static constexpr uint32_t ENERGY_UNIT_MASK = 0x1F00; + static constexpr uint32_t TIME_UNIT_OFFSET = 0x10; + static constexpr uint32_t TIME_UNIT_MASK = 0xF000; + }; + + struct PKG { + static constexpr uint32_t POWER_LIMIT = 0x610; + static constexpr uint32_t ENERGY_STATUS = 0x611; + static constexpr uint32_t PERF_STATUS = 0x613; + static constexpr uint32_t POWER_INFO = 0x614; + }; + + struct PP0 { + static constexpr uint32_t POWER_LIMIT = 0x638; + static constexpr uint32_t ENERGY_STATUS = 0x639; + static constexpr uint32_t POLICY = 0x63A; + static constexpr uint32_t PERF_STATUS = 0x63B; + }; + + struct PP1 { + static constexpr uint32_t POWER_LIMIT = 0x640; + static constexpr uint32_t ENERGY_STATUS = 0x641; + static constexpr uint32_t POLICY = 0x642; + }; + + struct DRAM { + static constexpr uint32_t POWER_LIMIT = 0x618; + static constexpr uint32_t ENERGY_STATUS = 0x619; + static constexpr uint32_t PERF_STATUS = 0x61B; + static constexpr uint32_t POWER_INFO = 0x61C; + }; + }; +}; + +} // namespace collectors +} // namespace agent +} // namespace volta +#endif diff --git a/sources/agent/src/config/config_loader.cc b/sources/agent/src/config/config_loader.cc index 8a81107..16cf662 100644 --- a/sources/agent/src/config/config_loader.cc +++ b/sources/agent/src/config/config_loader.cc @@ -56,6 +56,11 @@ Config ConfigLoader::LoadDefaultConfig() { proc_stat_config.metrics["cpu_usage_percent"] = true; config.collectors[CollectorNames::kProcStat] = proc_stat_config; + CollectorConfig rapl_collector; + rapl_collector.enabled = true; + rapl_collector.metrics = {{"cpu_energy_usage_total", true}}; + config.collectors[CollectorNames::kRapl] = rapl_collector; + return config; } diff --git a/sources/agent/src/main.cc b/sources/agent/src/main.cc index 09b6424..caece51 100644 --- a/sources/agent/src/main.cc +++ b/sources/agent/src/main.cc @@ -9,6 +9,7 @@ #include "collectors/nvml_collector.h" #include "collectors/proc_stat_collector.h" #include "collectors/ram_collector.h" +#include "collectors/rapl_collector.h" #include "config/config.h" #include "config/config_loader.h" #include "platform/platform_detector.h" @@ -28,10 +29,9 @@ int main() { active_collectors.push_back( std::make_unique()); - active_collectors.push_back(std::make_unique()); - - for (const auto& gpu : hw.gpus) { + active_collectors.push_back(std::make_unique()); + for (const auto &gpu : hw.gpus) { if (gpu.vendor == platform::GpuVendor::NVIDIA) { auto nvml = std::make_unique(); if (nvml->Init()) { @@ -43,8 +43,8 @@ int main() { Scheduler scheduler(config, std::move(active_collectors)); scheduler.Run(); - } catch (const std::exception& e) { - // std::cerr << "CRITICAL ERROR: " << e.what() << std::endl; + } catch (const std::exception &e) { + std::cerr << "CRITICAL ERROR: " << e.what() << std::endl; return 1; } return 0;