From 9d142b4a465b2bc83efa6b17156352781ceec27a Mon Sep 17 00:00:00 2001 From: flesher0813 <1208954694@qq.com> Date: Thu, 11 Dec 2025 11:24:31 +0800 Subject: [PATCH 1/9] [Feat] Support using same monitor instance and regist python stats --- docs/source/developer-guide/add_metrics.md | 113 +++++++++++++ docs/source/index.md | 1 + ucm/integration/vllm/conn_stats.py | 20 +++ ucm/{shared/metrics => }/observability.py | 10 +- ucm/shared/metrics/CMakeLists.txt | 33 +++- .../metrics/cc/api/stats_monitor_api.cc | 64 ++++++++ .../conn_stats.h => api/stats_monitor_api.h} | 62 ++----- .../metrics/cc/{ => domain}/stats/istats.h | 1 + .../metrics/cc/{ => domain}/stats_monitor.cc | 42 ++++- .../metrics/cc/{ => domain}/stats_monitor.h | 4 +- .../metrics/cc/{ => domain}/stats_registry.cc | 0 .../metrics/cc/{ => domain}/stats_registry.h | 4 +- ucm/shared/metrics/cc/stats/conn_stats.cc | 83 ---------- ucm/shared/metrics/cpy/metrics.py.cc | 57 ++++++- ucm/shared/test/CMakeLists.txt | 4 +- ucm/shared/test/case/metrics/monitor_test.cc | 153 ++++++++++++++++++ .../example/metrics/monitor_stats_example.py} | 22 +-- 17 files changed, 496 insertions(+), 177 deletions(-) create mode 100644 docs/source/developer-guide/add_metrics.md create mode 100644 ucm/integration/vllm/conn_stats.py rename ucm/{shared/metrics => }/observability.py (97%) create mode 100644 ucm/shared/metrics/cc/api/stats_monitor_api.cc rename ucm/shared/metrics/cc/{stats/conn_stats.h => api/stats_monitor_api.h} (52%) rename ucm/shared/metrics/cc/{ => domain}/stats/istats.h (98%) rename ucm/shared/metrics/cc/{ => domain}/stats_monitor.cc (68%) rename ucm/shared/metrics/cc/{ => domain}/stats_monitor.h (95%) rename ucm/shared/metrics/cc/{ => domain}/stats_registry.cc (100%) rename ucm/shared/metrics/cc/{ => domain}/stats_registry.h (92%) delete mode 100644 ucm/shared/metrics/cc/stats/conn_stats.cc create mode 100644 ucm/shared/test/case/metrics/monitor_test.cc rename ucm/shared/{metrics/test/test.py => test/example/metrics/monitor_stats_example.py} (82%) diff --git a/docs/source/developer-guide/add_metrics.md b/docs/source/developer-guide/add_metrics.md new file mode 100644 index 000000000..39561966b --- /dev/null +++ b/docs/source/developer-guide/add_metrics.md @@ -0,0 +1,113 @@ +# Custom Metrics +UCM supports custom metrics with bidirectional updates from both Python and C++ runtimes. The unified monitoring interface provides the ability to mutate stats across language boundaries through a shared metrics registry. + +## Architecture Overview +The metrics consists of these components below: +- **metrics** : Central stats registry that manages all metric lifecycle operations (registration, creation, updates, queries) +- **observability.py** : Prometheus integration layer that handles metric exposition and multiprocess collection +- **metrics_config.yaml** : Declarative configuration that defines which custom metrics to register and their properties + +## Getting Started +### Define Metrics in YAML +Prometheus provides three fundamental metric types: Counter, Gauge, and Histogram. UCM implements corresponding wrappers for each type. The method for adding new metrics is as follows; please refer to the [example YAML](https://github.com/ModelEngine-Group/unified-cache-management/blob/develop/examples/metrics/metrics_configs.yaml) for more detailed information. +```yaml +log_interval: 5 # Interval in seconds for logging metrics + +prometheus: + multiproc_dir: "/vllm-workspace" # Directory for Prometheus multiprocess mode + + metric_prefix: "ucm:" + + # Enable/disable metrics by category + enabled_metrics: + counters: true + gauges: true + histograms: true + + # Counter metrics configuration + counters: + - name: "received_requests" + documentation: "Total number of requests sent to ucm" + + # Gauge metrics configuration + gauges: + - name: "lookup_hit_rate" + documentation: "Hit rate of ucm lookup requests" + multiprocess_mode: "livemostrecent" + + # Histogram metrics configuration + histograms: + - name: "load_requests_num" + documentation: "Number of requests loaded from ucm" + buckets: [1, 5, 10, 20, 50, 100, 200, 500, 1000] +``` + +### Use Monitor APIs to Update Stats +The monitor provides a unified interface for metric operations. Note that the workflow requires registering a stats class before creating an instance. +:::::{tab-set} +:sync-group: install + +::::{tab-item} Python side interfaces +:selected: +:sync: py +**Lifecycle Methods** +- `register_istats(name, py::object)`: Register a new stats class implementation. +- `create_stats(name)`: Create and initialize a registered stats object. + +**Operation Methods** +- `update_stats(name, dict)`: Update specific fields of a specific stats object. +- `get_stats(name)`: Retrieve current values of a specific stats object. +- `get_stats_and_clear(name)`: Retrieve and reset a specific stats object. +- `get_all_stats_and_clear()`: Retrieve and reset all stats objects. +- `reset_stats(name)`: Reset a specific stats object to initial state. +- `reset_all()`: Reset all stats registered in monitor. + +**Example:** Using built-in ConnStats +```python +from ucm.integration.vllm.conn_stats import ConnStats +from ucm.shared.metrics import ucmmonitor + +conn_stats = ConnStats(name="ConnStats") +ucmmonitor.register_stats("ConnStats", conn_stats) # Register stats +ucmmonitor.create_stats("ConnStats") # Create a stats obj + +# Update stats +ucmmonitor.update_stats( + "ConnStats", + {"interval_lookup_hit_rates": external_hit_blocks / len(ucm_block_ids)}, +) + +``` +See more detailed example in [test case](https://github.com/ModelEngine-Group/unified-cache-management/tree/develop/ucm/shared/test/example). + +:::: + +::::{tab-item} C++ side interfaces +:sync: cc +**Lifecycle Methods** +- `RegistStats(std::string name, Creator creator)`: Register a new stats class implementation. +- `CreateStats(const std::string& name)`: Create and initialize a registered stats object. + +**Operation Methods** +- `UpdateStats(const std::string& name, const std::unordered_map& params)`: Update specific fields of a specific stats object. +- `ResetStats(const std::string& name)`: Retrieve current values of a specific stats object. +- `ResetAllStats()`: Retrieve and reset a specific stats object. +- `GetStats(const std::string& name)`: Retrieve and reset all stats objects. +- `GetStatsAndClear(const std::string& name)`: Reset a specific stats object to initial state. +- `GetAllStatsAndClear()`: Reset all stats registered in monitor. + +**Example:** Implementing custom stats in C++ +UCM supports custom metrics by following steps: +- Step 1: linking the static library monitor_static + ```c++ + target_link_libraries(xxxstore PUBLIC storeinfra monitor_static) + ``` +- Step 2: Inheriting from the IStats class to implement custom stats classes +- Step 3: Register stats class to monitor +- Step 4: Create stats object in monitor +- Step 5: Update or get stats info using operation methods + +See more detailed example in [test case](https://github.com/ModelEngine-Group/unified-cache-management/tree/develop/ucm/shared/test/case). + +:::: +::::: \ No newline at end of file diff --git a/docs/source/index.md b/docs/source/index.md index b494115bb..d7b441431 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -63,6 +63,7 @@ user-guide/metrics/metrics :caption: Developer Guide :maxdepth: 1 developer-guide/contribute +developer-guide/add_metrics ::: :::{toctree} diff --git a/ucm/integration/vllm/conn_stats.py b/ucm/integration/vllm/conn_stats.py new file mode 100644 index 000000000..94f868f2c --- /dev/null +++ b/ucm/integration/vllm/conn_stats.py @@ -0,0 +1,20 @@ +from ucm.shared.metrics import ucmmonitor + + +class ConnStats: + def __init__(self, name: str = "PyStats1"): + self._name = name + self._data = {} + + def Name(self) -> str: + return self._name + + def Update(self, params): + for k, v in params.items(): + self._data.setdefault(k, []).append(v) + + def Reset(self): + self._data.clear() + + def Data(self): + return self._data.copy() diff --git a/ucm/shared/metrics/observability.py b/ucm/observability.py similarity index 97% rename from ucm/shared/metrics/observability.py rename to ucm/observability.py index fb33400ce..796460322 100644 --- a/ucm/shared/metrics/observability.py +++ b/ucm/observability.py @@ -195,7 +195,7 @@ def log_prometheus(self, stats: Any): try: metric_mapped = self.metric_mappings[stat_name] if metric_mapped is None: - logger.warning(f"Stat {stat_name} not initialized.") + logger.debug(f"Stat {stat_name} not initialized.") continue metric_obj = getattr(self, metric_mapped["attr"], None) metric_type = metric_mapped["type"] @@ -213,8 +213,8 @@ def log_prometheus(self, stats: Any): else: value = [] self._log_histogram(metric_obj, value) - except Exception as e: - logger.warning(f"Failed to log metric {stat_name}: {e}") + except Exception: + logger.debug(f"Failed to log metric {stat_name}") @staticmethod def _metadata_to_labels(metadata: UCMEngineMetadata): @@ -267,8 +267,6 @@ def __init__(self, model_name: str, rank: int, config_path: str = ""): # Load configuration config = self._load_config(config_path) self.log_interval = config.get("log_interval", 10) - - self.monitor = ucmmonitor.StatsMonitor.get_instance() self.prometheus_logger = PrometheusLogger.GetOrCreate(self.metadata, config) self.is_running = True @@ -296,7 +294,7 @@ def _load_config(self, config_path: str) -> Dict[str, Any]: def log_worker(self): while self.is_running: # Use UCMStatsMonitor.get_states_and_clear() from external import - stats = self.monitor.get_stats_and_clear("ConnStats") + stats = ucmmonitor.get_all_stats_and_clear().data self.prometheus_logger.log_prometheus(stats) time.sleep(self.log_interval) diff --git a/ucm/shared/metrics/CMakeLists.txt b/ucm/shared/metrics/CMakeLists.txt index 585e6a547..37dc054a8 100644 --- a/ucm/shared/metrics/CMakeLists.txt +++ b/ucm/shared/metrics/CMakeLists.txt @@ -1,17 +1,34 @@ -file(GLOB_RECURSE CORE_SRCS CONFIGURE_DEPENDS - "${CMAKE_CURRENT_SOURCE_DIR}/cc/stats/*.cc" - "${CMAKE_CURRENT_SOURCE_DIR}/cc/*.cc") -add_library(monitor_static STATIC ${CORE_SRCS}) +file(GLOB_RECURSE DOMAIN_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/cc/domain/*.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/cc/domain/stats/*.cc" +) + +file(GLOB_RECURSE API_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/cc/api/*.cc" +) + +add_library(monitor_static STATIC + ${DOMAIN_SRCS} + ${API_SRCS} +) + set_property(TARGET monitor_static PROPERTY POSITION_INDEPENDENT_CODE ON) + target_include_directories(monitor_static PUBLIC - $ - $) + $ + $ + $ +) + set_target_properties(monitor_static PROPERTIES OUTPUT_NAME monitor) file(GLOB_RECURSE BINDINGS_SRCS CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/cpy/*.cc") pybind11_add_module(ucmmonitor ${BINDINGS_SRCS}) -target_link_libraries(ucmmonitor PRIVATE -Wl,--whole-archive monitor_static -Wl,--no-whole-archive) -target_include_directories(ucmmonitor PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cc) +target_link_libraries(ucmmonitor PRIVATE monitor_static) + +target_include_directories(ucmmonitor PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/cc/api" +) file(RELATIVE_PATH INSTALL_REL_PATH ${CMAKE_SOURCE_DIR} diff --git a/ucm/shared/metrics/cc/api/stats_monitor_api.cc b/ucm/shared/metrics/cc/api/stats_monitor_api.cc new file mode 100644 index 000000000..88845ebdd --- /dev/null +++ b/ucm/shared/metrics/cc/api/stats_monitor_api.cc @@ -0,0 +1,64 @@ +/** + * MIT License + * + * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * */ +#include "stats_monitor_api.h" +namespace UC::Metrics { + +void RegistStats(std::string name, Creator creator) +{ + StatsRegistry::GetInstance().RegisterStats(name, creator); +} + +void CreateStats(const std::string& name) { StatsMonitor::GetInstance().CreateStats(name); } + +void UpdateStats(const std::string& name, const std::unordered_map& params) +{ + StatsMonitor::GetInstance().UpdateStats(name, params); +} + +void ResetStats(const std::string& name) { StatsMonitor::GetInstance().ResetStats(name); } + +void ResetAllStats() { StatsMonitor::GetInstance().ResetAllStats(); } + +StatsResult GetStats(const std::string& name) +{ + StatsResult result; + result.data = StatsMonitor::GetInstance().GetStats(name); + return result; +} + +StatsResult GetStatsAndClear(const std::string& name) +{ + StatsResult result; + result.data = StatsMonitor::GetInstance().GetStatsAndClear(name); + return result; +} + +StatsResult GetAllStatsAndClear() +{ + StatsResult result; + result.data = StatsMonitor::GetInstance().GetAllStatsAndClear(); + return result; +} + +} // namespace UC::Metrics \ No newline at end of file diff --git a/ucm/shared/metrics/cc/stats/conn_stats.h b/ucm/shared/metrics/cc/api/stats_monitor_api.h similarity index 52% rename from ucm/shared/metrics/cc/stats/conn_stats.h rename to ucm/shared/metrics/cc/api/stats_monitor_api.h index e8cc94559..7f53c0ae3 100644 --- a/ucm/shared/metrics/cc/stats/conn_stats.h +++ b/ucm/shared/metrics/cc/api/stats_monitor_api.h @@ -21,58 +21,26 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ -#ifndef UNIFIEDCACHE_CONNSTATS_H -#define UNIFIEDCACHE_CONNSTATS_H - -#include -#include +#ifndef UNIFIEDCACHE_MONITOR_API_H +#define UNIFIEDCACHE_MONITOR_API_H #include #include -#include -#include "istats.h" -#include "stats_registry.h" +#include "stats_monitor.h" namespace UC::Metrics { - -enum class Key : uint8_t { - interval_lookup_hit_rates = 0, - save_requests_num, - save_blocks_num, - save_duration, - save_speed, - load_requests_num, - load_blocks_num, - load_duration, - load_speed, - COUNT +struct StatsResult { + StatsResult() = default; + std::unordered_map> data; }; -class ConnStats : public IStats { -public: - ConnStats(); - ~ConnStats() = default; - - std::string Name() const override; - void Reset() override; - void Update(const std::unordered_map& params) override; - std::unordered_map> Data() override; - -private: - static constexpr std::size_t N = static_cast(Key::COUNT); - std::array, N> data_; - - static Key KeyFromString(const std::string& k); - void EmplaceBack(Key id, double value); -}; - -struct Registrar { - Registrar() - { - StatsRegistry::RegisterStats( - "ConnStats", []() -> std::unique_ptr { return std::make_unique(); }); - } -}; +void RegistStats(std::string name, Creator creator); +void CreateStats(const std::string& name); +void UpdateStats(const std::string& name, const std::unordered_map& params); +void ResetStats(const std::string& name); +void ResetAllStats(); +StatsResult GetStats(const std::string& name); +StatsResult GetStatsAndClear(const std::string& name); +StatsResult GetAllStatsAndClear(); } // namespace UC::Metrics - -#endif // UNIFIEDCACHE_CONNSTATS_H \ No newline at end of file +#endif \ No newline at end of file diff --git a/ucm/shared/metrics/cc/stats/istats.h b/ucm/shared/metrics/cc/domain/stats/istats.h similarity index 98% rename from ucm/shared/metrics/cc/stats/istats.h rename to ucm/shared/metrics/cc/domain/stats/istats.h index 6e8de7b32..5e058f631 100644 --- a/ucm/shared/metrics/cc/stats/istats.h +++ b/ucm/shared/metrics/cc/domain/stats/istats.h @@ -24,6 +24,7 @@ #ifndef UNIFIEDCACHE_ISTATS_H #define UNIFIEDCACHE_ISTATS_H +#include #include #include #include diff --git a/ucm/shared/metrics/cc/stats_monitor.cc b/ucm/shared/metrics/cc/domain/stats_monitor.cc similarity index 68% rename from ucm/shared/metrics/cc/stats_monitor.cc rename to ucm/shared/metrics/cc/domain/stats_monitor.cc index 2d3d80266..348789050 100644 --- a/ucm/shared/metrics/cc/stats_monitor.cc +++ b/ucm/shared/metrics/cc/domain/stats_monitor.cc @@ -24,8 +24,6 @@ #include "stats_monitor.h" #include #include -#include "stats/istats.h" -#include "stats_registry.h" namespace UC::Metrics { @@ -47,36 +45,64 @@ void StatsMonitor::CreateStats(const std::string& name) std::unordered_map> StatsMonitor::GetStats(const std::string& name) { std::lock_guard lock(mutex_); - return stats_map_[name]->Data(); + auto it = stats_map_.find(name); + if (it == stats_map_.end() || !it->second) { return {}; } + return it->second->Data(); } void StatsMonitor::ResetStats(const std::string& name) { std::lock_guard lock(mutex_); - stats_map_[name]->Reset(); + auto it = stats_map_.find(name); + if (it == stats_map_.end() || !it->second) { return; } + it->second->Reset(); } std::unordered_map> StatsMonitor::GetStatsAndClear(const std::string& name) { std::lock_guard lock(mutex_); - auto result = stats_map_[name]->Data(); - stats_map_[name]->Reset(); + auto it = stats_map_.find(name); + if (it == stats_map_.end() || !it->second) { return {}; } + auto result = it->second->Data(); + it->second->Reset(); return result; } +std::unordered_map> StatsMonitor::GetAllStatsAndClear() +{ + std::lock_guard lock(mutex_); + std::unordered_map> all_stats; + + for (const auto& [name, stats_ptr] : stats_map_) { + if (stats_ptr) { + auto data = stats_ptr->Data(); + all_stats.insert(data.begin(), data.end()); + stats_ptr->Reset(); + } + } + return all_stats; +} + void StatsMonitor::UpdateStats(const std::string& name, const std::unordered_map& params) { std::lock_guard lock(mutex_); auto it = stats_map_.find(name); - if (it != stats_map_.end()) { it->second->Update(params); } + if (it == stats_map_.end() || !it->second) { return; } + try { + it->second->Update(params); + } catch (...) { + return; + } } void StatsMonitor::ResetAllStats() { std::lock_guard lock(mutex_); - for (auto& [n, ptr] : stats_map_) { ptr->Reset(); } + for (const auto& [name, stats_ptr] : stats_map_) { + if (stats_ptr) { stats_ptr->Reset(); } + } } } // namespace UC::Metrics \ No newline at end of file diff --git a/ucm/shared/metrics/cc/stats_monitor.h b/ucm/shared/metrics/cc/domain/stats_monitor.h similarity index 95% rename from ucm/shared/metrics/cc/stats_monitor.h rename to ucm/shared/metrics/cc/domain/stats_monitor.h index 1545d4b5c..50ecdacc0 100644 --- a/ucm/shared/metrics/cc/stats_monitor.h +++ b/ucm/shared/metrics/cc/domain/stats_monitor.h @@ -30,6 +30,7 @@ #include #include #include "stats/istats.h" +#include "stats_registry.h" namespace UC::Metrics { @@ -51,6 +52,8 @@ class StatsMonitor { std::unordered_map> GetStatsAndClear(const std::string& name); + std::unordered_map> GetAllStatsAndClear(); + void UpdateStats(const std::string& name, const std::unordered_map& params); @@ -64,7 +67,6 @@ class StatsMonitor { StatsMonitor(const StatsMonitor&) = delete; StatsMonitor& operator=(const StatsMonitor&) = delete; }; - } // namespace UC::Metrics #endif // UNIFIEDCACHE_MONITOR_H \ No newline at end of file diff --git a/ucm/shared/metrics/cc/stats_registry.cc b/ucm/shared/metrics/cc/domain/stats_registry.cc similarity index 100% rename from ucm/shared/metrics/cc/stats_registry.cc rename to ucm/shared/metrics/cc/domain/stats_registry.cc diff --git a/ucm/shared/metrics/cc/stats_registry.h b/ucm/shared/metrics/cc/domain/stats_registry.h similarity index 92% rename from ucm/shared/metrics/cc/stats_registry.h rename to ucm/shared/metrics/cc/domain/stats_registry.h index c22b6617c..3b7a89609 100644 --- a/ucm/shared/metrics/cc/stats_registry.h +++ b/ucm/shared/metrics/cc/domain/stats_registry.h @@ -31,7 +31,7 @@ namespace UC::Metrics { -using Creator = std::unique_ptr (*)(); +using Creator = std::function()>; class StatsRegistry { public: @@ -48,6 +48,8 @@ class StatsRegistry { ~StatsRegistry() = default; StatsRegistry(const StatsRegistry&) = delete; StatsRegistry& operator=(const StatsRegistry&) = delete; + StatsRegistry(StatsRegistry&&) = delete; + StatsRegistry& operator=(StatsRegistry&&) = delete; std::mutex mutex_; std::unordered_map registry_; diff --git a/ucm/shared/metrics/cc/stats/conn_stats.cc b/ucm/shared/metrics/cc/stats/conn_stats.cc deleted file mode 100644 index edf18ac2e..000000000 --- a/ucm/shared/metrics/cc/stats/conn_stats.cc +++ /dev/null @@ -1,83 +0,0 @@ -/** - * MIT License - * - * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * */ -#include "conn_stats.h" - -namespace UC::Metrics { - -ConnStats::ConnStats() = default; - -std::string ConnStats::Name() const { return "ConnStats"; } - -void ConnStats::Reset() -{ - for (auto& v : data_) v.clear(); -} - -void ConnStats::Update(const std::unordered_map& params) -{ - for (const auto& [k, v] : params) { - Key id = KeyFromString(k); - if (id == Key::COUNT) continue; - EmplaceBack(id, v); - } -} - -std::unordered_map> ConnStats::Data() -{ - std::unordered_map> result; - result["save_requests_num"] = data_[static_cast(Key::save_requests_num)]; - result["save_blocks_num"] = data_[static_cast(Key::save_blocks_num)]; - result["save_duration"] = data_[static_cast(Key::save_duration)]; - result["save_speed"] = data_[static_cast(Key::save_speed)]; - result["load_requests_num"] = data_[static_cast(Key::load_requests_num)]; - result["load_blocks_num"] = data_[static_cast(Key::load_blocks_num)]; - result["load_duration"] = data_[static_cast(Key::load_duration)]; - result["load_speed"] = data_[static_cast(Key::load_speed)]; - result["interval_lookup_hit_rates"] = - data_[static_cast(Key::interval_lookup_hit_rates)]; - return result; -} - -Key ConnStats::KeyFromString(const std::string& k) -{ - if (k == "save_requests_num") return Key::save_requests_num; - if (k == "save_blocks_num") return Key::save_blocks_num; - if (k == "save_duration") return Key::save_duration; - if (k == "save_speed") return Key::save_speed; - if (k == "load_requests_num") return Key::load_requests_num; - if (k == "load_blocks_num") return Key::load_blocks_num; - if (k == "load_duration") return Key::load_duration; - if (k == "load_speed") return Key::load_speed; - if (k == "interval_lookup_hit_rates") return Key::interval_lookup_hit_rates; - return Key::COUNT; -} - -void ConnStats::EmplaceBack(Key id, double value) -{ - data_[static_cast(id)].push_back(value); -} - -static Registrar registrar; - -} // namespace UC::Metrics \ No newline at end of file diff --git a/ucm/shared/metrics/cpy/metrics.py.cc b/ucm/shared/metrics/cpy/metrics.py.cc index 10bfc2f97..c23b5a310 100644 --- a/ucm/shared/metrics/cpy/metrics.py.cc +++ b/ucm/shared/metrics/cpy/metrics.py.cc @@ -21,21 +21,64 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ +#include #include #include -#include "stats_monitor.h" +#include "stats/istats.h" +#include "stats_monitor_api.h" namespace py = pybind11; namespace UC::Metrics { +class PythonStatsWrapper : public IStats { +public: + PythonStatsWrapper(py::object py_obj) : py_obj_(py::cast(py_obj)) {} + + std::string Name() const override { return py_obj_.attr("Name")().cast(); } + + void Update(const std::unordered_map& params) override + { + py_obj_.attr("Update")(params); + } + + void Reset() override { py_obj_.attr("Reset")(); } + + std::unordered_map> Data() override + { + return py_obj_.attr("Data")().cast>>(); + } + +private: + py::object py_obj_; +}; + void bind_monitor(py::module_& m) { - py::class_(m, "StatsMonitor") - .def_static("get_instance", &StatsMonitor::GetInstance, py::return_value_policy::reference) - .def("update_stats", &StatsMonitor::UpdateStats) - .def("reset_all", &StatsMonitor::ResetAllStats) - .def("get_stats", &StatsMonitor::GetStats) - .def("get_stats_and_clear", &StatsMonitor::GetStatsAndClear); + py::class_(m, "StatsResult") + .def(py::init<>()) + .def_readonly("data", &StatsResult::data); + m.def("create_stats", &CreateStats); + m.def("update_stats", &UpdateStats); + m.def("reset_stats", &ResetStats); + m.def("reset_all", &ResetAllStats); + m.def("get_stats", &GetStats); + m.def("get_stats_and_clear", &GetStatsAndClear); + m.def("get_all_stats_and_clear", &GetAllStatsAndClear); + + m.def( + "register_stats", + [](const std::string& name, py::object py_obj) { + if (!py::hasattr(py_obj, "Name") || !py::hasattr(py_obj, "Update") || + !py::hasattr(py_obj, "Reset") || !py::hasattr(py_obj, "Data")) { + throw std::runtime_error( + "Python object must implement Name/Update/Reset/Data methods"); + } + + RegistStats(name, [py_obj]() -> std::unique_ptr { + return std::make_unique(py_obj); + }); + }, + py::arg("name"), py::arg("py_obj")); } } // namespace UC::Metrics diff --git a/ucm/shared/test/CMakeLists.txt b/ucm/shared/test/CMakeLists.txt index b99d46a8f..cf5355f27 100644 --- a/ucm/shared/test/CMakeLists.txt +++ b/ucm/shared/test/CMakeLists.txt @@ -4,8 +4,8 @@ if(BUILD_UNIT_TESTS) add_executable(ucmshared.test ${UCMSHARED_TEST_SOURCE_FILES}) target_include_directories(ucmshared.test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/case) target_link_libraries(ucmshared.test PRIVATE - trans - gtest_main gtest + trans monitor_static + gtest_main gtest mockcpp ) gtest_discover_tests(ucmshared.test) endif() diff --git a/ucm/shared/test/case/metrics/monitor_test.cc b/ucm/shared/test/case/metrics/monitor_test.cc new file mode 100644 index 000000000..ee0eda392 --- /dev/null +++ b/ucm/shared/test/case/metrics/monitor_test.cc @@ -0,0 +1,153 @@ +/** + * MIT License + * + * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * */ +#include +#include +#include "stats/istats.h" +#include "stats_monitor_api.h" + +using namespace UC::Metrics; + +class TestStats : public IStats { +public: + explicit TestStats(const std::string& name) : name_(name) {} + std::string Name() const override { return name_; } + void Update(const std::unordered_map& params) override + { + for (const auto& [key, val] : params) { data_[key].push_back(val); } + } + void Reset() override { data_.clear(); } + std::unordered_map> Data() override { return data_; } + +private: + std::string name_; + std::unordered_map> data_; +}; + +class UCStatsMonitorUT : public testing::Test { +protected: + void SetUp() override + { + try { + RegistStats("test_stats", [name = "test_stats"]() -> std::unique_ptr { + return std::make_unique(name); + }); + RegistStats("stats1", [name = "stats1"]() -> std::unique_ptr { + return std::make_unique(name); + }); + RegistStats("stats2", [name = "stats2"]() -> std::unique_ptr { + return std::make_unique(name); + }); + } catch (const std::exception& e) { + throw; + } + } +}; + +TEST_F(UCStatsMonitorUT, UpdateAndGetStats) +{ + std::string statsName = "test_stats"; + CreateStats(statsName); + + std::unordered_map params; + params["value1"] = 10.5; + params["value2"] = 20.0; + UpdateStats(statsName, params); + + StatsResult result = GetStats(statsName); + ASSERT_EQ(result.data.size(), 2); + ASSERT_EQ(result.data["value1"][0], 10.5); + ASSERT_EQ(result.data["value2"][0], 20.0); + + params["value1"] = 30.5; + UpdateStats(statsName, params); + + result = GetStats(statsName); + ASSERT_EQ(result.data["value1"].size(), 2); + ASSERT_EQ(result.data["value1"][1], 30.5); + ASSERT_EQ(result.data["value2"].size(), 2); + ASSERT_EQ(result.data["value2"][1], 20.0); + + StatsResult clearResult = GetStatsAndClear(statsName); + ASSERT_EQ(clearResult.data.size(), 2); + ASSERT_EQ(clearResult.data["value1"].size(), 2); + ASSERT_EQ(clearResult.data["value2"].size(), 2); + + result = GetStats(statsName); + EXPECT_TRUE(result.data.empty() || + (result.data["value1"].empty() && result.data["value2"].empty())); + + UpdateStats(statsName, params); + ResetStats(statsName); + result = GetStats(statsName); + EXPECT_TRUE(result.data.empty() || + (result.data["value1"].empty() && result.data["value2"].empty())); +} + +TEST_F(UCStatsMonitorUT, MultipleStatsAndResetAll) +{ + std::string stats1 = "stats1"; + std::string stats2 = "stats2"; + + CreateStats(stats1); + CreateStats(stats2); + + UpdateStats(stats1, { + {"a", 1.0}, + {"b", 2.0} + }); + UpdateStats(stats2, { + {"c", 3.0}, + {"d", 4.0} + }); + + ASSERT_EQ(GetStats(stats1).data.size(), 2); + ASSERT_EQ(GetStats(stats2).data.size(), 2); + + ResetAllStats(); + EXPECT_TRUE(GetStats(stats1).data.empty() || + (GetStats(stats1).data["a"].empty() && GetStats(stats1).data["b"].empty())); + EXPECT_TRUE(GetStats(stats2).data.empty() || + (GetStats(stats2).data["c"].empty() && GetStats(stats2).data["d"].empty())); +} + +TEST_F(UCStatsMonitorUT, MultipleStatsAndGetAll) +{ + std::string statsA = "stats1"; + std::string statsB = "stats2"; + + CreateStats(statsA); + CreateStats(statsB); + + UpdateStats(statsA, { + {"x", 100.0} + }); + UpdateStats(statsB, { + {"y", 200.0} + }); + + StatsResult allStats = GetAllStatsAndClear(); + ASSERT_EQ(allStats.data.size(), 2); + + EXPECT_TRUE(!allStats.data.count(statsA) || !allStats.data.count(statsB)); +} \ No newline at end of file diff --git a/ucm/shared/metrics/test/test.py b/ucm/shared/test/example/metrics/monitor_stats_example.py similarity index 82% rename from ucm/shared/metrics/test/test.py rename to ucm/shared/test/example/metrics/monitor_stats_example.py index 246e6f880..c6670b317 100644 --- a/ucm/shared/metrics/test/test.py +++ b/ucm/shared/test/example/metrics/monitor_stats_example.py @@ -28,23 +28,17 @@ import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from ucm.integration.vllm.conn_stats import ConnStats from ucm.shared.metrics import ucmmonitor # import monitor -mon = ucmmonitor.StatsMonitor.get_instance() -mon.update_stats( - "ConnStats", - { - "save_duration": 1.2, - "save_speed": 300.5, - "load_duration": 0.8, - "load_speed": 450.0, - "interval_lookup_hit_rates": 0.95, - }, -) -mon.update_stats( - "ConnStats", +conn_stats1 = ConnStats(name="PyStats1") +ucmmonitor.register_stats("PyStats1", conn_stats1) + +ucmmonitor.create_stats("PyStats1") +ucmmonitor.update_stats( + "PyStats1", { "save_duration": 1.2, "save_speed": 300.5, @@ -54,5 +48,5 @@ }, ) -data = mon.get_stats("ConnStats") +data = ucmmonitor.get_stats("PyStats1").data print(data) From 0e57ca745f3654817d359e491862c59519abf8b6 Mon Sep 17 00:00:00 2001 From: flesher0813 <1208954694@qq.com> Date: Sat, 13 Dec 2025 16:06:08 +0800 Subject: [PATCH 2/9] [Fix] fix naming style and remove registry --- docs/source/developer-guide/add_metrics.md | 19 +--- ucm/integration/vllm/conn_stats.py | 20 ---- ucm/observability.py | 89 +++++---------- .../metrics/cc/api/stats_monitor_api.cc | 5 - ucm/shared/metrics/cc/api/stats_monitor_api.h | 3 - .../cc/domain/stats/{istats.h => stats.h} | 25 +++-- ucm/shared/metrics/cc/domain/stats_monitor.cc | 39 +++---- ucm/shared/metrics/cc/domain/stats_monitor.h | 15 ++- .../metrics/cc/domain/stats_registry.cc | 59 ---------- ucm/shared/metrics/cc/domain/stats_registry.h | 60 ---------- ucm/shared/metrics/cpy/metrics.py.cc | 43 +------- ucm/shared/test/case/metrics/monitor_test.cc | 36 +----- .../example/metrics/monitor_stats_example.py | 104 ++++++++++++++---- 13 files changed, 153 insertions(+), 364 deletions(-) delete mode 100644 ucm/integration/vllm/conn_stats.py rename ucm/shared/metrics/cc/domain/stats/{istats.h => stats.h} (70%) delete mode 100644 ucm/shared/metrics/cc/domain/stats_registry.cc delete mode 100644 ucm/shared/metrics/cc/domain/stats_registry.h diff --git a/docs/source/developer-guide/add_metrics.md b/docs/source/developer-guide/add_metrics.md index 39561966b..a629944da 100644 --- a/docs/source/developer-guide/add_metrics.md +++ b/docs/source/developer-guide/add_metrics.md @@ -3,8 +3,8 @@ UCM supports custom metrics with bidirectional updates from both Python and C++ ## Architecture Overview The metrics consists of these components below: -- **metrics** : Central stats registry that manages all metric lifecycle operations (registration, creation, updates, queries) -- **observability.py** : Prometheus integration layer that handles metric exposition and multiprocess collection +- **monitor** : Central stats registry that manages all metric lifecycle operations (registration, creation, updates, queries) +- **observability.py** : Prometheus integration layer that handles metric exposition - **metrics_config.yaml** : Declarative configuration that defines which custom metrics to register and their properties ## Getting Started @@ -31,7 +31,7 @@ prometheus: # Gauge metrics configuration gauges: - - name: "lookup_hit_rate" + - name: "external_lookup_hit_rate" documentation: "Hit rate of ucm lookup requests" multiprocess_mode: "livemostrecent" @@ -43,7 +43,7 @@ prometheus: ``` ### Use Monitor APIs to Update Stats -The monitor provides a unified interface for metric operations. Note that the workflow requires registering a stats class before creating an instance. +The monitor provides a unified interface for metric operations. Users only need to create stats and update them, while the observability component is responsible for fetching the stats and pushing them to Prometheus. :::::{tab-set} :sync-group: install @@ -51,7 +51,6 @@ The monitor provides a unified interface for metric operations. Note that the wo :selected: :sync: py **Lifecycle Methods** -- `register_istats(name, py::object)`: Register a new stats class implementation. - `create_stats(name)`: Create and initialize a registered stats object. **Operation Methods** @@ -64,11 +63,8 @@ The monitor provides a unified interface for metric operations. Note that the wo **Example:** Using built-in ConnStats ```python -from ucm.integration.vllm.conn_stats import ConnStats from ucm.shared.metrics import ucmmonitor -conn_stats = ConnStats(name="ConnStats") -ucmmonitor.register_stats("ConnStats", conn_stats) # Register stats ucmmonitor.create_stats("ConnStats") # Create a stats obj # Update stats @@ -85,7 +81,6 @@ See more detailed example in [test case](https://github.com/ModelEngine-Group/un ::::{tab-item} C++ side interfaces :sync: cc **Lifecycle Methods** -- `RegistStats(std::string name, Creator creator)`: Register a new stats class implementation. - `CreateStats(const std::string& name)`: Create and initialize a registered stats object. **Operation Methods** @@ -102,10 +97,8 @@ UCM supports custom metrics by following steps: ```c++ target_link_libraries(xxxstore PUBLIC storeinfra monitor_static) ``` -- Step 2: Inheriting from the IStats class to implement custom stats classes -- Step 3: Register stats class to monitor -- Step 4: Create stats object in monitor -- Step 5: Update or get stats info using operation methods +- Step 2: Create stats object using function **CreateStats** +- Step 3: Update using function **UpdateStats** See more detailed example in [test case](https://github.com/ModelEngine-Group/unified-cache-management/tree/develop/ucm/shared/test/case). diff --git a/ucm/integration/vllm/conn_stats.py b/ucm/integration/vllm/conn_stats.py deleted file mode 100644 index 94f868f2c..000000000 --- a/ucm/integration/vllm/conn_stats.py +++ /dev/null @@ -1,20 +0,0 @@ -from ucm.shared.metrics import ucmmonitor - - -class ConnStats: - def __init__(self, name: str = "PyStats1"): - self._name = name - self._data = {} - - def Name(self) -> str: - return self._name - - def Update(self, params): - for k, v in params.items(): - self._data.setdefault(k, []).append(v) - - def Reset(self): - self._data.clear() - - def Data(self): - return self._data.copy() diff --git a/ucm/observability.py b/ucm/observability.py index 796460322..dd195a1cf 100644 --- a/ucm/observability.py +++ b/ucm/observability.py @@ -21,22 +21,17 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # - +# Be impressed by https://github.com/LMCache/LMCache/blob/dev/lmcache/observability.py import os import threading import time from dataclasses import dataclass -from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any, List import prometheus_client import yaml -# Third Party -from prometheus_client import REGISTRY -from vllm.distributed.parallel_state import get_world_group - from ucm.logger import init_logger from ucm.shared.metrics import ucmmonitor @@ -56,7 +51,7 @@ class PrometheusLogger: _counter_cls = prometheus_client.Counter _histogram_cls = prometheus_client.Histogram - def __init__(self, metadata: UCMEngineMetadata, config: Dict[str, Any]): + def __init__(self, metadata: UCMEngineMetadata, config: dict[str, Any]): # Ensure PROMETHEUS_MULTIPROC_DIR is set before any metric registration prometheus_config = config.get("prometheus", {}) multiproc_dir = prometheus_config.get("multiproc_dir", "/vllm-workspace") @@ -74,7 +69,7 @@ def __init__(self, metadata: UCMEngineMetadata, config: Dict[str, Any]): self._init_metrics_from_config(labelnames, prometheus_config) def _init_metrics_from_config( - self, labelnames: List[str], prometheus_config: Dict[str, Any] + self, labelnames: List[str], prometheus_config: dict[str, Any] ): """Initialize metrics based on configuration""" enabled = prometheus_config.get("enabled_metrics", {}) @@ -85,7 +80,7 @@ def _init_metrics_from_config( # Store metric mapping: metric_name -> (metric_type, attribute_name, stats_field_name) # This mapping will be used in log_prometheus to dynamically log metrics - self.metric_mappings: Dict[str, Dict[str, str]] = {} + self.metric_mappings: dict[str, dict[str, str]] = {} # Initialize counters if enabled.get("counters", True): @@ -172,23 +167,23 @@ def _init_metrics_from_config( "attr": attr_name, } - def _log_gauge(self, gauge, data: Union[int, float]) -> None: + def _set_gauge(self, gauge, data: List) -> None: # Convenience function for logging to gauge. + if not data: + return gauge.labels(**self.labels).set(data) - def _log_counter(self, counter, data: Union[int, float]) -> None: + def _inc_counter(self, counter, data: List) -> None: # Convenience function for logging to counter. # Prevent ValueError from negative increment - if data < 0: - return - counter.labels(**self.labels).inc(data) + counter.labels(**self.labels).inc(sum(data)) - def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None: + def _observe_histogram(self, histogram, data: List) -> None: # Convenience function for logging to histogram. for value in data: histogram.labels(**self.labels).observe(value) - def log_prometheus(self, stats: Any): + def update_stats(self, stats: dict[str, List]): """Log metrics to Prometheus based on configuration file""" # Dynamically log metrics based on what's configured in YAML for stat_name, value in stats.items(): @@ -202,17 +197,14 @@ def log_prometheus(self, stats: Any): # Log based on metric type if metric_type == "counter": - self._log_counter(metric_obj, value) + self._set_gauge(metric_obj, value) elif metric_type == "gauge": - self._log_gauge(metric_obj, value) + self._inc_counter(metric_obj, value) elif metric_type == "histogram": # Histograms expect a list - if not isinstance(value, list): - if value: - value = [value] - else: - value = [] - self._log_histogram(metric_obj, value) + self._observe_histogram(metric_obj, value) + else: + logger.error(f"Not found metric type for {stat_name}") except Exception: logger.debug(f"Failed to log metric {stat_name}") @@ -223,40 +215,6 @@ def _metadata_to_labels(metadata: UCMEngineMetadata): "worker_id": metadata.worker_id, } - _instance = None - - @staticmethod - def GetOrCreate( - metadata: UCMEngineMetadata, - config_path: str = "", - ) -> "PrometheusLogger": - if PrometheusLogger._instance is None: - PrometheusLogger._instance = PrometheusLogger(metadata, config_path) - # assert PrometheusLogger._instance.metadata == metadata, \ - # "PrometheusLogger instance already created with different metadata" - if PrometheusLogger._instance.metadata != metadata: - logger.error( - "PrometheusLogger instance already created with" - "different metadata. This should not happen except " - "in test" - ) - return PrometheusLogger._instance - - @staticmethod - def GetInstance() -> "PrometheusLogger": - assert ( - PrometheusLogger._instance is not None - ), "PrometheusLogger instance not created yet" - return PrometheusLogger._instance - - @staticmethod - def GetInstanceOrNone() -> Optional["PrometheusLogger"]: - """ - Returns the singleton instance of PrometheusLogger if it exists, - otherwise returns None. - """ - return PrometheusLogger._instance - class UCMStatsLogger: def __init__(self, model_name: str, rank: int, config_path: str = ""): @@ -267,13 +225,13 @@ def __init__(self, model_name: str, rank: int, config_path: str = ""): # Load configuration config = self._load_config(config_path) self.log_interval = config.get("log_interval", 10) - self.prometheus_logger = PrometheusLogger.GetOrCreate(self.metadata, config) + self.prometheus_logger = PrometheusLogger(self.metadata, config) self.is_running = True self.thread = threading.Thread(target=self.log_worker, daemon=True) self.thread.start() - def _load_config(self, config_path: str) -> Dict[str, Any]: + def _load_config(self, config_path: str) -> dict[str, Any]: """Load configuration from YAML file""" try: with open(config_path, "r") as f: @@ -293,11 +251,16 @@ def _load_config(self, config_path: str) -> Dict[str, Any]: def log_worker(self): while self.is_running: - # Use UCMStatsMonitor.get_states_and_clear() from external import stats = ucmmonitor.get_all_stats_and_clear().data - self.prometheus_logger.log_prometheus(stats) + self.prometheus_logger.update_stats(stats) time.sleep(self.log_interval) def shutdown(self): self.is_running = False self.thread.join() + + def __del__(self): + try: + self.shutdown() + except Exception: + pass diff --git a/ucm/shared/metrics/cc/api/stats_monitor_api.cc b/ucm/shared/metrics/cc/api/stats_monitor_api.cc index 88845ebdd..f2f7c7aab 100644 --- a/ucm/shared/metrics/cc/api/stats_monitor_api.cc +++ b/ucm/shared/metrics/cc/api/stats_monitor_api.cc @@ -24,11 +24,6 @@ #include "stats_monitor_api.h" namespace UC::Metrics { -void RegistStats(std::string name, Creator creator) -{ - StatsRegistry::GetInstance().RegisterStats(name, creator); -} - void CreateStats(const std::string& name) { StatsMonitor::GetInstance().CreateStats(name); } void UpdateStats(const std::string& name, const std::unordered_map& params) diff --git a/ucm/shared/metrics/cc/api/stats_monitor_api.h b/ucm/shared/metrics/cc/api/stats_monitor_api.h index 7f53c0ae3..a77115d6f 100644 --- a/ucm/shared/metrics/cc/api/stats_monitor_api.h +++ b/ucm/shared/metrics/cc/api/stats_monitor_api.h @@ -23,8 +23,6 @@ * */ #ifndef UNIFIEDCACHE_MONITOR_API_H #define UNIFIEDCACHE_MONITOR_API_H -#include -#include #include "stats_monitor.h" namespace UC::Metrics { @@ -33,7 +31,6 @@ struct StatsResult { std::unordered_map> data; }; -void RegistStats(std::string name, Creator creator); void CreateStats(const std::string& name); void UpdateStats(const std::string& name, const std::unordered_map& params); void ResetStats(const std::string& name); diff --git a/ucm/shared/metrics/cc/domain/stats/istats.h b/ucm/shared/metrics/cc/domain/stats/stats.h similarity index 70% rename from ucm/shared/metrics/cc/domain/stats/istats.h rename to ucm/shared/metrics/cc/domain/stats/stats.h index 5e058f631..a535ed64c 100644 --- a/ucm/shared/metrics/cc/domain/stats/istats.h +++ b/ucm/shared/metrics/cc/domain/stats/stats.h @@ -21,24 +21,29 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ -#ifndef UNIFIEDCACHE_ISTATS_H -#define UNIFIEDCACHE_ISTATS_H +#ifndef UNIFIEDCACHE_STATS_H +#define UNIFIEDCACHE_STATS_H -#include -#include #include #include #include namespace UC::Metrics { -class IStats { +class Stats { public: - virtual ~IStats() = default; - virtual std::string Name() const = 0; - virtual void Update(const std::unordered_map& params) = 0; - virtual void Reset() = 0; - virtual std::unordered_map> Data() = 0; + explicit Stats(const std::string& name) : name_(name) {} + std::string Name() { return name_; } + void Update(const std::unordered_map& params) + { + for (const auto& [key, val] : params) { data_[key].push_back(val); } + } + void Reset() { data_.clear(); } + std::unordered_map> Data() { return data_; } + +private: + std::string name_; + std::unordered_map> data_; }; } // namespace UC::Metrics diff --git a/ucm/shared/metrics/cc/domain/stats_monitor.cc b/ucm/shared/metrics/cc/domain/stats_monitor.cc index 348789050..03a8dc2ea 100644 --- a/ucm/shared/metrics/cc/domain/stats_monitor.cc +++ b/ucm/shared/metrics/cc/domain/stats_monitor.cc @@ -22,40 +22,34 @@ * SOFTWARE. * */ #include "stats_monitor.h" -#include -#include namespace UC::Metrics { -StatsMonitor::StatsMonitor() -{ - auto& registry = StatsRegistry::GetInstance(); - for (const auto& name : registry.GetRegisteredStatsNames()) { - stats_map_[name] = registry.CreateStats(name); - } -} - void StatsMonitor::CreateStats(const std::string& name) { std::lock_guard lock(mutex_); - auto& registry = StatsRegistry::GetInstance(); - stats_map_[name] = registry.CreateStats(name); + stats_map_[name] = std::make_unique(name); } -std::unordered_map> StatsMonitor::GetStats(const std::string& name) +void StatsMonitor::UpdateStats(const std::string& name, + const std::unordered_map& params) { std::lock_guard lock(mutex_); auto it = stats_map_.find(name); - if (it == stats_map_.end() || !it->second) { return {}; } - return it->second->Data(); + if (it == stats_map_.end() || !it->second) { return; } + try { + it->second->Update(params); + } catch (...) { + return; + } } -void StatsMonitor::ResetStats(const std::string& name) +std::unordered_map> StatsMonitor::GetStats(const std::string& name) { std::lock_guard lock(mutex_); auto it = stats_map_.find(name); - if (it == stats_map_.end() || !it->second) { return; } - it->second->Reset(); + if (it == stats_map_.end() || !it->second) { return {}; } + return it->second->Data(); } std::unordered_map> @@ -84,17 +78,12 @@ std::unordered_map> StatsMonitor::GetAllStatsAn return all_stats; } -void StatsMonitor::UpdateStats(const std::string& name, - const std::unordered_map& params) +void StatsMonitor::ResetStats(const std::string& name) { std::lock_guard lock(mutex_); auto it = stats_map_.find(name); if (it == stats_map_.end() || !it->second) { return; } - try { - it->second->Update(params); - } catch (...) { - return; - } + it->second->Reset(); } void StatsMonitor::ResetAllStats() diff --git a/ucm/shared/metrics/cc/domain/stats_monitor.h b/ucm/shared/metrics/cc/domain/stats_monitor.h index 50ecdacc0..70d10d46f 100644 --- a/ucm/shared/metrics/cc/domain/stats_monitor.h +++ b/ucm/shared/metrics/cc/domain/stats_monitor.h @@ -29,8 +29,7 @@ #include #include #include -#include "stats/istats.h" -#include "stats_registry.h" +#include "stats/stats.h" namespace UC::Metrics { @@ -46,24 +45,24 @@ class StatsMonitor { void CreateStats(const std::string& name); - std::unordered_map> GetStats(const std::string& name); + void UpdateStats(const std::string& name, + const std::unordered_map& params); - void ResetStats(const std::string& name); + std::unordered_map> GetStats(const std::string& name); std::unordered_map> GetStatsAndClear(const std::string& name); std::unordered_map> GetAllStatsAndClear(); - void UpdateStats(const std::string& name, - const std::unordered_map& params); + void ResetStats(const std::string& name); void ResetAllStats(); private: std::mutex mutex_; - std::unordered_map> stats_map_; + std::unordered_map> stats_map_; - StatsMonitor(); + StatsMonitor() = default; StatsMonitor(const StatsMonitor&) = delete; StatsMonitor& operator=(const StatsMonitor&) = delete; }; diff --git a/ucm/shared/metrics/cc/domain/stats_registry.cc b/ucm/shared/metrics/cc/domain/stats_registry.cc deleted file mode 100644 index c2551d9ad..000000000 --- a/ucm/shared/metrics/cc/domain/stats_registry.cc +++ /dev/null @@ -1,59 +0,0 @@ -/** - * MIT License - * - * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * */ -#include "stats_registry.h" - -namespace UC::Metrics { - -StatsRegistry& StatsRegistry::GetInstance() -{ - static StatsRegistry inst; - return inst; -} - -void StatsRegistry::RegisterStats(std::string name, Creator creator) -{ - auto& reg = GetInstance(); - std::lock_guard lk(reg.mutex_); - reg.registry_[name] = creator; -} - -std::unique_ptr StatsRegistry::CreateStats(const std::string& name) -{ - auto& reg = GetInstance(); - std::lock_guard lk(reg.mutex_); - if (auto it = reg.registry_.find(name); it != reg.registry_.end()) return it->second(); - return nullptr; -} - -std::vector StatsRegistry::GetRegisteredStatsNames() -{ - auto& reg = GetInstance(); - std::lock_guard lk(reg.mutex_); - std::vector names; - names.reserve(reg.registry_.size()); - for (auto& [n, _] : reg.registry_) names.push_back(n); - return names; -} - -} // namespace UC::Metrics \ No newline at end of file diff --git a/ucm/shared/metrics/cc/domain/stats_registry.h b/ucm/shared/metrics/cc/domain/stats_registry.h deleted file mode 100644 index 3b7a89609..000000000 --- a/ucm/shared/metrics/cc/domain/stats_registry.h +++ /dev/null @@ -1,60 +0,0 @@ -/** - * MIT License - * - * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * */ -#ifndef UNIFIEDCACHE_REGISTRY_H -#define UNIFIEDCACHE_REGISTRY_H - -#include -#include -#include -#include "stats/istats.h" - -namespace UC::Metrics { - -using Creator = std::function()>; - -class StatsRegistry { -public: - static StatsRegistry& GetInstance(); - - static void RegisterStats(std::string name, Creator creator); - - std::unique_ptr CreateStats(const std::string& name); - - std::vector GetRegisteredStatsNames(); - -private: - StatsRegistry() = default; - ~StatsRegistry() = default; - StatsRegistry(const StatsRegistry&) = delete; - StatsRegistry& operator=(const StatsRegistry&) = delete; - StatsRegistry(StatsRegistry&&) = delete; - StatsRegistry& operator=(StatsRegistry&&) = delete; - - std::mutex mutex_; - std::unordered_map registry_; -}; - -} // namespace UC::Metrics - -#endif // UNIFIEDCACHE_REGISTRY_H \ No newline at end of file diff --git a/ucm/shared/metrics/cpy/metrics.py.cc b/ucm/shared/metrics/cpy/metrics.py.cc index c23b5a310..085f74033 100644 --- a/ucm/shared/metrics/cpy/metrics.py.cc +++ b/ucm/shared/metrics/cpy/metrics.py.cc @@ -21,37 +21,13 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ -#include #include #include -#include "stats/istats.h" #include "stats_monitor_api.h" namespace py = pybind11; namespace UC::Metrics { -class PythonStatsWrapper : public IStats { -public: - PythonStatsWrapper(py::object py_obj) : py_obj_(py::cast(py_obj)) {} - - std::string Name() const override { return py_obj_.attr("Name")().cast(); } - - void Update(const std::unordered_map& params) override - { - py_obj_.attr("Update")(params); - } - - void Reset() override { py_obj_.attr("Reset")(); } - - std::unordered_map> Data() override - { - return py_obj_.attr("Data")().cast>>(); - } - -private: - py::object py_obj_; -}; - void bind_monitor(py::module_& m) { py::class_(m, "StatsResult") @@ -59,26 +35,11 @@ void bind_monitor(py::module_& m) .def_readonly("data", &StatsResult::data); m.def("create_stats", &CreateStats); m.def("update_stats", &UpdateStats); - m.def("reset_stats", &ResetStats); - m.def("reset_all", &ResetAllStats); m.def("get_stats", &GetStats); m.def("get_stats_and_clear", &GetStatsAndClear); m.def("get_all_stats_and_clear", &GetAllStatsAndClear); - - m.def( - "register_stats", - [](const std::string& name, py::object py_obj) { - if (!py::hasattr(py_obj, "Name") || !py::hasattr(py_obj, "Update") || - !py::hasattr(py_obj, "Reset") || !py::hasattr(py_obj, "Data")) { - throw std::runtime_error( - "Python object must implement Name/Update/Reset/Data methods"); - } - - RegistStats(name, [py_obj]() -> std::unique_ptr { - return std::make_unique(py_obj); - }); - }, - py::arg("name"), py::arg("py_obj")); + m.def("reset_stats", &ResetStats); + m.def("reset_all", &ResetAllStats); } } // namespace UC::Metrics diff --git a/ucm/shared/test/case/metrics/monitor_test.cc b/ucm/shared/test/case/metrics/monitor_test.cc index ee0eda392..a53c0f419 100644 --- a/ucm/shared/test/case/metrics/monitor_test.cc +++ b/ucm/shared/test/case/metrics/monitor_test.cc @@ -23,41 +23,18 @@ * */ #include #include -#include "stats/istats.h" #include "stats_monitor_api.h" using namespace UC::Metrics; -class TestStats : public IStats { -public: - explicit TestStats(const std::string& name) : name_(name) {} - std::string Name() const override { return name_; } - void Update(const std::unordered_map& params) override - { - for (const auto& [key, val] : params) { data_[key].push_back(val); } - } - void Reset() override { data_.clear(); } - std::unordered_map> Data() override { return data_; } - -private: - std::string name_; - std::unordered_map> data_; -}; - class UCStatsMonitorUT : public testing::Test { protected: void SetUp() override { try { - RegistStats("test_stats", [name = "test_stats"]() -> std::unique_ptr { - return std::make_unique(name); - }); - RegistStats("stats1", [name = "stats1"]() -> std::unique_ptr { - return std::make_unique(name); - }); - RegistStats("stats2", [name = "stats2"]() -> std::unique_ptr { - return std::make_unique(name); - }); + CreateStats("test_stats"); + CreateStats("stats1"); + CreateStats("stats2"); } catch (const std::exception& e) { throw; } @@ -67,7 +44,6 @@ class UCStatsMonitorUT : public testing::Test { TEST_F(UCStatsMonitorUT, UpdateAndGetStats) { std::string statsName = "test_stats"; - CreateStats(statsName); std::unordered_map params; params["value1"] = 10.5; @@ -109,9 +85,6 @@ TEST_F(UCStatsMonitorUT, MultipleStatsAndResetAll) std::string stats1 = "stats1"; std::string stats2 = "stats2"; - CreateStats(stats1); - CreateStats(stats2); - UpdateStats(stats1, { {"a", 1.0}, {"b", 2.0} @@ -136,9 +109,6 @@ TEST_F(UCStatsMonitorUT, MultipleStatsAndGetAll) std::string statsA = "stats1"; std::string statsB = "stats2"; - CreateStats(statsA); - CreateStats(statsB); - UpdateStats(statsA, { {"x", 100.0} }); diff --git a/ucm/shared/test/example/metrics/monitor_stats_example.py b/ucm/shared/test/example/metrics/monitor_stats_example.py index c6670b317..aa7d68264 100644 --- a/ucm/shared/test/example/metrics/monitor_stats_example.py +++ b/ucm/shared/test/example/metrics/monitor_stats_example.py @@ -22,31 +22,87 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # +from functools import wraps +from ucm.shared.metrics import ucmmonitor -import os -import sys -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from ucm.integration.vllm.conn_stats import ConnStats -from ucm.shared.metrics import ucmmonitor +def test_wrap(func): + @wraps(func) + def wrapper(*args, **kwargs): + print(f"========>> Running in {func.__name__}:") + result = func(*args, **kwargs) + print() + return result + + return wrapper + + +@test_wrap +def metrics_with_update_stats(): + ucmmonitor.create_stats("PyStats") + ucmmonitor.update_stats( + "PyStats", + { + "save_duration": 1.2, + "save_speed": 300.5, + "load_duration": 0.8, + "load_speed": 450.0, + "interval_lookup_hit_rates": 0.95, + }, + ) + + data = ucmmonitor.get_stats("PyStats").data + assert data["save_duration"][0] == 1.2 + assert len(data) == 5 + print(f"Get PyStats stats: {data}") + + data = ucmmonitor.get_stats_and_clear("PyStats").data + assert data["save_duration"][0] == 1.2 + assert len(data) == 5 + print(f"Get PyStats stats and clear: {data}") + + data = ucmmonitor.get_stats_and_clear("PyStats").data + assert len(data) == 0 + print(f"After clear then get PyStats: {data}") + + +@test_wrap +def metrics_with_update_all_stats(): + ucmmonitor.create_stats("PyStats1") + ucmmonitor.create_stats("PyStats2") + ucmmonitor.update_stats( + "PyStats1", + { + "save_duration": 1.2, + "save_speed": 300.5, + }, + ) + + ucmmonitor.update_stats( + "PyStats2", + { + "load_duration": 0.8, + "load_speed": 450.0, + }, + ) + + data = ucmmonitor.get_stats("PyStats1").data + assert data["save_duration"][0] == 1.2 + assert len(data) == 2 + print(f"Only get PyStats1 stats: {data}") + + data = ucmmonitor.get_all_stats_and_clear().data + assert data["save_duration"][0] == 1.2 + assert data["load_duration"][0] == 0.8 + assert len(data) == 4 + print(f"Get all stats and clear: {data}") + + data = ucmmonitor.get_stats("PyStats2").data + assert len(data) == 0 + print(f"After clear then get PyStats2: {data}") + -# import monitor - -conn_stats1 = ConnStats(name="PyStats1") -ucmmonitor.register_stats("PyStats1", conn_stats1) - -ucmmonitor.create_stats("PyStats1") -ucmmonitor.update_stats( - "PyStats1", - { - "save_duration": 1.2, - "save_speed": 300.5, - "load_duration": 0.8, - "load_speed": 450.0, - "interval_lookup_hit_rates": 0.95, - }, -) - -data = ucmmonitor.get_stats("PyStats1").data -print(data) +if __name__ == "__main__": + metrics_with_update_stats() + metrics_with_update_all_stats() From ec18b55f4fbf21d210883bab50fb41b9f445b9cd Mon Sep 17 00:00:00 2001 From: flesher0813 <1208954694@qq.com> Date: Wed, 17 Dec 2025 11:21:21 +0800 Subject: [PATCH 3/9] metrics --- ucm/shared/metrics/CMakeLists.txt | 38 ++------ .../metrics/cc/api/stats_monitor_api.cc | 59 ----------- ucm/shared/metrics/cc/api/stats_monitor_api.h | 43 -------- ucm/shared/metrics/cc/domain/stats/stats.h | 51 ---------- ucm/shared/metrics/cc/domain/stats_monitor.cc | 97 ------------------- ucm/shared/metrics/cc/metrics.cc | 89 +++++++++++++++++ .../cc/{domain/stats_monitor.h => metrics.h} | 42 ++++---- ucm/shared/metrics/cpy/metrics.py.cc | 21 ++-- 8 files changed, 131 insertions(+), 309 deletions(-) delete mode 100644 ucm/shared/metrics/cc/api/stats_monitor_api.cc delete mode 100644 ucm/shared/metrics/cc/api/stats_monitor_api.h delete mode 100644 ucm/shared/metrics/cc/domain/stats/stats.h delete mode 100644 ucm/shared/metrics/cc/domain/stats_monitor.cc create mode 100644 ucm/shared/metrics/cc/metrics.cc rename ucm/shared/metrics/cc/{domain/stats_monitor.h => metrics.h} (63%) diff --git a/ucm/shared/metrics/CMakeLists.txt b/ucm/shared/metrics/CMakeLists.txt index 37dc054a8..67b1c4e68 100644 --- a/ucm/shared/metrics/CMakeLists.txt +++ b/ucm/shared/metrics/CMakeLists.txt @@ -1,37 +1,15 @@ -file(GLOB_RECURSE DOMAIN_SRCS - "${CMAKE_CURRENT_SOURCE_DIR}/cc/domain/*.cc" - "${CMAKE_CURRENT_SOURCE_DIR}/cc/domain/stats/*.cc" +file(GLOB_RECURSE UCMMETRICS_CC_SOURCE_FILES "./cc/*.cc") +add_library(metrics STATIC ${UCMMETRICS_CC_SOURCE_FILES}) +target_include_directories(metrics PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/cc ) -file(GLOB_RECURSE API_SRCS - "${CMAKE_CURRENT_SOURCE_DIR}/cc/api/*.cc" -) - -add_library(monitor_static STATIC - ${DOMAIN_SRCS} - ${API_SRCS} -) - -set_property(TARGET monitor_static PROPERTY POSITION_INDEPENDENT_CODE ON) - -target_include_directories(monitor_static PUBLIC - $ - $ - $ -) - -set_target_properties(monitor_static PROPERTIES OUTPUT_NAME monitor) - -file(GLOB_RECURSE BINDINGS_SRCS CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/cpy/*.cc") -pybind11_add_module(ucmmonitor ${BINDINGS_SRCS}) -target_link_libraries(ucmmonitor PRIVATE monitor_static) - -target_include_directories(ucmmonitor PRIVATE - "${CMAKE_CURRENT_SOURCE_DIR}/cc/api" -) +file(GLOB_RECURSE UCMMETRICS_CPY_SOURCE_FILES CONFIGURE_DEPENDS "./cpy/*.cc") +pybind11_add_module(ucmmetrics ${UCMMETRICS_CPY_SOURCE_FILES}) +target_link_libraries(ucmmetrics PRIVATE metrics) file(RELATIVE_PATH INSTALL_REL_PATH ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -install(TARGETS ucmmonitor LIBRARY DESTINATION ${INSTALL_REL_PATH} COMPONENT ucm) \ No newline at end of file +install(TARGETS ucmmetrics LIBRARY DESTINATION ${INSTALL_REL_PATH} COMPONENT ucm) \ No newline at end of file diff --git a/ucm/shared/metrics/cc/api/stats_monitor_api.cc b/ucm/shared/metrics/cc/api/stats_monitor_api.cc deleted file mode 100644 index f2f7c7aab..000000000 --- a/ucm/shared/metrics/cc/api/stats_monitor_api.cc +++ /dev/null @@ -1,59 +0,0 @@ -/** - * MIT License - * - * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * */ -#include "stats_monitor_api.h" -namespace UC::Metrics { - -void CreateStats(const std::string& name) { StatsMonitor::GetInstance().CreateStats(name); } - -void UpdateStats(const std::string& name, const std::unordered_map& params) -{ - StatsMonitor::GetInstance().UpdateStats(name, params); -} - -void ResetStats(const std::string& name) { StatsMonitor::GetInstance().ResetStats(name); } - -void ResetAllStats() { StatsMonitor::GetInstance().ResetAllStats(); } - -StatsResult GetStats(const std::string& name) -{ - StatsResult result; - result.data = StatsMonitor::GetInstance().GetStats(name); - return result; -} - -StatsResult GetStatsAndClear(const std::string& name) -{ - StatsResult result; - result.data = StatsMonitor::GetInstance().GetStatsAndClear(name); - return result; -} - -StatsResult GetAllStatsAndClear() -{ - StatsResult result; - result.data = StatsMonitor::GetInstance().GetAllStatsAndClear(); - return result; -} - -} // namespace UC::Metrics \ No newline at end of file diff --git a/ucm/shared/metrics/cc/api/stats_monitor_api.h b/ucm/shared/metrics/cc/api/stats_monitor_api.h deleted file mode 100644 index a77115d6f..000000000 --- a/ucm/shared/metrics/cc/api/stats_monitor_api.h +++ /dev/null @@ -1,43 +0,0 @@ -/** - * MIT License - * - * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * */ -#ifndef UNIFIEDCACHE_MONITOR_API_H -#define UNIFIEDCACHE_MONITOR_API_H -#include "stats_monitor.h" - -namespace UC::Metrics { -struct StatsResult { - StatsResult() = default; - std::unordered_map> data; -}; - -void CreateStats(const std::string& name); -void UpdateStats(const std::string& name, const std::unordered_map& params); -void ResetStats(const std::string& name); -void ResetAllStats(); -StatsResult GetStats(const std::string& name); -StatsResult GetStatsAndClear(const std::string& name); -StatsResult GetAllStatsAndClear(); - -} // namespace UC::Metrics -#endif \ No newline at end of file diff --git a/ucm/shared/metrics/cc/domain/stats/stats.h b/ucm/shared/metrics/cc/domain/stats/stats.h deleted file mode 100644 index a535ed64c..000000000 --- a/ucm/shared/metrics/cc/domain/stats/stats.h +++ /dev/null @@ -1,51 +0,0 @@ -/** - * MIT License - * - * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * */ -#ifndef UNIFIEDCACHE_STATS_H -#define UNIFIEDCACHE_STATS_H - -#include -#include -#include - -namespace UC::Metrics { - -class Stats { -public: - explicit Stats(const std::string& name) : name_(name) {} - std::string Name() { return name_; } - void Update(const std::unordered_map& params) - { - for (const auto& [key, val] : params) { data_[key].push_back(val); } - } - void Reset() { data_.clear(); } - std::unordered_map> Data() { return data_; } - -private: - std::string name_; - std::unordered_map> data_; -}; - -} // namespace UC::Metrics - -#endif \ No newline at end of file diff --git a/ucm/shared/metrics/cc/domain/stats_monitor.cc b/ucm/shared/metrics/cc/domain/stats_monitor.cc deleted file mode 100644 index 03a8dc2ea..000000000 --- a/ucm/shared/metrics/cc/domain/stats_monitor.cc +++ /dev/null @@ -1,97 +0,0 @@ -/** - * MIT License - * - * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * */ -#include "stats_monitor.h" - -namespace UC::Metrics { - -void StatsMonitor::CreateStats(const std::string& name) -{ - std::lock_guard lock(mutex_); - stats_map_[name] = std::make_unique(name); -} - -void StatsMonitor::UpdateStats(const std::string& name, - const std::unordered_map& params) -{ - std::lock_guard lock(mutex_); - auto it = stats_map_.find(name); - if (it == stats_map_.end() || !it->second) { return; } - try { - it->second->Update(params); - } catch (...) { - return; - } -} - -std::unordered_map> StatsMonitor::GetStats(const std::string& name) -{ - std::lock_guard lock(mutex_); - auto it = stats_map_.find(name); - if (it == stats_map_.end() || !it->second) { return {}; } - return it->second->Data(); -} - -std::unordered_map> -StatsMonitor::GetStatsAndClear(const std::string& name) -{ - std::lock_guard lock(mutex_); - auto it = stats_map_.find(name); - if (it == stats_map_.end() || !it->second) { return {}; } - auto result = it->second->Data(); - it->second->Reset(); - return result; -} - -std::unordered_map> StatsMonitor::GetAllStatsAndClear() -{ - std::lock_guard lock(mutex_); - std::unordered_map> all_stats; - - for (const auto& [name, stats_ptr] : stats_map_) { - if (stats_ptr) { - auto data = stats_ptr->Data(); - all_stats.insert(data.begin(), data.end()); - stats_ptr->Reset(); - } - } - return all_stats; -} - -void StatsMonitor::ResetStats(const std::string& name) -{ - std::lock_guard lock(mutex_); - auto it = stats_map_.find(name); - if (it == stats_map_.end() || !it->second) { return; } - it->second->Reset(); -} - -void StatsMonitor::ResetAllStats() -{ - std::lock_guard lock(mutex_); - for (const auto& [name, stats_ptr] : stats_map_) { - if (stats_ptr) { stats_ptr->Reset(); } - } -} - -} // namespace UC::Metrics \ No newline at end of file diff --git a/ucm/shared/metrics/cc/metrics.cc b/ucm/shared/metrics/cc/metrics.cc new file mode 100644 index 000000000..7dd4d4cff --- /dev/null +++ b/ucm/shared/metrics/cc/metrics.cc @@ -0,0 +1,89 @@ +/** + * MIT License + * + * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * */ +#include +#include "metrics.h" + +namespace UC::Metrics { + +void Metrics::CreateStats(const std::string& name, std::string& type) +{ + std::lock_guard lock(mutex_); + std::transform(type.begin(), type.end(), type.begin(), ::toupper); + if (stats_type_.count(name)) { + return; + } else { + if (type == "COUNTER") { + stats_type_[name] = MetricType::COUNTER; + } else if (type == "GUAGE") { + stats_type_[name] = MetricType::GUAGE; + } else if (type == "HISTOGRAM") { + stats_type_[name] = MetricType::HISTOGRAM; + } else { + return; + } + } +} + +void Metrics::UpdateStats(const std::string& name, double value) +{ + std::lock_guard lock(mutex_); + auto it = stats_type_.find(name); + if (it == stats_map_.end() || !it->second) { return; } + switch (it->second) + { + case MetricType::COUNTER: + counter_stats_[name] += value; + break; + case MetricType::GUAGE: + gauge_stats_[name] = value; + break; + case MetricType::HISTOGRAM: + histogram_stats_[name].push_back(value); + break; + + default: + break; + } +} + + +std::tuple< + std::unordered_map, + std::unordered_map, + std::unordered_map> + > Metrics::GetAllStatsAndClear() +{ + std::lock_guard lock(mutex_); + auto result = std::make_tuple( + std::move(counter_stats_), + std::move(gauge_stats_), + std::move(histogram_stats_) + ); + counter_stats_.clear(); + gauge_stats_.clear(); + histogram_stats_.clear(); + return result; +} + +} // namespace UC::Metrics \ No newline at end of file diff --git a/ucm/shared/metrics/cc/domain/stats_monitor.h b/ucm/shared/metrics/cc/metrics.h similarity index 63% rename from ucm/shared/metrics/cc/domain/stats_monitor.h rename to ucm/shared/metrics/cc/metrics.h index 70d10d46f..5e39871d0 100644 --- a/ucm/shared/metrics/cc/domain/stats_monitor.h +++ b/ucm/shared/metrics/cc/metrics.h @@ -29,42 +29,42 @@ #include #include #include -#include "stats/stats.h" +#include namespace UC::Metrics { -class StatsMonitor { +class Metrics { public: - static StatsMonitor& GetInstance() + static Metrics& GetInstance() { - static StatsMonitor inst; + static Metrics inst; return inst; } - ~StatsMonitor() = default; + ~Metrics() = default; - void CreateStats(const std::string& name); + void CreateStats(const std::string& name, std::string& type); - void UpdateStats(const std::string& name, - const std::unordered_map& params); + void UpdateStats(const std::string& name, double value); - std::unordered_map> GetStats(const std::string& name); - - std::unordered_map> GetStatsAndClear(const std::string& name); - - std::unordered_map> GetAllStatsAndClear(); - - void ResetStats(const std::string& name); - - void ResetAllStats(); + std::tuple< + std::unordered_map, + std::unordered_map, + std::unordered_map> + > GetAllStatsAndClear(); private: + enum class MetricType { COUNTER, GUAGE, HISTOGRAM }; + std::mutex mutex_; - std::unordered_map> stats_map_; + std::unordered_map counter_stats_; + std::unordered_map gauge_stats_; + std::unordered_map> histogram_stats_; + std::unordered_map stats_type_; - StatsMonitor() = default; - StatsMonitor(const StatsMonitor&) = delete; - StatsMonitor& operator=(const StatsMonitor&) = delete; + Metrics() = default; + Metrics(const Metrics&) = delete; + Metrics& operator=(const Metrics&) = delete; }; } // namespace UC::Metrics diff --git a/ucm/shared/metrics/cpy/metrics.py.cc b/ucm/shared/metrics/cpy/metrics.py.cc index 085f74033..6b41b3cd9 100644 --- a/ucm/shared/metrics/cpy/metrics.py.cc +++ b/ucm/shared/metrics/cpy/metrics.py.cc @@ -23,23 +23,28 @@ * */ #include #include -#include "stats_monitor_api.h" +#include "metrics.h" namespace py = pybind11; namespace UC::Metrics { +inline void CreateStats(const std::string& name, std::string& type) { + Metrics::GetInstance().CreateStats(name, type); +} + +inline void UpdateStats(const std::string& name, double value) { + Metrics::GetInstance().UpdateStats(name, value); +} + +inline auto GetAllStatsAndClear() { + return Metrics::GetInstance().GetAllStatsAndClear(); +} + void bind_monitor(py::module_& m) { - py::class_(m, "StatsResult") - .def(py::init<>()) - .def_readonly("data", &StatsResult::data); m.def("create_stats", &CreateStats); m.def("update_stats", &UpdateStats); - m.def("get_stats", &GetStats); - m.def("get_stats_and_clear", &GetStatsAndClear); m.def("get_all_stats_and_clear", &GetAllStatsAndClear); - m.def("reset_stats", &ResetStats); - m.def("reset_all", &ResetAllStats); } } // namespace UC::Metrics From 48b7f0d559d81f3e21a361567baecba3a7dba1b2 Mon Sep 17 00:00:00 2001 From: flesher0813 <1208954694@qq.com> Date: Wed, 17 Dec 2025 11:43:31 +0800 Subject: [PATCH 4/9] metrics --- ucm/shared/metrics/cc/metrics.cc | 8 ++++---- ucm/shared/metrics/cc/metrics.h | 4 ++-- ucm/shared/metrics/cpy/metrics.py.cc | 8 ++++++-- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/ucm/shared/metrics/cc/metrics.cc b/ucm/shared/metrics/cc/metrics.cc index 7dd4d4cff..d87bdaaf3 100644 --- a/ucm/shared/metrics/cc/metrics.cc +++ b/ucm/shared/metrics/cc/metrics.cc @@ -35,8 +35,8 @@ void Metrics::CreateStats(const std::string& name, std::string& type) } else { if (type == "COUNTER") { stats_type_[name] = MetricType::COUNTER; - } else if (type == "GUAGE") { - stats_type_[name] = MetricType::GUAGE; + } else if (type == "GAUGE") { + stats_type_[name] = MetricType::GAUGE; } else if (type == "HISTOGRAM") { stats_type_[name] = MetricType::HISTOGRAM; } else { @@ -49,13 +49,13 @@ void Metrics::UpdateStats(const std::string& name, double value) { std::lock_guard lock(mutex_); auto it = stats_type_.find(name); - if (it == stats_map_.end() || !it->second) { return; } + if (it == stats_type_.end()) { return; } switch (it->second) { case MetricType::COUNTER: counter_stats_[name] += value; break; - case MetricType::GUAGE: + case MetricType::GAUGE: gauge_stats_[name] = value; break; case MetricType::HISTOGRAM: diff --git a/ucm/shared/metrics/cc/metrics.h b/ucm/shared/metrics/cc/metrics.h index 5e39871d0..ee00f9083 100644 --- a/ucm/shared/metrics/cc/metrics.h +++ b/ucm/shared/metrics/cc/metrics.h @@ -54,13 +54,13 @@ class Metrics { > GetAllStatsAndClear(); private: - enum class MetricType { COUNTER, GUAGE, HISTOGRAM }; + enum class MetricType { COUNTER, GAUGE, HISTOGRAM }; std::mutex mutex_; std::unordered_map counter_stats_; std::unordered_map gauge_stats_; std::unordered_map> histogram_stats_; - std::unordered_map stats_type_; + std::unordered_map stats_type_; Metrics() = default; Metrics(const Metrics&) = delete; diff --git a/ucm/shared/metrics/cpy/metrics.py.cc b/ucm/shared/metrics/cpy/metrics.py.cc index 6b41b3cd9..a501170d2 100644 --- a/ucm/shared/metrics/cpy/metrics.py.cc +++ b/ucm/shared/metrics/cpy/metrics.py.cc @@ -36,7 +36,11 @@ inline void UpdateStats(const std::string& name, double value) { Metrics::GetInstance().UpdateStats(name, value); } -inline auto GetAllStatsAndClear() { +inline std::tuple< + std::unordered_map, + std::unordered_map, + std::unordered_map> +> GetAllStatsAndClear() { return Metrics::GetInstance().GetAllStatsAndClear(); } @@ -49,7 +53,7 @@ void bind_monitor(py::module_& m) } // namespace UC::Metrics -PYBIND11_MODULE(ucmmonitor, module) +PYBIND11_MODULE(ucmmetrics, module) { module.attr("project") = UCM_PROJECT_NAME; module.attr("version") = UCM_PROJECT_VERSION; From d76bcfefa3420b23e5c0a96acc7c5b93b4f97eab Mon Sep 17 00:00:00 2001 From: flesher0813 <1208954694@qq.com> Date: Wed, 17 Dec 2025 17:10:00 +0800 Subject: [PATCH 5/9] metrics --- examples/metrics/metrics_configs.yaml | 93 +++--- ucm/observability.py | 316 +++++++----------- ucm/shared/metrics/cc/metrics.cc | 6 + ucm/shared/metrics/cc/metrics.h | 2 + ucm/shared/metrics/cpy/metrics.py.cc | 7 +- .../example/metrics/monitor_stats_example.py | 80 ++--- 6 files changed, 191 insertions(+), 313 deletions(-) diff --git a/examples/metrics/metrics_configs.yaml b/examples/metrics/metrics_configs.yaml index 5ed07baa9..a4dba4fa3 100644 --- a/examples/metrics/metrics_configs.yaml +++ b/examples/metrics/metrics_configs.yaml @@ -2,55 +2,48 @@ # This file defines which metrics should be enabled and their configurations log_interval: 5 # Interval in seconds for logging metrics -prometheus: - multiproc_dir: "/vllm-workspace" # Directory for Prometheus multiprocess mode +multiproc_dir: "/vllm-workspace" # Directory for Prometheus multiprocess mode - metric_prefix: "ucm:" - - # Enable/disable metrics by category - enabled_metrics: - counters: true - gauges: true - histograms: true - - # Counter metrics configuration - # counters: - # - name: "received_requests" - # documentation: "Total number of requests sent to ucm" - - # Gauge metrics configuration - # gauges: - # - name: "lookup_hit_rate" - # documentation: "Hit rate of ucm lookup requests since last log" - # multiprocess_mode: "livemostrecent" - - # Histogram metrics configuration - histograms: - - name: "load_requests_num" - documentation: "Number of requests loaded from ucm" - buckets: [1, 5, 10, 20, 50, 100, 200, 500, 1000] - - name: "load_blocks_num" - documentation: "Number of blocks loaded from ucm" - buckets: [0, 50, 100, 150, 200, 250, 300, 350, 400, 550, 600, 750, 800, 850, 900, 950, 1000] - - name: "load_duration" - documentation: "Time to load from ucm (ms)" - buckets: [0, 50, 100, 150, 200, 250, 300, 350, 400, 550, 600, 750, 800, 850, 900, 950, 1000] - - name: "load_speed" - documentation: "Speed of loading from ucm (GB/s)" - buckets: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 50, 60, 70, 80, 90, 100] - - name: "save_requests_num" - documentation: "Number of requests saved to ucm" - buckets: [1, 5, 10, 20, 50, 100, 200, 500, 1000] - - name: "save_blocks_num" - documentation: "Number of blocks saved to ucm" - buckets: [0, 50, 100, 150, 200, 250, 300, 350, 400, 550, 600, 750, 800, 850, 900, 950, 1000] - - name: "save_duration" - documentation: "Time to save to ucm (ms)" - buckets: [0, 50, 100, 150, 200, 250, 300, 350, 400, 550, 600, 750, 800, 850, 900, 950, 1000] - - name: "save_speed" - documentation: "Speed of saving to ucm (GB/s)" - buckets: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 50, 60, 70, 80, 90, 100] - - name: "interval_lookup_hit_rates" - documentation: "Hit rates of ucm lookup requests" - buckets: [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] +metric_prefix: "ucm:" + +# Counter metrics configuration +# counter: +# - name: "received_requests" +# documentation: "Total number of requests sent to ucm" + +# Gauge metrics configuration +# gauge: +# - name: "lookup_hit_rate" +# documentation: "Hit rate of ucm lookup requests since last log" +# multiprocess_mode: "livemostrecent" + +# Histogram metrics configuration +histogram: + - name: "load_requests_num" + documentation: "Number of requests loaded from ucm" + buckets: [1, 5, 10, 20, 50, 100, 200, 500, 1000] + - name: "load_blocks_num" + documentation: "Number of blocks loaded from ucm" + buckets: [0, 50, 100, 150, 200, 250, 300, 350, 400, 550, 600, 750, 800, 850, 900, 950, 1000] + - name: "load_duration" + documentation: "Time to load from ucm (ms)" + buckets: [0, 50, 100, 150, 200, 250, 300, 350, 400, 550, 600, 750, 800, 850, 900, 950, 1000] + - name: "load_speed" + documentation: "Speed of loading from ucm (GB/s)" + buckets: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 50, 60, 70, 80, 90, 100] + - name: "save_requests_num" + documentation: "Number of requests saved to ucm" + buckets: [1, 5, 10, 20, 50, 100, 200, 500, 1000] + - name: "save_blocks_num" + documentation: "Number of blocks saved to ucm" + buckets: [0, 50, 100, 150, 200, 250, 300, 350, 400, 550, 600, 750, 800, 850, 900, 950, 1000] + - name: "save_duration" + documentation: "Time to save to ucm (ms)" + buckets: [0, 50, 100, 150, 200, 250, 300, 350, 400, 550, 600, 750, 800, 850, 900, 950, 1000] + - name: "save_speed" + documentation: "Speed of saving to ucm (GB/s)" + buckets: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 50, 60, 70, 80, 90, 100] + - name: "interval_lookup_hit_rates" + documentation: "Hit rates of ucm lookup requests" + buckets: [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] diff --git a/ucm/observability.py b/ucm/observability.py index dd195a1cf..e35138049 100644 --- a/ucm/observability.py +++ b/ucm/observability.py @@ -26,211 +26,18 @@ import os import threading import time -from dataclasses import dataclass -from typing import Any, List +from typing import Any, Union -import prometheus_client +from prometheus_client import Gauge, Counter, Histogram import yaml from ucm.logger import init_logger -from ucm.shared.metrics import ucmmonitor +from ucm.shared.metrics import ucmmetrics logger = init_logger(__name__) - -@dataclass -class UCMEngineMetadata: - """Metadata for UCM engine""" - - model_name: str - worker_id: str - - -class PrometheusLogger: - _gauge_cls = prometheus_client.Gauge - _counter_cls = prometheus_client.Counter - _histogram_cls = prometheus_client.Histogram - - def __init__(self, metadata: UCMEngineMetadata, config: dict[str, Any]): - # Ensure PROMETHEUS_MULTIPROC_DIR is set before any metric registration - prometheus_config = config.get("prometheus", {}) - multiproc_dir = prometheus_config.get("multiproc_dir", "/vllm-workspace") - if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: - os.environ["PROMETHEUS_MULTIPROC_DIR"] = multiproc_dir - if not os.path.exists(multiproc_dir): - os.makedirs(multiproc_dir, exist_ok=True) - - self.metadata = metadata - self.config = config - self.labels = self._metadata_to_labels(metadata) - labelnames = list(self.labels.keys()) - - # Initialize metrics based on configuration - self._init_metrics_from_config(labelnames, prometheus_config) - - def _init_metrics_from_config( - self, labelnames: List[str], prometheus_config: dict[str, Any] - ): - """Initialize metrics based on configuration""" - enabled = prometheus_config.get("enabled_metrics", {}) - - # Get metric name prefix from config (e.g., "ucm:") - # If not specified, use empty string - metric_prefix = prometheus_config.get("metric_prefix", "ucm:") - - # Store metric mapping: metric_name -> (metric_type, attribute_name, stats_field_name) - # This mapping will be used in log_prometheus to dynamically log metrics - self.metric_mappings: dict[str, dict[str, str]] = {} - - # Initialize counters - if enabled.get("counters", True): - counters = prometheus_config.get("counters", []) - for counter_cfg in counters: - name = counter_cfg.get("name") - doc = counter_cfg.get("documentation", "") - # Prometheus metric name with prefix - prometheus_name = f"{metric_prefix}{name}" if metric_prefix else name - # Internal attribute name for storing the metric object - attr_name = f"counter_{name}" - - if not hasattr(self, attr_name): - setattr( - self, - attr_name, - self._counter_cls( - name=prometheus_name, - documentation=doc, - labelnames=labelnames, - ), - ) - # Store mapping for dynamic logging - self.metric_mappings[name] = { - "type": "counter", - "attr": attr_name, - } - - # Initialize gauges - if enabled.get("gauges", True): - gauges = prometheus_config.get("gauges", []) - for gauge_cfg in gauges: - name = gauge_cfg.get("name") - doc = gauge_cfg.get("documentation", "") - multiprocess_mode = gauge_cfg.get("multiprocess_mode", "live") - # Prometheus metric name with prefix - prometheus_name = f"{metric_prefix}{name}" if metric_prefix else name - # Internal attribute name - attr_name = f"gauge_{name}" - - if not hasattr(self, attr_name): - setattr( - self, - attr_name, - self._gauge_cls( - name=prometheus_name, - documentation=doc, - labelnames=labelnames, - multiprocess_mode=multiprocess_mode, - ), - ) - # Store mapping for dynamic logging - self.metric_mappings[name] = { - "type": "gauge", - "attr": attr_name, - } - - # Initialize histograms - if enabled.get("histograms", True): - histograms = prometheus_config.get("histograms", []) - for hist_cfg in histograms: - name = hist_cfg.get("name") - doc = hist_cfg.get("documentation", "") - buckets = hist_cfg.get("buckets", []) - # Prometheus metric name with prefix - prometheus_name = f"{metric_prefix}{name}" if metric_prefix else name - # Internal attribute name - attr_name = f"histogram_{name}" - - if not hasattr(self, attr_name): - setattr( - self, - attr_name, - self._histogram_cls( - name=prometheus_name, - documentation=doc, - labelnames=labelnames, - buckets=buckets, - ), - ) - # Store mapping for dynamic logging - self.metric_mappings[name] = { - "type": "histogram", - "attr": attr_name, - } - - def _set_gauge(self, gauge, data: List) -> None: - # Convenience function for logging to gauge. - if not data: - return - gauge.labels(**self.labels).set(data) - - def _inc_counter(self, counter, data: List) -> None: - # Convenience function for logging to counter. - # Prevent ValueError from negative increment - counter.labels(**self.labels).inc(sum(data)) - - def _observe_histogram(self, histogram, data: List) -> None: - # Convenience function for logging to histogram. - for value in data: - histogram.labels(**self.labels).observe(value) - - def update_stats(self, stats: dict[str, List]): - """Log metrics to Prometheus based on configuration file""" - # Dynamically log metrics based on what's configured in YAML - for stat_name, value in stats.items(): - try: - metric_mapped = self.metric_mappings[stat_name] - if metric_mapped is None: - logger.debug(f"Stat {stat_name} not initialized.") - continue - metric_obj = getattr(self, metric_mapped["attr"], None) - metric_type = metric_mapped["type"] - - # Log based on metric type - if metric_type == "counter": - self._set_gauge(metric_obj, value) - elif metric_type == "gauge": - self._inc_counter(metric_obj, value) - elif metric_type == "histogram": - # Histograms expect a list - self._observe_histogram(metric_obj, value) - else: - logger.error(f"Not found metric type for {stat_name}") - except Exception: - logger.debug(f"Failed to log metric {stat_name}") - - @staticmethod - def _metadata_to_labels(metadata: UCMEngineMetadata): - return { - "model_name": metadata.model_name, - "worker_id": metadata.worker_id, - } - - -class UCMStatsLogger: - def __init__(self, model_name: str, rank: int, config_path: str = ""): - # Create metadata - self.metadata = UCMEngineMetadata( - model_name=str(model_name), worker_id=str(rank) - ) - # Load configuration - config = self._load_config(config_path) - self.log_interval = config.get("log_interval", 10) - self.prometheus_logger = PrometheusLogger(self.metadata, config) - self.is_running = True - - self.thread = threading.Thread(target=self.log_worker, daemon=True) - self.thread.start() - +class PrometheusStatsLogger: + def _load_config(self, config_path: str) -> dict[str, Any]: """Load configuration from YAML file""" try: @@ -249,10 +56,117 @@ def _load_config(self, config_path: str) -> dict[str, Any]: logger.error(f"Error parsing YAML config file {config_path}: {e}") return {} - def log_worker(self): + def __init__(self, model_name, worker_id, config_path): + # Ensure PROMETHEUS_MULTIPROC_DIR is set before any metric registration + self.config = self._load_config(config_path) + self.log_interval = self.config.get("log_interval", 10) + multiproc_dir = self.config.get("multiproc_dir", "/vllm-workspace") + if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: + os.environ["PROMETHEUS_MULTIPROC_DIR"] = multiproc_dir + if not os.path.exists(multiproc_dir): + os.makedirs(multiproc_dir, exist_ok=True) + + self.labels = { + "model_name": model_name, + "worker_id": worker_id, + } + self.labelnames = list(self.labels.keys()) + + # Initialize metrics based on configuration + self.metric_type_config = { + "counter": ( + Counter, + {} + ), + "gauge": ( + Gauge, + {"multiprocess_mode": "all"} + ), + "histogram": ( + Histogram, + {"buckets": []} + ) + } + self._init_metrics_from_config() + self.is_running = True + self.thread = threading.Thread(target=self.obtain_stats_thread, daemon=True) + self.thread.start() + + def _process_metric_group(self, group_name): + metric_cls, default_kwargs = self.metric_type_config[group_name] + cfg_list = self.config.get(group_name, []) + + for cfg in cfg_list: + name = cfg.get("name") + doc = cfg.get("documentation", "") + # Prometheus metric name with prefix + prometheus_name = f"{self.metric_prefix}{name}" + ucmmetrics.create_stats(name, group_name) + + metric_kwargs = { + "name": prometheus_name, + "documentation": doc, + "labelnames": self.labelnames, + **default_kwargs, + **{k: v for k, v in cfg.items() if k in default_kwargs} + } + + self.metric_mappings[name] = metric_cls(**metric_kwargs) + + def _init_metrics_from_config(self): + """Initialize metrics based on configuration""" + # Get metric name prefix from config (e.g., "ucm:") + self.metric_prefix = self.config.get("metric_prefix", "ucm:") + + # Store metric mapping: metric_name -> Union[Counter, Gauge, Histogram] + # This mapping will be used in update_stats to dynamically log metrics + self.metric_mappings: dict[str, Union[Counter, Gauge, Histogram]] = {} + + for group_name in self.metric_type_config.keys(): + self._process_metric_group(group_name) + + def _update_counter(self, metric, value): + if value < 0: + return + metric.inc(value) + + def _update_gauge(self, metric, value): + metric.set(value) + + def _update_histogram(self, metric, value): + for data in value: + metric.observe(data) + + def _update_with_func(self, update_func, stats: dict[str, Any], op_desc: str): + for stat_name, value in stats.items(): + if stat_name not in self.metric_mappings: + logger.error(f"Metric {stat_name} not found") + continue + + metric = self.metric_mappings[stat_name] + try: + metric_with_labels = metric.labels(**self.labels) + update_func(metric_with_labels, value) + except AttributeError as e: + logger.error(f"Metric {stat_name} does not support {op_desc}: {e}") + except Exception as e: + logger.debug(f"Failed to {op_desc} {stat_name}: {e}") + + def update_stats(self, counter_stats, gauge_stats, histogram_stats): + """Log metrics to Prometheus based on configuration file""" + # Dynamically log metrics based on what's configured in YAML + update_tasks = [ + (self._update_counter, counter_stats, "increment"), + (self._update_gauge, gauge_stats, "set"), + (self._update_histogram, histogram_stats, "observe"), + ] + for update_func, stats, op_desc in update_tasks: + self._update_with_func(update_func, stats, op_desc) + + def obtain_stats_thread(self): while self.is_running: - stats = ucmmonitor.get_all_stats_and_clear().data - self.prometheus_logger.update_stats(stats) + counter_stats, gauge_stats, histogram_stats = ucmmetrics.get_all_stats_and_clear() + self.update_stats(counter_stats, gauge_stats, histogram_stats) time.sleep(self.log_interval) def shutdown(self): diff --git a/ucm/shared/metrics/cc/metrics.cc b/ucm/shared/metrics/cc/metrics.cc index d87bdaaf3..87a0e2349 100644 --- a/ucm/shared/metrics/cc/metrics.cc +++ b/ucm/shared/metrics/cc/metrics.cc @@ -67,6 +67,12 @@ void Metrics::UpdateStats(const std::string& name, double value) } } +void Metrics::UpdateStats(const std::unordered_map& values) +{ + for(const auto& pair: values) { + UpdateStats(pair.first, pair.second); + } +} std::tuple< std::unordered_map, diff --git a/ucm/shared/metrics/cc/metrics.h b/ucm/shared/metrics/cc/metrics.h index ee00f9083..6e3a531f5 100644 --- a/ucm/shared/metrics/cc/metrics.h +++ b/ucm/shared/metrics/cc/metrics.h @@ -47,6 +47,8 @@ class Metrics { void UpdateStats(const std::string& name, double value); + void UpdateStats(const std::unordered_map& values); + std::tuple< std::unordered_map, std::unordered_map, diff --git a/ucm/shared/metrics/cpy/metrics.py.cc b/ucm/shared/metrics/cpy/metrics.py.cc index a501170d2..02dd98f0e 100644 --- a/ucm/shared/metrics/cpy/metrics.py.cc +++ b/ucm/shared/metrics/cpy/metrics.py.cc @@ -36,6 +36,10 @@ inline void UpdateStats(const std::string& name, double value) { Metrics::GetInstance().UpdateStats(name, value); } +inline void UpdateStats(const std::unordered_map& params) { + Metrics::GetInstance().UpdateStats(params); +} + inline std::tuple< std::unordered_map, std::unordered_map, @@ -47,7 +51,8 @@ inline std::tuple< void bind_monitor(py::module_& m) { m.def("create_stats", &CreateStats); - m.def("update_stats", &UpdateStats); + m.def("update_stats", py::overload_cast(&UpdateStats)); + m.def("update_stats", py::overload_cast&>(&UpdateStats)); m.def("get_all_stats_and_clear", &GetAllStatsAndClear); } diff --git a/ucm/shared/test/example/metrics/monitor_stats_example.py b/ucm/shared/test/example/metrics/monitor_stats_example.py index aa7d68264..2e86a4404 100644 --- a/ucm/shared/test/example/metrics/monitor_stats_example.py +++ b/ucm/shared/test/example/metrics/monitor_stats_example.py @@ -24,7 +24,7 @@ # from functools import wraps -from ucm.shared.metrics import ucmmonitor +from ucm.shared.metrics import ucmmetrics def test_wrap(func): @@ -40,69 +40,27 @@ def wrapper(*args, **kwargs): @test_wrap def metrics_with_update_stats(): - ucmmonitor.create_stats("PyStats") - ucmmonitor.update_stats( - "PyStats", + ucmmetrics.create_stats("counter_1", "counter") + ucmmetrics.update_stats("counter_1", 1.2) + + ucmmetrics.create_stats("gauge_1", "gauge") + ucmmetrics.update_stats("gauge_1", 2.2) + + ucmmetrics.create_stats("histogram_1", "histogram") + ucmmetrics.update_stats("histogram_1", 1) + + counters, gauges, histograms = ucmmetrics.get_all_stats_and_clear() + print(f"After clear then get counters: {counters}, gauges: {gauges}, histograms: {histograms}") + ucmmetrics.update_stats( { - "save_duration": 1.2, - "save_speed": 300.5, - "load_duration": 0.8, - "load_speed": 450.0, - "interval_lookup_hit_rates": 0.95, - }, + "counter_1": 5, + "gauge_1": 6, + "histogram_1": 7, + } ) - - data = ucmmonitor.get_stats("PyStats").data - assert data["save_duration"][0] == 1.2 - assert len(data) == 5 - print(f"Get PyStats stats: {data}") - - data = ucmmonitor.get_stats_and_clear("PyStats").data - assert data["save_duration"][0] == 1.2 - assert len(data) == 5 - print(f"Get PyStats stats and clear: {data}") - - data = ucmmonitor.get_stats_and_clear("PyStats").data - assert len(data) == 0 - print(f"After clear then get PyStats: {data}") - - -@test_wrap -def metrics_with_update_all_stats(): - ucmmonitor.create_stats("PyStats1") - ucmmonitor.create_stats("PyStats2") - ucmmonitor.update_stats( - "PyStats1", - { - "save_duration": 1.2, - "save_speed": 300.5, - }, - ) - - ucmmonitor.update_stats( - "PyStats2", - { - "load_duration": 0.8, - "load_speed": 450.0, - }, - ) - - data = ucmmonitor.get_stats("PyStats1").data - assert data["save_duration"][0] == 1.2 - assert len(data) == 2 - print(f"Only get PyStats1 stats: {data}") - - data = ucmmonitor.get_all_stats_and_clear().data - assert data["save_duration"][0] == 1.2 - assert data["load_duration"][0] == 0.8 - assert len(data) == 4 - print(f"Get all stats and clear: {data}") - - data = ucmmonitor.get_stats("PyStats2").data - assert len(data) == 0 - print(f"After clear then get PyStats2: {data}") + counters, gauges, histograms = ucmmetrics.get_all_stats_and_clear() + print(f"After clear then get counters: {counters}, gauges: {gauges}, histograms: {histograms}") if __name__ == "__main__": metrics_with_update_stats() - metrics_with_update_all_stats() From a2c86925a215f486b7666ca690dfef9f4ca12035 Mon Sep 17 00:00:00 2001 From: flesher0813 <1208954694@qq.com> Date: Wed, 17 Dec 2025 17:57:54 +0800 Subject: [PATCH 6/9] metrics --- ucm/shared/metrics/CMakeLists.txt | 3 +- ucm/shared/metrics/cc/api/metrics_api.cc | 48 +++++++++++++++++++ ucm/shared/metrics/cc/api/metrics_api.h | 42 ++++++++++++++++ ucm/shared/metrics/cc/{ => domain}/metrics.cc | 0 ucm/shared/metrics/cc/{ => domain}/metrics.h | 0 ucm/shared/metrics/cpy/metrics.py.cc | 22 +-------- 6 files changed, 93 insertions(+), 22 deletions(-) create mode 100644 ucm/shared/metrics/cc/api/metrics_api.cc create mode 100644 ucm/shared/metrics/cc/api/metrics_api.h rename ucm/shared/metrics/cc/{ => domain}/metrics.cc (100%) rename ucm/shared/metrics/cc/{ => domain}/metrics.h (100%) diff --git a/ucm/shared/metrics/CMakeLists.txt b/ucm/shared/metrics/CMakeLists.txt index 67b1c4e68..2aaa7bb77 100644 --- a/ucm/shared/metrics/CMakeLists.txt +++ b/ucm/shared/metrics/CMakeLists.txt @@ -1,7 +1,8 @@ file(GLOB_RECURSE UCMMETRICS_CC_SOURCE_FILES "./cc/*.cc") add_library(metrics STATIC ${UCMMETRICS_CC_SOURCE_FILES}) target_include_directories(metrics PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR}/cc + ${CMAKE_CURRENT_SOURCE_DIR}/cc/api + ${CMAKE_CURRENT_SOURCE_DIR}/cc/domain ) file(GLOB_RECURSE UCMMETRICS_CPY_SOURCE_FILES CONFIGURE_DEPENDS "./cpy/*.cc") diff --git a/ucm/shared/metrics/cc/api/metrics_api.cc b/ucm/shared/metrics/cc/api/metrics_api.cc new file mode 100644 index 000000000..bd9fa93e7 --- /dev/null +++ b/ucm/shared/metrics/cc/api/metrics_api.cc @@ -0,0 +1,48 @@ +/** + * MIT License + * + * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * */ +#include "metrics_api.h" +namespace UC::Metrics { + +void CreateStats(const std::string& name, std::string& type) { Metrics::GetInstance().CreateStats(name, type); } + +void UpdateStats(const std::string& name, double value) +{ + Metrics::GetInstance().UpdateStats(name, value); +} + +void UpdateStats(const std::unordered_map& values) +{ + Metrics::GetInstance().UpdateStats(values); +} + +std::tuple< + std::unordered_map, + std::unordered_map, + std::unordered_map> + > GetAllStatsAndClear() +{ + return Metrics::GetInstance().GetAllStatsAndClear(); +} + +} // namespace UC::Metrics \ No newline at end of file diff --git a/ucm/shared/metrics/cc/api/metrics_api.h b/ucm/shared/metrics/cc/api/metrics_api.h new file mode 100644 index 000000000..a9a3e64b3 --- /dev/null +++ b/ucm/shared/metrics/cc/api/metrics_api.h @@ -0,0 +1,42 @@ +/** + * MIT License + * + * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * */ +#ifndef UNIFIEDCACHE_METRICS_API_H +#define UNIFIEDCACHE_METRICS_API_H +#include "metrics.h" + +namespace UC::Metrics { +void CreateStats(const std::string& name, std::string& type); + +void UpdateStats(const std::string& name, double value); + +void UpdateStats(const std::unordered_map& values); + +std::tuple< + std::unordered_map, + std::unordered_map, + std::unordered_map> + > GetAllStatsAndClear(); + +} // namespace UC::Metrics +#endif \ No newline at end of file diff --git a/ucm/shared/metrics/cc/metrics.cc b/ucm/shared/metrics/cc/domain/metrics.cc similarity index 100% rename from ucm/shared/metrics/cc/metrics.cc rename to ucm/shared/metrics/cc/domain/metrics.cc diff --git a/ucm/shared/metrics/cc/metrics.h b/ucm/shared/metrics/cc/domain/metrics.h similarity index 100% rename from ucm/shared/metrics/cc/metrics.h rename to ucm/shared/metrics/cc/domain/metrics.h diff --git a/ucm/shared/metrics/cpy/metrics.py.cc b/ucm/shared/metrics/cpy/metrics.py.cc index 02dd98f0e..cf149f1c0 100644 --- a/ucm/shared/metrics/cpy/metrics.py.cc +++ b/ucm/shared/metrics/cpy/metrics.py.cc @@ -23,31 +23,11 @@ * */ #include #include -#include "metrics.h" +#include "metrics_api.h" namespace py = pybind11; namespace UC::Metrics { -inline void CreateStats(const std::string& name, std::string& type) { - Metrics::GetInstance().CreateStats(name, type); -} - -inline void UpdateStats(const std::string& name, double value) { - Metrics::GetInstance().UpdateStats(name, value); -} - -inline void UpdateStats(const std::unordered_map& params) { - Metrics::GetInstance().UpdateStats(params); -} - -inline std::tuple< - std::unordered_map, - std::unordered_map, - std::unordered_map> -> GetAllStatsAndClear() { - return Metrics::GetInstance().GetAllStatsAndClear(); -} - void bind_monitor(py::module_& m) { m.def("create_stats", &CreateStats); From a2b04379b890783ea3413f9fc5a3127241a02cdf Mon Sep 17 00:00:00 2001 From: flesher0813 <1208954694@qq.com> Date: Thu, 18 Dec 2025 10:29:42 +0800 Subject: [PATCH 7/9] add example --- examples/metrics/metrics_configs.yaml | 15 ++-- ucm/integration/vllm/ucm_connector.py | 26 +++--- ucm/observability.py | 86 +++++++++++-------- ucm/shared/metrics/cc/domain/metrics.h | 4 +- .../example/metrics/monitor_stats_example.py | 12 ++- ucm/store/detail/task/task_shard.h | 12 +++ ucm/store/nfsstore/CMakeLists.txt | 2 +- 7 files changed, 93 insertions(+), 64 deletions(-) diff --git a/examples/metrics/metrics_configs.yaml b/examples/metrics/metrics_configs.yaml index a4dba4fa3..c16753a5d 100644 --- a/examples/metrics/metrics_configs.yaml +++ b/examples/metrics/metrics_configs.yaml @@ -19,10 +19,10 @@ metric_prefix: "ucm:" # Histogram metrics configuration histogram: - - name: "load_requests_num" + - name: "load_requests_total" documentation: "Number of requests loaded from ucm" buckets: [1, 5, 10, 20, 50, 100, 200, 500, 1000] - - name: "load_blocks_num" + - name: "load_blocks_total" documentation: "Number of blocks loaded from ucm" buckets: [0, 50, 100, 150, 200, 250, 300, 350, 400, 550, 600, 750, 800, 850, 900, 950, 1000] - name: "load_duration" @@ -31,10 +31,10 @@ histogram: - name: "load_speed" documentation: "Speed of loading from ucm (GB/s)" buckets: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 50, 60, 70, 80, 90, 100] - - name: "save_requests_num" + - name: "save_requests_total" documentation: "Number of requests saved to ucm" buckets: [1, 5, 10, 20, 50, 100, 200, 500, 1000] - - name: "save_blocks_num" + - name: "save_blocks_total" documentation: "Number of blocks saved to ucm" buckets: [0, 50, 100, 150, 200, 250, 300, 350, 400, 550, 600, 750, 800, 850, 900, 950, 1000] - name: "save_duration" @@ -46,4 +46,9 @@ histogram: - name: "interval_lookup_hit_rates" documentation: "Hit rates of ucm lookup requests" buckets: [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] - + - name: "s2d_bandwidth" + documentation: "Band width of uc store task s2d, copy tensors from storage to device" + buckets: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + - name: "d2s_bandwidth" + documentation: "Band width of uc store task d2s, copy tensors from device to storage" + buckets: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] diff --git a/ucm/integration/vllm/ucm_connector.py b/ucm/integration/vllm/ucm_connector.py index ee8558e84..adb9dab2c 100644 --- a/ucm/integration/vllm/ucm_connector.py +++ b/ucm/integration/vllm/ucm_connector.py @@ -18,8 +18,8 @@ from vllm.v1.core.sched.output import SchedulerOutput from ucm.logger import init_logger -from ucm.shared.metrics import ucmmonitor -from ucm.shared.metrics.observability import UCMStatsLogger +from ucm.observability import PrometheusStatsLogger +from ucm.shared.metrics import ucmmetrics from ucm.store.factory_v1 import UcmConnectorFactoryV1 from ucm.store.ucmstore_v1 import Task, UcmKVStoreBaseV1 from ucm.utils import Config @@ -160,12 +160,11 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): self.metrics_config = self.launch_config.get("metrics_config_path", "") if self.metrics_config: - self.stats_logger = UCMStatsLogger( + self.stats_logger = PrometheusStatsLogger( vllm_config.model_config.served_model_name, self.global_rank, self.metrics_config, ) - self.monitor = ucmmonitor.StatsMonitor.get_instance() self.synchronize = ( torch.cuda.synchronize @@ -295,8 +294,7 @@ def get_num_new_matched_tokens( f"hit external: {external_hit_blocks}" ) if self.metrics_config: - self.monitor.update_stats( - "ConnStats", + ucmmetrics.update_stats( {"interval_lookup_hit_rates": external_hit_blocks / len(ucm_block_ids)}, ) @@ -539,14 +537,13 @@ def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: / 1024 ) # GB/s if self.metrics_config and is_load: - self.monitor.update_stats( - "ConnStats", + ucmmetrics.update_stats( { - "load_requests_num": num_loaded_request, - "load_blocks_num": num_loaded_block, + "load_requests_total": num_loaded_request, + "load_blocks_total": num_loaded_block, "load_duration": load_end_time - load_start_time, "load_speed": load_speed, - }, + } ) def wait_for_layer_load(self, layer_name: str) -> None: @@ -615,11 +612,10 @@ def wait_for_save(self) -> None: / 1024 ) # GB/s if self.metrics_config and is_save: - self.monitor.update_stats( - "ConnStats", + ucmmetrics.update_stats( { - "save_requests_num": num_saved_request, - "save_blocks_num": num_saved_block, + "save_requests_total": num_saved_request, + "save_blocks_total": num_saved_block, "save_duration": save_end_time - save_start_time, "save_speed": save_speed, }, diff --git a/ucm/observability.py b/ucm/observability.py index e35138049..f12153556 100644 --- a/ucm/observability.py +++ b/ucm/observability.py @@ -28,16 +28,17 @@ import time from typing import Any, Union -from prometheus_client import Gauge, Counter, Histogram import yaml +from prometheus_client import Counter, Gauge, Histogram from ucm.logger import init_logger from ucm.shared.metrics import ucmmetrics logger = init_logger(__name__) + class PrometheusStatsLogger: - + def _load_config(self, config_path: str) -> dict[str, Any]: """Load configuration from YAML file""" try: @@ -57,9 +58,14 @@ def _load_config(self, config_path: str) -> dict[str, Any]: return {} def __init__(self, model_name, worker_id, config_path): - # Ensure PROMETHEUS_MULTIPROC_DIR is set before any metric registration + """ + Load metrics config from YAML file (config_path), + register metrics using prometheus_client, and start a thread to get updated metrics. + """ + # Load metrics config self.config = self._load_config(config_path) self.log_interval = self.config.get("log_interval", 10) + multiproc_dir = self.config.get("multiproc_dir", "/vllm-workspace") if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: os.environ["PROMETHEUS_MULTIPROC_DIR"] = multiproc_dir @@ -71,50 +77,46 @@ def __init__(self, model_name, worker_id, config_path): "worker_id": worker_id, } self.labelnames = list(self.labels.keys()) - - # Initialize metrics based on configuration + self.metric_type_config = { - "counter": ( - Counter, - {} - ), - "gauge": ( - Gauge, - {"multiprocess_mode": "all"} - ), - "histogram": ( - Histogram, - {"buckets": []} - ) + "counter": (Counter, {}), + "gauge": (Gauge, {"multiprocess_mode": "all"}), + "histogram": (Histogram, {"buckets": []}), } + # Initialize metrics based on config self._init_metrics_from_config() + + # Start thread to update metrics self.is_running = True - self.thread = threading.Thread(target=self.obtain_stats_thread, daemon=True) + self.thread = threading.Thread(target=self.update_stats_loop, daemon=True) self.thread.start() - - def _process_metric_group(self, group_name): - metric_cls, default_kwargs = self.metric_type_config[group_name] - cfg_list = self.config.get(group_name, []) - + + def _register_metrics_by_type(self, metric_type): + """ + Register metrics by different metric types. + """ + metric_cls, default_kwargs = self.metric_type_config[metric_type] + cfg_list = self.config.get(metric_type, []) + for cfg in cfg_list: name = cfg.get("name") doc = cfg.get("documentation", "") # Prometheus metric name with prefix prometheus_name = f"{self.metric_prefix}{name}" - ucmmetrics.create_stats(name, group_name) - + ucmmetrics.create_stats(name, metric_type) + metric_kwargs = { "name": prometheus_name, "documentation": doc, "labelnames": self.labelnames, **default_kwargs, - **{k: v for k, v in cfg.items() if k in default_kwargs} + **{k: v for k, v in cfg.items() if k in default_kwargs}, } - + self.metric_mappings[name] = metric_cls(**metric_kwargs) def _init_metrics_from_config(self): - """Initialize metrics based on configuration""" + """Initialize metrics based on config""" # Get metric name prefix from config (e.g., "ucm:") self.metric_prefix = self.config.get("metric_prefix", "ucm:") @@ -122,9 +124,9 @@ def _init_metrics_from_config(self): # This mapping will be used in update_stats to dynamically log metrics self.metric_mappings: dict[str, Union[Counter, Gauge, Histogram]] = {} - for group_name in self.metric_type_config.keys(): - self._process_metric_group(group_name) - + for metric_type in self.metric_type_config.keys(): + self._register_metrics_by_type(metric_type) + def _update_counter(self, metric, value): if value < 0: return @@ -136,13 +138,17 @@ def _update_gauge(self, metric, value): def _update_histogram(self, metric, value): for data in value: metric.observe(data) - + def _update_with_func(self, update_func, stats: dict[str, Any], op_desc: str): + """ + Generic update for Prometheus metrics: match metrics by name, bind labels, + and update values via the specified function (update_func). + """ for stat_name, value in stats.items(): if stat_name not in self.metric_mappings: logger.error(f"Metric {stat_name} not found") continue - + metric = self.metric_mappings[stat_name] try: metric_with_labels = metric.labels(**self.labels) @@ -153,8 +159,9 @@ def _update_with_func(self, update_func, stats: dict[str, Any], op_desc: str): logger.debug(f"Failed to {op_desc} {stat_name}: {e}") def update_stats(self, counter_stats, gauge_stats, histogram_stats): - """Log metrics to Prometheus based on configuration file""" - # Dynamically log metrics based on what's configured in YAML + """ + Update all Prometheus metrics (Counter/Gauge/Histogram) with given stats. + """ update_tasks = [ (self._update_counter, counter_stats, "increment"), (self._update_gauge, gauge_stats, "set"), @@ -163,9 +170,14 @@ def update_stats(self, counter_stats, gauge_stats, histogram_stats): for update_func, stats, op_desc in update_tasks: self._update_with_func(update_func, stats, op_desc) - def obtain_stats_thread(self): + def update_stats_loop(self): + """ + Periodically update Prometheus metrics in a loop until stopped. + """ while self.is_running: - counter_stats, gauge_stats, histogram_stats = ucmmetrics.get_all_stats_and_clear() + counter_stats, gauge_stats, histogram_stats = ( + ucmmetrics.get_all_stats_and_clear() + ) self.update_stats(counter_stats, gauge_stats, histogram_stats) time.sleep(self.log_interval) diff --git a/ucm/shared/metrics/cc/domain/metrics.h b/ucm/shared/metrics/cc/domain/metrics.h index 6e3a531f5..087bf402d 100644 --- a/ucm/shared/metrics/cc/domain/metrics.h +++ b/ucm/shared/metrics/cc/domain/metrics.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ -#ifndef UNIFIEDCACHE_MONITOR_H -#define UNIFIEDCACHE_MONITOR_H +#ifndef UNIFIEDCACHE_METRICS_H +#define UNIFIEDCACHE_METRICS_H #include #include diff --git a/ucm/shared/test/example/metrics/monitor_stats_example.py b/ucm/shared/test/example/metrics/monitor_stats_example.py index 2e86a4404..94b60ade1 100644 --- a/ucm/shared/test/example/metrics/monitor_stats_example.py +++ b/ucm/shared/test/example/metrics/monitor_stats_example.py @@ -42,15 +42,17 @@ def wrapper(*args, **kwargs): def metrics_with_update_stats(): ucmmetrics.create_stats("counter_1", "counter") ucmmetrics.update_stats("counter_1", 1.2) - + ucmmetrics.create_stats("gauge_1", "gauge") ucmmetrics.update_stats("gauge_1", 2.2) - + ucmmetrics.create_stats("histogram_1", "histogram") ucmmetrics.update_stats("histogram_1", 1) counters, gauges, histograms = ucmmetrics.get_all_stats_and_clear() - print(f"After clear then get counters: {counters}, gauges: {gauges}, histograms: {histograms}") + print( + f"After clear then get counters: {counters}, gauges: {gauges}, histograms: {histograms}" + ) ucmmetrics.update_stats( { "counter_1": 5, @@ -59,7 +61,9 @@ def metrics_with_update_stats(): } ) counters, gauges, histograms = ucmmetrics.get_all_stats_and_clear() - print(f"After clear then get counters: {counters}, gauges: {gauges}, histograms: {histograms}") + print( + f"After clear then get counters: {counters}, gauges: {gauges}, histograms: {histograms}" + ) if __name__ == "__main__": diff --git a/ucm/store/detail/task/task_shard.h b/ucm/store/detail/task/task_shard.h index 2f71738fa..8ab9e3827 100644 --- a/ucm/store/detail/task/task_shard.h +++ b/ucm/store/detail/task/task_shard.h @@ -35,6 +35,7 @@ #include #include "logger/logger.h" #include "task_waiter.h" +#include "metrics_api.h" namespace UC { @@ -129,6 +130,17 @@ class Task { auto wait = execTp_ - startTp_; auto exec = NowTp() - execTp_; auto bw = size_ / exec / 1024 / 1024 / 1024; + switch (type_) + { + case Type::DUMP: + UC::Metrics::UpdateStats("d2s_bandwidth", bw); + break; + case Type::LOAD: + UC::Metrics::UpdateStats("s2d_bandwidth", bw); + break; + default: + break; + } return fmt::format("wait={:.06f}s, exec={:.06f}s, bw={:.06f}GB/s", wait, exec, bw); } diff --git a/ucm/store/nfsstore/CMakeLists.txt b/ucm/store/nfsstore/CMakeLists.txt index b48376278..426d4aebe 100644 --- a/ucm/store/nfsstore/CMakeLists.txt +++ b/ucm/store/nfsstore/CMakeLists.txt @@ -6,7 +6,7 @@ target_include_directories(nfsstore PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/cc/api ${CMAKE_CURRENT_SOURCE_DIR}/cc/domain ) -target_link_libraries(nfsstore PUBLIC storeintf storedevice infra_logger) +target_link_libraries(nfsstore PUBLIC storeintf storedevice infra_logger metrics) file(GLOB_RECURSE UCMSTORE_NFS_CPY_SOURCE_FILES "./cpy/*.cc") pybind11_add_module(ucmnfsstore ${UCMSTORE_NFS_CPY_SOURCE_FILES}) From dfddaf1cb7f7f9b6ba6fb7edf1470f16e3785154 Mon Sep 17 00:00:00 2001 From: flesher0813 <1208954694@qq.com> Date: Thu, 18 Dec 2025 14:36:26 +0800 Subject: [PATCH 8/9] add example --- examples/metrics/grafana.json | 215 +++++++++++++++++- ucm/store/CMakeLists.txt | 2 +- ucm/store/detail/task/task_shard.h | 13 +- ucm/store/nfsstore/CMakeLists.txt | 2 +- .../pcstore/cc/domain/trans/trans_task.h | 12 + 5 files changed, 226 insertions(+), 18 deletions(-) diff --git a/examples/metrics/grafana.json b/examples/metrics/grafana.json index 72d175971..ceb8c8b8b 100644 --- a/examples/metrics/grafana.json +++ b/examples/metrics/grafana.json @@ -225,7 +225,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "rate(ucm:load_requests_num_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(ucm:load_requests_num_count{model_name=\"$model_name\"}[$__rate_interval])", + "expr": "rate(ucm:load_requests_total_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(ucm:load_requests_total_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, "legendFormat": "worker-{{worker_id}}", @@ -328,7 +328,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "rate(ucm:load_blocks_num_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(ucm:load_blocks_num_count{model_name=\"$model_name\"}[$__rate_interval])", + "expr": "rate(ucm:load_blocks_total_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(ucm:load_blocks_total_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, "legendFormat": "worker-{{worker_id}}", @@ -638,7 +638,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "rate(ucm:save_requests_num_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(ucm:save_requests_num_count{model_name=\"$model_name\"}[$__rate_interval])", + "expr": "rate(ucm:save_requests_total_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(ucm:save_requests_total_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, "legendFormat": "worker-{{worker_id}}", @@ -741,7 +741,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "rate(ucm:save_blocks_num_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(ucm:save_blocks_num_count{model_name=\"$model_name\"}[$__rate_interval])", + "expr": "rate(ucm:save_blocks_total_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(ucm:save_blocks_total_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, "legendFormat": "worker-{{worker_id}}", @@ -957,7 +957,214 @@ ], "title": "Connector Save Speed", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "P50 S2D speed in GB/s for each start_load_kv.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 0.005 + }, + { + "color": "green", + "value": 0.01 + } + ] + }, + "unit": "gb/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 21, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(ucm:s2d_bandwidth_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(ucm:s2d_bandwidth_count{model_name=\"$model_name\"}[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "worker-{{worker_id}}", + "range": true, + "refId": "A" + } + ], + "title": "NFS S2D BandWidth", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "P50 D2S speed in GB/s for each start_load_kv.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 0.005 + }, + { + "color": "green", + "value": 0.01 + } + ] + }, + "unit": "gb/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 21, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(ucm:d2s_bandwidth_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(ucm:d2s_bandwidth_count{model_name=\"$model_name\"}[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "worker-{{worker_id}}", + "range": true, + "refId": "A" + } + ], + "title": "NFS D2S BandWidth", + "type": "timeseries" } + ], "refresh": "", "schemaVersion": 39, diff --git a/ucm/store/CMakeLists.txt b/ucm/store/CMakeLists.txt index b4bd46a44..8175933c8 100644 --- a/ucm/store/CMakeLists.txt +++ b/ucm/store/CMakeLists.txt @@ -1,7 +1,7 @@ add_subdirectory(detail) add_library(storeintf INTERFACE) target_include_directories(storeintf INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) -target_link_libraries(storeintf INTERFACE storedetail infra_status) +target_link_libraries(storeintf INTERFACE storedetail infra_status metrics) add_subdirectory(nfsstore) add_subdirectory(pcstore) add_subdirectory(mooncakestore) diff --git a/ucm/store/detail/task/task_shard.h b/ucm/store/detail/task/task_shard.h index 8ab9e3827..b67c0cace 100644 --- a/ucm/store/detail/task/task_shard.h +++ b/ucm/store/detail/task/task_shard.h @@ -35,7 +35,7 @@ #include #include "logger/logger.h" #include "task_waiter.h" -#include "metrics_api.h" + namespace UC { @@ -130,17 +130,6 @@ class Task { auto wait = execTp_ - startTp_; auto exec = NowTp() - execTp_; auto bw = size_ / exec / 1024 / 1024 / 1024; - switch (type_) - { - case Type::DUMP: - UC::Metrics::UpdateStats("d2s_bandwidth", bw); - break; - case Type::LOAD: - UC::Metrics::UpdateStats("s2d_bandwidth", bw); - break; - default: - break; - } return fmt::format("wait={:.06f}s, exec={:.06f}s, bw={:.06f}GB/s", wait, exec, bw); } diff --git a/ucm/store/nfsstore/CMakeLists.txt b/ucm/store/nfsstore/CMakeLists.txt index 426d4aebe..b48376278 100644 --- a/ucm/store/nfsstore/CMakeLists.txt +++ b/ucm/store/nfsstore/CMakeLists.txt @@ -6,7 +6,7 @@ target_include_directories(nfsstore PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/cc/api ${CMAKE_CURRENT_SOURCE_DIR}/cc/domain ) -target_link_libraries(nfsstore PUBLIC storeintf storedevice infra_logger metrics) +target_link_libraries(nfsstore PUBLIC storeintf storedevice infra_logger) file(GLOB_RECURSE UCMSTORE_NFS_CPY_SOURCE_FILES "./cpy/*.cc") pybind11_add_module(ucmnfsstore ${UCMSTORE_NFS_CPY_SOURCE_FILES}) diff --git a/ucm/store/pcstore/cc/domain/trans/trans_task.h b/ucm/store/pcstore/cc/domain/trans/trans_task.h index 8fcb48fba..e0520cd34 100644 --- a/ucm/store/pcstore/cc/domain/trans/trans_task.h +++ b/ucm/store/pcstore/cc/domain/trans/trans_task.h @@ -33,6 +33,7 @@ #include #include #include +#include "metrics_api.h" namespace UC { @@ -74,6 +75,17 @@ class TransTask { auto total = ioSize * number_; auto costs = NowTp() - startTp; auto bw = double(total) / costs / 1e9; + switch (type) + { + case Type::DUMP: + UC::Metrics::UpdateStats("d2s_bandwidth", bw); + break; + case Type::LOAD: + UC::Metrics::UpdateStats("s2d_bandwidth", bw); + break; + default: + break; + } return fmt::format("Task({},{},{},{}) finished, costs={:.06f}s, bw={:.06f}GB/s.", id, brief_, number_, total, costs, bw); } From c71d023223c1e0928b2dbecb1e2d91a72db36468 Mon Sep 17 00:00:00 2001 From: flesher0813 <1208954694@qq.com> Date: Thu, 18 Dec 2025 15:13:17 +0800 Subject: [PATCH 9/9] metrics --- docs/source/developer-guide/add_metrics.md | 182 +++++++++++++-------- 1 file changed, 116 insertions(+), 66 deletions(-) diff --git a/docs/source/developer-guide/add_metrics.md b/docs/source/developer-guide/add_metrics.md index a629944da..94325a8dd 100644 --- a/docs/source/developer-guide/add_metrics.md +++ b/docs/source/developer-guide/add_metrics.md @@ -3,76 +3,113 @@ UCM supports custom metrics with bidirectional updates from both Python and C++ ## Architecture Overview The metrics consists of these components below: -- **monitor** : Central stats registry that manages all metric lifecycle operations (registration, creation, updates, queries) +- **metrics** : Defined in cpp. Central stats registry that manages all metric lifecycle operations (creation, updates, queries) - **observability.py** : Prometheus integration layer that handles metric exposition - **metrics_config.yaml** : Declarative configuration that defines which custom metrics to register and their properties ## Getting Started -### Define Metrics in YAML -Prometheus provides three fundamental metric types: Counter, Gauge, and Histogram. UCM implements corresponding wrappers for each type. The method for adding new metrics is as follows; please refer to the [example YAML](https://github.com/ModelEngine-Group/unified-cache-management/blob/develop/examples/metrics/metrics_configs.yaml) for more detailed information. +### Step 1: Define Metrics in YAML +Prometheus provides three fundamental metric types: Counter, Gauge, and Histogram. UCM implements corresponding wrappers for each type. After defining new metric in yaml, it will be registered to Promethues automatically by below function: +```python +def _register_metrics_by_type(self, metric_type): + """ + Register metrics by different metric types. + """ + metric_cls, default_kwargs = self.metric_type_config[metric_type] + cfg_list = self.config.get(metric_type, []) + + for cfg in cfg_list: + name = cfg.get("name") + doc = cfg.get("documentation", "") + # Prometheus metric name with prefix + prometheus_name = f"{self.metric_prefix}{name}" + ucmmetrics.create_stats(name, metric_type) + # Parameters for constructing a metric + metric_kwargs = { + "name": prometheus_name, + "documentation": doc, + "labelnames": self.labelnames, + **default_kwargs, + **{k: v for k, v in cfg.items() if k in default_kwargs}, + } + # Store metric mapping: metric_name -> Union[Counter, Gauge, Histogram] + self.metric_mappings[name] = metric_cls(**metric_kwargs) +``` + +Example of yaml below: ```yaml +# Prometheus Metrics Configuration +# This file defines which metrics should be enabled and their configurations log_interval: 5 # Interval in seconds for logging metrics -prometheus: - multiproc_dir: "/vllm-workspace" # Directory for Prometheus multiprocess mode - - metric_prefix: "ucm:" - - # Enable/disable metrics by category - enabled_metrics: - counters: true - gauges: true - histograms: true - - # Counter metrics configuration - counters: - - name: "received_requests" - documentation: "Total number of requests sent to ucm" - - # Gauge metrics configuration - gauges: - - name: "external_lookup_hit_rate" - documentation: "Hit rate of ucm lookup requests" - multiprocess_mode: "livemostrecent" - - # Histogram metrics configuration - histograms: - - name: "load_requests_num" - documentation: "Number of requests loaded from ucm" - buckets: [1, 5, 10, 20, 50, 100, 200, 500, 1000] +multiproc_dir: "/vllm-workspace" # Directory for Prometheus multiprocess mode + +metric_prefix: "ucm:" + +# Counter metrics configuration +# counter: +# - name: "received_requests" +# documentation: "Total number of requests sent to ucm" + +# Gauge metrics configuration +# gauge: +# - name: "lookup_hit_rate" +# documentation: "Hit rate of ucm lookup requests since last log" +# multiprocess_mode: "livemostrecent" + +# Histogram metrics configuration +histogram: + - name: "load_requests_total" + documentation: "Number of requests loaded from ucm" + buckets: [1, 5, 10, 20, 50, 100, 200, 500, 1000] + - name: "load_blocks_total" + documentation: "Number of blocks loaded from ucm" + buckets: [0, 50, 100, 150, 200, 250, 300, 350, 400, 550, 600, 750, 800, 850, 900, 950, 1000] + - name: "load_duration" + documentation: "Time to load from ucm (ms)" + buckets: [0, 50, 100, 150, 200, 250, 300, 350, 400, 550, 600, 750, 800, 850, 900, 950, 1000] + - name: "load_speed" + documentation: "Speed of loading from ucm (GB/s)" + buckets: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 50, 60, 70, 80, 90, 100] + - name: "interval_lookup_hit_rates" + documentation: "Hit rates of ucm lookup requests" + buckets: [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] + - name: "s2d_bandwidth" + documentation: "Band width of uc store task s2d, copy tensors from storage to device" + buckets: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + - name: "d2s_bandwidth" + documentation: "Band width of uc store task d2s, copy tensors from device to storage" + buckets: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] ``` +Please refer to the [example YAML](https://github.com/ModelEngine-Group/unified-cache-management/blob/develop/examples/metrics/metrics_configs.yaml) for more detailed information. -### Use Monitor APIs to Update Stats -The monitor provides a unified interface for metric operations. Users only need to create stats and update them, while the observability component is responsible for fetching the stats and pushing them to Prometheus. +### Step 2: Use Metrics APIs to Update Stats +The metrics provides a unified interface for metric operations. After defining metrics in yaml, users only need to link metrics/import ucmmetrics and update them in suitable position, while the observability component is responsible for fetching the stats and pushing them to Prometheus. :::::{tab-set} :sync-group: install ::::{tab-item} Python side interfaces :selected: :sync: py -**Lifecycle Methods** -- `create_stats(name)`: Create and initialize a registered stats object. - -**Operation Methods** -- `update_stats(name, dict)`: Update specific fields of a specific stats object. -- `get_stats(name)`: Retrieve current values of a specific stats object. -- `get_stats_and_clear(name)`: Retrieve and reset a specific stats object. -- `get_all_stats_and_clear()`: Retrieve and reset all stats objects. -- `reset_stats(name)`: Reset a specific stats object to initial state. -- `reset_all()`: Reset all stats registered in monitor. - **Example:** Using built-in ConnStats ```python -from ucm.shared.metrics import ucmmonitor - -ucmmonitor.create_stats("ConnStats") # Create a stats obj +# 1. Import ucmmetrics +from ucm.shared.metrics import ucmmetrics -# Update stats -ucmmonitor.update_stats( - "ConnStats", - {"interval_lookup_hit_rates": external_hit_blocks / len(ucm_block_ids)}, +# 2. Update a stat +ucmmetrics.update_stats( + {"interval_lookup_hit_rates": external_hit_blocks / len(ucm_block_ids)}, ) +# 2. Update stats +ucmmetrics.update_stats( + { + "load_requests_total": num_loaded_request, + "load_blocks_total": num_loaded_block, + "load_duration": load_end_time - load_start_time, + "load_speed": load_speed, + } +) ``` See more detailed example in [test case](https://github.com/ModelEngine-Group/unified-cache-management/tree/develop/ucm/shared/test/example). @@ -80,27 +117,40 @@ See more detailed example in [test case](https://github.com/ModelEngine-Group/un ::::{tab-item} C++ side interfaces :sync: cc -**Lifecycle Methods** -- `CreateStats(const std::string& name)`: Create and initialize a registered stats object. - -**Operation Methods** -- `UpdateStats(const std::string& name, const std::unordered_map& params)`: Update specific fields of a specific stats object. -- `ResetStats(const std::string& name)`: Retrieve current values of a specific stats object. -- `ResetAllStats()`: Retrieve and reset a specific stats object. -- `GetStats(const std::string& name)`: Retrieve and reset all stats objects. -- `GetStatsAndClear(const std::string& name)`: Reset a specific stats object to initial state. -- `GetAllStatsAndClear()`: Reset all stats registered in monitor. **Example:** Implementing custom stats in C++ UCM supports custom metrics by following steps: -- Step 1: linking the static library monitor_static +- Step 1: linking the static library metrics ```c++ target_link_libraries(xxxstore PUBLIC storeinfra monitor_static) ``` -- Step 2: Create stats object using function **CreateStats** +- - Step 3: Update using function **UpdateStats** - -See more detailed example in [test case](https://github.com/ModelEngine-Group/unified-cache-management/tree/develop/ucm/shared/test/case). - +```c++ +// 1. Include metrics api head file +#include "metrics_api.h" + +// 2. Update metrics defined in yaml +auto Epilog(const size_t ioSize) const noexcept + { + auto total = ioSize * number_; + auto costs = NowTp() - startTp; + auto bw = double(total) / costs / 1e9; + switch (type) + { + case Type::DUMP: + UC::Metrics::UpdateStats("d2s_bandwidth", bw); + break; + case Type::LOAD: + UC::Metrics::UpdateStats("s2d_bandwidth", bw); + break; + default: + break; + } + return fmt::format("Task({},{},{},{}) finished, costs={:.06f}s, bw={:.06f}GB/s.", id, + brief_, number_, total, costs, bw); + } +``` :::: -::::: \ No newline at end of file +::::: +