2828import time
2929from typing import Any , Union
3030
31- from prometheus_client import Gauge , Counter , Histogram
3231import yaml
32+ from prometheus_client import Counter , Gauge , Histogram
3333
3434from ucm .logger import init_logger
3535from ucm .shared .metrics import ucmmetrics
3636
3737logger = init_logger (__name__ )
3838
39+
3940class PrometheusStatsLogger :
40-
41+
4142 def _load_config (self , config_path : str ) -> dict [str , Any ]:
4243 """Load configuration from YAML file"""
4344 try :
@@ -57,9 +58,14 @@ def _load_config(self, config_path: str) -> dict[str, Any]:
5758 return {}
5859
5960 def __init__ (self , model_name , worker_id , config_path ):
60- # Ensure PROMETHEUS_MULTIPROC_DIR is set before any metric registration
61+ """
62+ Load metrics config from YAML file (config_path),
63+ register metrics using prometheus_client, and start a thread to get updated metrics.
64+ """
65+ # Load metrics config
6166 self .config = self ._load_config (config_path )
6267 self .log_interval = self .config .get ("log_interval" , 10 )
68+
6369 multiproc_dir = self .config .get ("multiproc_dir" , "/vllm-workspace" )
6470 if "PROMETHEUS_MULTIPROC_DIR" not in os .environ :
6571 os .environ ["PROMETHEUS_MULTIPROC_DIR" ] = multiproc_dir
@@ -71,60 +77,56 @@ def __init__(self, model_name, worker_id, config_path):
7177 "worker_id" : worker_id ,
7278 }
7379 self .labelnames = list (self .labels .keys ())
74-
75- # Initialize metrics based on configuration
80+
7681 self .metric_type_config = {
77- "counter" : (
78- Counter ,
79- {}
80- ),
81- "gauge" : (
82- Gauge ,
83- {"multiprocess_mode" : "all" }
84- ),
85- "histogram" : (
86- Histogram ,
87- {"buckets" : []}
88- )
82+ "counter" : (Counter , {}),
83+ "gauge" : (Gauge , {"multiprocess_mode" : "all" }),
84+ "histogram" : (Histogram , {"buckets" : []}),
8985 }
86+ # Initialize metrics based on config
9087 self ._init_metrics_from_config ()
88+
89+ # Start thread to update metrics
9190 self .is_running = True
92- self .thread = threading .Thread (target = self .obtain_stats_thread , daemon = True )
91+ self .thread = threading .Thread (target = self .update_stats_loop , daemon = True )
9392 self .thread .start ()
94-
95- def _process_metric_group (self , group_name ):
96- metric_cls , default_kwargs = self .metric_type_config [group_name ]
97- cfg_list = self .config .get (group_name , [])
98-
93+
94+ def _register_metrics_by_type (self , metric_type ):
95+ """
96+ Register metrics by different metric types.
97+ """
98+ metric_cls , default_kwargs = self .metric_type_config [metric_type ]
99+ cfg_list = self .config .get (metric_type , [])
100+
99101 for cfg in cfg_list :
100102 name = cfg .get ("name" )
101103 doc = cfg .get ("documentation" , "" )
102104 # Prometheus metric name with prefix
103105 prometheus_name = f"{ self .metric_prefix } { name } "
104- ucmmetrics .create_stats (name , group_name )
105-
106+ ucmmetrics .create_stats (name , metric_type )
107+
106108 metric_kwargs = {
107109 "name" : prometheus_name ,
108110 "documentation" : doc ,
109111 "labelnames" : self .labelnames ,
110112 ** default_kwargs ,
111- ** {k : v for k , v in cfg .items () if k in default_kwargs }
113+ ** {k : v for k , v in cfg .items () if k in default_kwargs },
112114 }
113-
115+
114116 self .metric_mappings [name ] = metric_cls (** metric_kwargs )
115117
116118 def _init_metrics_from_config (self ):
117- """Initialize metrics based on configuration """
119+ """Initialize metrics based on config """
118120 # Get metric name prefix from config (e.g., "ucm:")
119121 self .metric_prefix = self .config .get ("metric_prefix" , "ucm:" )
120122
121123 # Store metric mapping: metric_name -> Union[Counter, Gauge, Histogram]
122124 # This mapping will be used in update_stats to dynamically log metrics
123125 self .metric_mappings : dict [str , Union [Counter , Gauge , Histogram ]] = {}
124126
125- for group_name in self .metric_type_config .keys ():
126- self ._process_metric_group ( group_name )
127-
127+ for metric_type in self .metric_type_config .keys ():
128+ self ._register_metrics_by_type ( metric_type )
129+
128130 def _update_counter (self , metric , value ):
129131 if value < 0 :
130132 return
@@ -136,13 +138,17 @@ def _update_gauge(self, metric, value):
136138 def _update_histogram (self , metric , value ):
137139 for data in value :
138140 metric .observe (data )
139-
141+
140142 def _update_with_func (self , update_func , stats : dict [str , Any ], op_desc : str ):
143+ """
144+ Generic update for Prometheus metrics: match metrics by name, bind labels,
145+ and update values via the specified function (update_func).
146+ """
141147 for stat_name , value in stats .items ():
142148 if stat_name not in self .metric_mappings :
143149 logger .error (f"Metric { stat_name } not found" )
144150 continue
145-
151+
146152 metric = self .metric_mappings [stat_name ]
147153 try :
148154 metric_with_labels = metric .labels (** self .labels )
@@ -153,8 +159,9 @@ def _update_with_func(self, update_func, stats: dict[str, Any], op_desc: str):
153159 logger .debug (f"Failed to { op_desc } { stat_name } : { e } " )
154160
155161 def update_stats (self , counter_stats , gauge_stats , histogram_stats ):
156- """Log metrics to Prometheus based on configuration file"""
157- # Dynamically log metrics based on what's configured in YAML
162+ """
163+ Update all Prometheus metrics (Counter/Gauge/Histogram) with given stats.
164+ """
158165 update_tasks = [
159166 (self ._update_counter , counter_stats , "increment" ),
160167 (self ._update_gauge , gauge_stats , "set" ),
@@ -163,9 +170,14 @@ def update_stats(self, counter_stats, gauge_stats, histogram_stats):
163170 for update_func , stats , op_desc in update_tasks :
164171 self ._update_with_func (update_func , stats , op_desc )
165172
166- def obtain_stats_thread (self ):
173+ def update_stats_loop (self ):
174+ """
175+ Periodically update Prometheus metrics in a loop until stopped.
176+ """
167177 while self .is_running :
168- counter_stats , gauge_stats , histogram_stats = ucmmetrics .get_all_stats_and_clear ()
178+ counter_stats , gauge_stats , histogram_stats = (
179+ ucmmetrics .get_all_stats_and_clear ()
180+ )
169181 self .update_stats (counter_stats , gauge_stats , histogram_stats )
170182 time .sleep (self .log_interval )
171183
0 commit comments