22import os
33
44from fastapi import FastAPI
5- from opentelemetry import _logs , trace
5+ from opentelemetry import _logs , metrics , trace
66from opentelemetry .exporter .otlp .proto .grpc ._log_exporter import OTLPLogExporter
77from opentelemetry .exporter .otlp .proto .grpc .trace_exporter import OTLPSpanExporter
88from opentelemetry .instrumentation .fastapi import FastAPIInstrumentor
@@ -44,23 +44,62 @@ def _instrument_google_generativeai():
4444
4545
4646def configure (name : str = "kagent" , namespace : str = "kagent" , fastapi_app : FastAPI | None = None ):
47- """Configure OpenTelemetry tracing and logging for this service.
47+ """Configure OpenTelemetry tracing, logging, and metrics for this service.
4848
49- This sets up OpenTelemetry providers and exporters for tracing and logging,
50- using environment variables to determine whether each is enabled.
49+ This sets up OpenTelemetry providers and exporters for tracing, logging,
50+ and metrics, using environment variables to determine whether each is enabled.
51+
52+ Providers are configured before instrumentors so that instrumentors can
53+ discover and use all available providers (TracerProvider, MeterProvider, etc.).
5154
5255 Args:
5356 name: service name to report to OpenTelemetry (used as ``service.name``). Default is "kagent".
5457 namespace: logical namespace for the service (used as ``service.namespace``). Default is "kagent".
5558 fastapi_app: Optional FastAPI application instance to instrument. If
5659 provided and tracing is enabled, FastAPI routes will be instrumented.
60+ If metrics is enabled, a ``/metrics`` endpoint will be added for
61+ Prometheus scraping.
5762 """
5863 tracing_enabled = os .getenv ("OTEL_TRACING_ENABLED" , "false" ).lower () == "true"
5964 logging_enabled = os .getenv ("OTEL_LOGGING_ENABLED" , "false" ).lower () == "true"
65+ metrics_enabled = os .getenv ("OTEL_METRICS_ENABLED" , "false" ).lower () == "true"
6066
6167 resource = Resource ({"service.name" : name , "service.namespace" : namespace })
6268
63- # Configure tracing if enabled
69+ # ------------------------------------------------------------------ #
70+ # 1. Configure providers BEFORE instrumentors so that instrumentors #
71+ # can discover MeterProvider, TracerProvider, etc. at init time. #
72+ # ------------------------------------------------------------------ #
73+
74+ # 1a. Metrics provider (Prometheus pull endpoint)
75+ if metrics_enabled :
76+ logging .info ("Enabling Prometheus metrics" )
77+ try :
78+ from opentelemetry .exporter .prometheus import PrometheusMetricReader
79+ from opentelemetry .sdk .metrics import MeterProvider
80+
81+ reader = PrometheusMetricReader ()
82+ meter_provider = MeterProvider (resource = resource , metric_readers = [reader ])
83+ metrics .set_meter_provider (meter_provider )
84+ logging .info ("MeterProvider configured with Prometheus exporter" )
85+
86+ if fastapi_app :
87+ from prometheus_client import CONTENT_TYPE_LATEST , generate_latest
88+ from starlette .responses import Response
89+
90+ @fastapi_app .get ("/metrics" )
91+ async def metrics_endpoint ():
92+ return Response (content = generate_latest (), media_type = CONTENT_TYPE_LATEST )
93+
94+ logging .info ("Added /metrics endpoint for Prometheus scraping" )
95+ except ImportError :
96+ logging .warning (
97+ "opentelemetry-exporter-prometheus is not installed; "
98+ "metrics endpoint will not be available. "
99+ "Install it with: pip install opentelemetry-exporter-prometheus"
100+ )
101+
102+ # 1b. Tracing provider
64103 if tracing_enabled :
65104 logging .info ("Enabling tracing" )
66105 # Check standard OTEL env vars: signal-specific endpoint first, then general endpoint
@@ -90,10 +129,8 @@ def configure(name: str = "kagent", namespace: str = "kagent", fastapi_app: Fast
90129 trace .set_tracer_provider (tracer_provider )
91130 logging .info ("Created new TracerProvider" )
92131
93- HTTPXClientInstrumentor ().instrument ()
94- if fastapi_app :
95- FastAPIInstrumentor ().instrument_app (fastapi_app )
96- # Configure logging if enabled
132+ # 1c. Logging provider
133+ event_logger_provider = None
97134 if logging_enabled :
98135 logging .info ("Enabling logging for GenAI events" )
99136 logger_provider = LoggerProvider (resource = resource )
@@ -114,15 +151,128 @@ def configure(name: str = "kagent", namespace: str = "kagent", fastapi_app: Fast
114151
115152 _logs .set_logger_provider (logger_provider )
116153 logging .info ("Log provider configured with OTLP" )
117- # When logging is enabled, use new event-based approach (input/output as log events in Body)
118- logging .info ("OpenAI instrumentation configured with event logging capability" )
119- # Create event logger provider using the configured logger provider
154+ # Create event logger provider for instrumentors
120155 event_logger_provider = EventLoggerProvider (logger_provider )
156+
157+ # ------------------------------------------------------------------ #
158+ # 2. Instrument libraries — all providers are now available. #
159+ # ------------------------------------------------------------------ #
160+
161+ if tracing_enabled :
162+ HTTPXClientInstrumentor ().instrument ()
163+ if fastapi_app :
164+ FastAPIInstrumentor ().instrument_app (fastapi_app )
165+
166+ if event_logger_provider :
167+ # Event logging mode: input/output as log events in Body
168+ logging .info ("OpenAI instrumentation configured with event logging capability" )
121169 OpenAIInstrumentor (use_legacy_attributes = False ).instrument (event_logger_provider = event_logger_provider )
122170 _instrument_anthropic (event_logger_provider )
123171 else :
124- # Use legacy attributes ( input/output as GenAI span attributes)
172+ # Legacy attributes mode: input/output as GenAI span attributes
125173 logging .info ("OpenAI instrumentation configured with legacy GenAI span attributes" )
126174 OpenAIInstrumentor ().instrument ()
127175 _instrument_anthropic ()
128176 _instrument_google_generativeai ()
177+
178+ # ------------------------------------------------------------------ #
179+ # 3. LiteLLM metrics callback for providers that bypass their SDK. #
180+ # LiteLLM uses raw httpx for some providers (e.g., Anthropic), #
181+ # so the SDK instrumentors never fire. This callback fills the gap.#
182+ # ------------------------------------------------------------------ #
183+
184+ if metrics_enabled :
185+ _register_litellm_metrics_callback ()
186+
187+
188+ def _register_litellm_metrics_callback ():
189+ """Register a LiteLLM callback that records GenAI metrics for providers
190+ where LiteLLM bypasses the provider's Python SDK (e.g., Anthropic).
191+
192+ LiteLLM uses raw httpx POST requests for some providers instead of their
193+ official Python SDKs. This means the OpenTelemetry instrumentors for those
194+ SDKs never fire and no metrics are recorded. This callback fills that gap
195+ by recording metrics directly from LiteLLM's success/failure callbacks.
196+
197+ Providers where LiteLLM uses the SDK directly (e.g., OpenAI) are skipped
198+ to avoid double-counting with the existing instrumentor metrics.
199+ """
200+ try :
201+ import litellm
202+ from litellm .integrations .custom_logger import CustomLogger
203+ except ImportError :
204+ logging .debug ("litellm not installed; skipping LiteLLM metrics callback" )
205+ return
206+
207+ meter = metrics .get_meter ("kagent.litellm" )
208+ token_histogram = meter .create_histogram (
209+ name = "gen_ai.client.token.usage" ,
210+ unit = "token" ,
211+ description = "Measures number of input and output tokens used" ,
212+ )
213+ duration_histogram = meter .create_histogram (
214+ name = "gen_ai.client.operation.duration" ,
215+ unit = "s" ,
216+ description = "GenAI operation duration" ,
217+ )
218+
219+ # Providers where LiteLLM uses the Python SDK directly, so the
220+ # SDK instrumentor already captures metrics. Skip these to avoid
221+ # double-counting.
222+ SDK_INSTRUMENTED_PROVIDERS = frozenset ({
223+ "openai" , "azure" , "azure_text" , "azure_ai" ,
224+ })
225+
226+ class _MetricsCallback (CustomLogger ):
227+ def _record_metrics (self , kwargs , response_obj , start_time , end_time ):
228+ provider = kwargs .get ("custom_llm_provider" , "" )
229+ if provider in SDK_INSTRUMENTED_PROVIDERS :
230+ return
231+
232+ model = kwargs .get ("model" , "unknown" )
233+ # Match attribute names used by the Google GenAI instrumentor
234+ # so all providers appear with consistent labels in Prometheus.
235+ base_attrs = {
236+ "gen_ai.provider.name" : provider .capitalize () if provider else "Unknown" ,
237+ "gen_ai.response.model" : model ,
238+ }
239+
240+ duration_s = (end_time - start_time ).total_seconds ()
241+ duration_histogram .record (duration_s , attributes = base_attrs )
242+
243+ usage = getattr (response_obj , "usage" , None )
244+ if usage is None and isinstance (response_obj , dict ):
245+ usage = response_obj .get ("usage" )
246+ if usage is None :
247+ return
248+
249+ input_tokens = getattr (usage , "prompt_tokens" , None )
250+ if input_tokens is None and isinstance (usage , dict ):
251+ input_tokens = usage .get ("prompt_tokens" , 0 )
252+ output_tokens = getattr (usage , "completion_tokens" , None )
253+ if output_tokens is None and isinstance (usage , dict ):
254+ output_tokens = usage .get ("completion_tokens" , 0 )
255+
256+ if input_tokens :
257+ token_histogram .record (
258+ input_tokens ,
259+ attributes = {** base_attrs , "gen_ai.token.type" : "input" },
260+ )
261+ if output_tokens :
262+ token_histogram .record (
263+ output_tokens ,
264+ attributes = {** base_attrs , "gen_ai.token.type" : "output" },
265+ )
266+
267+ def log_success_event (self , kwargs , response_obj , start_time , end_time ):
268+ try :
269+ self ._record_metrics (kwargs , response_obj , start_time , end_time )
270+ except Exception :
271+ logging .debug ("Failed to record LiteLLM metrics" , exc_info = True )
272+
273+ async def async_log_success_event (self , kwargs , response_obj , start_time , end_time ):
274+ self .log_success_event (kwargs , response_obj , start_time , end_time )
275+
276+ litellm .callbacks .append (_MetricsCallback ())
277+ logging .info ("Registered LiteLLM metrics callback for non-SDK providers" )
278+
0 commit comments