Skip to content

Commit 316d263

Browse files
committed
Enable /metrics endpoint
Signed-off-by: Dobes Vandermeer <dobes.vandermeer@newsela.com>
1 parent 05d0bde commit 316d263

3 files changed

Lines changed: 189 additions & 13 deletions

File tree

python/packages/kagent-core/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ dependencies = [
1818
"opentelemetry-instrumentation-httpx >= 0.52.0",
1919
"opentelemetry-instrumentation-fastapi>=0.52.0",
2020
"opentelemetry-instrumentation-google-generativeai>=0.52.5",
21+
"opentelemetry-exporter-prometheus>=0.52b0",
2122
"typing-extensions>=4.0.0",
2223
]
2324

python/packages/kagent-core/src/kagent/core/tracing/_utils.py

Lines changed: 163 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import os
33

44
from fastapi import FastAPI
5-
from opentelemetry import _logs, trace
5+
from opentelemetry import _logs, metrics, trace
66
from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter
77
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
88
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
@@ -44,23 +44,62 @@ def _instrument_google_generativeai():
4444

4545

4646
def configure(name: str = "kagent", namespace: str = "kagent", fastapi_app: FastAPI | None = None):
47-
"""Configure OpenTelemetry tracing and logging for this service.
47+
"""Configure OpenTelemetry tracing, logging, and metrics for this service.
4848
49-
This sets up OpenTelemetry providers and exporters for tracing and logging,
50-
using environment variables to determine whether each is enabled.
49+
This sets up OpenTelemetry providers and exporters for tracing, logging,
50+
and metrics, using environment variables to determine whether each is enabled.
51+
52+
Providers are configured before instrumentors so that instrumentors can
53+
discover and use all available providers (TracerProvider, MeterProvider, etc.).
5154
5255
Args:
5356
name: service name to report to OpenTelemetry (used as ``service.name``). Default is "kagent".
5457
namespace: logical namespace for the service (used as ``service.namespace``). Default is "kagent".
5558
fastapi_app: Optional FastAPI application instance to instrument. If
5659
provided and tracing is enabled, FastAPI routes will be instrumented.
60+
If metrics is enabled, a ``/metrics`` endpoint will be added for
61+
Prometheus scraping.
5762
"""
5863
tracing_enabled = os.getenv("OTEL_TRACING_ENABLED", "false").lower() == "true"
5964
logging_enabled = os.getenv("OTEL_LOGGING_ENABLED", "false").lower() == "true"
65+
metrics_enabled = os.getenv("OTEL_METRICS_ENABLED", "false").lower() == "true"
6066

6167
resource = Resource({"service.name": name, "service.namespace": namespace})
6268

63-
# Configure tracing if enabled
69+
# ------------------------------------------------------------------ #
70+
# 1. Configure providers BEFORE instrumentors so that instrumentors #
71+
# can discover MeterProvider, TracerProvider, etc. at init time. #
72+
# ------------------------------------------------------------------ #
73+
74+
# 1a. Metrics provider (Prometheus pull endpoint)
75+
if metrics_enabled:
76+
logging.info("Enabling Prometheus metrics")
77+
try:
78+
from opentelemetry.exporter.prometheus import PrometheusMetricReader
79+
from opentelemetry.sdk.metrics import MeterProvider
80+
81+
reader = PrometheusMetricReader()
82+
meter_provider = MeterProvider(resource=resource, metric_readers=[reader])
83+
metrics.set_meter_provider(meter_provider)
84+
logging.info("MeterProvider configured with Prometheus exporter")
85+
86+
if fastapi_app:
87+
from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
88+
from starlette.responses import Response
89+
90+
@fastapi_app.get("/metrics")
91+
async def metrics_endpoint():
92+
return Response(content=generate_latest(), media_type=CONTENT_TYPE_LATEST)
93+
94+
logging.info("Added /metrics endpoint for Prometheus scraping")
95+
except ImportError:
96+
logging.warning(
97+
"opentelemetry-exporter-prometheus is not installed; "
98+
"metrics endpoint will not be available. "
99+
"Install it with: pip install opentelemetry-exporter-prometheus"
100+
)
101+
102+
# 1b. Tracing provider
64103
if tracing_enabled:
65104
logging.info("Enabling tracing")
66105
# Check standard OTEL env vars: signal-specific endpoint first, then general endpoint
@@ -90,10 +129,8 @@ def configure(name: str = "kagent", namespace: str = "kagent", fastapi_app: Fast
90129
trace.set_tracer_provider(tracer_provider)
91130
logging.info("Created new TracerProvider")
92131

93-
HTTPXClientInstrumentor().instrument()
94-
if fastapi_app:
95-
FastAPIInstrumentor().instrument_app(fastapi_app)
96-
# Configure logging if enabled
132+
# 1c. Logging provider
133+
event_logger_provider = None
97134
if logging_enabled:
98135
logging.info("Enabling logging for GenAI events")
99136
logger_provider = LoggerProvider(resource=resource)
@@ -114,15 +151,128 @@ def configure(name: str = "kagent", namespace: str = "kagent", fastapi_app: Fast
114151

115152
_logs.set_logger_provider(logger_provider)
116153
logging.info("Log provider configured with OTLP")
117-
# When logging is enabled, use new event-based approach (input/output as log events in Body)
118-
logging.info("OpenAI instrumentation configured with event logging capability")
119-
# Create event logger provider using the configured logger provider
154+
# Create event logger provider for instrumentors
120155
event_logger_provider = EventLoggerProvider(logger_provider)
156+
157+
# ------------------------------------------------------------------ #
158+
# 2. Instrument libraries — all providers are now available. #
159+
# ------------------------------------------------------------------ #
160+
161+
if tracing_enabled:
162+
HTTPXClientInstrumentor().instrument()
163+
if fastapi_app:
164+
FastAPIInstrumentor().instrument_app(fastapi_app)
165+
166+
if event_logger_provider:
167+
# Event logging mode: input/output as log events in Body
168+
logging.info("OpenAI instrumentation configured with event logging capability")
121169
OpenAIInstrumentor(use_legacy_attributes=False).instrument(event_logger_provider=event_logger_provider)
122170
_instrument_anthropic(event_logger_provider)
123171
else:
124-
# Use legacy attributes (input/output as GenAI span attributes)
172+
# Legacy attributes mode: input/output as GenAI span attributes
125173
logging.info("OpenAI instrumentation configured with legacy GenAI span attributes")
126174
OpenAIInstrumentor().instrument()
127175
_instrument_anthropic()
128176
_instrument_google_generativeai()
177+
178+
# ------------------------------------------------------------------ #
179+
# 3. LiteLLM metrics callback for providers that bypass their SDK. #
180+
# LiteLLM uses raw httpx for some providers (e.g., Anthropic), #
181+
# so the SDK instrumentors never fire. This callback fills the gap.#
182+
# ------------------------------------------------------------------ #
183+
184+
if metrics_enabled:
185+
_register_litellm_metrics_callback()
186+
187+
188+
def _register_litellm_metrics_callback():
189+
"""Register a LiteLLM callback that records GenAI metrics for providers
190+
where LiteLLM bypasses the provider's Python SDK (e.g., Anthropic).
191+
192+
LiteLLM uses raw httpx POST requests for some providers instead of their
193+
official Python SDKs. This means the OpenTelemetry instrumentors for those
194+
SDKs never fire and no metrics are recorded. This callback fills that gap
195+
by recording metrics directly from LiteLLM's success/failure callbacks.
196+
197+
Providers where LiteLLM uses the SDK directly (e.g., OpenAI) are skipped
198+
to avoid double-counting with the existing instrumentor metrics.
199+
"""
200+
try:
201+
import litellm
202+
from litellm.integrations.custom_logger import CustomLogger
203+
except ImportError:
204+
logging.debug("litellm not installed; skipping LiteLLM metrics callback")
205+
return
206+
207+
meter = metrics.get_meter("kagent.litellm")
208+
token_histogram = meter.create_histogram(
209+
name="gen_ai.client.token.usage",
210+
unit="token",
211+
description="Measures number of input and output tokens used",
212+
)
213+
duration_histogram = meter.create_histogram(
214+
name="gen_ai.client.operation.duration",
215+
unit="s",
216+
description="GenAI operation duration",
217+
)
218+
219+
# Providers where LiteLLM uses the Python SDK directly, so the
220+
# SDK instrumentor already captures metrics. Skip these to avoid
221+
# double-counting.
222+
SDK_INSTRUMENTED_PROVIDERS = frozenset({
223+
"openai", "azure", "azure_text", "azure_ai",
224+
})
225+
226+
class _MetricsCallback(CustomLogger):
227+
def _record_metrics(self, kwargs, response_obj, start_time, end_time):
228+
provider = kwargs.get("custom_llm_provider", "")
229+
if provider in SDK_INSTRUMENTED_PROVIDERS:
230+
return
231+
232+
model = kwargs.get("model", "unknown")
233+
# Match attribute names used by the Google GenAI instrumentor
234+
# so all providers appear with consistent labels in Prometheus.
235+
base_attrs = {
236+
"gen_ai.provider.name": provider.capitalize() if provider else "Unknown",
237+
"gen_ai.response.model": model,
238+
}
239+
240+
duration_s = (end_time - start_time).total_seconds()
241+
duration_histogram.record(duration_s, attributes=base_attrs)
242+
243+
usage = getattr(response_obj, "usage", None)
244+
if usage is None and isinstance(response_obj, dict):
245+
usage = response_obj.get("usage")
246+
if usage is None:
247+
return
248+
249+
input_tokens = getattr(usage, "prompt_tokens", None)
250+
if input_tokens is None and isinstance(usage, dict):
251+
input_tokens = usage.get("prompt_tokens", 0)
252+
output_tokens = getattr(usage, "completion_tokens", None)
253+
if output_tokens is None and isinstance(usage, dict):
254+
output_tokens = usage.get("completion_tokens", 0)
255+
256+
if input_tokens:
257+
token_histogram.record(
258+
input_tokens,
259+
attributes={**base_attrs, "gen_ai.token.type": "input"},
260+
)
261+
if output_tokens:
262+
token_histogram.record(
263+
output_tokens,
264+
attributes={**base_attrs, "gen_ai.token.type": "output"},
265+
)
266+
267+
def log_success_event(self, kwargs, response_obj, start_time, end_time):
268+
try:
269+
self._record_metrics(kwargs, response_obj, start_time, end_time)
270+
except Exception:
271+
logging.debug("Failed to record LiteLLM metrics", exc_info=True)
272+
273+
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
274+
self.log_success_event(kwargs, response_obj, start_time, end_time)
275+
276+
litellm.callbacks.append(_MetricsCallback())
277+
logging.info("Registered LiteLLM metrics callback for non-SDK providers")
278+

python/uv.lock

Lines changed: 25 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)