From b3006e6388e2331e1ade305ad99d75b1d943683a Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Mon, 27 Apr 2026 11:24:05 +0800 Subject: [PATCH 01/17] =?UTF-8?q?=E2=9C=A8=20Feat:=20Update=20monitoring?= =?UTF-8?q?=20configuration=20to=20use=20OpenTelemetry=20OTLP=20protocol?= =?UTF-8?q?=20and=20enhance=20observability=20features?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/consts/const.py | 27 +- backend/utils/monitoring.py | 50 +- doc/docs/en/sdk/monitoring.md | 370 +++--- doc/docs/zh/sdk/monitoring.md | 370 +++--- docker/.env.example | 24 +- docker/deploy.sh | 2 +- docker/docker-compose-monitoring.yml | 77 +- .../dashboards/nexent-llm-performance.json | 544 -------- .../provisioning/dashboards/dashboards.yml | 13 - .../provisioning/datasources/datasources.yml | 16 - docker/monitoring/monitoring.env | 19 +- docker/monitoring/monitoring.env.example | 20 +- docker/monitoring/otel-collector-config.yml | 94 +- docker/monitoring/prometheus.yml | 39 - sdk/nexent/monitor/__init__.py | 24 +- sdk/nexent/monitor/monitoring.py | 461 +++++-- sdk/pyproject.toml | 20 +- test/backend/utils/test_monitoring.py | 128 +- test/sdk/monitor/test_monitoring.py | 1098 ++++++----------- 19 files changed, 1276 insertions(+), 2120 deletions(-) delete mode 100644 docker/monitoring/grafana/dashboards/nexent-llm-performance.json delete mode 100644 docker/monitoring/grafana/provisioning/dashboards/dashboards.yml delete mode 100644 docker/monitoring/grafana/provisioning/datasources/datasources.yml delete mode 100644 docker/monitoring/prometheus.yml diff --git a/backend/consts/const.py b/backend/consts/const.py index 796db4987..9448723b7 100644 --- a/backend/consts/const.py +++ b/backend/consts/const.py @@ -316,19 +316,34 @@ class VectorDatabaseType(str, Enum): THINK_END_PATTERN = "" -# Telemetry and Monitoring Configuration +# Telemetry and Monitoring Configuration (OTLP Protocol) ENABLE_TELEMETRY = os.getenv("ENABLE_TELEMETRY", "false").lower() == "true" -SERVICE_NAME = os.getenv("SERVICE_NAME", "nexent-backend") -JAEGER_ENDPOINT = os.getenv( - "JAEGER_ENDPOINT", "http://localhost:14268/api/traces") -PROMETHEUS_PORT = int(os.getenv("PROMETHEUS_PORT", "8000")) +OTEL_SERVICE_NAME = os.getenv("OTEL_SERVICE_NAME", "nexent-backend") +OTEL_EXPORTER_OTLP_ENDPOINT = os.getenv( + "OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318") +OTEL_EXPORTER_OTLP_PROTOCOL = os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL", "http") +OTEL_EXPORTER_OTLP_HEADERS = os.getenv("OTEL_EXPORTER_OTLP_HEADERS", "") TELEMETRY_SAMPLE_RATE = float(os.getenv("TELEMETRY_SAMPLE_RATE", "1.0")) +# Parse OTLP headers into dict format +def _parse_otlp_headers(headers_str: str) -> dict: + """Parse OTLP headers string into dict. Format: 'key1=value1,key2=value2'""" + if not headers_str: + return {} + headers = {} + for pair in headers_str.split(","): + if "=" in pair: + key, value = pair.split("=", 1) + headers[key.strip()] = value.strip() + return headers + +OTLP_HEADERS = _parse_otlp_headers(OTEL_EXPORTER_OTLP_HEADERS) + # Performance monitoring thresholds LLM_SLOW_REQUEST_THRESHOLD_SECONDS = float( os.getenv("LLM_SLOW_REQUEST_THRESHOLD_SECONDS", "5.0")) LLM_SLOW_TOKEN_RATE_THRESHOLD = float( - os.getenv("LLM_SLOW_TOKEN_RATE_THRESHOLD", "10.0")) # tokens per second + os.getenv("LLM_SLOW_TOKEN_RATE_THRESHOLD", "10.0")) DEFAULT_ZH_TITLE = "新对话" diff --git a/backend/utils/monitoring.py b/backend/utils/monitoring.py index eb20d88ec..28aaaef51 100644 --- a/backend/utils/monitoring.py +++ b/backend/utils/monitoring.py @@ -2,8 +2,8 @@ Global Monitoring Manager for Backend This module initializes and configures the global monitoring manager instance -with backend environment variables. All other backend modules should import -`monitoring_manager` directly from this module. +with backend environment variables using OTLP protocol. All other backend modules +should import `monitoring_manager` directly from this module. Usage: from utils.monitoring import monitoring_manager @@ -17,25 +17,24 @@ async def my_function(): MonitoringConfig, get_monitoring_manager ) -# Import configuration from backend (support both relative and absolute imports) try: - # Try relative import first (when running from backend directory) from consts.const import ( ENABLE_TELEMETRY, - SERVICE_NAME, - JAEGER_ENDPOINT, - PROMETHEUS_PORT, + OTEL_SERVICE_NAME, + OTEL_EXPORTER_OTLP_ENDPOINT, + OTEL_EXPORTER_OTLP_PROTOCOL, + OTLP_HEADERS, TELEMETRY_SAMPLE_RATE, LLM_SLOW_REQUEST_THRESHOLD_SECONDS, LLM_SLOW_TOKEN_RATE_THRESHOLD ) except ImportError: - # Fallback to absolute import (when running from project root) from backend.consts.const import ( ENABLE_TELEMETRY, - SERVICE_NAME, - JAEGER_ENDPOINT, - PROMETHEUS_PORT, + OTEL_SERVICE_NAME, + OTEL_EXPORTER_OTLP_ENDPOINT, + OTEL_EXPORTER_OTLP_PROTOCOL, + OTLP_HEADERS, TELEMETRY_SAMPLE_RATE, LLM_SLOW_REQUEST_THRESHOLD_SECONDS, LLM_SLOW_TOKEN_RATE_THRESHOLD @@ -45,39 +44,30 @@ async def my_function(): logger = logging.getLogger(__name__) -# ============================================================================ -# Global Monitoring Manager Instance -# ============================================================================ - -# Get the global monitoring manager instance monitoring_manager = get_monitoring_manager() -# Initialize monitoring configuration immediately when this module is imported - def _initialize_monitoring(): - """Initialize monitoring configuration with backend environment variables.""" + """Initialize monitoring configuration with OTLP settings.""" config = MonitoringConfig( enable_telemetry=ENABLE_TELEMETRY, - service_name=SERVICE_NAME, - jaeger_endpoint=JAEGER_ENDPOINT, - prometheus_port=PROMETHEUS_PORT, + service_name=OTEL_SERVICE_NAME, + otlp_endpoint=OTEL_EXPORTER_OTLP_ENDPOINT, + otlp_protocol=OTEL_EXPORTER_OTLP_PROTOCOL, + otlp_headers=OTLP_HEADERS, telemetry_sample_rate=TELEMETRY_SAMPLE_RATE, llm_slow_request_threshold_seconds=LLM_SLOW_REQUEST_THRESHOLD_SECONDS, llm_slow_token_rate_threshold=LLM_SLOW_TOKEN_RATE_THRESHOLD ) - # Configure the SDK monitoring system using the singleton monitoring_manager.configure(config) logger.info( - f"Global monitoring initialized: service_name={SERVICE_NAME}, enable_telemetry={ENABLE_TELEMETRY}") + f"OTLP monitoring initialized: service_name={OTEL_SERVICE_NAME}, " + f"enable_telemetry={ENABLE_TELEMETRY}, endpoint={OTEL_EXPORTER_OTLP_ENDPOINT}, " + f"protocol={OTEL_EXPORTER_OTLP_PROTOCOL}" + ) -# Initialize monitoring when module is imported _initialize_monitoring() - -# Export the global monitoring manager instance -__all__ = [ - 'monitoring_manager' -] +__all__ = ['monitoring_manager'] \ No newline at end of file diff --git a/doc/docs/en/sdk/monitoring.md b/doc/docs/en/sdk/monitoring.md index 4aa625132..64b211401 100644 --- a/doc/docs/en/sdk/monitoring.md +++ b/doc/docs/en/sdk/monitoring.md @@ -1,289 +1,227 @@ -# 🚀 Nexent LLM Monitoring System +# Nexent Agent Observability (OTLP) -Enterprise-grade monitoring solution specifically designed for monitoring LLM token generation speed and performance. +Enterprise-grade observability for AI agents using OpenTelemetry OTLP protocol. Supports integration with AI observability platforms like Arize Phoenix, Langfuse, and more. -## 📊 System Architecture +## Architecture ``` -┌─────────────────────────────────────────────────────────┐ -│ Nexent LLM Monitoring System │ -├─────────────────────────────────────────────────────────┤ -│ │ -│ Nexent API ──► OpenTelemetry ──► Jaeger (Tracing) │ -│ │ │ │ -│ │ └──────► Prometheus (Metrics) │ -│ │ │ │ -│ └─► OpenAI LLM └──► Grafana (Visualization) │ -│ (Token Monitoring) │ -└─────────────────────────────────────────────────────────┘ +NexentAgent ──► OpenTelemetry SDK ──► OTLP Collector ──► Arize Phoenix / Langfuse / Jaeger + │ │ + │ OpenInference Semantics │ + │ (llm.*, agent.* attributes) │ + └────────────────────────────────────────┘ ``` -## ⚡ Quick Start (5 minutes) +## Quick Start ```bash -# 1. Start monitoring services -./docker/start-monitoring.sh +cd docker +cp .env.example .env -# 2. Install performance monitoring dependencies -uv sync --extra performance - -# 3. Enable monitoring -export ENABLE_TELEMETRY=true +vim .env +ENABLE_TELEMETRY=true +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http -# 4. Start backend service -python backend/config_service.py -python backend/runtime_service.py +docker-compose -f docker-compose-monitoring.yml up -d ``` -## 📊 Access Monitoring Interfaces +## AI Observability Platforms -| Interface | URL | Purpose | -|-----------|-----|---------| -| **Grafana Dashboard** | http://localhost:3005 | LLM Performance Monitoring | -| **Jaeger Tracing** | http://localhost:16686 | Request Trace Analysis | -| **Prometheus Metrics** | http://localhost:9090 | Raw Monitoring Data | +### Arize Phoenix -### 🔐 Grafana Login Information +Arize Phoenix provides AI-specific observability with OpenInference semantic support. -When first accessing Grafana (http://localhost:3005), you need to login: +**Configuration:** -``` -Username: admin -Password: admin +```bash +OTEL_EXPORTER_OTLP_ENDPOINT=https://phoenix.arize.com/v1 +OTEL_EXPORTER_OTLP_HEADERS=x-api-key=YOUR_PHOENIX_API_KEY +OTEL_EXPORTER_OTLP_PROTOCOL=http ``` -**After first login, you'll be prompted to change password:** -- Set a new password (recommended) -- Click "Skip" to skip (development environment) +**Features:** +- LLM trace visualization with prompt/completion +- Token-level performance metrics +- Agent step tracing +- Cost analysis -**After login, you can see:** -- 📊 **LLM Performance Dashboard** - Pre-configured performance dashboard -- 📈 **Data Source Configuration** - Auto-connected to Prometheus and Jaeger -- 🎯 **Real-time Monitoring Panel** - Key metrics like token generation speed, latency +### Langfuse -## 🎯 Core Features +Langfuse offers prompt management and LLM observability with OTLP support. -### ⚡ LLM-Specific Monitoring -- **Token Generation Speed**: Real-time monitoring of tokens generated per second -- **TTFT (Time to First Token)**: First token return latency -- **Streaming Response Analysis**: Generation timestamp for each token -- **Model Performance Comparison**: Performance benchmarks across different models +**Configuration:** -### 🔍 Distributed Tracing -- **Complete Request Chain**: End-to-end tracing from HTTP to LLM -- **Performance Bottleneck Detection**: Automatically identify slow queries and anomalies -- **Error Root Cause Analysis**: Quickly locate problem sources +```bash +OTEL_EXPORTER_OTLP_ENDPOINT=https://cloud.langfuse.com/api/public/otel/v1 -### 🛠️ Developer-Friendly Design -- **One-Line Integration**: Quick monitoring with decorators -- **Zero-Dependency Degradation**: Auto-skip when monitoring dependencies are missing -- **Zero-Touch Usage**: No need to manually check monitoring status, handled automatically -- **Flexible Configuration**: Environment variable controlled behavior +LANGFUSE_PUBLIC_KEY=pk-xxx +LANGFUSE_SECRET_KEY=sk-xxx -## 🛠️ Adding Monitoring to Code +OTEL_EXPORTER_OTLP_HEADERS=Authorization=Basic BASE64_ENCODED_KEY +``` -### 🎯 Recommended Approach: Singleton Pattern (v2.1+) +Generate the encoded key: -```python -# Backend service usage - directly use globally configured monitoring_manager -from utils.monitoring import monitoring_manager +```bash +echo -n "$LANGFUSE_PUBLIC_KEY:$LANGFUSE_SECRET_KEY" | base64 +``` -# API endpoint monitoring -@monitoring_manager.monitor_endpoint("my_service.my_function") -async def my_api_function(): - return {"status": "ok"} +**Features:** +- Prompt versioning and management +- Session-based trace grouping +- User feedback collection +- Model cost tracking -# LLM call monitoring -@monitoring_manager.monitor_llm_call("gpt-4", "chat_completion") -def call_llm(messages): - # Automatically get token-level monitoring - return llm_response +### Local Jaeger (OTLP) -# Manual monitoring events -monitoring_manager.add_span_event("custom_event", {"key": "value"}) -monitoring_manager.set_span_attributes(user_id="123", action="process") -``` +For local development, Jaeger still works via OTLP. -### 📦 Direct SDK Usage +**Configuration:** -```python -from nexent.monitor import get_monitoring_manager - -# Get global monitoring manager - already configured in backend -monitor = get_monitoring_manager() - -# Use decorators -@monitor.monitor_llm_call("claude-3", "completion") -def my_llm_function(): - return "response" - -# Or use directly in business logic -with monitor.trace_llm_request("custom_operation", "my_model") as span: - # Execute business logic - result = process_data() - monitor.add_span_event("processing_completed") - return result +```bash +OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http ``` -### ✨ Global Configuration Automation - -Monitoring configuration is auto-initialized in `backend/utils/monitoring.py`: +**Docker setup:** -```python -# No manual configuration needed - auto-completed at system startup -# monitoring_manager already configured with environment variables -from utils.monitoring import monitoring_manager +```yaml +jaeger: + image: jaegertracing/all-in-one:1.52 + environment: + - COLLECTOR_OTLP_ENABLED=true + ports: + - "16686:16686" + - "4318:4318" +``` -# Direct usage without checking if enabled -@monitoring_manager.monitor_endpoint("my_function") -def my_function(): - pass +## Environment Variables -# FastAPI application initialization -monitoring_manager.setup_fastapi_app(app) -``` +| Variable | Default | Description | +|----------|---------|-------------| +| `ENABLE_TELEMETRY` | `false` | Enable/disable monitoring | +| `OTEL_SERVICE_NAME` | `nexent-backend` | Service identifier | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | `http://localhost:4318` | OTLP receiver endpoint | +| `OTEL_EXPORTER_OTLP_PROTOCOL` | `http` | Protocol: `http` or `grpc` | +| `OTEL_EXPORTER_OTLP_HEADERS` | (empty) | Auth headers (comma-separated) | -### 🔒 Auto Start/Stop Design +## Code Integration -- **Smart Monitoring**: Auto start/stop based on `ENABLE_TELEMETRY` environment variable -- **Zero-Touch Usage**: External code doesn't need to check monitoring status, use all features directly -- **Graceful Degradation**: Silent no-effect when disabled, normal operation when enabled -- **Default Off**: Auto-disabled when not configured +### Endpoint Monitoring -```bash -# Enable monitoring -export ENABLE_TELEMETRY=true +```python +from utils.monitoring import monitoring_manager -# Disable monitoring -export ENABLE_TELEMETRY=false +@monitoring_manager.monitor_endpoint("my_service.my_function") +async def my_api_function(): + return {"status": "ok"} ``` -## 📊 Core Monitoring Metrics +### LLM Call Monitoring -| Metric | Description | Importance | -|--------|-------------|------------| -| `llm_token_generation_rate` | Token generation speed (tokens/s) | ⭐⭐⭐ | -| `llm_time_to_first_token_seconds` | First token latency | ⭐⭐⭐ | -| `llm_request_duration_seconds` | Complete request duration | ⭐⭐⭐ | -| `llm_total_tokens` | Input/output token count | ⭐⭐ | -| `llm_error_count` | LLM call error count | ⭐⭐⭐ | +```python +@monitoring_manager.monitor_llm_call("gpt-4", "chat_completion") +def call_llm(messages): + return llm_response +``` -## 🔧 Environment Configuration +### Agent Step Tracing -```bash -# Add to .env file -cat >> .env << EOF -ENABLE_TELEMETRY=true -SERVICE_NAME=nexent-backend -JAEGER_ENDPOINT=http://localhost:14268/api/traces -LLM_SLOW_REQUEST_THRESHOLD_SECONDS=5.0 -LLM_SLOW_TOKEN_RATE_THRESHOLD=10.0 -TELEMETRY_SAMPLE_RATE=1.0 # Development environment, production recommended 0.1 -EOF +```python +with monitoring_manager.trace_agent_step("web_search", "research_agent", "tool_call") as span: + result = execute_tool() + monitoring_manager.set_tool_output(result) ``` -## 🛠️ System Verification +### Tool Call Tracing -```bash -# Check metrics endpoint -curl http://localhost:8000/metrics - -# Verify dependency installation -python -c "from backend.utils.monitoring import MONITORING_AVAILABLE; print(f'Monitoring Available: {MONITORING_AVAILABLE}')" +```python +with monitoring_manager.trace_tool_call("web_search", "agent_name", {"query": "test"}) as span: + results = search_web("test") + monitoring_manager.set_tool_output({"results": results}) ``` -## 🆘 Troubleshooting - -### No monitoring data? -```bash -# Check service status -docker-compose -f docker/docker-compose-monitoring.yml ps +## OpenInference Semantic Attributes -# Check dependency installation -python -c "import opentelemetry; print('✅ Monitoring dependencies installed')" -``` +The system uses OpenInference semantic conventions for AI-specific observability: -### Port conflicts? -```bash -# Check port usage -lsof -i :3005 -i :9090 -i :16686 -``` +### LLM Attributes -### Dependency installation issues? -```bash -# Reinstall performance dependencies -uv sync --extra performance +| Attribute | Description | +|-----------|-------------| +| `llm.model_name` | Model identifier (e.g., `gpt-4`) | +| `llm.operation.name` | Operation type (e.g., `chat_completion`) | +| `llm.token_count.prompt` | Input token count | +| `llm.token_count.completion` | Output token count | +| `llm.invocation_parameters` | Model parameters (JSON) | +| `llm.time_to_first_token` | TTFT in seconds | -# Check performance configuration in pyproject.toml -cat backend/pyproject.toml | grep -A 20 "performance" -``` +### Agent Attributes -### Service name shows as unknown_service? -```bash -# Check environment variable configuration -echo "SERVICE_NAME: $SERVICE_NAME" +| Attribute | Description | +|-----------|-------------| +| `agent.name` | Agent identifier | +| `agent.step.name` | Step name (e.g., `web_search`) | +| `agent.step.type` | Step type: `tool_call`, `reasoning`, `action_selection` | +| `agent.tool.name` | Tool name | +| `agent.tool.input` | Tool input (JSON) | +| `agent.tool.output` | Tool output (JSON) | -# Restart monitoring service to apply new configuration -./docker/start-monitoring.sh -``` +## Metrics -## 🧹 Data Management +| Metric | Description | +|--------|-------------| +| `llm.request.duration` | Request latency | +| `llm.token.generation_rate` | Tokens per second | +| `llm.time_to_first_token` | TTFT | +| `llm.token_count.prompt` | Input tokens | +| `llm.token_count.completion` | Output tokens | +| `agent.step.count` | Agent step count | +| `agent.execution.duration` | Agent execution time | +| `agent.error.count` | Agent errors | -### Clean Jaeger Trace Data -```bash -# Method 1: Restart Jaeger container (simplest) -docker-compose -f docker/docker-compose-monitoring.yml restart nexent-jaeger +## Collector Configuration -# Method 2: Completely rebuild Jaeger container and data -docker-compose -f docker/docker-compose-monitoring.yml stop nexent-jaeger -docker-compose -f docker/docker-compose-monitoring.yml rm -f nexent-jaeger -docker-compose -f docker/docker-compose-monitoring.yml up -d nexent-jaeger +The OpenTelemetry Collector routes data to your chosen backend: -# Method 3: Clean all monitoring data (rebuild all containers) -docker-compose -f docker/docker-compose-monitoring.yml down -docker-compose -f docker/docker-compose-monitoring.yml up -d +```yaml +exporters: + otlp: + endpoint: ${OTEL_EXPORTER_OTLP_ENDPOINT} + headers: + authorization: ${OTEL_EXPORTER_OTLP_HEADERS} ``` -### Clean Prometheus Metrics Data -```bash -# Restart Prometheus container -docker-compose -f docker/docker-compose-monitoring.yml restart nexent-prometheus +See `docker/monitoring/otel-collector-config.yml` for full configuration with platform examples. -# Completely clean Prometheus data -docker-compose -f docker/docker-compose-monitoring.yml stop nexent-prometheus -docker volume rm docker_prometheus_data 2>/dev/null || true -docker-compose -f docker/docker-compose-monitoring.yml up -d nexent-prometheus -``` +## Graceful Degradation -### Clean Grafana Configuration -```bash -# Reset Grafana configuration and dashboards -docker-compose -f docker/docker-compose-monitoring.yml stop nexent-grafana -docker volume rm docker_grafana_data 2>/dev/null || true -docker-compose -f docker/docker-compose-monitoring.yml up -d nexent-grafana +When OpenTelemetry dependencies are not installed, monitoring gracefully disables: + +```python +pip install nexent # Basic package - no monitoring +pip install nexent[performance] # With OTLP support ``` -## 📈 Typical Problem Analysis +All monitoring methods work without errors when disabled - decorators pass through, context managers yield None. -### Slow token generation (< 5 tokens/s) -1. **Analysis**: Grafana → Token Generation Rate panel -2. **Solution**: Check model service load, optimize input prompt length +## Troubleshooting -### Slow request response (> 10s) -1. **Analysis**: Jaeger → View complete trace chain -2. **Solution**: Locate bottleneck (database/LLM/network) +### No data appearing -### Error rate spike (> 10%) -1. **Analysis**: Prometheus → llm_error_count metric -2. **Solution**: Check model service availability, verify API keys +1. Check `ENABLE_TELEMETRY=true` in `.env` +2. Verify OTLP endpoint is reachable +3. Check authentication headers are correct -## 🎉 Getting Started +### Connection errors -After setup completion, you can: +1. Test endpoint: `curl -v $OTEL_EXPORTER_OTLP_ENDPOINT/v1/traces` +2. Verify protocol matches endpoint (`http` vs `grpc`) +3. Check Collector logs: `docker logs nexent-otel-collector` -1. 📊 View **LLM Performance Dashboard** in Grafana -2. 🔍 Trace complete request chains in Jaeger -3. 📈 Analyze token generation speed and performance bottlenecks -4. 🚨 Set performance alerts and thresholds +### Wrong attributes -Enjoy efficient LLM performance monitoring! 🚀 +1. Verify OpenInference attributes in platform UI +2. Check span attribute naming: `llm.model_name` not `model_name` +3. Review platform-specific attribute requirements \ No newline at end of file diff --git a/doc/docs/zh/sdk/monitoring.md b/doc/docs/zh/sdk/monitoring.md index c592df267..bb1bf000e 100644 --- a/doc/docs/zh/sdk/monitoring.md +++ b/doc/docs/zh/sdk/monitoring.md @@ -1,289 +1,227 @@ -# 🚀 Nexent LLM 监控系统 +# Nexent Agent 可观测性(OTLP) -专门监控大模型 Token 生成速度和性能的企业级监控解决方案。 +基于 OpenTelemetry OTLP 协议的 AI Agent 企业级可观测性方案。支持对接 Arize Phoenix、Langfuse 等主流 AI 可观测性平台。 -## 📊 系统架构 +## 系统架构 ``` -┌─────────────────────────────────────────────────────────┐ -│ Nexent LLM 监控系统 │ -├─────────────────────────────────────────────────────────┤ -│ │ -│ Nexent API ──► OpenTelemetry ──► Jaeger (链路追踪) │ -│ │ │ │ -│ │ └──────► Prometheus (指标收集) │ -│ │ │ │ -│ └─► OpenAI LLM └──► Grafana (可视化) │ -│ (Token 监控) │ -└─────────────────────────────────────────────────────────┘ +NexentAgent ──► OpenTelemetry SDK ──► OTLP Collector ──► Arize Phoenix / Langfuse / Jaeger + │ │ + │ OpenInference 语义约定 │ + │ (llm.*, agent.* 属性) │ + └────────────────────────────────────────┘ ``` -## ⚡ 快速启动(5分钟) +## 快速启动 ```bash -# 1. 启动监控服务 -./docker/start-monitoring.sh +cd docker +cp .env.example .env -# 2. 安装性能监控依赖 -uv sync --extra performance - -# 3. 启用监控 -export ENABLE_TELEMETRY=true +vim .env +ENABLE_TELEMETRY=true +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http -# 4. 启动后端服务 -python backend/config_service.py -python backend/runtime_service.py +docker-compose -f docker-compose-monitoring.yml up -d ``` -## 📊 访问监控界面 +## AI 可观测性平台对接 -| 界面 | 地址 | 用途 | -|------|------|------| -| **Grafana 仪表板** | http://localhost:3005 | LLM 性能监控 | -| **Jaeger 链路追踪** | http://localhost:16686 | 请求链路分析 | -| **Prometheus 指标** | http://localhost:9090 | 原始监控数据 | +### Arize Phoenix -### 🔐 Grafana 登录信息 +Arize Phoenix 提供针对 AI 的专业可观测性,原生支持 OpenInference 语义。 -首次访问 Grafana (http://localhost:3005) 时需要登录: +**配置:** -``` -用户名: admin -密码: admin +```bash +OTEL_EXPORTER_OTLP_ENDPOINT=https://phoenix.arize.com/v1 +OTEL_EXPORTER_OTLP_HEADERS=x-api-key=YOUR_PHOENIX_API_KEY +OTEL_EXPORTER_OTLP_PROTOCOL=http ``` -**首次登录后会要求修改密码,可以:** -- 设置新密码(推荐) -- 点击 "Skip" 跳过(开发环境) +**功能特性:** +- LLM 调用链可视化(Prompt/Completion) +- Token 级性能指标 +- Agent 步骤追踪 +- 成本分析 -**登录后可以看到:** -- 📊 **LLM Performance Dashboard** - 预配置的性能仪表板 -- 📈 **数据源配置** - 自动连接到 Prometheus 和 Jaeger -- 🎯 **实时监控面板** - Token 生成速度、延迟等关键指标 +### Langfuse -## 🎯 核心功能特性 +Langfuse 提供 Prompt 管理和 LLM 可观测性,支持 OTLP 协议。 -### ⚡ LLM 专用监控 -- **Token 生成速度**: 实时监控每秒生成的 token 数量 -- **TTFT (Time to First Token)**: 首个 token 返回延迟 -- **流式响应分析**: 每个 token 的生成时间戳 -- **模型性能对比**: 不同模型的性能基准 +**配置:** -### 🔍 分布式链路追踪 -- **完整请求链路**: 从 HTTP 到 LLM 的端到端追踪 -- **性能瓶颈识别**: 自动定位慢查询和异常 -- **错误根因分析**: 快速定位问题根源 +```bash +OTEL_EXPORTER_OTLP_ENDPOINT=https://cloud.langfuse.com/api/public/otel/v1 -### 🛠️ 开发友好设计 -- **一行代码接入**: 使用装饰器快速添加监控 -- **零依赖降级**: 未安装监控依赖时自动跳过 -- **零感知使用**: 无需手动检查监控状态,自动处理 -- **灵活配置**: 环境变量控制监控行为 +LANGFUSE_PUBLIC_KEY=pk-xxx +LANGFUSE_SECRET_KEY=sk-xxx -## 🛠️ 添加监控到代码 +OTEL_EXPORTER_OTLP_HEADERS=Authorization=Basic BASE64_ENCODED_KEY +``` -### 🎯 推荐方式:单例模式 (v2.1+) +生成认证 Key: -```python -# 后端服务中使用 - 直接使用全局配置好的 monitoring_manager -from utils.monitoring import monitoring_manager +```bash +echo -n "$LANGFUSE_PUBLIC_KEY:$LANGFUSE_SECRET_KEY" | base64 +``` -# API 端点监控 -@monitoring_manager.monitor_endpoint("my_service.my_function") -async def my_api_function(): - return {"status": "ok"} +**功能特性:** +- Prompt 版本管理 +- 会话级 Trace 分组 +- 用户反馈收集 +- 模型成本追踪 -# LLM 调用监控 -@monitoring_manager.monitor_llm_call("gpt-4", "chat_completion") -def call_llm(messages): - # 自动获得 Token 级别监控 - return llm_response +### 本地 Jaeger(OTLP) -# 手动添加监控事件 -monitoring_manager.add_span_event("custom_event", {"key": "value"}) -monitoring_manager.set_span_attributes(user_id="123", action="process") -``` +本地开发可继续使用 Jaeger,通过 OTLP 协议对接。 -### 📦 SDK中直接使用 +**配置:** -```python -from nexent.monitor import get_monitoring_manager - -# 获取全局监控管理器 - 在backend已自动配置 -monitor = get_monitoring_manager() - -# 使用装饰器 -@monitor.monitor_llm_call("claude-3", "completion") -def my_llm_function(): - return "response" - -# 或者在业务逻辑中直接使用 -with monitor.trace_llm_request("custom_operation", "my_model") as span: - # 执行业务逻辑 - result = process_data() - monitor.add_span_event("processing_completed") - return result +```bash +OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http ``` -### ✨ 全局配置自动化 - -监控配置已在 `backend/utils/monitoring.py` 中自动初始化: +**Docker 配置:** -```python -# 无需手动配置 - 系统启动时自动完成 -# monitoring_manager 已经使用环境变量配置完成 -from utils.monitoring import monitoring_manager +```yaml +jaeger: + image: jaegertracing/all-in-one:1.52 + environment: + - COLLECTOR_OTLP_ENABLED=true + ports: + - "16686:16686" + - "4318:4318" +``` -# 直接使用即可,无需检查是否开启 -@monitoring_manager.monitor_endpoint("my_function") -def my_function(): - pass +## 环境变量 -# FastAPI应用初始化 -monitoring_manager.setup_fastapi_app(app) -``` +| 变量 | 默认值 | 说明 | +|------|--------|------| +| `ENABLE_TELEMETRY` | `false` | 启用/禁用监控 | +| `OTEL_SERVICE_NAME` | `nexent-backend` | 服务标识 | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | `http://localhost:4318` | OTLP 接收端点 | +| `OTEL_EXPORTER_OTLP_PROTOCOL` | `http` | 协议:`http` 或 `grpc` | +| `OTEL_EXPORTER_OTLP_HEADERS` | (空) | 认证头(逗号分隔) | -### 🔒 自动启停设计 +## 代码集成 -- **智能监控**: 根据 `ENABLE_TELEMETRY` 环境变量自动启停 -- **零感知使用**: 外部代码无需检查监控状态,直接使用所有功能 -- **优雅降级**: 未开启时静默无效果,开启时正常工作 -- **默认关闭**: 未配置时自动视为关闭状态 +### 端点监控 -```bash -# 开启监控 -export ENABLE_TELEMETRY=true +```python +from utils.monitoring import monitoring_manager -# 关闭监控 -export ENABLE_TELEMETRY=false +@monitoring_manager.monitor_endpoint("my_service.my_function") +async def my_api_function(): + return {"status": "ok"} ``` -## 📊 核心监控指标 +### LLM 调用监控 -| 指标 | 描述 | 重要性 | -|------|------|-------| -| `llm_token_generation_rate` | Token 生成速度 (tokens/s) | ⭐⭐⭐ | -| `llm_time_to_first_token_seconds` | 首 Token 延迟 | ⭐⭐⭐ | -| `llm_request_duration_seconds` | 完整请求耗时 | ⭐⭐⭐ | -| `llm_total_tokens` | 输入/输出 Token 数量 | ⭐⭐ | -| `llm_error_count` | LLM 调用错误数 | ⭐⭐⭐ | +```python +@monitoring_manager.monitor_llm_call("gpt-4", "chat_completion") +def call_llm(messages): + return llm_response +``` -## 🔧 环境配置 +### Agent 步骤追踪 -```bash -# 添加到 .env 文件 -cat >> .env << EOF -ENABLE_TELEMETRY=true -SERVICE_NAME=nexent-backend -JAEGER_ENDPOINT=http://localhost:14268/api/traces -LLM_SLOW_REQUEST_THRESHOLD_SECONDS=5.0 -LLM_SLOW_TOKEN_RATE_THRESHOLD=10.0 -TELEMETRY_SAMPLE_RATE=1.0 # 开发环境,生产环境推荐 0.1 -EOF +```python +with monitoring_manager.trace_agent_step("web_search", "research_agent", "tool_call") as span: + result = execute_tool() + monitoring_manager.set_tool_output(result) ``` -## 🛠️ 验证系统 +### 工具调用追踪 -```bash -# 检查指标端点 -curl http://localhost:8000/metrics - -# 验证依赖安装 -python -c "from backend.utils.monitoring import MONITORING_AVAILABLE; print(f'监控可用: {MONITORING_AVAILABLE}')" +```python +with monitoring_manager.trace_tool_call("web_search", "agent_name", {"query": "test"}) as span: + results = search_web("test") + monitoring_manager.set_tool_output({"results": results}) ``` -## 🆘 故障排除 - -### 监控数据为空? -```bash -# 检查服务状态 -docker-compose -f docker/docker-compose-monitoring.yml ps +## OpenInference 语义属性 -# 检查依赖安装 -python -c "import opentelemetry; print('✅ 监控依赖已安装')" -``` +系统使用 OpenInference 语义约定,专为 AI 可观测性设计: -### 端口冲突? -```bash -# 检查端口占用 -lsof -i :3005 -i :9090 -i :16686 -``` +### LLM 属性 -### 依赖安装问题? -```bash -# 重新安装性能依赖 -uv sync --extra performance +| 属性 | 说明 | +|------|------| +| `llm.model_name` | 模型标识(如 `gpt-4`) | +| `llm.operation.name` | 操作类型(如 `chat_completion`) | +| `llm.token_count.prompt` | 输入 Token 数 | +| `llm.token_count.completion` | 输出 Token 数 | +| `llm.invocation_parameters` | 模型参数(JSON) | +| `llm.time_to_first_token` | TTFT(秒) | -# 检查 pyproject.toml 中的 performance 配置 -cat backend/pyproject.toml | grep -A 20 "performance" -``` +### Agent 属性 -### 服务名显示为 unknown_service? -```bash -# 检查环境变量配置 -echo "SERVICE_NAME: $SERVICE_NAME" +| 属性 | 说明 | +|------|------| +| `agent.name` | Agent 标识 | +| `agent.step.name` | 步骤名称(如 `web_search`) | +| `agent.step.type` | 步骤类型:`tool_call`、`reasoning`、`action_selection` | +| `agent.tool.name` | 工具名称 | +| `agent.tool.input` | 工具输入(JSON) | +| `agent.tool.output` | 工具输出(JSON) | -# 重启监控服务以应用新配置 -./docker/start-monitoring.sh -``` +## 指标 -## 🧹 数据管理 +| 指标 | 说明 | +|------|------| +| `llm.request.duration` | 请求延迟 | +| `llm.token.generation_rate` | Token 生成速率 | +| `llm.time_to_first_token` | TTFT | +| `llm.token_count.prompt` | 输入 Token | +| `llm.token_count.completion` | 输出 Token | +| `agent.step.count` | Agent 步骤数 | +| `agent.execution.duration` | Agent 执行时间 | +| `agent.error.count` | Agent 错误数 | -### 清理 Jaeger 追踪数据 -```bash -# 方法1: 重启 Jaeger 容器(最简单) -docker-compose -f docker/docker-compose-monitoring.yml restart nexent-jaeger +## Collector 配置 -# 方法2: 完全重建 Jaeger 容器和数据 -docker-compose -f docker/docker-compose-monitoring.yml stop nexent-jaeger -docker-compose -f docker/docker-compose-monitoring.yml rm -f nexent-jaeger -docker-compose -f docker/docker-compose-monitoring.yml up -d nexent-jaeger +OpenTelemetry Collector 将数据路由到选定的后端: -# 方法3: 清理所有监控数据(重建所有容器) -docker-compose -f docker/docker-compose-monitoring.yml down -docker-compose -f docker/docker-compose-monitoring.yml up -d +```yaml +exporters: + otlp: + endpoint: ${OTEL_EXPORTER_OTLP_ENDPOINT} + headers: + authorization: ${OTEL_EXPORTER_OTLP_HEADERS} ``` -### 清理 Prometheus 指标数据 -```bash -# 重启 Prometheus 容器 -docker-compose -f docker/docker-compose-monitoring.yml restart nexent-prometheus +完整配置见 `docker/monitoring/otel-collector-config.yml`。 -# 完全清理 Prometheus 数据 -docker-compose -f docker/docker-compose-monitoring.yml stop nexent-prometheus -docker volume rm docker_prometheus_data 2>/dev/null || true -docker-compose -f docker/docker-compose-monitoring.yml up -d nexent-prometheus -``` +## 优雅降级 -### 清理 Grafana 配置 -```bash -# 重置 Grafana 配置和仪表板 -docker-compose -f docker/docker-compose-monitoring.yml stop nexent-grafana -docker volume rm docker_grafana_data 2>/dev/null || true -docker-compose -f docker/docker-compose-monitoring.yml up -d nexent-grafana +未安装 OpenTelemetry 依赖时,监控自动禁用: + +```python +pip install nexent # 基础包 - 无监控 +pip install nexent[performance] # 包含 OTLP 支持 ``` -## 📈 典型问题分析 +禁用时所有监控方法均正常工作 - 装饰器透传,上下文管理器返回 None。 -### Token 生成速度慢 (< 5 tokens/s) -1. **分析**: Grafana → Token Generation Rate 面板 -2. **解决**: 检查模型服务负载、优化输入 prompt 长度 +## 故障排除 -### 请求响应慢 (> 10s) -1. **分析**: Jaeger → 查看完整链路追踪 -2. **解决**: 定位瓶颈环节(数据库/LLM/网络) +### 数据未显示 -### 错误率突增 (> 10%) -1. **分析**: Prometheus → llm_error_count 指标 -2. **解决**: 检查模型服务可用性、验证 API 密钥 +1. 检查 `.env` 中 `ENABLE_TELEMETRY=true` +2. 验证 OTLP 端点可访问 +3. 检查认证头配置正确 -## 🎉 开始使用 +### 连接错误 -设置完成后你可以: +1. 测试端点:`curl -v $OTEL_EXPORTER_OTLP_ENDPOINT/v1/traces` +2. 确认协议匹配端点(`http` vs `grpc`) +3. 查看 Collector 日志:`docker logs nexent-otel-collector` -1. 📊 在 Grafana 中查看 **LLM Performance Dashboard** -2. 🔍 在 Jaeger 中追踪每个请求的完整链路 -3. 📈 分析 Token 生成速度和性能瓶颈 -4. 🚨 设置性能告警和阈值 +### 属性错误 -享受高效的 LLM 性能监控! 🚀 +1. 在平台 UI 中验证 OpenInference 属性 +2. 检查 Span 属性命名:使用 `llm.model_name` 而非 `model_name` +3. 查看平台特定属性要求 \ No newline at end of file diff --git a/docker/.env.example b/docker/.env.example index 888609e04..9bc680feb 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -152,11 +152,27 @@ WORKER_CONCURRENCY=4 # Skills Configuration SKILLS_PATH=/mnt/nexent/skills -# Telemetry and Monitoring Configuration +# Telemetry and Monitoring Configuration (OTLP Protocol) +# Enable OpenTelemetry monitoring for agent observability ENABLE_TELEMETRY=false -SERVICE_NAME=nexent-backend -JAEGER_ENDPOINT=http://localhost:14268/api/traces -PROMETHEUS_PORT=8000 + +# Service name for identifying traces in observability platforms +OTEL_SERVICE_NAME=nexent-backend + +# OTLP endpoint - can be: +# - http://otel-collector:4318 (through OpenTelemetry Collector) +# - Direct connection to Arize Phoenix: https://phoenix.arize.com/v1/traces +# - Direct connection to Langfuse: https://cloud.langfuse.com/api/public/otel/v1/traces +OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 + +# Protocol: "http" or "grpc" +OTEL_EXPORTER_OTLP_PROTOCOL=http + +# Authentication headers (format: key1=value1,key2=value2) +# For Arize Phoenix: x-api-key=YOUR_API_KEY +# For Langfuse: Authorization=Basic base64(public_key:secret_key) +OTEL_EXPORTER_OTLP_HEADERS= + TELEMETRY_SAMPLE_RATE=1.0 LLM_SLOW_REQUEST_THRESHOLD_SECONDS=5.0 LLM_SLOW_TOKEN_RATE_THRESHOLD=10.0 diff --git a/docker/deploy.sh b/docker/deploy.sh index e30e6e75a..4f6e91549 100755 --- a/docker/deploy.sh +++ b/docker/deploy.sh @@ -1041,7 +1041,7 @@ main_deploy() { echo "--------------------------------" echo "" - APP_VERSION="$(get_app_version)" + APP_VERSION="latest" if [ -z "$APP_VERSION" ]; then echo "❌ Failed to get app version, please check the backend/consts/const.py file" exit 1 diff --git a/docker/docker-compose-monitoring.yml b/docker/docker-compose-monitoring.yml index fb4aa5eaf..20cadb0a9 100644 --- a/docker/docker-compose-monitoring.yml +++ b/docker/docker-compose-monitoring.yml @@ -1,65 +1,4 @@ services: - # Jaeger - Distributed Tracing - jaeger: - image: jaegertracing/all-in-one:1.52 - container_name: nexent-jaeger - ports: - - "16686:16686" # Jaeger UI - - "14268:14268" # Jaeger collector HTTP - - "14250:14250" # Jaeger collector gRPC - - "6831:6831/udp" # Agent UDP - - "6832:6832/udp" # Agent UDP - environment: - - COLLECTOR_OTLP_ENABLED=true - - COLLECTOR_ZIPKIN_HOST_PORT=:9411 - networks: - - nexent-network - restart: unless-stopped - volumes: - - jaeger-data:/tmp - - # Prometheus - Metrics Collection - prometheus: - image: prom/prometheus:v2.48.0 - container_name: nexent-prometheus - ports: - - "9090:9090" - command: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.path=/prometheus' - - '--web.console.libraries=/etc/prometheus/console_libraries' - - '--web.console.templates=/etc/prometheus/consoles' - - '--storage.tsdb.retention.time=15d' - - '--web.enable-lifecycle' - - '--web.enable-admin-api' - volumes: - - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml - - prometheus-data:/prometheus - networks: - - nexent-network - restart: unless-stopped - - # Grafana - Metrics Visualization - grafana: - image: grafana/grafana:10.2.0 - container_name: nexent-grafana - ports: - - "3005:3000" - environment: - - GF_SECURITY_ADMIN_PASSWORD=admin - - GF_USERS_ALLOW_SIGN_UP=false - - GF_INSTALL_PLUGINS=grafana-piechart-panel - volumes: - - grafana-data:/var/lib/grafana - - ./monitoring/grafana/provisioning:/etc/grafana/provisioning - - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards - networks: - - nexent-network - restart: unless-stopped - depends_on: - - prometheus - - # OpenTelemetry Collector (Optional - for advanced setups) otel-collector: image: otel/opentelemetry-collector-contrib:0.89.0 container_name: nexent-otel-collector @@ -67,22 +6,12 @@ services: volumes: - ./monitoring/otel-collector-config.yml:/etc/otel-collector-config.yml ports: - - "4317:4317" # OTLP gRPC receiver - - "4318:4318" # OTLP HTTP receiver - - "8888:8888" # Prometheus metrics exposed by the collector - - "8889:8889" # Prometheus exporter metrics - depends_on: - - jaeger - - prometheus + - "4317:4317" + - "4318:4318" networks: - nexent-network restart: unless-stopped -volumes: - jaeger-data: - prometheus-data: - grafana-data: - networks: nexent-network: - external: true + external: true \ No newline at end of file diff --git a/docker/monitoring/grafana/dashboards/nexent-llm-performance.json b/docker/monitoring/grafana/dashboards/nexent-llm-performance.json deleted file mode 100644 index ec8d0434a..000000000 --- a/docker/monitoring/grafana/dashboards/nexent-llm-performance.json +++ /dev/null @@ -1,544 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": null, - "links": [], - "liveNow": false, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 1, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "histogram_quantile(0.95, rate(llm_request_duration_seconds_bucket[5m]))", - "interval": "", - "legendFormat": "95th percentile", - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "histogram_quantile(0.50, rate(llm_request_duration_seconds_bucket[5m]))", - "interval": "", - "legendFormat": "50th percentile (median)", - "refId": "B" - } - ], - "title": "LLM Request Duration", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "tokens/s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 - }, - "id": 2, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "histogram_quantile(0.95, rate(llm_token_generation_rate_bucket[5m]))", - "interval": "", - "legendFormat": "95th percentile", - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "histogram_quantile(0.50, rate(llm_token_generation_rate_bucket[5m]))", - "interval": "", - "legendFormat": "50th percentile (median)", - "refId": "B" - } - ], - "title": "Token Generation Rate", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 8 - }, - "id": 3, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "histogram_quantile(0.95, rate(llm_time_to_first_token_seconds_bucket[5m]))", - "interval": "", - "legendFormat": "95th percentile TTFT", - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "histogram_quantile(0.50, rate(llm_time_to_first_token_seconds_bucket[5m]))", - "interval": "", - "legendFormat": "50th percentile TTFT", - "refId": "B" - } - ], - "title": "Time to First Token (TTFT)", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "tokens" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 8 - }, - "id": 4, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "rate(llm_total_tokens_total{type=\"input\"}[5m])", - "interval": "", - "legendFormat": "Input tokens/sec", - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "rate(llm_total_tokens_total{type=\"output\"}[5m])", - "interval": "", - "legendFormat": "Output tokens/sec", - "refId": "B" - } - ], - "title": "Token Throughput", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "errors/sec" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 16 - }, - "id": 5, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "rate(llm_error_count_total[5m])", - "interval": "", - "legendFormat": "Error rate by model: {{model}}", - "refId": "A" - } - ], - "title": "LLM Error Rate", - "type": "timeseries" - } - ], - "refresh": "5s", - "schemaVersion": 37, - "style": "dark", - "tags": ["nexent", "llm", "performance"], - "templating": { - "list": [] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "Nexent LLM Performance Dashboard", - "uid": "nexent-llm-perf", - "version": 1, - "weekStart": "" -} - diff --git a/docker/monitoring/grafana/provisioning/dashboards/dashboards.yml b/docker/monitoring/grafana/provisioning/dashboards/dashboards.yml deleted file mode 100644 index b89a1fa81..000000000 --- a/docker/monitoring/grafana/provisioning/dashboards/dashboards.yml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: 1 - -providers: - - name: 'Nexent LLM Monitoring' - orgId: 1 - folder: 'Nexent' - type: file - disableDeletion: false - updateIntervalSeconds: 10 - allowUiUpdates: true - options: - path: /var/lib/grafana/dashboards - diff --git a/docker/monitoring/grafana/provisioning/datasources/datasources.yml b/docker/monitoring/grafana/provisioning/datasources/datasources.yml deleted file mode 100644 index 9bdc40d61..000000000 --- a/docker/monitoring/grafana/provisioning/datasources/datasources.yml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: 1 - -datasources: - - name: Prometheus - type: prometheus - access: proxy - url: http://prometheus:9090 - isDefault: true - editable: true - - - name: Jaeger - type: jaeger - access: proxy - url: http://jaeger:16686 - editable: true - diff --git a/docker/monitoring/monitoring.env b/docker/monitoring/monitoring.env index 2506c03a6..ec88b61f2 100644 --- a/docker/monitoring/monitoring.env +++ b/docker/monitoring/monitoring.env @@ -1,21 +1,12 @@ -# Telemetry and Monitoring Configuration ENABLE_TELEMETRY=true -SERVICE_NAME=nexent-backend -JAEGER_ENDPOINT=http://localhost:14268/api/traces -PROMETHEUS_PORT=8000 +OTEL_SERVICE_NAME=nexent-backend +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http +OTEL_EXPORTER_OTLP_HEADERS= TELEMETRY_SAMPLE_RATE=1.0 -# Performance monitoring thresholds LLM_SLOW_REQUEST_THRESHOLD_SECONDS=5.0 LLM_SLOW_TOKEN_RATE_THRESHOLD=10.0 -# Grafana Configuration -GF_SECURITY_ADMIN_PASSWORD=admin -GF_USERS_ALLOW_SIGN_UP=false - -# Service ports -JAEGER_UI_PORT=16686 -PROMETHEUS_UI_PORT=9090 -GRAFANA_UI_PORT=3000 OTEL_COLLECTOR_GRPC_PORT=4317 -OTEL_COLLECTOR_HTTP_PORT=4318 +OTEL_COLLECTOR_HTTP_PORT=4318 \ No newline at end of file diff --git a/docker/monitoring/monitoring.env.example b/docker/monitoring/monitoring.env.example index 26ab041c8..ec88b61f2 100644 --- a/docker/monitoring/monitoring.env.example +++ b/docker/monitoring/monitoring.env.example @@ -1,22 +1,12 @@ -# Telemetry and Monitoring Configuration ENABLE_TELEMETRY=true -SERVICE_NAME=nexent-backend -JAEGER_ENDPOINT=http://localhost:14268/api/traces -PROMETHEUS_PORT=8000 +OTEL_SERVICE_NAME=nexent-backend +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http +OTEL_EXPORTER_OTLP_HEADERS= TELEMETRY_SAMPLE_RATE=1.0 -# Performance monitoring thresholds LLM_SLOW_REQUEST_THRESHOLD_SECONDS=5.0 LLM_SLOW_TOKEN_RATE_THRESHOLD=10.0 -# Grafana Configuration -GF_SECURITY_ADMIN_PASSWORD=admin -GF_USERS_ALLOW_SIGN_UP=false - -# Service ports -JAEGER_UI_PORT=16686 -PROMETHEUS_UI_PORT=9090 -GRAFANA_UI_PORT=3000 OTEL_COLLECTOR_GRPC_PORT=4317 -OTEL_COLLECTOR_HTTP_PORT=4318 - +OTEL_COLLECTOR_HTTP_PORT=4318 \ No newline at end of file diff --git a/docker/monitoring/otel-collector-config.yml b/docker/monitoring/otel-collector-config.yml index f14f427b5..c8d16f98e 100644 --- a/docker/monitoring/otel-collector-config.yml +++ b/docker/monitoring/otel-collector-config.yml @@ -5,22 +5,16 @@ receivers: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 - - # Prometheus receiver to collect metrics from instrumented apps - prometheus: - config: - scrape_configs: - - job_name: 'nexent-backend-otel' - static_configs: - - targets: ['host.docker.internal:8000'] - scrape_interval: 5s processors: batch: timeout: 1s send_batch_size: 512 - # Resource processor to add common attributes + memory_limiter: + limit_mib: 256 + check_interval: 1s + resource: attributes: - key: service.name @@ -30,51 +24,65 @@ processors: from_attribute: version action: insert - # Memory limiter to prevent OOM - memory_limiter: - limit_mib: 256 - check_interval: 1s - - # Add attributes specifically for LLM monitoring - attributes: - actions: - - key: llm.system - value: openai - action: insert - - key: deployment.environment - value: development - action: insert - exporters: - # Export traces to Jaeger via OTLP - otlp/jaeger: - endpoint: jaeger:14250 - tls: - insecure: true - - # Export metrics to Prometheus - prometheus: - endpoint: "0.0.0.0:8889" - resource_to_telemetry_conversion: - enabled: true - - # Logging exporter for debugging + otlp: + endpoint: ${OTEL_EXPORTER_OTLP_ENDPOINT:-http://localhost:4318} + headers: + authorization: ${OTEL_EXPORTER_OTLP_HEADERS:-} + logging: verbosity: normal service: - extensions: [] pipelines: traces: receivers: [otlp] processors: [memory_limiter, resource, batch] - exporters: [otlp/jaeger, logging] + exporters: [otlp, logging] metrics: - receivers: [otlp, prometheus] - processors: [memory_limiter, resource, attributes, batch] - exporters: [prometheus, logging] + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [otlp, logging] telemetry: logs: level: "info" + +# Example configurations for AI observability platforms: +# +# === Arize Phoenix === +# Set environment variables: +# OTEL_EXPORTER_OTLP_ENDPOINT=https://phoenix.arize.com/v1 +# OTEL_EXPORTER_OTLP_HEADERS=x-api-key=YOUR_PHOENIX_API_KEY +# +# Or configure directly in exporters section: +# otlp/arize: +# endpoint: https://phoenix.arize.com/v1 +# headers: +# x-api-key: YOUR_PHOENIX_API_KEY +# +# === Langfuse === +# Set environment variables: +# OTEL_EXPORTER_OTLP_ENDPOINT=https://cloud.langfuse.com/api/public/otel +# OTEL_EXPORTER_OTLP_HEADERS=Authorization=Basic BASE64_ENCODED_KEY +# +# Where BASE64_ENCODED_KEY = base64(public_key:secret_key) +# +# Or configure directly: +# otlp/langfuse: +# endpoint: https://cloud.langfuse.com/api/public/otel/v1 +# headers: +# Authorization: Basic BASE64_ENCODED_KEY +# +# === Local Jaeger (OTLP) === +# For gradual migration, you can still use Jaeger via OTLP: +# otlp/jaeger: +# endpoint: jaeger:4317 +# tls: +# insecure: true +# +# === Multiple Exporters === +# To export to multiple backends simultaneously, create multiple exporters +# and add them to the pipelines: +# exporters: [otlp/arize, otlp/langfuse, logging] \ No newline at end of file diff --git a/docker/monitoring/prometheus.yml b/docker/monitoring/prometheus.yml deleted file mode 100644 index 49258c097..000000000 --- a/docker/monitoring/prometheus.yml +++ /dev/null @@ -1,39 +0,0 @@ -global: - scrape_interval: 15s - evaluation_interval: 15s - -rule_files: - # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. - - "nexent_alerts.yml" - -scrape_configs: - # Nexent Backend - LLM Metrics - - job_name: 'nexent-backend' - static_configs: - - targets: ['host.docker.internal:8000'] # Adjust based on your backend service - scrape_interval: 15s - metrics_path: /metrics - scrape_timeout: 10s - - # OpenTelemetry Collector - - job_name: 'otel-collector' - static_configs: - - targets: ['otel-collector:8888'] - scrape_interval: 10s - - # Prometheus self-monitoring - - job_name: 'prometheus' - static_configs: - - targets: ['localhost:9090'] - - # Jaeger Metrics - - job_name: 'jaeger' - static_configs: - - targets: ['jaeger:14269'] - -# Alertmanager configuration (optional) -# alerting: -# alertmanagers: -# - static_configs: -# - targets: -# - alertmanager:9093 diff --git a/sdk/nexent/monitor/__init__.py b/sdk/nexent/monitor/__init__.py index a0216d382..02a4ff1b4 100644 --- a/sdk/nexent/monitor/__init__.py +++ b/sdk/nexent/monitor/__init__.py @@ -1,12 +1,24 @@ """ Nexent Monitor Package - LLM Performance Monitoring System -A comprehensive monitoring solution specifically designed for LLM applications. -Provides distributed tracing, token-level performance monitoring, and seamless -integration with OpenTelemetry, Jaeger, Prometheus, and Grafana. +A comprehensive monitoring solution using OpenTelemetry OTLP protocol. +Provides distributed tracing, token-level performance monitoring, and seamless +integration with AI observability platforms like Arize Phoenix and Langfuse. """ -from .monitoring import * - -__version__ = "0.1.0" +from .monitoring import ( + MonitoringConfig, + MonitoringManager, + LLMTokenTracker, + get_monitoring_manager, + is_opentelemetry_available, +) +__version__ = "0.2.0" +__all__ = [ + 'MonitoringConfig', + 'MonitoringManager', + 'LLMTokenTracker', + 'get_monitoring_manager', + 'is_opentelemetry_available', +] \ No newline at end of file diff --git a/sdk/nexent/monitor/monitoring.py b/sdk/nexent/monitor/monitoring.py index 7163a69cc..b108e7c01 100644 --- a/sdk/nexent/monitor/monitoring.py +++ b/sdk/nexent/monitor/monitoring.py @@ -2,8 +2,9 @@ Nexent LLM Performance Monitoring System A comprehensive monitoring solution specifically designed for LLM applications. -Provides distributed tracing, token-level performance monitoring, and seamless -integration with OpenTelemetry, Jaeger, Prometheus, and Grafana. +Provides distributed tracing, token-level performance monitoring, and seamless +integration with OpenTelemetry OTLP protocol for AI observability platforms +like Arize Phoenix, Langfuse, and others. This module uses a singleton pattern for consistent monitoring across the SDK. When OpenTelemetry dependencies are not available, the module gracefully degrades @@ -17,24 +18,29 @@ # Optional OpenTelemetry imports - gracefully handle missing dependencies try: from opentelemetry.trace.status import Status, StatusCode - from opentelemetry.exporter.prometheus import PrometheusMetricReader + from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter as OTLPSpanExporterHTTP + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter as OTLPSpanExporterGRPC + from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as OTLPMetricExporterHTTP + from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as OTLPMetricExporterGRPC from opentelemetry.sdk.metrics import MeterProvider + from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.sdk.trace import TracerProvider from opentelemetry.instrumentation.requests import RequestsInstrumentor from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor - from opentelemetry.exporter.jaeger.thrift import JaegerExporter from opentelemetry import trace, metrics from opentelemetry.sdk.resources import Resource OPENTELEMETRY_AVAILABLE = True except ImportError: OPENTELEMETRY_AVAILABLE = False + import logging import time import functools +import json from contextlib import contextmanager from typing import Any, Dict, Optional, Callable, TypeVar, cast, Iterator -from dataclasses import dataclass +from dataclasses import dataclass, field logger = logging.getLogger(__name__) @@ -45,13 +51,20 @@ def is_opentelemetry_available() -> bool: """Check if OpenTelemetry dependencies are available.""" return OPENTELEMETRY_AVAILABLE + @dataclass class MonitoringConfig: - """Configuration for monitoring system.""" + """ + Configuration for monitoring system using OTLP protocol. + + Supports HTTP and gRPC protocols for exporting traces and metrics + to any OpenTelemetry-compatible backend (Arize Phoenix, Langfuse, etc). + """ enable_telemetry: bool = False - service_name: str = "nexent-sdk" - jaeger_endpoint: str = "http://localhost:14268/api/traces" - prometheus_port: int = 8000 + service_name: str = "nexent-backend" + otlp_endpoint: str = "http://localhost:4318" + otlp_protocol: str = "http" # "http" or "grpc" + otlp_headers: Dict[str, str] = field(default_factory=dict) telemetry_sample_rate: float = 1.0 llm_slow_request_threshold_seconds: float = 5.0 llm_slow_token_rate_threshold: float = 10.0 @@ -64,6 +77,13 @@ def __post_init__(self): "Install with: pip install nexent[performance]" ) self.enable_telemetry = False + + # Validate protocol + if self.otlp_protocol not in ("http", "grpc"): + logger.warning( + f"Invalid OTLP protocol '{self.otlp_protocol}'. Using 'http'." + ) + self.otlp_protocol = "http" class MonitoringManager: @@ -87,13 +107,19 @@ def __init__(self): self._tracer: Optional[Any] = None self._meter: Optional[Any] = None - # LLM-specific metrics + # LLM-specific metrics (OpenInference semantics) self._llm_request_duration: Optional[Any] = None self._llm_token_generation_rate: Optional[Any] = None self._llm_ttft_duration: Optional[Any] = None - self._llm_total_tokens: Optional[Any] = None + self._llm_token_count_prompt: Optional[Any] = None + self._llm_token_count_completion: Optional[Any] = None self._llm_error_count: Optional[Any] = None + # Agent-specific metrics (OpenInference semantics) + self._agent_step_count: Optional[Any] = None + self._agent_execution_duration: Optional[Any] = None + self._agent_error_count: Optional[Any] = None + self._initialized = True logger.info("MonitoringManager singleton created") @@ -101,13 +127,15 @@ def configure(self, config: MonitoringConfig) -> None: """Configure the monitoring system.""" self._config = config logger.info( - f"Monitoring configured: enabled={config.enable_telemetry}, service={config.service_name}") + f"Monitoring configured: enabled={config.enable_telemetry}, " + f"service={config.service_name}, protocol={config.otlp_protocol}" + ) if config.enable_telemetry: - self._init_telemetry() + self._init_telemetry_otlp() - def _init_telemetry(self) -> None: - """Initialize OpenTelemetry tracing and metrics.""" + def _init_telemetry_otlp(self) -> None: + """Initialize OpenTelemetry tracing and metrics with OTLP exporters.""" if not self._config or not self._config.enable_telemetry: logger.info("Telemetry is disabled by configuration") return @@ -120,75 +148,142 @@ def _init_telemetry(self) -> None: return try: - # Setup tracing with proper service name resource + # Setup resource with service name resource = Resource.create({ "service.name": self._config.service_name, "service.version": "1.0.0", "service.instance.id": "nexent-instance-1" }) + + # Initialize TracerProvider with OTLP exporter self._tracer_provider = TracerProvider(resource=resource) trace.set_tracer_provider(self._tracer_provider) - # Jaeger exporter - jaeger_exporter = JaegerExporter( - agent_host_name="localhost", - agent_port=14268, - collector_endpoint=self._config.jaeger_endpoint, - ) + # Choose exporter based on protocol + if self._config.otlp_protocol == "grpc": + span_exporter = OTLPSpanExporterGRPC( + endpoint=self._config.otlp_endpoint, + headers=self._config.otlp_headers + ) + else: + # HTTP protocol (default) + # For HTTP, append /v1/traces to endpoint if not already present + trace_endpoint = self._config.otlp_endpoint + if not trace_endpoint.endswith("/v1/traces"): + trace_endpoint = trace_endpoint.rstrip("/") + "/v1/traces" + span_exporter = OTLPSpanExporterHTTP( + endpoint=trace_endpoint, + headers=self._config.otlp_headers + ) - span_processor = BatchSpanProcessor(jaeger_exporter) + # BatchSpanProcessor for efficient export + span_processor = BatchSpanProcessor( + span_exporter, + max_queue_size=512, + schedule_delay_millis=1000, # 1 second + max_export_batch_size=512 + ) self._tracer_provider.add_span_processor(span_processor) - # Setup metrics with Prometheus exporter - prometheus_reader = PrometheusMetricReader() + # Initialize MeterProvider with OTLP exporter + if self._config.otlp_protocol == "grpc": + metric_exporter = OTLPMetricExporterGRPC( + endpoint=self._config.otlp_endpoint, + headers=self._config.otlp_headers + ) + else: + # HTTP protocol + metric_endpoint = self._config.otlp_endpoint + if not metric_endpoint.endswith("/v1/metrics"): + metric_endpoint = metric_endpoint.rstrip("/") + "/v1/metrics" + metric_exporter = OTLPMetricExporterHTTP( + endpoint=metric_endpoint, + headers=self._config.otlp_headers + ) + + # PeriodicExportingMetricReader for batch export + metric_reader = PeriodicExportingMetricReader( + exporter=metric_exporter, + export_interval_millis=60000 # 60 seconds + ) + self._meter_provider = MeterProvider( resource=resource, - metric_readers=[prometheus_reader]) + metric_readers=[metric_reader] + ) metrics.set_meter_provider(self._meter_provider) # Get tracer and meter instances self._tracer = trace.get_tracer(self._config.service_name) self._meter = metrics.get_meter(self._config.service_name) - # Create LLM-specific metrics + # Create LLM-specific metrics (OpenInference semantic conventions) self._llm_request_duration = self._meter.create_histogram( - name="llm_request_duration_seconds", + name="llm.request.duration", description="Duration of LLM requests in seconds", unit="s" ) self._llm_token_generation_rate = self._meter.create_histogram( - name="llm_token_generation_rate", + name="llm.token.generation_rate", description="Token generation rate (tokens per second)", unit="tokens/s" ) self._llm_ttft_duration = self._meter.create_histogram( - name="llm_time_to_first_token_seconds", + name="llm.time_to_first_token", description="Time to first token (TTFT) in seconds", unit="s" ) - self._llm_total_tokens = self._meter.create_counter( - name="llm_total_tokens", - description="Total tokens processed", + self._llm_token_count_prompt = self._meter.create_counter( + name="llm.token_count.prompt", + description="Number of prompt/input tokens", + unit="tokens" + ) + + self._llm_token_count_completion = self._meter.create_counter( + name="llm.token_count.completion", + description="Number of completion/output tokens", unit="tokens" ) self._llm_error_count = self._meter.create_counter( - name="llm_error_count", + name="llm.error.count", description="Number of LLM errors", unit="errors" ) + # Create Agent-specific metrics (OpenInference semantic conventions) + self._agent_step_count = self._meter.create_counter( + name="agent.step.count", + description="Number of agent execution steps", + unit="steps" + ) + + self._agent_execution_duration = self._meter.create_histogram( + name="agent.execution.duration", + description="Duration of agent execution in seconds", + unit="s" + ) + + self._agent_error_count = self._meter.create_counter( + name="agent.error.count", + description="Number of agent execution errors", + unit="errors" + ) + # Auto-instrument other libraries RequestsInstrumentor().instrument() logger.info( - f"Telemetry initialized successfully for service: {self._config.service_name}") + f"OTLP telemetry initialized successfully for service: {self._config.service_name}, " + f"endpoint: {self._config.otlp_endpoint}, protocol: {self._config.otlp_protocol}" + ) except Exception as e: - logger.error(f"Failed to initialize telemetry: {str(e)}") + logger.error(f"Failed to initialize OTLP telemetry: {str(e)}") + # Do not raise - allow application to continue without monitoring @property def is_enabled(self) -> bool: @@ -208,7 +303,8 @@ def setup_fastapi_app(self, app) -> bool: if self.is_enabled and app and OPENTELEMETRY_AVAILABLE: FastAPIInstrumentor.instrument_app(app) logger.info( - "FastAPI application monitoring initialized successfully") + "FastAPI application monitoring initialized successfully" + ) return True elif not OPENTELEMETRY_AVAILABLE: logger.warning( @@ -222,18 +318,25 @@ def setup_fastapi_app(self, app) -> bool: @contextmanager def trace_llm_request(self, operation_name: str, model_name: str, **attributes: Any) -> Iterator[Optional[Any]]: - """Context manager for tracing LLM requests with comprehensive metrics.""" + """ + Context manager for tracing LLM requests with comprehensive metrics. + Uses OpenInference semantic conventions for attribute naming. + """ if not self.is_enabled or not OPENTELEMETRY_AVAILABLE or not self._tracer: yield None return + # OpenInference semantic attributes + openinference_attrs = { + "llm.model_name": model_name, + "llm.operation.name": operation_name, + } + # Add user-provided attributes + openinference_attrs.update(attributes) + with self._tracer.start_as_current_span( operation_name, - attributes={ - "llm.model_name": model_name, - "llm.operation": operation_name, - **attributes - } + attributes=openinference_attrs ) as span: start_time = time.time() try: @@ -242,13 +345,143 @@ def trace_llm_request(self, operation_name: str, model_name: str, **attributes: span.set_status(Status(StatusCode.ERROR, str(e))) if self._llm_error_count: self._llm_error_count.add( - 1, {"model": model_name, "operation": operation_name}) + 1, {"llm.model_name": model_name, "llm.operation.name": operation_name} + ) raise finally: duration = time.time() - start_time if self._llm_request_duration: self._llm_request_duration.record( - duration, {"model": model_name, "operation": operation_name}) + duration, {"llm.model_name": model_name, "llm.operation.name": operation_name} + ) + + @contextmanager + def trace_agent_step(self, step_name: str, agent_name: str, step_type: str, **attributes: Any) -> Iterator[Optional[Any]]: + """ + Context manager for tracing Agent execution steps. + Uses OpenInference semantic conventions for attribute naming. + + Args: + step_name: Name of the step (e.g., "web_search", "reasoning_step_1") + agent_name: Name of the agent + step_type: Type of step - "tool_call", "reasoning", or "action_selection" + **attributes: Additional attributes to add to the span + """ + if not self.is_enabled or not OPENTELEMETRY_AVAILABLE or not self._tracer: + yield None + return + + # OpenInference semantic attributes for agent + openinference_attrs = { + "agent.name": agent_name, + "agent.step.name": step_name, + "agent.step.type": step_type, + } + openinference_attrs.update(attributes) + + span_name = f"agent.{step_name}" + + with self._tracer.start_as_current_span( + span_name, + attributes=openinference_attrs + ) as span: + start_time = time.time() + try: + yield span + except Exception as e: + span.set_status(Status(StatusCode.ERROR, str(e))) + if self._agent_error_count: + self._agent_error_count.add( + 1, {"agent.name": agent_name, "error.type": type(e).__name__} + ) + raise + finally: + duration = time.time() - start_time + if self._agent_step_count: + self._agent_step_count.add( + 1, {"agent.name": agent_name, "agent.step.type": step_type} + ) + + @contextmanager + def trace_tool_call(self, tool_name: str, agent_name: str, tool_input: Optional[Dict] = None, **attributes: Any) -> Iterator[Optional[Any]]: + """ + Context manager for tracing Agent tool calls. + Uses OpenInference semantic conventions for attribute naming. + + Args: + tool_name: Name of the tool being called + agent_name: Name of the agent making the call + tool_input: Input parameters for the tool (will be JSON serialized) + **attributes: Additional attributes to add to the span + """ + if not self.is_enabled or not OPENTELEMETRY_AVAILABLE or not self._tracer: + yield None + return + + # OpenInference semantic attributes for tool call + openinference_attrs = { + "agent.name": agent_name, + "agent.step.name": tool_name, + "agent.step.type": "tool_call", + "agent.tool.name": tool_name, + } + + # Add tool input as JSON string + if tool_input: + try: + openinference_attrs["agent.tool.input"] = json.dumps(tool_input, ensure_ascii=False) + except (TypeError, ValueError): + openinference_attrs["agent.tool.input"] = str(tool_input) + + openinference_attrs.update(attributes) + + span_name = f"agent.tool.{tool_name}" + + with self._tracer.start_as_current_span( + span_name, + attributes=openinference_attrs + ) as span: + start_time = time.time() + try: + yield span + except Exception as e: + span.set_status(Status(StatusCode.ERROR, str(e))) + span.set_attribute("error.type", type(e).__name__) + span.set_attribute("error.message", str(e)) + if self._agent_error_count: + self._agent_error_count.add( + 1, {"agent.name": agent_name, "error.type": type(e).__name__, "agent.tool.name": tool_name} + ) + raise + finally: + duration = time.time() - start_time + duration_ms = duration * 1000 + span.set_attribute("agent.tool.duration_ms", duration_ms) + if self._agent_step_count: + self._agent_step_count.add( + 1, {"agent.name": agent_name, "agent.step.type": "tool_call", "agent.tool.name": tool_name} + ) + + def set_tool_output(self, output: Any) -> None: + """ + Set the output of a tool call on the current span. + Call this within a trace_tool_call context manager. + + Args: + output: Tool output (will be JSON serialized) + """ + if not self.is_enabled or not OPENTELEMETRY_AVAILABLE: + return + + span = trace.get_current_span() + if span and span.is_recording(): + try: + if isinstance(output, str): + span.set_attribute("agent.tool.output", output) + else: + span.set_attribute("agent.tool.output", json.dumps(output, ensure_ascii=False)) + except (TypeError, ValueError): + span.set_attribute("agent.tool.output", str(output)) def get_current_span(self) -> Optional[Any]: """Get the current active span.""" @@ -279,16 +512,34 @@ def create_token_tracker(self, model_name: str, span: Optional[Any] = None) -> ' return LLMTokenTracker(self, model_name, span) def record_llm_metrics(self, metric_type: str, value: float, attributes: Dict[str, Any]) -> None: - """Record LLM-specific metrics.""" + """ + Record LLM-specific metrics using OpenInference semantic conventions. + """ if not self.is_enabled or not OPENTELEMETRY_AVAILABLE: return + # Ensure attributes use OpenInference naming + if "model" in attributes and "llm.model_name" not in attributes: + attributes["llm.model_name"] = attributes["model"] + if metric_type == "ttft" and self._llm_ttft_duration: self._llm_ttft_duration.record(value, attributes) elif metric_type == "token_rate" and self._llm_token_generation_rate: self._llm_token_generation_rate.record(value, attributes) - elif metric_type == "tokens" and self._llm_total_tokens: - self._llm_total_tokens.add(value, attributes) + elif metric_type == "tokens_prompt" and self._llm_token_count_prompt: + self._llm_token_count_prompt.add(value, attributes) + elif metric_type == "tokens_completion" and self._llm_token_count_completion: + self._llm_token_count_completion.add(value, attributes) + + def record_agent_metrics(self, metric_type: str, value: float, attributes: Dict[str, Any]) -> None: + """ + Record Agent-specific metrics using OpenInference semantic conventions. + """ + if not self.is_enabled or not OPENTELEMETRY_AVAILABLE: + return + + if metric_type == "duration" and self._agent_execution_duration: + self._agent_execution_duration.record(value, attributes) def monitor_endpoint(self, operation_name: Optional[str] = None, include_params: bool = True, exclude_params: Optional[list] = None) -> Callable[[F], F]: """ @@ -324,8 +575,8 @@ async def async_wrapper(*args, **kwargs): except Exception as e: duration = time.time() - start_time self.add_span_event(f"{op_name}.error", { - "error_type": type(e).__name__, - "error_message": str(e), + "error.type": type(e).__name__, + "error.message": str(e), "duration": duration }) raise @@ -355,8 +606,8 @@ def sync_wrapper(*args, **kwargs): except Exception as e: duration = time.time() - start_time self.add_span_event(f"{op_name}.error", { - "error_type": type(e).__name__, - "error_message": str(e), + "error.type": type(e).__name__, + "error.message": str(e), "duration": duration }) raise @@ -373,6 +624,7 @@ def monitor_llm_call(self, model_name: str, operation: str = "llm_completion"): """ Specialized decorator for LLM calls with token tracking. Monitoring is automatically enabled/disabled based on configuration. + Uses OpenInference semantic conventions for attribute naming. """ def decorator(func: F) -> F: @functools.wraps(func) @@ -389,8 +641,8 @@ async def async_wrapper(*args, **kwargs): return result except Exception as e: self.add_span_event("llm_call_error", { - "error_type": type(e).__name__, - "error_message": str(e) + "error.type": type(e).__name__, + "error.message": str(e) }) raise @@ -409,8 +661,8 @@ def sync_wrapper(*args, **kwargs): return result except Exception as e: self.add_span_event("llm_call_error", { - "error_type": type(e).__name__, - "error_message": str(e) + "error.type": type(e).__name__, + "error.message": str(e) }) raise @@ -421,9 +673,72 @@ def sync_wrapper(*args, **kwargs): return decorator + def monitor_agent_execution(self, agent_name: str): + """ + Decorator to add monitoring to Agent execution. + Tracks overall execution duration and error count. + + Args: + agent_name: Name of the agent being monitored + """ + def decorator(func: F) -> F: + @functools.wraps(func) + async def async_wrapper(*args, **kwargs): + start_time = time.time() + status = "success" + + try: + result = await func(*args, **kwargs) + return result + except Exception as e: + status = "error" + if self._agent_error_count: + self._agent_error_count.add( + 1, {"agent.name": agent_name, "error.type": type(e).__name__} + ) + raise + finally: + duration = time.time() - start_time + if self._agent_execution_duration: + self._agent_execution_duration.record( + duration, {"agent.name": agent_name, "agent.status": status} + ) + + @functools.wraps(func) + def sync_wrapper(*args, **kwargs): + start_time = time.time() + status = "success" + + try: + result = func(*args, **kwargs) + return result + except Exception as e: + status = "error" + if self._agent_error_count: + self._agent_error_count.add( + 1, {"agent.name": agent_name, "error.type": type(e).__name__} + ) + raise + finally: + duration = time.time() - start_time + if self._agent_execution_duration: + self._agent_execution_duration.record( + duration, {"agent.name": agent_name, "agent.status": status} + ) + + if hasattr(func, '__code__') and func.__code__.co_flags & 0x80: + return cast(F, async_wrapper) + else: + return cast(F, sync_wrapper) + + return decorator + class LLMTokenTracker: - """Tracks token generation metrics for streaming LLM responses.""" + """ + Tracks token generation metrics for streaming LLM responses. + Uses OpenInference semantic conventions for attribute naming. + """ def __init__(self, manager: MonitoringManager, model_name: str, span: Optional[Any] = None): self.manager = manager @@ -446,10 +761,10 @@ def record_first_token(self) -> None: if self.span: self.span.add_event("first_token_received", - {"ttft_seconds": ttft}) + {"llm.time_to_first_token": ttft}) self.manager.record_llm_metrics( - "ttft", ttft, {"model": self.model_name}) + "ttft", ttft, {"llm.model_name": self.model_name}) def record_token(self, token: str) -> None: """Record a new token generated.""" @@ -468,7 +783,7 @@ def record_token(self, token: str) -> None: }) def record_completion(self, input_tokens: int = 0, output_tokens: int = 0) -> None: - """Record completion metrics.""" + """Record completion metrics using OpenInference semantic conventions.""" if not self.manager.is_enabled: return @@ -481,23 +796,23 @@ def record_completion(self, input_tokens: int = 0, output_tokens: int = 0) -> No if total_duration > 0 and self.token_count > 0: generation_rate = self.token_count / total_duration self.manager.record_llm_metrics("token_rate", generation_rate, { - "model": self.model_name}) + "llm.model_name": self.model_name}) - # Record total tokens - self.manager.record_llm_metrics("tokens", input_tokens, { - "model": self.model_name, "type": "input"}) - self.manager.record_llm_metrics("tokens", output_tokens, { - "model": self.model_name, "type": "output"}) + # Record token counts using OpenInference naming + self.manager.record_llm_metrics("tokens_prompt", input_tokens, { + "llm.model_name": self.model_name}) + self.manager.record_llm_metrics("tokens_completion", output_tokens, { + "llm.model_name": self.model_name}) - # Add span attributes + # Add span attributes using OpenInference naming if self.span: self.span.set_attributes({ - "llm.input_tokens": input_tokens, - "llm.output_tokens": output_tokens, - "llm.total_tokens": input_tokens + output_tokens, + "llm.token_count.prompt": input_tokens, + "llm.token_count.completion": output_tokens, + "llm.token_count.total": input_tokens + output_tokens, "llm.generation_rate": generation_rate, - "llm.total_duration": total_duration, - "llm.ttft": self.first_token_time - self.start_time if self.first_token_time else 0 + "llm.duration.total": total_duration, + "llm.time_to_first_token": self.first_token_time - self.start_time if self.first_token_time else 0 }) @@ -534,4 +849,4 @@ async def my_function(): 'LLMTokenTracker', 'get_monitoring_manager', 'is_opentelemetry_available', -] +] \ No newline at end of file diff --git a/sdk/pyproject.toml b/sdk/pyproject.toml index 30dfcf784..12d9ae275 100644 --- a/sdk/pyproject.toml +++ b/sdk/pyproject.toml @@ -74,19 +74,13 @@ data_process = [ "unstructured[all-docs]" ] performance = [ - # OpenTelemetry Core Components - "opentelemetry-api==1.20.0", - "opentelemetry-sdk==1.20.0", - "opentelemetry-semantic-conventions==0.41b0", - # OpenTelemetry Instrumentation - "opentelemetry-instrumentation==0.41b0", - "opentelemetry-instrumentation-fastapi==0.41b0", - "opentelemetry-instrumentation-requests==0.41b0", - # OpenTelemetry Exporters - "opentelemetry-exporter-jaeger", - "opentelemetry-exporter-prometheus", - # Additional monitoring dependencies - "prometheus-client" + "opentelemetry-api>=1.20.0", + "opentelemetry-sdk>=1.20.0", + "opentelemetry-semantic-conventions>=0.41b0", + "opentelemetry-instrumentation>=0.41b0", + "opentelemetry-instrumentation-fastapi>=0.41b0", + "opentelemetry-instrumentation-requests>=0.41b0", + "opentelemetry-exporter-otlp>=1.20.0", ] dev = [ "nexent[quality, data_process, performance]" diff --git a/test/backend/utils/test_monitoring.py b/test/backend/utils/test_monitoring.py index 4cd5b44e2..5e263a83b 100644 --- a/test/backend/utils/test_monitoring.py +++ b/test/backend/utils/test_monitoring.py @@ -1,7 +1,7 @@ """ -Unit tests for backend monitoring utilities. +Unit tests for backend monitoring utilities (OTLP-based). -Tests the actual functionality and integration of the monitoring system. +Tests the actual functionality and integration of the OTLP monitoring system. """ import pytest @@ -18,15 +18,16 @@ def test_monitoring_manager_exists(self): assert hasattr(monitoring_manager, 'configure') assert hasattr(monitoring_manager, 'monitor_endpoint') assert hasattr(monitoring_manager, 'monitor_llm_call') + assert hasattr(monitoring_manager, 'trace_agent_step') + assert hasattr(monitoring_manager, 'trace_tool_call') def test_monitoring_manager_methods_callable(self): """Test that monitoring manager methods are callable.""" - # These should not raise exceptions when called monitoring_manager.add_span_event("test_event") monitoring_manager.set_span_attributes(key="value") monitoring_manager.record_llm_metrics("ttft", 0.5, {}) + monitoring_manager.record_agent_metrics("duration", 1.0, {}) - # Property access should work is_enabled = monitoring_manager.is_enabled assert isinstance(is_enabled, bool) @@ -36,7 +37,6 @@ def test_monitoring_manager_decorators(self): def test_function(): return {"result": "success"} - # Function should work normally result = test_function() assert result == {"result": "success"} @@ -44,17 +44,38 @@ def test_monitoring_manager_llm_decorator(self): """Test that LLM monitoring decorator works.""" @monitoring_manager.monitor_llm_call("test_model") def test_llm_function(**kwargs): - # Should handle the _token_tracker kwarg return {"result": "llm_success"} - # Function should work normally result = test_llm_function() assert result == {"result": "llm_success"} + def test_monitoring_manager_agent_decorator(self): + """Test that agent execution decorator works.""" + @monitoring_manager.monitor_agent_execution("test_agent") + def test_agent_function(): + return {"result": "agent_success"} + + result = test_agent_function() + assert result == {"result": "agent_success"} + + def test_agent_step_tracing(self): + """Test agent step tracing context manager.""" + with monitoring_manager.trace_agent_step("test_step", "test_agent", "tool_call") as span: + pass + + with monitoring_manager.trace_agent_step("reasoning", "test_agent", "reasoning") as span: + pass + + def test_tool_call_tracing(self): + """Test tool call tracing context manager.""" + tool_input = {"query": "test"} + + with monitoring_manager.trace_tool_call("web_search", "test_agent", tool_input) as span: + monitoring_manager.set_tool_output({"results": []}) + def test_monitoring_manager_context_manager(self): """Test that monitoring context manager works.""" with monitoring_manager.trace_llm_request("test_op", "test_model") as span: - # Should work whether span is None or a real span pass def test_token_tracker_creation(self): @@ -62,7 +83,6 @@ def test_token_tracker_creation(self): tracker = monitoring_manager.create_token_tracker("test_model") assert tracker is not None - # Should be able to call methods without errors tracker.record_first_token() tracker.record_token("test_token") tracker.record_completion(input_tokens=10, output_tokens=15) @@ -71,38 +91,48 @@ def test_fastapi_app_setup(self): """Test FastAPI app setup functionality.""" mock_app = MagicMock() - # Should return a boolean and not raise exceptions result = monitoring_manager.setup_fastapi_app(mock_app) assert isinstance(result, bool) - # Should handle None app gracefully result = monitoring_manager.setup_fastapi_app(None) assert result is False - def test_configuration_methods(self): - """Test configuration-related methods.""" + def test_otlp_configuration(self): + """Test OTLP configuration methods.""" from sdk.nexent.monitor.monitoring import MonitoringConfig - # Should be able to configure without errors config = MonitoringConfig( enable_telemetry=False, - service_name="test-service" + service_name="test-service", + otlp_endpoint="http://localhost:4318", + otlp_protocol="http", + otlp_headers={} + ) + + monitoring_manager.configure(config) + + def test_grpc_protocol_config(self): + """Test gRPC protocol configuration.""" + from sdk.nexent.monitor.monitoring import MonitoringConfig + + config = MonitoringConfig( + enable_telemetry=False, + service_name="test-service", + otlp_endpoint="http://localhost:4317", + otlp_protocol="grpc" ) - # Should not raise exceptions monitoring_manager.configure(config) def test_error_resilience(self): """Test that monitoring handles errors gracefully.""" - # These should not raise exceptions even if monitoring has issues try: monitoring_manager.add_span_event("test_event", {"key": "value"}) monitoring_manager.set_span_attributes(test_attr="test_value") - monitoring_manager.record_llm_metrics( - "token_rate", 10.0, {"model": "test"}) + monitoring_manager.record_llm_metrics("token_rate", 10.0, {"llm.model_name": "test"}) + monitoring_manager.record_agent_metrics("duration", 1.5, {"agent.name": "test"}) except Exception as e: - pytest.fail( - f"Monitoring methods should handle errors gracefully: {e}") + pytest.fail(f"Monitoring methods should handle errors gracefully: {e}") def test_complex_decorator_scenario(self): """Test complex decorator usage scenarios.""" @@ -114,7 +144,6 @@ async def async_function(username, password, debug=False): def sync_function(data): return {"processed": data} - # Both should work import asyncio result1 = asyncio.run(async_function("user1", "secret", debug=True)) assert result1["username"] == "user1" @@ -129,7 +158,6 @@ def test_monitoring_with_exceptions(self): def error_function(): raise ValueError("Test error") - # Exception should be propagated with pytest.raises(ValueError, match="Test error"): error_function() @@ -137,10 +165,7 @@ def test_module_attributes(self): """Test that the module has correct attributes.""" import backend.utils.monitoring as monitoring_module - # Should have monitoring_manager assert hasattr(monitoring_module, 'monitoring_manager') - - # Should have __all__ export list assert hasattr(monitoring_module, '__all__') assert 'monitoring_manager' in monitoring_module.__all__ @@ -149,22 +174,8 @@ def test_singleton_behavior(self): from backend.utils.monitoring import monitoring_manager as manager1 from backend.utils.monitoring import monitoring_manager as manager2 - # Should be the same instance assert manager1 is manager2 - def test_edge_case_parameters(self): - """Test monitoring with edge case parameters.""" - # Empty strings - monitoring_manager.add_span_event("") - monitoring_manager.set_span_attributes() - - # Large data - large_data = {"key": "x" * 1000} - monitoring_manager.add_span_event("large_event", large_data) - - # None values - monitoring_manager.add_span_event("none_test", None) - def test_concurrent_usage(self): """Test concurrent usage of monitoring manager.""" import threading @@ -174,8 +185,7 @@ def test_concurrent_usage(self): def worker(): try: monitoring_manager.add_span_event("concurrent_test") - monitoring_manager.set_span_attributes( - worker_id=threading.current_thread().ident) + monitoring_manager.set_span_attributes(worker_id=threading.current_thread().ident) results.append("success") except Exception as e: results.append(f"error: {e}") @@ -187,7 +197,6 @@ def worker(): for t in threads: t.join() - # All workers should complete successfully assert len(results) == 5 assert all(r == "success" for r in results) @@ -197,7 +206,6 @@ def test_decorator_parameter_filtering(self): def function_with_secrets(public_data, secret, debug=True): return {"public": public_data, "debug": debug} - # Should work without exposing secret parameter result = function_with_secrets("visible", "hidden", debug=False) assert result["public"] == "visible" assert result["debug"] is False @@ -206,11 +214,9 @@ def test_llm_decorator_with_token_tracker(self): """Test LLM decorator properly handles token tracker parameter.""" @monitoring_manager.monitor_llm_call("gpt-4") def mock_llm_call(**kwargs): - # Should receive _token_tracker parameter assert "_token_tracker" in kwargs token_tracker = kwargs["_token_tracker"] - # Should be able to use token tracker (may be None when disabled) if token_tracker: token_tracker.record_first_token() token_tracker.record_token("test") @@ -221,32 +227,10 @@ def mock_llm_call(**kwargs): result = mock_llm_call() assert result == "LLM response" - def test_context_manager_error_handling(self): - """Test context manager handles errors properly.""" - try: - with monitoring_manager.trace_llm_request("error_op", "test_model") as span: - # Should be able to work with span even if it's None - if span: - span.set_attribute("test", "value") - # Raise an error to test error handling - raise RuntimeError("Test error in context") - except RuntimeError: - # Error should be properly propagated - pass - - def test_metrics_recording_all_types(self): - """Test all types of metrics recording.""" - # Should handle different metric types - monitoring_manager.record_llm_metrics("ttft", 0.5, {"model": "test"}) - monitoring_manager.record_llm_metrics( - "token_rate", 10.5, {"model": "test"}) - monitoring_manager.record_llm_metrics( - "tokens", 100, {"model": "test", "type": "input"}) - monitoring_manager.record_llm_metrics( - "unknown_type", 42, {"model": "test"}) - def test_get_current_span(self): """Test getting current span functionality.""" span = monitoring_manager.get_current_span() - # Should return None when monitoring is disabled or no active span - # Should not raise an exception + + def test_get_tracer(self): + """Test getting tracer property.""" + tracer = monitoring_manager.tracer \ No newline at end of file diff --git a/test/sdk/monitor/test_monitoring.py b/test/sdk/monitor/test_monitoring.py index 7196458fb..63e69ac6d 100644 --- a/test/sdk/monitor/test_monitoring.py +++ b/test/sdk/monitor/test_monitoring.py @@ -1,21 +1,23 @@ """ -Comprehensive unit tests for SDK monitoring module. +Comprehensive unit tests for SDK monitoring module (OTLP-based). Tests cover: -- MonitoringConfig dataclass +- MonitoringConfig dataclass (OTLP fields) - MonitoringManager singleton behavior -- Telemetry initialization and configuration -- LLM request tracing and metrics +- OTLP telemetry initialization +- LLM request tracing with OpenInference semantics +- Agent step and tool tracing - Token tracking and performance metrics -- Decorator functionality for endpoint and LLM monitoring -- Error handling and edge cases +- Decorator functionality +- Error handling and graceful degradation """ from sdk.nexent.monitor.monitoring import ( MonitoringConfig, MonitoringManager, LLMTokenTracker, - get_monitoring_manager + get_monitoring_manager, + is_opentelemetry_available ) import pytest import asyncio @@ -23,27 +25,29 @@ class TestMonitoringConfig: - """Test MonitoringConfig dataclass.""" + """Test MonitoringConfig dataclass with OTLP fields.""" def test_default_config(self): """Test default configuration values.""" config = MonitoringConfig() assert config.enable_telemetry is False - assert config.service_name == "nexent-sdk" - assert config.jaeger_endpoint == "http://localhost:14268/api/traces" - assert config.prometheus_port == 8000 + assert config.service_name == "nexent-backend" + assert config.otlp_endpoint == "http://localhost:4318" + assert config.otlp_protocol == "http" + assert config.otlp_headers == {} assert config.telemetry_sample_rate == 1.0 assert config.llm_slow_request_threshold_seconds == 5.0 assert config.llm_slow_token_rate_threshold == 10.0 def test_custom_config(self): - """Test configuration with custom values.""" + """Test configuration with custom OTLP values.""" config = MonitoringConfig( enable_telemetry=True, service_name="test-service", - jaeger_endpoint="http://test:14268/api/traces", - prometheus_port=9000, + otlp_endpoint="https://phoenix.arize.com/v1", + otlp_protocol="grpc", + otlp_headers={"x-api-key": "test-key"}, telemetry_sample_rate=0.5, llm_slow_request_threshold_seconds=10.0, llm_slow_token_rate_threshold=20.0 @@ -51,12 +55,22 @@ def test_custom_config(self): assert config.enable_telemetry is True assert config.service_name == "test-service" - assert config.jaeger_endpoint == "http://test:14268/api/traces" - assert config.prometheus_port == 9000 + assert config.otlp_endpoint == "https://phoenix.arize.com/v1" + assert config.otlp_protocol == "grpc" + assert config.otlp_headers == {"x-api-key": "test-key"} assert config.telemetry_sample_rate == 0.5 assert config.llm_slow_request_threshold_seconds == 10.0 assert config.llm_slow_token_rate_threshold == 20.0 + def test_invalid_protocol_defaults_to_http(self): + """Test that invalid protocol defaults to http.""" + with patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True): + config = MonitoringConfig( + enable_telemetry=True, + otlp_protocol="invalid" + ) + assert config.otlp_protocol == "http" + class TestMonitoringManager: """Test MonitoringManager singleton and core functionality.""" @@ -74,769 +88,396 @@ def test_singleton_behavior(self): assert manager1 is manager2 assert id(manager1) == id(manager2) - def test_initialization_only_once(self): - """Test that initialization only happens once.""" - manager1 = MonitoringManager() - original_config = manager1._config - - manager2 = MonitoringManager() - assert manager2._config is original_config - - def test_configure_disabled_telemetry(self): - """Test configuration with telemetry disabled.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=False) - - with patch.object(manager, '_init_telemetry') as mock_init: - manager.configure(config) - - assert manager._config is config - mock_init.assert_not_called() - - def test_configure_enabled_telemetry(self): - """Test configuration with telemetry enabled.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - - with patch.object(manager, '_init_telemetry') as mock_init: - manager.configure(config) - - assert manager._config is config - mock_init.assert_called_once() - def test_is_enabled_property(self): """Test is_enabled property behavior.""" manager = MonitoringManager() - # No config set assert manager.is_enabled is False - # Config with telemetry disabled config_disabled = MonitoringConfig(enable_telemetry=False) manager.configure(config_disabled) assert manager.is_enabled is False - # Config with telemetry enabled - config_enabled = MonitoringConfig(enable_telemetry=True) - manager.configure(config_enabled) - assert manager.is_enabled is True + @patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', False) + def test_telemetry_disabled_when_otlp_not_available(self): + """Test telemetry is disabled when OpenTelemetry not installed.""" + config = MonitoringConfig(enable_telemetry=True) + assert config.enable_telemetry is False @patch('sdk.nexent.monitor.monitoring.trace') @patch('sdk.nexent.monitor.monitoring.metrics') @patch('sdk.nexent.monitor.monitoring.TracerProvider') @patch('sdk.nexent.monitor.monitoring.MeterProvider') - @patch('sdk.nexent.monitor.monitoring.JaegerExporter') + @patch('sdk.nexent.monitor.monitoring.OTLPSpanExporterHTTP') + @patch('sdk.nexent.monitor.monitoring.OTLPMetricExporterHTTP') @patch('sdk.nexent.monitor.monitoring.BatchSpanProcessor') - @patch('sdk.nexent.monitor.monitoring.PrometheusMetricReader') + @patch('sdk.nexent.monitor.monitoring.PeriodicExportingMetricReader') @patch('sdk.nexent.monitor.monitoring.Resource') @patch('sdk.nexent.monitor.monitoring.RequestsInstrumentor') - def test_init_telemetry_success(self, mock_requests_instr, mock_resource, - mock_prometheus, mock_batch_processor, - mock_jaeger, mock_meter_provider, - mock_tracer_provider, mock_metrics, mock_trace): - """Test successful telemetry initialization.""" - manager = MonitoringManager() - config = MonitoringConfig( - enable_telemetry=True, - service_name="test-service", - jaeger_endpoint="http://test:14268/api/traces" - ) - - # Mock return values - mock_resource_instance = MagicMock() - mock_resource.create.return_value = mock_resource_instance - - mock_tracer_provider_instance = MagicMock() - mock_tracer_provider.return_value = mock_tracer_provider_instance + def test_init_telemetry_http(self, mock_requests_instr, mock_resource, + mock_periodic_reader, mock_batch_processor, + mock_metric_exporter_http, mock_span_exporter_http, + mock_meter_provider, mock_tracer_provider, + mock_metrics, mock_trace): + """Test telemetry initialization with HTTP protocol.""" + with patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True): + manager = MonitoringManager() + config = MonitoringConfig( + enable_telemetry=True, + service_name="test-service", + otlp_endpoint="http://localhost:4318", + otlp_protocol="http" + ) - mock_meter_provider_instance = MagicMock() - mock_meter_provider.return_value = mock_meter_provider_instance + mock_resource_instance = MagicMock() + mock_resource.create.return_value = mock_resource_instance - mock_tracer = MagicMock() - mock_trace.get_tracer.return_value = mock_tracer + mock_tracer_provider_instance = MagicMock() + mock_tracer_provider.return_value = mock_tracer_provider_instance - mock_meter = MagicMock() - mock_metrics.get_meter.return_value = mock_meter + mock_meter_provider_instance = MagicMock() + mock_meter_provider.return_value = mock_meter_provider_instance - # Configure will call _init_telemetry internally - manager.configure(config) + mock_tracer = MagicMock() + mock_trace.get_tracer.return_value = mock_tracer - # Verify resource creation (called once during configure) - mock_resource.create.assert_called_with({ - "service.name": "test-service", - "service.version": "1.0.0", - "service.instance.id": "nexent-instance-1" - }) + mock_meter = MagicMock() + mock_metrics.get_meter.return_value = mock_meter - # Verify tracer provider setup - mock_tracer_provider.assert_called_once_with( - resource=mock_resource_instance) - mock_trace.set_tracer_provider.assert_called_once_with( - mock_tracer_provider_instance) - - # Verify metrics setup - mock_meter_provider.assert_called_once() - mock_metrics.set_meter_provider.assert_called_once() + manager.configure(config) - # Verify instrumentation - mock_requests_instr().instrument.assert_called_once() + mock_resource.create.assert_called() + mock_tracer_provider.assert_called_once() + mock_span_exporter_http.assert_called_once() + mock_batch_processor.assert_called_once() + mock_requests_instr().instrument.assert_called_once() - def test_init_telemetry_disabled(self): - """Test telemetry initialization when disabled.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=False) - manager.configure(config) + @patch('sdk.nexent.monitor.monitoring.trace') + @patch('sdk.nexent.monitor.monitoring.metrics') + @patch('sdk.nexent.monitor.monitoring.TracerProvider') + @patch('sdk.nexent.monitor.monitoring.MeterProvider') + @patch('sdk.nexent.monitor.monitoring.OTLPSpanExporterGRPC') + @patch('sdk.nexent.monitor.monitoring.OTLPMetricExporterGRPC') + @patch('sdk.nexent.monitor.monitoring.BatchSpanProcessor') + @patch('sdk.nexent.monitor.monitoring.PeriodicExportingMetricReader') + @patch('sdk.nexent.monitor.monitoring.Resource') + def test_init_telemetry_grpc(self, mock_resource, mock_periodic_reader, + mock_batch_processor, mock_metric_exporter_grpc, + mock_span_exporter_grpc, mock_meter_provider, + mock_tracer_provider, mock_metrics, mock_trace): + """Test telemetry initialization with gRPC protocol.""" + with patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True): + manager = MonitoringManager() + config = MonitoringConfig( + enable_telemetry=True, + service_name="test-service", + otlp_endpoint="http://localhost:4317", + otlp_protocol="grpc" + ) - with patch('sdk.nexent.monitor.monitoring.trace') as mock_trace: - manager._init_telemetry() - mock_trace.set_tracer_provider.assert_not_called() + mock_resource_instance = MagicMock() + mock_resource.create.return_value = mock_resource_instance + mock_tracer_provider.return_value = MagicMock() + mock_meter_provider.return_value = MagicMock() + mock_trace.get_tracer.return_value = MagicMock() + mock_metrics.get_meter.return_value = MagicMock() - def test_init_telemetry_no_config(self): - """Test telemetry initialization with no config.""" - manager = MonitoringManager() + manager.configure(config) - with patch('sdk.nexent.monitor.monitoring.trace') as mock_trace: - manager._init_telemetry() - mock_trace.set_tracer_provider.assert_not_called() + mock_span_exporter_grpc.assert_called_once() + mock_metric_exporter_grpc.assert_called_once() def test_init_telemetry_exception_handling(self): - """Test telemetry initialization with exceptions.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) - - with patch('sdk.nexent.monitor.monitoring.TracerProvider', side_effect=Exception("Test error")): - with patch('sdk.nexent.monitor.monitoring.logger') as mock_logger: - manager._init_telemetry() - mock_logger.error.assert_called_once() - - def test_setup_fastapi_app_enabled(self): - """Test FastAPI app setup when monitoring is enabled.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) - - mock_app = MagicMock() - - with patch('sdk.nexent.monitor.monitoring.FastAPIInstrumentor') as mock_instrumentor: - result = manager.setup_fastapi_app(mock_app) - - assert result is True - mock_instrumentor.instrument_app.assert_called_once_with(mock_app) - - def test_setup_fastapi_app_disabled(self): - """Test FastAPI app setup when monitoring is disabled.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=False) - manager.configure(config) - - mock_app = MagicMock() - result = manager.setup_fastapi_app(mock_app) - - assert result is False - - def test_setup_fastapi_app_no_app(self): - """Test FastAPI app setup with None app.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) - - result = manager.setup_fastapi_app(None) - assert result is False - - def test_setup_fastapi_app_exception(self): - """Test FastAPI app setup with exception.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) - - mock_app = MagicMock() - - with patch('sdk.nexent.monitor.monitoring.FastAPIInstrumentor') as mock_instrumentor: - mock_instrumentor.instrument_app.side_effect = Exception( - "Test error") + """Test telemetry initialization handles exceptions gracefully.""" + with patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True): + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=True) - result = manager.setup_fastapi_app(mock_app) - assert result is False + with patch('sdk.nexent.monitor.monitoring.Resource.create', side_effect=Exception("Test error")): + manager.configure(config) @patch('sdk.nexent.monitor.monitoring.trace') - def test_trace_llm_request_enabled(self, mock_trace): - """Test LLM request tracing when enabled.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) - manager._tracer = MagicMock() - - mock_span = MagicMock() - manager._tracer.start_as_current_span.return_value.__enter__ = Mock( - return_value=mock_span) - manager._tracer.start_as_current_span.return_value.__exit__ = Mock( - return_value=None) - - with manager.trace_llm_request("test_op", "test_model", param1="value1") as span: - assert span is mock_span - - manager._tracer.start_as_current_span.assert_called_once_with( - "test_op", - attributes={ - "llm.model_name": "test_model", - "llm.operation": "test_op", - "param1": "value1" - } - ) - - def test_trace_llm_request_disabled(self): - """Test LLM request tracing when disabled.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=False) - manager.configure(config) - - with manager.trace_llm_request("test_op", "test_model") as span: - assert span is None + def test_trace_llm_request_openinference_attrs(self, mock_trace): + """Test LLM request tracing uses OpenInference attribute names.""" + with patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True): + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=True) + manager.configure(config) + manager._tracer = MagicMock() - def test_trace_llm_request_no_tracer(self): - """Test LLM request tracing when tracer is None.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) - manager._tracer = None + mock_span = MagicMock() + manager._tracer.start_as_current_span.return_value.__enter__ = Mock(return_value=mock_span) + manager._tracer.start_as_current_span.return_value.__exit__ = Mock(return_value=None) - with manager.trace_llm_request("test_op", "test_model") as span: - assert span is None + with manager.trace_llm_request("test_op", "gpt-4", extra="value") as span: + pass - @patch('sdk.nexent.monitor.monitoring.trace') - def test_trace_llm_request_with_exception(self, mock_trace): - """Test LLM request tracing with exception.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) - manager._tracer = MagicMock() - manager._llm_error_count = MagicMock() + call_args = manager._tracer.start_as_current_span.call_args + attributes = call_args[1]['attributes'] - mock_span = MagicMock() - manager._tracer.start_as_current_span.return_value.__enter__ = Mock( - return_value=mock_span) - manager._tracer.start_as_current_span.return_value.__exit__ = Mock( - return_value=None) + assert "llm.model_name" in attributes + assert attributes["llm.model_name"] == "gpt-4" + assert "llm.operation.name" in attributes + assert attributes["llm.operation.name"] == "test_op" - test_error = ValueError("Test error") - with pytest.raises(ValueError): - with manager.trace_llm_request("test_op", "test_model") as span: - raise test_error +class TestAgentStepTracing: + """Test Agent step tracing functionality.""" - # Verify error handling - mock_span.set_status.assert_called_once() - manager._llm_error_count.add.assert_called_once_with( - 1, {"model": "test_model", "operation": "test_op"} - ) + def setup_method(self): + """Reset singleton state before each test.""" + MonitoringManager._instance = None + MonitoringManager._initialized = False @patch('sdk.nexent.monitor.monitoring.trace') - def test_get_current_span_enabled(self, mock_trace): - """Test getting current span when enabled.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) + def test_trace_agent_step_tool_call(self, mock_trace): + """Test tracing agent tool call step.""" + with patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True): + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=True) + manager.configure(config) + manager._tracer = MagicMock() - mock_span = MagicMock() - mock_trace.get_current_span.return_value = mock_span + mock_span = MagicMock() + manager._tracer.start_as_current_span.return_value.__enter__ = Mock(return_value=mock_span) + manager._tracer.start_as_current_span.return_value.__exit__ = Mock(return_value=None) - result = manager.get_current_span() - assert result is mock_span - mock_trace.get_current_span.assert_called_once() + with manager.trace_agent_step("web_search", "test_agent", "tool_call") as span: + pass - def test_get_current_span_disabled(self): - """Test getting current span when disabled.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=False) - manager.configure(config) + call_args = manager._tracer.start_as_current_span.call_args + attributes = call_args[1]['attributes'] - result = manager.get_current_span() - assert result is None + assert "agent.name" in attributes + assert attributes["agent.name"] == "test_agent" + assert "agent.step.name" in attributes + assert attributes["agent.step.name"] == "web_search" + assert "agent.step.type" in attributes + assert attributes["agent.step.type"] == "tool_call" @patch('sdk.nexent.monitor.monitoring.trace') - def test_add_span_event_enabled(self, mock_trace): - """Test adding span event when enabled.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) + def test_trace_agent_step_reasoning(self, mock_trace): + """Test tracing agent reasoning step.""" + with patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True): + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=True) + manager.configure(config) + manager._tracer = MagicMock() - mock_span = MagicMock() - mock_trace.get_current_span.return_value = mock_span + mock_span = MagicMock() + manager._tracer.start_as_current_span.return_value.__enter__ = Mock(return_value=mock_span) + manager._tracer.start_as_current_span.return_value.__exit__ = Mock(return_value=None) - manager.add_span_event("test_event", {"key": "value"}) + with manager.trace_agent_step("analyze_query", "test_agent", "reasoning") as span: + pass - mock_span.add_event.assert_called_once_with( - "test_event", {"key": "value"}) + call_args = manager._tracer.start_as_current_span.call_args + attributes = call_args[1]['attributes'] - @patch('sdk.nexent.monitor.monitoring.trace') - def test_add_span_event_no_attributes(self, mock_trace): - """Test adding span event without attributes.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) + assert attributes["agent.step.type"] == "reasoning" - mock_span = MagicMock() - mock_trace.get_current_span.return_value = mock_span + @patch('sdk.nexent.monitor.monitoring.trace') + def test_trace_agent_step_action_selection(self, mock_trace): + """Test tracing agent action selection step.""" + with patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True): + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=True) + manager.configure(config) + manager._tracer = MagicMock() - manager.add_span_event("test_event") + mock_span = MagicMock() + manager._tracer.start_as_current_span.return_value.__enter__ = Mock(return_value=mock_span) + manager._tracer.start_as_current_span.return_value.__exit__ = Mock(return_value=None) - mock_span.add_event.assert_called_once_with("test_event", {}) + with manager.trace_agent_step("decide_next", "test_agent", "action_selection") as span: + pass - def test_add_span_event_disabled(self): - """Test adding span event when disabled.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=False) - manager.configure(config) + call_args = manager._tracer.start_as_current_span.call_args + attributes = call_args[1]['attributes'] - # Should not raise any exception - manager.add_span_event("test_event", {"key": "value"}) + assert attributes["agent.step.type"] == "action_selection" @patch('sdk.nexent.monitor.monitoring.trace') - def test_add_span_event_no_span(self, mock_trace): - """Test adding span event when no current span.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) + def test_trace_tool_call_with_input_output(self, mock_trace): + """Test tracing tool call with input and output.""" + with patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True): + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=True) + manager.configure(config) + manager._tracer = MagicMock() - mock_trace.get_current_span.return_value = None + mock_span = MagicMock() + manager._tracer.start_as_current_span.return_value.__enter__ = Mock(return_value=mock_span) + manager._tracer.start_as_current_span.return_value.__exit__ = Mock(return_value=None) - # Should not raise any exception - manager.add_span_event("test_event", {"key": "value"}) + tool_input = {"query": "test search", "limit": 10} - @patch('sdk.nexent.monitor.monitoring.trace') - def test_set_span_attributes_enabled(self, mock_trace): - """Test setting span attributes when enabled.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) + with manager.trace_tool_call("web_search", "test_agent", tool_input) as span: + manager.set_tool_output({"results": ["item1", "item2"]}) - mock_span = MagicMock() - mock_trace.get_current_span.return_value = mock_span + call_args = manager._tracer.start_as_current_span.call_args + attributes = call_args[1]['attributes'] - manager.set_span_attributes(key1="value1", key2="value2") + assert "agent.tool.name" in attributes + assert attributes["agent.tool.name"] == "web_search" + assert "agent.tool.input" in attributes + assert "query" in attributes["agent.tool.input"] - mock_span.set_attributes.assert_called_once_with( - {"key1": "value1", "key2": "value2"}) + mock_span.set_attribute.assert_called() - def test_set_span_attributes_disabled(self): - """Test setting span attributes when disabled.""" + def test_trace_agent_step_disabled(self): + """Test agent step tracing when disabled.""" manager = MonitoringManager() config = MonitoringConfig(enable_telemetry=False) manager.configure(config) - # Should not raise any exception - manager.set_span_attributes(key1="value1", key2="value2") - - def test_create_token_tracker(self): - """Test creating token tracker.""" - manager = MonitoringManager() - mock_span = MagicMock() - - tracker = manager.create_token_tracker("test_model", mock_span) - - assert isinstance(tracker, LLMTokenTracker) - assert tracker.manager is manager - assert tracker.model_name == "test_model" - assert tracker.span is mock_span + with manager.trace_agent_step("test_step", "test_agent", "tool_call") as span: + assert span is None - def test_record_llm_metrics_disabled(self): - """Test recording LLM metrics when disabled.""" + def test_trace_tool_call_disabled(self): + """Test tool call tracing when disabled.""" manager = MonitoringManager() config = MonitoringConfig(enable_telemetry=False) manager.configure(config) - # Should not raise any exception - manager.record_llm_metrics("ttft", 0.5, {"model": "test"}) - - def test_record_llm_metrics_ttft(self): - """Test recording TTFT metrics.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) - manager._llm_ttft_duration = MagicMock() - - manager.record_llm_metrics("ttft", 0.5, {"model": "test"}) - - manager._llm_ttft_duration.record.assert_called_once_with( - 0.5, {"model": "test"}) - - def test_record_llm_metrics_token_rate(self): - """Test recording token rate metrics.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) - manager._llm_token_generation_rate = MagicMock() - - manager.record_llm_metrics("token_rate", 10.5, {"model": "test"}) - - manager._llm_token_generation_rate.record.assert_called_once_with(10.5, { - "model": "test"}) - - def test_record_llm_metrics_tokens(self): - """Test recording token count metrics.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) - manager._llm_total_tokens = MagicMock() - - manager.record_llm_metrics("tokens", 100, {"model": "test"}) - - manager._llm_total_tokens.add.assert_called_once_with( - 100, {"model": "test"}) - - def test_monitor_endpoint_decorator_async(self): - """Test monitor_endpoint decorator with async function.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) - - with patch.object(manager, 'trace_llm_request') as mock_trace: - mock_context = MagicMock() - mock_trace.return_value.__enter__ = Mock(return_value=MagicMock()) - mock_trace.return_value.__exit__ = Mock(return_value=None) - - @manager.monitor_endpoint("test_operation") - async def test_function(param1, param2="default"): - return {"result": "success"} - - # Test the decorated function - result = asyncio.run(test_function("value1", param2="value2")) - - assert result == {"result": "success"} - - def test_monitor_endpoint_decorator_sync(self): - """Test monitor_endpoint decorator with sync function.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) - - with patch.object(manager, 'trace_llm_request') as mock_trace: - mock_context = MagicMock() - mock_trace.return_value.__enter__ = Mock(return_value=MagicMock()) - mock_trace.return_value.__exit__ = Mock(return_value=None) - - @manager.monitor_endpoint("test_operation") - def test_function(param1, param2="default"): - return {"result": "success"} - - # Test the decorated function - result = test_function("value1", param2="value2") - - assert result == {"result": "success"} - - def test_monitor_endpoint_decorator_with_exception(self): - """Test monitor_endpoint decorator with exception.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) - - with patch.object(manager, 'trace_llm_request') as mock_trace: - mock_context = MagicMock() - mock_trace.return_value.__enter__ = Mock(return_value=MagicMock()) - mock_trace.return_value.__exit__ = Mock(return_value=None) - - @manager.monitor_endpoint("test_operation") - def test_function(): - raise ValueError("Test error") - - # Test that exception is re-raised - with pytest.raises(ValueError, match="Test error"): - test_function() - - def test_monitor_endpoint_exclude_params(self): - """Test monitor_endpoint decorator with excluded parameters.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) - - with patch.object(manager, 'trace_llm_request') as mock_trace, \ - patch.object(manager, 'set_span_attributes') as mock_set_attrs: - - mock_span = MagicMock() - mock_trace.return_value.__enter__ = Mock(return_value=mock_span) - mock_trace.return_value.__exit__ = Mock(return_value=None) - - @manager.monitor_endpoint("test_operation", exclude_params=["password"]) - def test_function(username, password, debug=True): - return {"result": "success"} - - test_function(username="user1", password="secret123", debug=False) - - # Verify that password was excluded and other params included - mock_set_attrs.assert_called() - call_args = mock_set_attrs.call_args[1] - assert "param.username" in call_args - assert call_args["param.username"] == "user1" - assert "param.debug" in call_args - assert call_args["param.debug"] is False - assert "param.password" not in call_args - - def test_monitor_llm_call_decorator_sync(self): - """Test monitor_llm_call decorator with sync function.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) - - with patch.object(manager, 'trace_llm_request') as mock_trace, \ - patch.object(manager, 'create_token_tracker') as mock_create_tracker: - - mock_span = MagicMock() - mock_trace.return_value.__enter__ = Mock(return_value=mock_span) - mock_trace.return_value.__exit__ = Mock(return_value=None) - - mock_tracker = MagicMock() - mock_create_tracker.return_value = mock_tracker - - @manager.monitor_llm_call("test_model", "completion") - def test_llm_function(**kwargs): - # Verify token tracker is passed - assert "_token_tracker" in kwargs - assert kwargs["_token_tracker"] is mock_tracker - return {"result": "success"} - - result = test_llm_function() - assert result == {"result": "success"} + with manager.trace_tool_call("test_tool", "test_agent", {"input": "data"}) as span: + assert span is None - def test_monitor_llm_call_decorator_async(self): - """Test monitor_llm_call decorator with async function.""" - manager = MonitoringManager() - config = MonitoringConfig(enable_telemetry=True) - manager.configure(config) - with patch.object(manager, 'trace_llm_request') as mock_trace, \ - patch.object(manager, 'create_token_tracker') as mock_create_tracker: +class TestAgentMetrics: + """Test Agent metrics functionality.""" - mock_span = MagicMock() - mock_trace.return_value.__enter__ = Mock(return_value=mock_span) - mock_trace.return_value.__exit__ = Mock(return_value=None) + def setup_method(self): + """Reset singleton state before each test.""" + MonitoringManager._instance = None + MonitoringManager._initialized = False - mock_tracker = MagicMock() - mock_create_tracker.return_value = mock_tracker + def test_record_agent_metrics_duration(self): + """Test recording agent execution duration.""" + with patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True): + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=True) + manager.configure(config) + manager._agent_execution_duration = MagicMock() - @manager.monitor_llm_call("test_model", "completion") - async def test_llm_function(**kwargs): - # Verify token tracker is passed - assert "_token_tracker" in kwargs - assert kwargs["_token_tracker"] is mock_tracker - return {"result": "success"} + manager.record_agent_metrics("duration", 1.5, {"agent.name": "test"}) - result = asyncio.run(test_llm_function()) - assert result == {"result": "success"} + manager._agent_execution_duration.record.assert_called_once() class TestLLMTokenTracker: - """Test LLMTokenTracker functionality.""" + """Test LLMTokenTracker with OpenInference semantics.""" def setup_method(self): """Set up test fixtures.""" self.manager = MagicMock() self.span = MagicMock() - self.model_name = "test_model" - - def test_initialization(self): - """Test LLMTokenTracker initialization.""" - with patch('time.time', return_value=123.456): - tracker = LLMTokenTracker(self.manager, self.model_name, self.span) - - assert tracker.manager is self.manager - assert tracker.model_name == self.model_name - assert tracker.span is self.span - assert tracker.start_time == 123.456 - assert tracker.first_token_time is None - assert tracker.token_count == 0 - assert tracker.input_tokens == 0 - assert tracker.output_tokens == 0 - - def test_record_first_token_enabled(self): - """Test recording first token when monitoring is enabled.""" - self.manager.is_enabled = True - - # 0.5 second difference - with patch('time.time', side_effect=[123.456, 123.956]): - tracker = LLMTokenTracker(self.manager, self.model_name, self.span) - tracker.record_first_token() + self.model_name = "gpt-4" - assert tracker.first_token_time == 123.956 - - # Verify span event - self.span.add_event.assert_called_once_with( - "first_token_received", {"ttft_seconds": 0.5} - ) - - # Verify metrics recording - self.manager.record_llm_metrics.assert_called_once_with( - "ttft", 0.5, {"model": self.model_name} - ) - - def test_record_first_token_disabled(self): - """Test recording first token when monitoring is disabled.""" - self.manager.is_enabled = False - - tracker = LLMTokenTracker(self.manager, self.model_name, self.span) - tracker.record_first_token() - - assert tracker.first_token_time is None - self.span.add_event.assert_not_called() - self.manager.record_llm_metrics.assert_not_called() - - def test_record_first_token_multiple_calls(self): - """Test that first token is only recorded once.""" + def test_record_completion_openinference_attrs(self): + """Test completion uses OpenInference attribute names.""" self.manager.is_enabled = True - with patch('time.time', side_effect=[123.456, 123.956, 124.456]): + with patch('time.time', side_effect=[123.456, 123.956, 125.456]): tracker = LLMTokenTracker(self.manager, self.model_name, self.span) - - # First call should record tracker.record_first_token() - first_time = tracker.first_token_time + tracker.token_count = 10 - # Second call should not change the time - tracker.record_first_token() + tracker.record_completion(input_tokens=20, output_tokens=30) - assert tracker.first_token_time == first_time - assert self.span.add_event.call_count == 1 + expected_attrs = { + "llm.token_count.prompt": 20, + "llm.token_count.completion": 30, + "llm.token_count.total": 50, + "llm.generation_rate": 5.0, + "llm.duration.total": 2.0, + "llm.time_to_first_token": 0.5 + } + self.span.set_attributes.assert_called_once_with(expected_attrs) - def test_record_token_enabled(self): - """Test recording token when monitoring is enabled.""" + def test_record_metrics_openinference_labels(self): + """Test metrics recording uses OpenInference labels.""" self.manager.is_enabled = True - with patch('time.time', side_effect=[123.456, 123.956]): - tracker = LLMTokenTracker(self.manager, self.model_name, self.span) - tracker.record_token("test_token") - - assert tracker.token_count == 1 - assert tracker.first_token_time == 123.956 # Should auto-record first token - - # Verify span event - self.span.add_event.assert_called_with( - "token_generated", { - "token_count": 1, - "token_length": len("test_token") - } - ) - - def test_record_token_disabled(self): - """Test recording token when monitoring is disabled.""" - self.manager.is_enabled = False - tracker = LLMTokenTracker(self.manager, self.model_name, self.span) - tracker.record_token("test_token") - - assert tracker.token_count == 0 - assert tracker.first_token_time is None - self.span.add_event.assert_not_called() - - def test_record_token_multiple_tokens(self): - """Test recording multiple tokens.""" - self.manager.is_enabled = True - - with patch('time.time', side_effect=[123.456, 123.956, 124.056, 124.156]): - tracker = LLMTokenTracker(self.manager, self.model_name, self.span) - - tracker.record_token("token1") - tracker.record_token("token2") - tracker.record_token("token3") - - assert tracker.token_count == 3 - # First token time should not change after initial recording - assert tracker.first_token_time == 123.956 - def test_record_completion_enabled(self): - """Test recording completion metrics when monitoring is enabled.""" - self.manager.is_enabled = True - - # 2.5 second total - with patch('time.time', side_effect=[123.456, 123.956, 125.956]): - tracker = LLMTokenTracker(self.manager, self.model_name, self.span) - tracker.record_first_token() # Set first token time (creates duration of 0.5s) - tracker.token_count = 5 # Simulate 5 tokens generated - - tracker.record_completion(input_tokens=10, output_tokens=15) - - assert tracker.input_tokens == 10 - assert tracker.output_tokens == 15 + with patch('time.time', side_effect=[123.456, 124.456]): + tracker.record_completion(input_tokens=10, output_tokens=5) - # Verify metrics recording - the actual rate calculation: 5 tokens / 2.5 seconds = 2.0 tokens/sec - expected_rate = 2.0 # 5 tokens / 2.5 seconds - self.manager.record_llm_metrics.assert_any_call( - "token_rate", expected_rate, {"model": self.model_name} - ) self.manager.record_llm_metrics.assert_any_call( - "tokens", 10, {"model": self.model_name, "type": "input"} + "tokens_prompt", 10, {"llm.model_name": self.model_name} ) self.manager.record_llm_metrics.assert_any_call( - "tokens", 15, {"model": self.model_name, "type": "output"} + "tokens_completion", 5, {"llm.model_name": self.model_name} ) - def test_record_completion_disabled(self): - """Test recording completion metrics when monitoring is disabled.""" - self.manager.is_enabled = False - tracker = LLMTokenTracker(self.manager, self.model_name, self.span) - tracker.record_completion(input_tokens=10, output_tokens=15) +class TestDecorators: + """Test monitoring decorators.""" - self.manager.record_llm_metrics.assert_not_called() + def setup_method(self): + """Reset singleton state before each test.""" + MonitoringManager._instance = None + MonitoringManager._initialized = False - def test_record_completion_span_attributes(self): - """Test that completion sets span attributes correctly.""" - self.manager.is_enabled = True + def test_monitor_endpoint_decorator_sync(self): + """Test monitor_endpoint decorator with sync function.""" + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=False) + manager.configure(config) - # 2 second total - with patch('time.time', side_effect=[123.456, 123.956, 125.456]): - tracker = LLMTokenTracker(self.manager, self.model_name, self.span) - tracker.record_first_token() - tracker.token_count = 10 + @manager.monitor_endpoint("test_operation") + def test_function(param1, param2="default"): + return {"result": "success"} - tracker.record_completion(input_tokens=20, output_tokens=30) + result = test_function("value1", param2="value2") + assert result == {"result": "success"} - # Verify span attributes - expected_attrs = { - "llm.input_tokens": 20, - "llm.output_tokens": 30, - "llm.total_tokens": 50, - "llm.generation_rate": 5.0, # 10 tokens / 2 seconds - "llm.total_duration": 2.0, - "llm.ttft": 0.5 # first_token_time - start_time - } - self.span.set_attributes.assert_called_once_with(expected_attrs) + def test_monitor_endpoint_decorator_async(self): + """Test monitor_endpoint decorator with async function.""" + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=False) + manager.configure(config) - def test_record_completion_zero_duration(self): - """Test recording completion with zero duration.""" - self.manager.is_enabled = True + @manager.monitor_endpoint("test_operation") + async def test_function(param1, param2="default"): + return {"result": "success"} - with patch('time.time', return_value=123.456): # Same time for all calls - tracker = LLMTokenTracker(self.manager, self.model_name, self.span) - tracker.token_count = 5 + result = asyncio.run(test_function("value1", param2="value2")) + assert result == {"result": "success"} - tracker.record_completion(input_tokens=10, output_tokens=15) + def test_monitor_llm_call_decorator(self): + """Test monitor_llm_call decorator.""" + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=False) + manager.configure(config) - # Should handle zero duration gracefully - assert tracker.input_tokens == 10 - assert tracker.output_tokens == 15 + @manager.monitor_llm_call("gpt-4", "completion") + def test_llm_function(**kwargs): + return {"result": "llm_success"} - def test_record_completion_no_tokens(self): - """Test recording completion with no tokens generated.""" - self.manager.is_enabled = True + result = test_llm_function() + assert result == {"result": "llm_success"} - # 1 second total - with patch('time.time', side_effect=[123.456, 124.456]): - tracker = LLMTokenTracker(self.manager, self.model_name, self.span) - # Don't set token_count (remains 0) + def test_monitor_agent_execution_decorator(self): + """Test monitor_agent_execution decorator.""" + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=False) + manager.configure(config) - tracker.record_completion(input_tokens=10, output_tokens=15) + @manager.monitor_agent_execution("test_agent") + def test_agent_function(): + return {"result": "agent_success"} - # Should handle zero tokens gracefully - assert tracker.input_tokens == 10 - assert tracker.output_tokens == 15 + result = test_agent_function() + assert result == {"result": "agent_success"} class TestGlobalFunctions: """Test global functions.""" def test_get_monitoring_manager_singleton(self): - """Test that get_monitoring_manager returns the same instance.""" - # Reset singleton + """Test get_monitoring_manager returns singleton.""" MonitoringManager._instance = None MonitoringManager._initialized = False @@ -846,108 +487,115 @@ def test_get_monitoring_manager_singleton(self): assert manager1 is manager2 assert isinstance(manager1, MonitoringManager) + def test_is_opentelemetry_available(self): + """Test is_opentelemetry_available function.""" + result = is_opentelemetry_available() + assert isinstance(result, bool) -class TestIntegrationScenarios: - """Test integration scenarios and edge cases.""" + +class TestProtocolSwitching: + """Test HTTP/gRPC protocol switching.""" def setup_method(self): """Reset singleton state before each test.""" MonitoringManager._instance = None MonitoringManager._initialized = False - def test_full_monitoring_lifecycle(self): - """Test complete monitoring lifecycle from config to metrics.""" - manager = get_monitoring_manager() + @patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True) + @patch('sdk.nexent.monitor.monitoring.OTLPSpanExporterHTTP') + def test_http_protocol_uses_http_exporter(self, mock_http_exporter): + """Test that http protocol uses HTTP exporter.""" + manager = MonitoringManager() config = MonitoringConfig( - enable_telemetry=True, service_name="test-service") + enable_telemetry=True, + otlp_endpoint="http://localhost:4318", + otlp_protocol="http" + ) + + with patch('sdk.nexent.monitor.monitoring.TracerProvider'), \ + patch('sdk.nexent.monitor.monitoring.Resource.create'), \ + patch('sdk.nexent.monitor.monitoring.trace'), \ + patch('sdk.nexent.monitor.monitoring.metrics'), \ + patch('sdk.nexent.monitor.monitoring.MeterProvider'), \ + patch('sdk.nexent.monitor.monitoring.BatchSpanProcessor'), \ + patch('sdk.nexent.monitor.monitoring.RequestsInstrumentor'): - with patch.object(manager, '_init_telemetry'): manager.configure(config) - # Test that all methods work with enabled monitoring - assert manager.is_enabled is True + mock_http_exporter.assert_called_once() + + @patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True) + @patch('sdk.nexent.monitor.monitoring.OTLPSpanExporterGRPC') + def test_grpc_protocol_uses_grpc_exporter(self, mock_grpc_exporter): + """Test that grpc protocol uses gRPC exporter.""" + manager = MonitoringManager() + config = MonitoringConfig( + enable_telemetry=True, + otlp_endpoint="http://localhost:4317", + otlp_protocol="grpc" + ) - tracker = manager.create_token_tracker("test_model") - assert isinstance(tracker, LLMTokenTracker) + with patch('sdk.nexent.monitor.monitoring.TracerProvider'), \ + patch('sdk.nexent.monitor.monitoring.Resource.create'), \ + patch('sdk.nexent.monitor.monitoring.trace'), \ + patch('sdk.nexent.monitor.monitoring.metrics'), \ + patch('sdk.nexent.monitor.monitoring.MeterProvider'), \ + patch('sdk.nexent.monitor.monitoring.BatchSpanProcessor'), \ + patch('sdk.nexent.monitor.monitoring.RequestsInstrumentor'): - # Test decorators work - @manager.monitor_endpoint("test_op") - def test_func(): - return "success" + manager.configure(config) - result = test_func() - assert result == "success" + mock_grpc_exporter.assert_called_once() - def test_monitoring_disabled_lifecycle(self): - """Test monitoring lifecycle when disabled.""" - manager = get_monitoring_manager() - config = MonitoringConfig(enable_telemetry=False) - manager.configure(config) +class TestErrorHandling: + """Test error handling and graceful degradation.""" - # All methods should work without errors when disabled - assert manager.is_enabled is False + def setup_method(self): + """Reset singleton state before each test.""" + MonitoringManager._instance = None + MonitoringManager._initialized = False + + def test_methods_work_when_disabled(self): + """Test all methods work gracefully when monitoring is disabled.""" + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=False) + manager.configure(config) manager.add_span_event("test_event") manager.set_span_attributes(key="value") manager.record_llm_metrics("ttft", 0.5, {}) + manager.record_agent_metrics("duration", 1.0, {}) - # Decorators should still work - @manager.monitor_endpoint("test_op") - def test_func(): - return "success" - - result = test_func() - assert result == "success" - - def test_concurrent_access(self): - """Test concurrent access to singleton.""" - import threading - - managers = [] - - def create_manager(): - managers.append(get_monitoring_manager()) - - threads = [threading.Thread(target=create_manager) for _ in range(10)] + with manager.trace_llm_request("test", "model") as span: + assert span is None - for t in threads: - t.start() - for t in threads: - t.join() + with manager.trace_agent_step("step", "agent", "tool_call") as span: + assert span is None - # All managers should be the same instance - assert len(set(id(m) for m in managers)) == 1 + with manager.trace_tool_call("tool", "agent", {"input": "data"}) as span: + assert span is None - def test_error_resilience(self): - """Test that monitoring errors don't break application flow.""" - manager = get_monitoring_manager() - config = MonitoringConfig(enable_telemetry=True) + def test_decorators_propagate_exceptions(self): + """Test decorators properly propagate exceptions.""" + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=False) manager.configure(config) - # Test that when monitoring is disabled, methods handle gracefully - manager._config.enable_telemetry = False - - # These should not raise exceptions when disabled - manager.add_span_event("test_event") - manager.set_span_attributes(key="value") - manager.record_llm_metrics("ttft", 0.5, {}) + @manager.monitor_endpoint("test") + def error_func(): + raise ValueError("Test error") - # Re-enable for decorator test - manager._config.enable_telemetry = True + with pytest.raises(ValueError, match="Test error"): + error_func() - # Test decorator with mocked internal error handling - with patch.object(manager, 'trace_llm_request') as mock_trace: - # Mock context manager that handles errors gracefully - mock_context = MagicMock() - mock_context.__enter__ = Mock(return_value=None) - mock_context.__exit__ = Mock(return_value=None) - mock_trace.return_value = mock_context + def test_exporter_error_does_not_crash(self): + """Test exporter errors don't crash application.""" + with patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True): + manager = MonitoringManager() - @manager.monitor_endpoint("test_op") - def test_func(): - return "success" + with patch('sdk.nexent.monitor.monitoring.Resource.create', side_effect=Exception("Export error")): + config = MonitoringConfig(enable_telemetry=True) + manager.configure(config) - # Function should work normally - result = test_func() - assert result == "success" + assert manager._tracer is None \ No newline at end of file From 4351d67fff1ca625d8775d4465e45983f2341c87 Mon Sep 17 00:00:00 2001 From: hhhhsc Date: Wed, 29 Apr 2026 09:09:36 +0800 Subject: [PATCH 02/17] Refine OpenTelemetry monitoring and multi-platform config --- backend/consts/const.py | 45 +- backend/utils/monitoring.py | 123 ++++- doc/docs/.vitepress/config.mts | 1 + doc/docs/en/sdk/monitoring.md | 69 ++- doc/docs/zh/sdk/monitoring.md | 69 ++- doc/docs/zh/sdk/opentelemetry-design.md | 255 ++++++++++ docker/.env.example | 25 +- docker/monitoring/monitoring.env | 14 +- docker/monitoring/monitoring.env.example | 14 +- docker/monitoring/otel-collector-config.yml | 38 +- docker/start-monitoring.sh | 20 +- sdk/nexent/monitor/monitoring.py | 529 +++++++++++++++----- test/sdk/monitor/conftest.py | 61 ++- test/sdk/monitor/test_monitoring.py | 174 ++++++- 14 files changed, 1228 insertions(+), 209 deletions(-) create mode 100644 doc/docs/zh/sdk/opentelemetry-design.md diff --git a/backend/consts/const.py b/backend/consts/const.py index 9448723b7..543d64ba1 100644 --- a/backend/consts/const.py +++ b/backend/consts/const.py @@ -317,13 +317,32 @@ class VectorDatabaseType(str, Enum): # Telemetry and Monitoring Configuration (OTLP Protocol) -ENABLE_TELEMETRY = os.getenv("ENABLE_TELEMETRY", "false").lower() == "true" -OTEL_SERVICE_NAME = os.getenv("OTEL_SERVICE_NAME", "nexent-backend") -OTEL_EXPORTER_OTLP_ENDPOINT = os.getenv( - "OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318") -OTEL_EXPORTER_OTLP_PROTOCOL = os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL", "http") -OTEL_EXPORTER_OTLP_HEADERS = os.getenv("OTEL_EXPORTER_OTLP_HEADERS", "") -TELEMETRY_SAMPLE_RATE = float(os.getenv("TELEMETRY_SAMPLE_RATE", "1.0")) +MONITORING_CONFIG_FILE = os.getenv("MONITORING_CONFIG_FILE", "") +MONITORING_PROVIDER = os.getenv("MONITORING_PROVIDER", "") +ENABLE_TELEMETRY_RAW = os.getenv("ENABLE_TELEMETRY") +ENABLE_TELEMETRY = (ENABLE_TELEMETRY_RAW or "false").lower() == "true" +OTEL_SERVICE_NAME_RAW = os.getenv("OTEL_SERVICE_NAME") +OTEL_SERVICE_NAME = OTEL_SERVICE_NAME_RAW or "nexent-backend" +OTEL_EXPORTER_OTLP_ENDPOINT_RAW = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT") +OTEL_EXPORTER_OTLP_ENDPOINT = OTEL_EXPORTER_OTLP_ENDPOINT_RAW or "http://localhost:4318" +OTEL_EXPORTER_OTLP_TRACES_ENDPOINT = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", "") +OTEL_EXPORTER_OTLP_METRICS_ENDPOINT = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", "") +OTEL_EXPORTER_OTLP_PROTOCOL_RAW = os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL") +OTEL_EXPORTER_OTLP_PROTOCOL = OTEL_EXPORTER_OTLP_PROTOCOL_RAW or "http" +OTEL_EXPORTER_OTLP_HEADERS_RAW = os.getenv("OTEL_EXPORTER_OTLP_HEADERS") +OTEL_EXPORTER_OTLP_HEADERS = OTEL_EXPORTER_OTLP_HEADERS_RAW or "" +OTEL_EXPORTER_OTLP_AUTHORIZATION = os.getenv("OTEL_EXPORTER_OTLP_AUTHORIZATION", "") +OTEL_EXPORTER_OTLP_X_API_KEY = os.getenv("OTEL_EXPORTER_OTLP_X_API_KEY", "") +OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION = os.getenv( + "OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION", "") +OTEL_EXPORTER_OTLP_METRICS_ENABLED_RAW = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENABLED") +OTEL_EXPORTER_OTLP_METRICS_ENABLED = ( + OTEL_EXPORTER_OTLP_METRICS_ENABLED_RAW or "true").lower() == "true" +MONITORING_USE_PLATFORM_SDK_RAW = os.getenv("MONITORING_USE_PLATFORM_SDK") +MONITORING_USE_PLATFORM_SDK = (MONITORING_USE_PLATFORM_SDK_RAW or "false").lower() == "true" +MONITORING_PROJECT_NAME = os.getenv("MONITORING_PROJECT_NAME", "") +TELEMETRY_SAMPLE_RATE_RAW = os.getenv("TELEMETRY_SAMPLE_RATE") +TELEMETRY_SAMPLE_RATE = float(TELEMETRY_SAMPLE_RATE_RAW or "1.0") # Parse OTLP headers into dict format def _parse_otlp_headers(headers_str: str) -> dict: @@ -338,12 +357,20 @@ def _parse_otlp_headers(headers_str: str) -> dict: return headers OTLP_HEADERS = _parse_otlp_headers(OTEL_EXPORTER_OTLP_HEADERS) +if OTEL_EXPORTER_OTLP_AUTHORIZATION: + OTLP_HEADERS["Authorization"] = OTEL_EXPORTER_OTLP_AUTHORIZATION +if OTEL_EXPORTER_OTLP_X_API_KEY: + OTLP_HEADERS["x-api-key"] = OTEL_EXPORTER_OTLP_X_API_KEY +if OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION: + OTLP_HEADERS["x-langfuse-ingestion-version"] = OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION # Performance monitoring thresholds +LLM_SLOW_REQUEST_THRESHOLD_SECONDS_RAW = os.getenv("LLM_SLOW_REQUEST_THRESHOLD_SECONDS") LLM_SLOW_REQUEST_THRESHOLD_SECONDS = float( - os.getenv("LLM_SLOW_REQUEST_THRESHOLD_SECONDS", "5.0")) + LLM_SLOW_REQUEST_THRESHOLD_SECONDS_RAW or "5.0") +LLM_SLOW_TOKEN_RATE_THRESHOLD_RAW = os.getenv("LLM_SLOW_TOKEN_RATE_THRESHOLD") LLM_SLOW_TOKEN_RATE_THRESHOLD = float( - os.getenv("LLM_SLOW_TOKEN_RATE_THRESHOLD", "10.0")) + LLM_SLOW_TOKEN_RATE_THRESHOLD_RAW or "10.0") DEFAULT_ZH_TITLE = "新对话" diff --git a/backend/utils/monitoring.py b/backend/utils/monitoring.py index 28aaaef51..254809ca6 100644 --- a/backend/utils/monitoring.py +++ b/backend/utils/monitoring.py @@ -7,7 +7,7 @@ Usage: from utils.monitoring import monitoring_manager - + @monitoring_manager.monitor_endpoint("my_service.my_function") async def my_function(): return {"status": "ok"} @@ -20,23 +20,63 @@ async def my_function(): try: from consts.const import ( ENABLE_TELEMETRY, + ENABLE_TELEMETRY_RAW, + MONITORING_CONFIG_FILE, + MONITORING_PROVIDER, + MONITORING_USE_PLATFORM_SDK_RAW, + MONITORING_USE_PLATFORM_SDK, + MONITORING_PROJECT_NAME, + OTEL_SERVICE_NAME_RAW, OTEL_SERVICE_NAME, + OTEL_EXPORTER_OTLP_ENDPOINT_RAW, OTEL_EXPORTER_OTLP_ENDPOINT, + OTEL_EXPORTER_OTLP_TRACES_ENDPOINT, + OTEL_EXPORTER_OTLP_METRICS_ENDPOINT, + OTEL_EXPORTER_OTLP_PROTOCOL_RAW, OTEL_EXPORTER_OTLP_PROTOCOL, + OTEL_EXPORTER_OTLP_HEADERS_RAW, + OTEL_EXPORTER_OTLP_AUTHORIZATION, + OTEL_EXPORTER_OTLP_X_API_KEY, + OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION, + OTEL_EXPORTER_OTLP_METRICS_ENABLED_RAW, + OTEL_EXPORTER_OTLP_METRICS_ENABLED, OTLP_HEADERS, + TELEMETRY_SAMPLE_RATE_RAW, TELEMETRY_SAMPLE_RATE, + LLM_SLOW_REQUEST_THRESHOLD_SECONDS_RAW, LLM_SLOW_REQUEST_THRESHOLD_SECONDS, + LLM_SLOW_TOKEN_RATE_THRESHOLD_RAW, LLM_SLOW_TOKEN_RATE_THRESHOLD ) except ImportError: from backend.consts.const import ( ENABLE_TELEMETRY, + ENABLE_TELEMETRY_RAW, + MONITORING_CONFIG_FILE, + MONITORING_PROVIDER, + MONITORING_USE_PLATFORM_SDK_RAW, + MONITORING_USE_PLATFORM_SDK, + MONITORING_PROJECT_NAME, + OTEL_SERVICE_NAME_RAW, OTEL_SERVICE_NAME, + OTEL_EXPORTER_OTLP_ENDPOINT_RAW, OTEL_EXPORTER_OTLP_ENDPOINT, + OTEL_EXPORTER_OTLP_TRACES_ENDPOINT, + OTEL_EXPORTER_OTLP_METRICS_ENDPOINT, + OTEL_EXPORTER_OTLP_PROTOCOL_RAW, OTEL_EXPORTER_OTLP_PROTOCOL, + OTEL_EXPORTER_OTLP_HEADERS_RAW, + OTEL_EXPORTER_OTLP_AUTHORIZATION, + OTEL_EXPORTER_OTLP_X_API_KEY, + OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION, + OTEL_EXPORTER_OTLP_METRICS_ENABLED_RAW, + OTEL_EXPORTER_OTLP_METRICS_ENABLED, OTLP_HEADERS, + TELEMETRY_SAMPLE_RATE_RAW, TELEMETRY_SAMPLE_RATE, + LLM_SLOW_REQUEST_THRESHOLD_SECONDS_RAW, LLM_SLOW_REQUEST_THRESHOLD_SECONDS, + LLM_SLOW_TOKEN_RATE_THRESHOLD_RAW, LLM_SLOW_TOKEN_RATE_THRESHOLD ) @@ -47,27 +87,84 @@ async def my_function(): monitoring_manager = get_monitoring_manager() +def _is_explicit_non_default(raw_value: str | None, default_value: str) -> bool: + """Return True when an env value should override a config-file value.""" + return raw_value not in (None, "", default_value) + + +def _build_env_overrides() -> dict: + """Build config overrides from environment-derived constants.""" + overrides = {} + if ENABLE_TELEMETRY_RAW is not None: + overrides["enable_telemetry"] = ENABLE_TELEMETRY + if MONITORING_PROVIDER: + overrides["provider"] = MONITORING_PROVIDER + if _is_explicit_non_default(OTEL_SERVICE_NAME_RAW, "nexent-backend"): + overrides["service_name"] = OTEL_SERVICE_NAME + if _is_explicit_non_default(OTEL_EXPORTER_OTLP_ENDPOINT_RAW, "http://localhost:4318"): + overrides["otlp_endpoint"] = OTEL_EXPORTER_OTLP_ENDPOINT + if OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: + overrides["otlp_traces_endpoint"] = OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + if OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: + overrides["otlp_metrics_endpoint"] = OTEL_EXPORTER_OTLP_METRICS_ENDPOINT + if _is_explicit_non_default(OTEL_EXPORTER_OTLP_PROTOCOL_RAW, "http"): + overrides["otlp_protocol"] = OTEL_EXPORTER_OTLP_PROTOCOL + if ( + OTEL_EXPORTER_OTLP_HEADERS_RAW + or OTEL_EXPORTER_OTLP_AUTHORIZATION + or OTEL_EXPORTER_OTLP_X_API_KEY + or OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION + ): + overrides["otlp_headers"] = OTLP_HEADERS + if OTEL_EXPORTER_OTLP_METRICS_ENABLED_RAW is not None: + overrides["export_metrics"] = OTEL_EXPORTER_OTLP_METRICS_ENABLED + if MONITORING_USE_PLATFORM_SDK_RAW is not None: + overrides["use_platform_sdk"] = MONITORING_USE_PLATFORM_SDK + if MONITORING_PROJECT_NAME: + overrides["project_name"] = MONITORING_PROJECT_NAME + if _is_explicit_non_default(TELEMETRY_SAMPLE_RATE_RAW, "1.0"): + overrides["telemetry_sample_rate"] = TELEMETRY_SAMPLE_RATE + if _is_explicit_non_default(LLM_SLOW_REQUEST_THRESHOLD_SECONDS_RAW, "5.0"): + overrides["llm_slow_request_threshold_seconds"] = LLM_SLOW_REQUEST_THRESHOLD_SECONDS + if _is_explicit_non_default(LLM_SLOW_TOKEN_RATE_THRESHOLD_RAW, "10.0"): + overrides["llm_slow_token_rate_threshold"] = LLM_SLOW_TOKEN_RATE_THRESHOLD + return overrides + + def _initialize_monitoring(): """Initialize monitoring configuration with OTLP settings.""" - config = MonitoringConfig( - enable_telemetry=ENABLE_TELEMETRY, - service_name=OTEL_SERVICE_NAME, - otlp_endpoint=OTEL_EXPORTER_OTLP_ENDPOINT, - otlp_protocol=OTEL_EXPORTER_OTLP_PROTOCOL, - otlp_headers=OTLP_HEADERS, - telemetry_sample_rate=TELEMETRY_SAMPLE_RATE, - llm_slow_request_threshold_seconds=LLM_SLOW_REQUEST_THRESHOLD_SECONDS, - llm_slow_token_rate_threshold=LLM_SLOW_TOKEN_RATE_THRESHOLD - ) + if MONITORING_CONFIG_FILE: + config = MonitoringConfig.from_file( + MONITORING_CONFIG_FILE, + overrides=_build_env_overrides() + ) + else: + config = MonitoringConfig( + enable_telemetry=ENABLE_TELEMETRY, + service_name=OTEL_SERVICE_NAME, + provider=MONITORING_PROVIDER or "otlp", + otlp_endpoint=OTEL_EXPORTER_OTLP_ENDPOINT, + otlp_traces_endpoint=OTEL_EXPORTER_OTLP_TRACES_ENDPOINT or None, + otlp_metrics_endpoint=OTEL_EXPORTER_OTLP_METRICS_ENDPOINT or None, + otlp_protocol=OTEL_EXPORTER_OTLP_PROTOCOL, + otlp_headers=OTLP_HEADERS, + export_metrics=OTEL_EXPORTER_OTLP_METRICS_ENABLED, + use_platform_sdk=MONITORING_USE_PLATFORM_SDK, + project_name=MONITORING_PROJECT_NAME or None, + telemetry_sample_rate=TELEMETRY_SAMPLE_RATE, + llm_slow_request_threshold_seconds=LLM_SLOW_REQUEST_THRESHOLD_SECONDS, + llm_slow_token_rate_threshold=LLM_SLOW_TOKEN_RATE_THRESHOLD + ) monitoring_manager.configure(config) logger.info( f"OTLP monitoring initialized: service_name={OTEL_SERVICE_NAME}, " - f"enable_telemetry={ENABLE_TELEMETRY}, endpoint={OTEL_EXPORTER_OTLP_ENDPOINT}, " + f"enable_telemetry={config.enable_telemetry}, provider={config.provider}, " + f"endpoint={config.otlp_endpoint}, trace_endpoint={config.get_trace_endpoint()}, " f"protocol={OTEL_EXPORTER_OTLP_PROTOCOL}" ) _initialize_monitoring() -__all__ = ['monitoring_manager'] \ No newline at end of file +__all__ = ['monitoring_manager'] diff --git a/doc/docs/.vitepress/config.mts b/doc/docs/.vitepress/config.mts index 6ee76ff5d..87e79a831 100644 --- a/doc/docs/.vitepress/config.mts +++ b/doc/docs/.vitepress/config.mts @@ -385,6 +385,7 @@ export default defineConfig({ ], }, { text: "性能监控", link: "/zh/sdk/monitoring" }, + { text: "OpenTelemetry 设计", link: "/zh/sdk/opentelemetry-design" }, { text: "向量数据库", link: "/zh/sdk/vector-database" }, { text: "数据处理", link: "/zh/sdk/data-process" }, ], diff --git a/doc/docs/en/sdk/monitoring.md b/doc/docs/en/sdk/monitoring.md index 64b211401..fb3a79a08 100644 --- a/doc/docs/en/sdk/monitoring.md +++ b/doc/docs/en/sdk/monitoring.md @@ -20,6 +20,7 @@ cp .env.example .env vim .env ENABLE_TELEMETRY=true +MONITORING_PROVIDER=otlp OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 OTEL_EXPORTER_OTLP_PROTOCOL=http @@ -35,9 +36,18 @@ Arize Phoenix provides AI-specific observability with OpenInference semantic sup **Configuration:** ```bash -OTEL_EXPORTER_OTLP_ENDPOINT=https://phoenix.arize.com/v1 -OTEL_EXPORTER_OTLP_HEADERS=x-api-key=YOUR_PHOENIX_API_KEY +MONITORING_PROVIDER=phoenix +OTEL_EXPORTER_OTLP_ENDPOINT=https://app.phoenix.arize.com/s/YOUR_SPACE +OTEL_EXPORTER_OTLP_AUTHORIZATION="Bearer YOUR_PHOENIX_API_KEY" OTEL_EXPORTER_OTLP_PROTOCOL=http +OTEL_EXPORTER_OTLP_METRICS_ENABLED=false +``` + +To let the Phoenix SDK handle part of the OpenTelemetry setup, also enable this. When the SDK returns a tracer provider, Nexent reuses it to avoid registering a second global OpenTelemetry provider: + +```bash +MONITORING_USE_PLATFORM_SDK=true +MONITORING_PROJECT_NAME=nexent-production ``` **Features:** @@ -53,12 +63,14 @@ Langfuse offers prompt management and LLM observability with OTLP support. **Configuration:** ```bash -OTEL_EXPORTER_OTLP_ENDPOINT=https://cloud.langfuse.com/api/public/otel/v1 +MONITORING_PROVIDER=langfuse +OTEL_EXPORTER_OTLP_ENDPOINT=https://cloud.langfuse.com/api/public/otel LANGFUSE_PUBLIC_KEY=pk-xxx LANGFUSE_SECRET_KEY=sk-xxx -OTEL_EXPORTER_OTLP_HEADERS=Authorization=Basic BASE64_ENCODED_KEY +OTEL_EXPORTER_OTLP_AUTHORIZATION=Basic BASE64_ENCODED_KEY +OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION=4 ``` Generate the encoded key: @@ -101,10 +113,39 @@ jaeger: | Variable | Default | Description | |----------|---------|-------------| | `ENABLE_TELEMETRY` | `false` | Enable/disable monitoring | +| `MONITORING_CONFIG_FILE` | (empty) | JSON/YAML monitoring config file path | +| `MONITORING_PROVIDER` | `otlp` | Provider profile: `otlp`, `phoenix`, `langfuse`, `jaeger`, `custom` | +| `MONITORING_USE_PLATFORM_SDK` | `false` | Whether to also initialize a provider SDK | +| `MONITORING_PROJECT_NAME` | `nexent` | Observability platform project name | | `OTEL_SERVICE_NAME` | `nexent-backend` | Service identifier | -| `OTEL_EXPORTER_OTLP_ENDPOINT` | `http://localhost:4318` | OTLP receiver endpoint | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | `http://localhost:4318` | OTLP base endpoint; SDK derives `/v1/traces` and `/v1/metrics` | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | (empty) | Optional trace-specific endpoint | +| `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` | (empty) | Optional metric-specific endpoint | | `OTEL_EXPORTER_OTLP_PROTOCOL` | `http` | Protocol: `http` or `grpc` | -| `OTEL_EXPORTER_OTLP_HEADERS` | (empty) | Auth headers (comma-separated) | +| `OTEL_EXPORTER_OTLP_HEADERS` | (empty) | Generic auth headers (comma-separated) | +| `OTEL_EXPORTER_OTLP_AUTHORIZATION` | (empty) | `Authorization` header, commonly used by Phoenix bearer auth and Langfuse | +| `OTEL_EXPORTER_OTLP_X_API_KEY` | (empty) | `x-api-key` header for platforms that require it | +| `OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION` | (empty) | Langfuse ingestion version, for example `4` | +| `OTEL_EXPORTER_OTLP_METRICS_ENABLED` | `true` | Whether to export OTLP metrics | + +## Configuration File + +You can also set `MONITORING_CONFIG_FILE` to a JSON/YAML file. Explicit non-default environment variables override file values. + +```yaml +monitoring: + enable_telemetry: true + service_name: nexent-backend + project_name: nexent-production + exporter: + provider: langfuse + protocol: http + endpoint: https://cloud.langfuse.com/api/public/otel + headers: + Authorization: Basic BASE64_ENCODED_KEY + x-langfuse-ingestion-version: "4" + export_metrics: false +``` ## Code Integration @@ -183,14 +224,20 @@ The system uses OpenInference semantic conventions for AI-specific observability ## Collector Configuration -The OpenTelemetry Collector routes data to your chosen backend: +By default, the OpenTelemetry Collector only logs data through the logging exporter. This avoids forwarding data back into itself when no external backend is configured. To forward through the Collector, add a platform exporter: ```yaml exporters: - otlp: - endpoint: ${OTEL_EXPORTER_OTLP_ENDPOINT} + otlphttp/langfuse: + endpoint: https://cloud.langfuse.com/api/public/otel headers: - authorization: ${OTEL_EXPORTER_OTLP_HEADERS} + Authorization: Basic BASE64_ENCODED_KEY + x-langfuse-ingestion-version: "4" + +service: + pipelines: + traces: + exporters: [otlphttp/langfuse, logging] ``` See `docker/monitoring/otel-collector-config.yml` for full configuration with platform examples. @@ -224,4 +271,4 @@ All monitoring methods work without errors when disabled - decorators pass throu 1. Verify OpenInference attributes in platform UI 2. Check span attribute naming: `llm.model_name` not `model_name` -3. Review platform-specific attribute requirements \ No newline at end of file +3. Review platform-specific attribute requirements diff --git a/doc/docs/zh/sdk/monitoring.md b/doc/docs/zh/sdk/monitoring.md index bb1bf000e..5b93d47b1 100644 --- a/doc/docs/zh/sdk/monitoring.md +++ b/doc/docs/zh/sdk/monitoring.md @@ -20,6 +20,7 @@ cp .env.example .env vim .env ENABLE_TELEMETRY=true +MONITORING_PROVIDER=otlp OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 OTEL_EXPORTER_OTLP_PROTOCOL=http @@ -35,9 +36,18 @@ Arize Phoenix 提供针对 AI 的专业可观测性,原生支持 OpenInference **配置:** ```bash -OTEL_EXPORTER_OTLP_ENDPOINT=https://phoenix.arize.com/v1 -OTEL_EXPORTER_OTLP_HEADERS=x-api-key=YOUR_PHOENIX_API_KEY +MONITORING_PROVIDER=phoenix +OTEL_EXPORTER_OTLP_ENDPOINT=https://app.phoenix.arize.com/s/YOUR_SPACE +OTEL_EXPORTER_OTLP_AUTHORIZATION="Bearer YOUR_PHOENIX_API_KEY" OTEL_EXPORTER_OTLP_PROTOCOL=http +OTEL_EXPORTER_OTLP_METRICS_ENABLED=false +``` + +如果希望使用 Phoenix 官方 SDK 负责部分 OpenTelemetry 初始化,可额外启用。启用后 SDK 返回的 tracer provider 会被复用,避免重复注册 OpenTelemetry 全局 provider: + +```bash +MONITORING_USE_PLATFORM_SDK=true +MONITORING_PROJECT_NAME=nexent-production ``` **功能特性:** @@ -53,12 +63,14 @@ Langfuse 提供 Prompt 管理和 LLM 可观测性,支持 OTLP 协议。 **配置:** ```bash -OTEL_EXPORTER_OTLP_ENDPOINT=https://cloud.langfuse.com/api/public/otel/v1 +MONITORING_PROVIDER=langfuse +OTEL_EXPORTER_OTLP_ENDPOINT=https://cloud.langfuse.com/api/public/otel LANGFUSE_PUBLIC_KEY=pk-xxx LANGFUSE_SECRET_KEY=sk-xxx -OTEL_EXPORTER_OTLP_HEADERS=Authorization=Basic BASE64_ENCODED_KEY +OTEL_EXPORTER_OTLP_AUTHORIZATION=Basic BASE64_ENCODED_KEY +OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION=4 ``` 生成认证 Key: @@ -101,10 +113,39 @@ jaeger: | 变量 | 默认值 | 说明 | |------|--------|------| | `ENABLE_TELEMETRY` | `false` | 启用/禁用监控 | +| `MONITORING_CONFIG_FILE` | (空) | JSON/YAML 监控配置文件路径 | +| `MONITORING_PROVIDER` | `otlp` | 平台配置:`otlp`、`phoenix`、`langfuse`、`jaeger`、`custom` | +| `MONITORING_USE_PLATFORM_SDK` | `false` | 是否额外初始化平台 SDK | +| `MONITORING_PROJECT_NAME` | `nexent` | 监控平台项目名 | | `OTEL_SERVICE_NAME` | `nexent-backend` | 服务标识 | -| `OTEL_EXPORTER_OTLP_ENDPOINT` | `http://localhost:4318` | OTLP 接收端点 | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | `http://localhost:4318` | OTLP base endpoint,SDK 会派生 `/v1/traces` 和 `/v1/metrics` | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | (空) | 可选 trace 专用 endpoint | +| `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` | (空) | 可选 metric 专用 endpoint | | `OTEL_EXPORTER_OTLP_PROTOCOL` | `http` | 协议:`http` 或 `grpc` | -| `OTEL_EXPORTER_OTLP_HEADERS` | (空) | 认证头(逗号分隔) | +| `OTEL_EXPORTER_OTLP_HEADERS` | (空) | 通用认证头(逗号分隔) | +| `OTEL_EXPORTER_OTLP_AUTHORIZATION` | (空) | `Authorization` header,常用于 Phoenix bearer auth 和 Langfuse | +| `OTEL_EXPORTER_OTLP_X_API_KEY` | (空) | `x-api-key` header,用于兼容需要该 header 的平台 | +| `OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION` | (空) | Langfuse 实时摄取版本,例如 `4` | +| `OTEL_EXPORTER_OTLP_METRICS_ENABLED` | `true` | 是否导出 OTLP metrics | + +## 配置文件 + +除环境变量外,也可以通过 `MONITORING_CONFIG_FILE` 指定 JSON/YAML 文件。环境变量中显式设置的非默认值会覆盖文件配置。 + +```yaml +monitoring: + enable_telemetry: true + service_name: nexent-backend + project_name: nexent-production + exporter: + provider: langfuse + protocol: http + endpoint: https://cloud.langfuse.com/api/public/otel + headers: + Authorization: Basic BASE64_ENCODED_KEY + x-langfuse-ingestion-version: "4" + export_metrics: false +``` ## 代码集成 @@ -183,14 +224,20 @@ with monitoring_manager.trace_tool_call("web_search", "agent_name", {"query": "t ## Collector 配置 -OpenTelemetry Collector 将数据路由到选定的后端: +OpenTelemetry Collector 默认只通过 logging exporter 打印数据,避免没有外部后端时把数据转发回自身。需要通过 Collector 转发到平台时,增加对应 exporter: ```yaml exporters: - otlp: - endpoint: ${OTEL_EXPORTER_OTLP_ENDPOINT} + otlphttp/langfuse: + endpoint: https://cloud.langfuse.com/api/public/otel headers: - authorization: ${OTEL_EXPORTER_OTLP_HEADERS} + Authorization: Basic BASE64_ENCODED_KEY + x-langfuse-ingestion-version: "4" + +service: + pipelines: + traces: + exporters: [otlphttp/langfuse, logging] ``` 完整配置见 `docker/monitoring/otel-collector-config.yml`。 @@ -224,4 +271,4 @@ pip install nexent[performance] # 包含 OTLP 支持 1. 在平台 UI 中验证 OpenInference 属性 2. 检查 Span 属性命名:使用 `llm.model_name` 而非 `model_name` -3. 查看平台特定属性要求 \ No newline at end of file +3. 查看平台特定属性要求 diff --git a/doc/docs/zh/sdk/opentelemetry-design.md b/doc/docs/zh/sdk/opentelemetry-design.md new file mode 100644 index 000000000..fc78194d1 --- /dev/null +++ b/doc/docs/zh/sdk/opentelemetry-design.md @@ -0,0 +1,255 @@ +# Nexent OpenTelemetry 可观测性设计 + +生成日期:2026-04-28 +基准分支:`dev/opentelemetry` + +## 设计目标 + +Nexent 的监控能力以 OpenTelemetry 为主干,SDK 和后端只负责生成标准 span、event、metric,并通过 OTLP 导出。Phoenix、Langfuse、Jaeger 等平台只作为可配置的 exporter 或可选 SDK 增强层,避免把业务代码绑定到单一平台。 + +目标: + +- Agent 流式运行期间保持 trace 上下文,完整覆盖 API、服务准备、Agent 线程、LLM 流式输出、工具调用。 +- 支持 `otlp`、`phoenix`、`langfuse`、`jaeger`、`custom` provider profile。 +- 同时支持环境变量和 JSON/YAML 配置文件,环境变量可覆盖文件配置。 +- 支持 base endpoint 和 signal-specific endpoint,避免 `/v1/traces`、`/v1/metrics` 路径重复拼接。 +- 保持 OpenTelemetry 原生实现,平台 SDK 只通过 `MONITORING_USE_PLATFORM_SDK=true` 显式启用。 + +## 技术栈 + +| 分类 | 实现 | +|------|------| +| 标准框架 | OpenTelemetry API/SDK | +| 导出协议 | OTLP HTTP、OTLP gRPC | +| Trace exporter | `opentelemetry-exporter-otlp` HTTP/gRPC trace exporter | +| Metric exporter | `opentelemetry-exporter-otlp` HTTP/gRPC metric exporter | +| 自动埋点 | FastAPI instrumentation、requests instrumentation | +| AI 语义 | OpenInference 风格属性:`llm.*`、`agent.*`、`agent.tool.*` | +| 配置 | 环境变量、`MONITORING_CONFIG_FILE` JSON/YAML | +| Collector | `otel/opentelemetry-collector-contrib`,使用 `otlphttp` 转发 HTTP 平台 | +| 可选 SDK | `phoenix.otel.register`、`langfuse.get_client`,默认不启用;Phoenix SDK 成功注册时复用其 tracer provider | + +## 配置模型 + +### 环境变量 + +| 变量 | 说明 | +|------|------| +| `ENABLE_TELEMETRY` | 总开关 | +| `MONITORING_CONFIG_FILE` | JSON/YAML 配置文件路径 | +| `MONITORING_PROVIDER` | `otlp`、`phoenix`、`langfuse`、`jaeger`、`custom` | +| `MONITORING_USE_PLATFORM_SDK` | 是否额外初始化平台 SDK | +| `MONITORING_PROJECT_NAME` | 平台项目名 | +| `OTEL_SERVICE_NAME` | OpenTelemetry service name | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP base endpoint | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | 可选 trace 专用 endpoint | +| `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` | 可选 metric 专用 endpoint | +| `OTEL_EXPORTER_OTLP_PROTOCOL` | `http` 或 `grpc` | +| `OTEL_EXPORTER_OTLP_HEADERS` | 通用 `key=value,key2=value2` header | +| `OTEL_EXPORTER_OTLP_AUTHORIZATION` | `Authorization` header,常用于 Phoenix bearer auth 和 Langfuse Basic Auth | +| `OTEL_EXPORTER_OTLP_X_API_KEY` | `x-api-key` header,用于兼容需要该 header 的平台 | +| `OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION` | Langfuse 摄取版本,例如 `4` | +| `OTEL_EXPORTER_OTLP_METRICS_ENABLED` | 是否导出 metric | + +### 配置文件 + +```yaml +monitoring: + enable_telemetry: true + service_name: nexent-backend + project_name: nexent-production + exporter: + provider: langfuse + protocol: http + endpoint: https://cloud.langfuse.com/api/public/otel + headers: + Authorization: Basic BASE64_ENCODED_KEY + x-langfuse-ingestion-version: "4" + export_traces: true + export_metrics: false + use_platform_sdk: false +``` + +环境变量中显式设置的非默认值会覆盖配置文件,便于同一镜像在不同环境接入不同平台。 + +## Endpoint 规则 + +HTTP exporter 支持两种输入: + +- base endpoint:`https://cloud.langfuse.com/api/public/otel` +- signal endpoint:`https://cloud.langfuse.com/api/public/otel/v1/traces` + +SDK 会按 signal 派生最终地址: + +| 输入 | Trace endpoint | Metric endpoint | +|------|----------------|-----------------| +| `https://host/api/public/otel` | `https://host/api/public/otel/v1/traces` | `https://host/api/public/otel/v1/metrics` | +| `https://host/api/public/otel/v1/traces` | 原值 | `https://host/api/public/otel/v1/metrics` | +| `https://host/api/public/otel/v1/metrics` | `https://host/api/public/otel/v1/traces` | 原值 | + +## 平台接入 + +### 纯 OTLP / 自建 Collector + +```bash +MONITORING_PROVIDER=otlp +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http +``` + +### Phoenix + +Phoenix 支持通过 OTLP HTTP 接收 traces,也提供 `phoenix.otel` SDK 包装 OpenTelemetry。 + +```bash +MONITORING_PROVIDER=phoenix +OTEL_EXPORTER_OTLP_ENDPOINT=https://app.phoenix.arize.com/s/YOUR_SPACE +OTEL_EXPORTER_OTLP_AUTHORIZATION="Bearer YOUR_PHOENIX_API_KEY" +OTEL_EXPORTER_OTLP_METRICS_ENABLED=false +MONITORING_PROJECT_NAME=nexent-production +``` + +可选启用平台 SDK。启用后如果 `phoenix.otel.register` 成功返回 tracer provider,Nexent 会复用该 provider,避免重复注册全局 OpenTelemetry tracer provider: + +```bash +MONITORING_USE_PLATFORM_SDK=true +``` + +### Langfuse + +Langfuse 的 OTLP HTTP base endpoint 是 `/api/public/otel`,使用 Basic Auth。实时摄取建议带 `x-langfuse-ingestion-version=4`。 + +```bash +MONITORING_PROVIDER=langfuse +OTEL_EXPORTER_OTLP_ENDPOINT=https://cloud.langfuse.com/api/public/otel +OTEL_EXPORTER_OTLP_AUTHORIZATION="Basic BASE64_PUBLIC_SECRET" +OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION=4 +OTEL_EXPORTER_OTLP_METRICS_ENABLED=false +``` + +## 埋点信息 + +| 埋点 | 位置 | 内容 | 目的 | +|------|------|------|------| +| FastAPI 自动 span | `backend/apps/app_factory.py` | route、method、status、duration | API 入口耗时和错误定位 | +| requests 自动 span | `MonitoringManager` 初始化 | 外部 HTTP 调用 | 观测模型服务、工具服务、MCP 等依赖 | +| `agent.run` | `backend/apps/agent_app.py` | `/agent/run` 请求 | Agent 运行入口追踪 | +| `agent_service.run_agent_stream` | `backend/services/agent_service.py` | `agent_id`、`conversation_id`、debug、文件数、记忆开关、策略、准备耗时 | 分析 SSE 创建前的准备阶段 | +| `user_resolution.*` | `run_agent_stream` | 用户、租户、语言和耗时 | 鉴权与租户解析定位 | +| `user_message_save.*` | `run_agent_stream` | 保存或跳过原因、耗时 | 判断会话写入是否正常 | +| `memory_context_build.*` | `run_agent_stream` | 记忆开关、共享策略、耗时 | 定位记忆上下文瓶颈 | +| `streaming_strategy.*` | `run_agent_stream` | `with_memory` 或 `no_memory` | 判断实际执行分支 | +| `generate_stream_no_memory.*` | `generate_stream_no_memory` | 准备与流式输出事件 | 追踪无记忆流式执行 | +| `agent_run` | `sdk/nexent/core/agents/run_agent.py` | 线程启动、缓存读取、消息 yield | 追踪 Agent 流式输出 | +| `agent_run_thread` | `run_agent.py` | Agent 创建、MCP 工具装载、执行错误 | 追踪实际 Agent 执行线程 | +| `chat_completion` | `openai_llm.py` | 模型、温度、top_p、消息数、token、TTFT、chunk 数、输出长度 | LLM 性能、成本和异常分析 | +| `trace_agent_step` | SDK 公共 API | `agent.name`、`agent.step.name`、`agent.step.type` | 供后续推理步骤、工具选择等细粒度埋点扩展 | +| `trace_tool_call` | SDK 公共 API | 工具名、输入、输出、耗时、错误 | 工具可用性和延迟分析 | + +### 事件清单 + +| Span / 位置 | Event | 主要属性 | 目的 | +|-------------|-------|----------|------| +| `monitor_endpoint` 通用装饰器 | `.started` / `.completed` / `.error` | `param.*`、`duration`、`error.*` | 统一记录接口和服务函数的开始、结束、异常 | +| `agent_service.run_agent_stream` | `user_resolution.started` / `user_resolution.completed` | `duration`、`user_id`、`tenant_id`、`language` | 定位用户、租户、语言解析耗时和结果 | +| `agent_service.run_agent_stream` | `user_message_save.started` / `user_message_save.completed` / `user_message_save.skipped` | `duration`、`reason` | 判断用户消息是否写入,以及跳过原因 | +| `agent_service.run_agent_stream` | `memory_context_build.started` / `memory_context_build.completed` | `duration`、`memory_enabled`、`agent_share_option`、`debug_mode` | 观测记忆上下文构建耗时和开关状态 | +| `agent_service.run_agent_stream` | `streaming_strategy.selected` / `streaming_strategy.completed` | `strategy`、`selected_strategy`、`duration` | 识别实际流式分支与选择耗时 | +| `agent_service.run_agent_stream` | `stream_generator.memory_stream.creating` / `stream_generator.no_memory_stream.creating` | 无 | 标记 generator 创建分支 | +| `agent_service.run_agent_stream` | `streaming_response.creating` / `streaming_response.created` / `run_agent_stream.preparation_completed` | `duration`、`media_type`、`total_preparation_time` | 观测 SSE 响应创建和整体准备耗时 | +| `generate_stream_no_memory` | `generate_stream_no_memory.started` / `generate_stream_no_memory.completed` / `generate_stream_no_memory.streaming.started` / `generate_stream_no_memory.streaming.completed` | 无 | 观测无记忆路径的准备和流式消费边界 | +| `agent_run` | `agent_run.started` / `agent_run.thread_started` / `agent_run.get_cached_message` / `agent_run.get_cached_message_completed` / `agent_run.yield_message` | 无 | 观测 Agent 线程启动、缓存轮询和消息 yield | +| `monitor_llm_call` | `llm_call_started` / `llm_call_completed` / `llm_call_error` | `error.*` | 统一记录 LLM 调用生命周期 | +| `openai_chat.chat_completion` | `completion_started` / `completion_finished` / `model_stopped` / `error_occurred` | `model_id`、`temperature`、`top_p`、`message_count`、`total_duration`、`output_length`、`chunk_count`、`error.*` | 分析模型参数、流式输出耗时、停止和异常 | +| `trace_tool_call` | span 属性 `agent.tool.input` / `agent.tool.output` | JSON 字符串、`agent.tool.duration_ms`、`error.*` | 分析工具输入输出、耗时和异常 | + +## 指标 + +| 指标 | 类型 | 维度 | 用途 | +|------|------|------|------| +| `llm.request.duration` | histogram | model、operation | LLM 请求延迟 | +| `llm.token.generation_rate` | histogram | model | token/s | +| `llm.time_to_first_token` | histogram | model | 首 token 延迟 | +| `llm.token_count.prompt` | counter | model | 输入 token 成本 | +| `llm.token_count.completion` | counter | model | 输出 token 成本 | +| `llm.error.count` | counter | model、operation | LLM 错误率 | +| `agent.step.count` | counter | agent、step type、tool | Agent 步骤和工具调用量 | +| `agent.execution.duration` | histogram | agent、status | Agent 总耗时 | +| `agent.error.count` | counter | agent、error type | Agent 异常统计 | + +## Agent 运行数据流 + +```mermaid +flowchart TD + U[用户] --> FE[前端 Chat] + FE --> API[POST /agent/run] + API --> S1[agent.run span] + S1 --> S2[agent_service.run_agent_stream span] + S2 --> A[user_resolution] + S2 --> B[user_message_save] + S2 --> C[memory_context_build] + C --> D{streaming_strategy} + D -->|with_memory| E[generate_stream_with_memory] + D -->|no_memory| F[generate_stream_no_memory span] + E --> G[StreamingResponse] + F --> G + G --> H[agent_run async generator span] + H --> I[agent_run_thread span] + I --> J[NexentAgent] + J --> K[Tool / MCP / HTTP spans] + J --> L[chat_completion span] + L --> M[token events and LLM metrics] + K --> OTel[OpenTelemetry Tracer/Meter Provider] + M --> OTel + OTel --> Collector[OTLP Collector] + Collector --> Phoenix[Phoenix] + Collector --> Langfuse[Langfuse] + Collector --> Other[Jaeger / Custom Backend] +``` + +## 监控页面结构 + +```mermaid +flowchart TB + Page[Agent 监控页] --> Filters[筛选区: 时间 / 租户 / 用户 / Agent / 会话 / 模型 / 状态] + Page --> KPIs[指标区: 成功率 / P95 / TTFT / tokens/s / token 成本 / 错误数] + Page --> List[Trace 列表] + Page --> Detail[Trace 详情] + Detail --> Waterfall[Span 瀑布图] + Detail --> Timeline[Agent 时间线] + Detail --> LLM[LLM 调用面板] + Detail --> Tool[工具调用面板] + Detail --> Raw[原始 OTel 属性] + Detail --> Eval[反馈和评估] +``` + +与 Phoenix 和 Langfuse 对比: + +| 方案 | 优点 | 不足 | +|------|------|------| +| Phoenix | OpenInference 生态匹配好,适合 trace debug、实验、评估;`phoenix.otel` 可降低接入成本 | Nexent 的租户、权限、Agent 配置需要额外映射 | +| Langfuse | Trace、session、user、prompt、evaluation、dashboard 能力完整,OTLP endpoint 和 SDK 都基于 OpenTelemetry | 需要补充 `langfuse.*` 属性才能获得更好的筛选聚合体验 | +| Nexent 自建页 | 可直接关联租户、会话、Agent 配置和权限,适合产品内闭环 | 需要自建 trace 存储、查询、聚合和瀑布图 | + +推荐路径: + +1. 短期使用 OTLP 对接 Phoenix/Langfuse,先满足调试和分析。 +2. 中期在 Nexent 增加 trace 跳转、轻量指标概览。 +3. 长期按租户、会话、Agent 版本建立自有监控页,同时保留 OTLP 双写能力。 + +## 已修复的设计风险 + +| 风险 | 修复 | +|------|------| +| async generator span 提前结束 | `monitor_endpoint` 使用 `inspect.isasyncgenfunction`,在 `async for` 消费期间保持 span 打开 | +| `/v1/traces` 路径重复拼接 | SDK 支持 base endpoint 和 signal endpoint 自动归一化 | +| Collector header 无法兼容平台 | Collector 默认只 logging;平台转发示例改用 `otlphttp/` exporter,并拆分 `Authorization`、`x-api-key`、`x-langfuse-ingestion-version` | +| 单测漏掉流式函数 | 增加 async generator 装饰器测试 | + +## 参考 + +- Phoenix Setup Tracing: https://arize.com/docs/phoenix/tracing/how-to-tracing/setup-tracing +- Phoenix Setup OTEL: https://arize.com/docs/phoenix/tracing/how-to-tracing/setup-tracing/setup-using-phoenix-otel +- Phoenix Authentication: https://arize.com/docs/phoenix/deployment/authentication +- Langfuse OpenTelemetry: https://langfuse.com/integrations/native/opentelemetry +- Langfuse Overview: https://langfuse.com/docs diff --git a/docker/.env.example b/docker/.env.example index 9bc680feb..9c9c7c3b5 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -156,22 +156,37 @@ SKILLS_PATH=/mnt/nexent/skills # Enable OpenTelemetry monitoring for agent observability ENABLE_TELEMETRY=false +# Optional JSON/YAML config file. Environment variables override file values when set. +MONITORING_CONFIG_FILE= + +# Provider profile: otlp, phoenix, langfuse, jaeger, custom +MONITORING_PROVIDER=otlp +MONITORING_USE_PLATFORM_SDK=false +MONITORING_PROJECT_NAME=nexent + # Service name for identifying traces in observability platforms OTEL_SERVICE_NAME=nexent-backend -# OTLP endpoint - can be: +# OTLP base endpoint - can be: # - http://otel-collector:4318 (through OpenTelemetry Collector) -# - Direct connection to Arize Phoenix: https://phoenix.arize.com/v1/traces -# - Direct connection to Langfuse: https://cloud.langfuse.com/api/public/otel/v1/traces +# - Direct connection to Langfuse: https://cloud.langfuse.com/api/public/otel +# - Direct connection to your OTLP-compatible collector or gateway OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 +# Optional signal-specific endpoints. Leave empty unless the backend requires them. +OTEL_EXPORTER_OTLP_TRACES_ENDPOINT= +OTEL_EXPORTER_OTLP_METRICS_ENDPOINT= + # Protocol: "http" or "grpc" OTEL_EXPORTER_OTLP_PROTOCOL=http # Authentication headers (format: key1=value1,key2=value2) -# For Arize Phoenix: x-api-key=YOUR_API_KEY -# For Langfuse: Authorization=Basic base64(public_key:secret_key) +# Prefer platform-specific variables when using the Collector. OTEL_EXPORTER_OTLP_HEADERS= +OTEL_EXPORTER_OTLP_AUTHORIZATION= +OTEL_EXPORTER_OTLP_X_API_KEY= +OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION= +OTEL_EXPORTER_OTLP_METRICS_ENABLED=true TELEMETRY_SAMPLE_RATE=1.0 LLM_SLOW_REQUEST_THRESHOLD_SECONDS=5.0 diff --git a/docker/monitoring/monitoring.env b/docker/monitoring/monitoring.env index ec88b61f2..b15c8f097 100644 --- a/docker/monitoring/monitoring.env +++ b/docker/monitoring/monitoring.env @@ -1,12 +1,24 @@ ENABLE_TELEMETRY=true OTEL_SERVICE_NAME=nexent-backend +MONITORING_PROVIDER=otlp +MONITORING_CONFIG_FILE= +MONITORING_USE_PLATFORM_SDK=false +MONITORING_PROJECT_NAME=nexent + +# Use a base OTLP HTTP endpoint. SDK code derives /v1/traces and /v1/metrics. OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_TRACES_ENDPOINT= +OTEL_EXPORTER_OTLP_METRICS_ENDPOINT= OTEL_EXPORTER_OTLP_PROTOCOL=http OTEL_EXPORTER_OTLP_HEADERS= +OTEL_EXPORTER_OTLP_AUTHORIZATION= +OTEL_EXPORTER_OTLP_X_API_KEY= +OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION= +OTEL_EXPORTER_OTLP_METRICS_ENABLED=true TELEMETRY_SAMPLE_RATE=1.0 LLM_SLOW_REQUEST_THRESHOLD_SECONDS=5.0 LLM_SLOW_TOKEN_RATE_THRESHOLD=10.0 OTEL_COLLECTOR_GRPC_PORT=4317 -OTEL_COLLECTOR_HTTP_PORT=4318 \ No newline at end of file +OTEL_COLLECTOR_HTTP_PORT=4318 diff --git a/docker/monitoring/monitoring.env.example b/docker/monitoring/monitoring.env.example index ec88b61f2..b15c8f097 100644 --- a/docker/monitoring/monitoring.env.example +++ b/docker/monitoring/monitoring.env.example @@ -1,12 +1,24 @@ ENABLE_TELEMETRY=true OTEL_SERVICE_NAME=nexent-backend +MONITORING_PROVIDER=otlp +MONITORING_CONFIG_FILE= +MONITORING_USE_PLATFORM_SDK=false +MONITORING_PROJECT_NAME=nexent + +# Use a base OTLP HTTP endpoint. SDK code derives /v1/traces and /v1/metrics. OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_TRACES_ENDPOINT= +OTEL_EXPORTER_OTLP_METRICS_ENDPOINT= OTEL_EXPORTER_OTLP_PROTOCOL=http OTEL_EXPORTER_OTLP_HEADERS= +OTEL_EXPORTER_OTLP_AUTHORIZATION= +OTEL_EXPORTER_OTLP_X_API_KEY= +OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION= +OTEL_EXPORTER_OTLP_METRICS_ENABLED=true TELEMETRY_SAMPLE_RATE=1.0 LLM_SLOW_REQUEST_THRESHOLD_SECONDS=5.0 LLM_SLOW_TOKEN_RATE_THRESHOLD=10.0 OTEL_COLLECTOR_GRPC_PORT=4317 -OTEL_COLLECTOR_HTTP_PORT=4318 \ No newline at end of file +OTEL_COLLECTOR_HTTP_PORT=4318 diff --git a/docker/monitoring/otel-collector-config.yml b/docker/monitoring/otel-collector-config.yml index c8d16f98e..58f278848 100644 --- a/docker/monitoring/otel-collector-config.yml +++ b/docker/monitoring/otel-collector-config.yml @@ -10,7 +10,7 @@ processors: batch: timeout: 1s send_batch_size: 512 - + memory_limiter: limit_mib: 256 check_interval: 1s @@ -25,11 +25,6 @@ processors: action: insert exporters: - otlp: - endpoint: ${OTEL_EXPORTER_OTLP_ENDPOINT:-http://localhost:4318} - headers: - authorization: ${OTEL_EXPORTER_OTLP_HEADERS:-} - logging: verbosity: normal @@ -38,13 +33,13 @@ service: traces: receivers: [otlp] processors: [memory_limiter, resource, batch] - exporters: [otlp, logging] - + exporters: [logging] + metrics: receivers: [otlp] processors: [memory_limiter, resource, batch] - exporters: [otlp, logging] - + exporters: [logging] + telemetry: logs: level: "info" @@ -53,27 +48,32 @@ service: # # === Arize Phoenix === # Set environment variables: -# OTEL_EXPORTER_OTLP_ENDPOINT=https://phoenix.arize.com/v1 -# OTEL_EXPORTER_OTLP_HEADERS=x-api-key=YOUR_PHOENIX_API_KEY +# OTEL_EXPORTER_OTLP_ENDPOINT=https://app.phoenix.arize.com/s/YOUR_SPACE +# OTEL_EXPORTER_OTLP_AUTHORIZATION=Bearer YOUR_PHOENIX_API_KEY +# OTEL_EXPORTER_OTLP_METRICS_ENABLED=false # # Or configure directly in exporters section: -# otlp/arize: -# endpoint: https://phoenix.arize.com/v1 +# otlphttp/arize: +# endpoint: https://app.phoenix.arize.com/s/YOUR_SPACE # headers: -# x-api-key: YOUR_PHOENIX_API_KEY +# Authorization: Bearer YOUR_PHOENIX_API_KEY +# Then add otlphttp/arize to the traces pipeline exporters. # # === Langfuse === # Set environment variables: # OTEL_EXPORTER_OTLP_ENDPOINT=https://cloud.langfuse.com/api/public/otel -# OTEL_EXPORTER_OTLP_HEADERS=Authorization=Basic BASE64_ENCODED_KEY +# OTEL_EXPORTER_OTLP_AUTHORIZATION=Basic BASE64_ENCODED_KEY +# OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION=4 # # Where BASE64_ENCODED_KEY = base64(public_key:secret_key) # # Or configure directly: -# otlp/langfuse: -# endpoint: https://cloud.langfuse.com/api/public/otel/v1 +# otlphttp/langfuse: +# endpoint: https://cloud.langfuse.com/api/public/otel # headers: # Authorization: Basic BASE64_ENCODED_KEY +# x-langfuse-ingestion-version: "4" +# Then add otlphttp/langfuse to the traces pipeline exporters. # # === Local Jaeger (OTLP) === # For gradual migration, you can still use Jaeger via OTLP: @@ -85,4 +85,4 @@ service: # === Multiple Exporters === # To export to multiple backends simultaneously, create multiple exporters # and add them to the pipelines: -# exporters: [otlp/arize, otlp/langfuse, logging] \ No newline at end of file +# exporters: [otlphttp/arize, otlphttp/langfuse, logging] diff --git a/docker/start-monitoring.sh b/docker/start-monitoring.sh index 8cd8561f0..fb8304816 100755 --- a/docker/start-monitoring.sh +++ b/docker/start-monitoring.sh @@ -1,7 +1,7 @@ #!/bin/bash # Nexent LLM Performance Monitoring Setup Script -# This script sets up OpenTelemetry + Jaeger + Prometheus + Grafana for monitoring +# This script starts the OpenTelemetry Collector used by Nexent monitoring. set -e @@ -47,7 +47,7 @@ check_service() { local name=$1 local url=$2 local port=$3 - + if curl -s --max-time 5 --connect-timeout 3 "$url" > /dev/null 2>&1; then echo "✅ $name is running at http://localhost:$port" return 0 @@ -57,22 +57,16 @@ check_service() { fi } -# Check Jaeger -check_service "Jaeger" "http://localhost:16686/api/services" "16686" || true - -# Check Prometheus -check_service "Prometheus" "http://localhost:9090/-/healthy" "9090" || true - -# Check Grafana -check_service "Grafana" "http://localhost:3005/api/health" "3005" || true +# Check OpenTelemetry Collector HTTP receiver +check_service "OpenTelemetry Collector HTTP receiver" "http://localhost:4318" "4318" || true echo "" echo "🎉 Monitoring setup complete!" echo "" echo "📊 Access your monitoring tools:" -echo " • Jaeger UI: http://localhost:16686" -echo " • Prometheus: http://localhost:9090" -echo " • Grafana: http://localhost:3005 (admin/admin)" +echo " • OTLP HTTP receiver: http://localhost:4318" +echo " • OTLP gRPC receiver: localhost:4317" +echo " • Configure Phoenix, Langfuse, Jaeger, or another OTLP backend in monitoring.env" echo "" echo "🔧 To enable monitoring in your Nexent backend:" echo " 1. Set ENABLE_TELEMETRY=true in your .env file" diff --git a/sdk/nexent/monitor/monitoring.py b/sdk/nexent/monitor/monitoring.py index b108e7c01..1dc809148 100644 --- a/sdk/nexent/monitor/monitoring.py +++ b/sdk/nexent/monitor/monitoring.py @@ -38,6 +38,8 @@ import time import functools import json +import inspect +from pathlib import Path from contextlib import contextmanager from typing import Any, Dict, Optional, Callable, TypeVar, cast, Iterator from dataclasses import dataclass, field @@ -46,6 +48,122 @@ F = TypeVar('F', bound=Callable[..., Any]) +DEFAULT_OTLP_ENDPOINT = "http://localhost:4318" +TRACE_PATH = "/v1/traces" +METRIC_PATH = "/v1/metrics" +SUPPORTED_PROVIDERS = {"otlp", "phoenix", "langfuse", "jaeger", "custom"} + + +def _as_bool(value: Any, default: bool = False) -> bool: + """Convert common configuration values to bool.""" + if value is None: + return default + if isinstance(value, bool): + return value + if isinstance(value, (int, float)): + return bool(value) + if isinstance(value, str): + return value.strip().lower() in {"1", "true", "yes", "y", "on"} + return default + + +def _as_float(value: Any, default: float) -> float: + """Convert common configuration values to float.""" + try: + return float(value) + except (TypeError, ValueError): + return default + + +def _compact_dict(data: Dict[str, Any]) -> Dict[str, Any]: + """Drop empty values from a configuration dictionary.""" + return {key: value for key, value in data.items() if value not in (None, "")} + + +def _load_mapping_file(path: str) -> Dict[str, Any]: + """Load a JSON or YAML mapping from disk.""" + if not path: + return {} + + config_path = Path(path) + if not config_path.exists(): + logger.warning(f"Monitoring config file does not exist: {path}") + return {} + + raw_text = config_path.read_text(encoding="utf-8") + if config_path.suffix.lower() == ".json": + loaded = json.loads(raw_text) + else: + try: + import yaml + except ImportError as exc: + raise RuntimeError( + "PyYAML is required to read YAML monitoring config files." + ) from exc + loaded = yaml.safe_load(raw_text) + + if not loaded: + return {} + if not isinstance(loaded, dict): + raise ValueError("Monitoring config file must contain a mapping object.") + return loaded + + +def _normalize_header_value(value: Any) -> str: + """Normalize header values from config files or environment variables.""" + if isinstance(value, (list, tuple)): + return ",".join(str(item) for item in value) + return str(value) + + +def _parse_headers(headers: Any) -> Dict[str, str]: + """Parse headers from a dict or a key=value comma-separated string.""" + if not headers: + return {} + if isinstance(headers, dict): + return { + str(key).strip(): _normalize_header_value(value).strip() + for key, value in headers.items() + if str(key).strip() and value not in (None, "") + } + if isinstance(headers, str): + parsed = {} + for pair in headers.split(","): + if "=" not in pair: + continue + key, value = pair.split("=", 1) + key = key.strip() + if key: + parsed[key] = value.strip() + return parsed + return {} + + +def _merge_headers(*header_sets: Any) -> Dict[str, str]: + """Merge multiple header sources, with later sources taking precedence.""" + merged: Dict[str, str] = {} + for headers in header_sets: + merged.update(_parse_headers(headers)) + return merged + + +def _derive_http_signal_endpoint(endpoint: str, signal_path: str) -> str: + """ + Build a signal-specific OTLP HTTP endpoint from a base or signal endpoint. + + This accepts both base endpoints like `/api/public/otel` and existing signal + endpoints like `/api/public/otel/v1/traces`, avoiding duplicated `/v1/*` + suffixes. + """ + endpoint = (endpoint or DEFAULT_OTLP_ENDPOINT).rstrip("/") + if endpoint.endswith(signal_path): + return endpoint + if endpoint.endswith(TRACE_PATH): + return endpoint[: -len(TRACE_PATH)] + signal_path + if endpoint.endswith(METRIC_PATH): + return endpoint[: -len(METRIC_PATH)] + signal_path + return endpoint + signal_path + def is_opentelemetry_available() -> bool: """Check if OpenTelemetry dependencies are available.""" @@ -56,35 +174,142 @@ def is_opentelemetry_available() -> bool: class MonitoringConfig: """ Configuration for monitoring system using OTLP protocol. - + Supports HTTP and gRPC protocols for exporting traces and metrics to any OpenTelemetry-compatible backend (Arize Phoenix, Langfuse, etc). """ enable_telemetry: bool = False service_name: str = "nexent-backend" - otlp_endpoint: str = "http://localhost:4318" + provider: str = "otlp" + otlp_endpoint: str = DEFAULT_OTLP_ENDPOINT + otlp_traces_endpoint: Optional[str] = None + otlp_metrics_endpoint: Optional[str] = None otlp_protocol: str = "http" # "http" or "grpc" otlp_headers: Dict[str, str] = field(default_factory=dict) + export_traces: bool = True + export_metrics: bool = True + use_platform_sdk: bool = False + project_name: Optional[str] = None telemetry_sample_rate: float = 1.0 llm_slow_request_threshold_seconds: float = 5.0 llm_slow_token_rate_threshold: float = 10.0 - + + @classmethod + def from_file(cls, config_file: str, overrides: Optional[Dict[str, Any]] = None) -> "MonitoringConfig": + """ + Build monitoring config from JSON/YAML and optional overrides. + + Supported shape: + + monitoring: + enable_telemetry: true + service_name: nexent-backend + exporter: + provider: langfuse + protocol: http + endpoint: https://cloud.langfuse.com/api/public/otel + traces_endpoint: https://cloud.langfuse.com/api/public/otel/v1/traces + headers: + Authorization: Basic xxx + export_metrics: false + """ + loaded = _load_mapping_file(config_file) + data = loaded.get("monitoring", loaded) + if not isinstance(data, dict): + raise ValueError("The monitoring config section must be a mapping object.") + + exporter = data.get("exporter", {}) + if exporter is None: + exporter = {} + if not isinstance(exporter, dict): + raise ValueError("The monitoring exporter section must be a mapping object.") + + headers = _merge_headers(data.get("otlp_headers"), data.get("headers"), exporter.get("headers")) + config_data = _compact_dict({ + "enable_telemetry": data.get("enable_telemetry"), + "service_name": data.get("service_name"), + "provider": exporter.get("provider", data.get("provider")), + "otlp_endpoint": exporter.get("endpoint", data.get("otlp_endpoint")), + "otlp_traces_endpoint": exporter.get("traces_endpoint", data.get("otlp_traces_endpoint")), + "otlp_metrics_endpoint": exporter.get("metrics_endpoint", data.get("otlp_metrics_endpoint")), + "otlp_protocol": exporter.get("protocol", data.get("otlp_protocol")), + "otlp_headers": headers, + "export_traces": exporter.get("export_traces", data.get("export_traces")), + "export_metrics": exporter.get("export_metrics", data.get("export_metrics")), + "use_platform_sdk": exporter.get("use_platform_sdk", data.get("use_platform_sdk")), + "project_name": exporter.get("project_name", data.get("project_name")), + "telemetry_sample_rate": data.get("telemetry_sample_rate"), + "llm_slow_request_threshold_seconds": data.get("llm_slow_request_threshold_seconds"), + "llm_slow_token_rate_threshold": data.get("llm_slow_token_rate_threshold"), + }) + + if overrides: + merged_headers = _merge_headers(config_data.get("otlp_headers"), overrides.get("otlp_headers")) + config_data.update(_compact_dict(overrides)) + if merged_headers: + config_data["otlp_headers"] = merged_headers + + return cls(**config_data) + def __post_init__(self): """Validate configuration and adjust based on OpenTelemetry availability.""" + self.provider = (self.provider or "otlp").strip().lower() + if self.provider not in SUPPORTED_PROVIDERS: + logger.warning( + f"Unknown monitoring provider '{self.provider}'. Using 'custom'." + ) + self.provider = "custom" + + self.enable_telemetry = _as_bool(self.enable_telemetry) + self.export_traces = _as_bool(self.export_traces, True) + self.export_metrics = _as_bool(self.export_metrics, True) + self.use_platform_sdk = _as_bool(self.use_platform_sdk) + self.telemetry_sample_rate = _as_float(self.telemetry_sample_rate, 1.0) + self.llm_slow_request_threshold_seconds = _as_float( + self.llm_slow_request_threshold_seconds, 5.0) + self.llm_slow_token_rate_threshold = _as_float( + self.llm_slow_token_rate_threshold, 10.0) + self.otlp_headers = _parse_headers(self.otlp_headers) + if self.enable_telemetry and not OPENTELEMETRY_AVAILABLE: logger.warning( "OpenTelemetry dependencies not available. Disabling telemetry. " "Install with: pip install nexent[performance]" ) self.enable_telemetry = False - + # Validate protocol + self.otlp_protocol = (self.otlp_protocol or "http").strip().lower() if self.otlp_protocol not in ("http", "grpc"): logger.warning( f"Invalid OTLP protocol '{self.otlp_protocol}'. Using 'http'." ) self.otlp_protocol = "http" + if self.provider in {"phoenix", "langfuse"} and self.otlp_protocol == "grpc": + logger.warning( + f"{self.provider} OTLP integration only supports HTTP in this configuration. Using 'http'." + ) + self.otlp_protocol = "http" + + def get_trace_endpoint(self) -> str: + """Return the resolved trace exporter endpoint.""" + if self.otlp_protocol == "grpc": + return self.otlp_traces_endpoint or self.otlp_endpoint + return _derive_http_signal_endpoint( + self.otlp_traces_endpoint or self.otlp_endpoint, + TRACE_PATH, + ) + + def get_metric_endpoint(self) -> str: + """Return the resolved metric exporter endpoint.""" + if self.otlp_protocol == "grpc": + return self.otlp_metrics_endpoint or self.otlp_endpoint + return _derive_http_signal_endpoint( + self.otlp_metrics_endpoint or self.otlp_endpoint, + METRIC_PATH, + ) + class MonitoringManager: """Singleton monitoring manager for the entire SDK.""" @@ -128,7 +353,8 @@ def configure(self, config: MonitoringConfig) -> None: self._config = config logger.info( f"Monitoring configured: enabled={config.enable_telemetry}, " - f"service={config.service_name}, protocol={config.otlp_protocol}" + f"service={config.service_name}, provider={config.provider}, " + f"protocol={config.otlp_protocol}" ) if config.enable_telemetry: @@ -152,64 +378,68 @@ def _init_telemetry_otlp(self) -> None: resource = Resource.create({ "service.name": self._config.service_name, "service.version": "1.0.0", - "service.instance.id": "nexent-instance-1" + "service.instance.id": "nexent-instance-1", + "telemetry.provider": self._config.provider, }) - # Initialize TracerProvider with OTLP exporter - self._tracer_provider = TracerProvider(resource=resource) - trace.set_tracer_provider(self._tracer_provider) - - # Choose exporter based on protocol - if self._config.otlp_protocol == "grpc": - span_exporter = OTLPSpanExporterGRPC( - endpoint=self._config.otlp_endpoint, - headers=self._config.otlp_headers - ) - else: - # HTTP protocol (default) - # For HTTP, append /v1/traces to endpoint if not already present - trace_endpoint = self._config.otlp_endpoint - if not trace_endpoint.endswith("/v1/traces"): - trace_endpoint = trace_endpoint.rstrip("/") + "/v1/traces" - span_exporter = OTLPSpanExporterHTTP( - endpoint=trace_endpoint, - headers=self._config.otlp_headers - ) + platform_tracer_provider = None + if self._config.use_platform_sdk and self._config.export_traces: + platform_tracer_provider = self._initialize_platform_sdk() - # BatchSpanProcessor for efficient export - span_processor = BatchSpanProcessor( - span_exporter, - max_queue_size=512, - schedule_delay_millis=1000, # 1 second - max_export_batch_size=512 - ) - self._tracer_provider.add_span_processor(span_processor) + # Initialize TracerProvider with OTLP exporter + self._tracer_provider = platform_tracer_provider or TracerProvider(resource=resource) + if platform_tracer_provider is None: + trace.set_tracer_provider(self._tracer_provider) + + if self._config.export_traces and platform_tracer_provider is None: + # Choose exporter based on protocol + if self._config.otlp_protocol == "grpc": + span_exporter = OTLPSpanExporterGRPC( + endpoint=self._config.get_trace_endpoint(), + headers=self._config.otlp_headers + ) + else: + span_exporter = OTLPSpanExporterHTTP( + endpoint=self._config.get_trace_endpoint(), + headers=self._config.otlp_headers + ) - # Initialize MeterProvider with OTLP exporter - if self._config.otlp_protocol == "grpc": - metric_exporter = OTLPMetricExporterGRPC( - endpoint=self._config.otlp_endpoint, - headers=self._config.otlp_headers + # BatchSpanProcessor for efficient export + span_processor = BatchSpanProcessor( + span_exporter, + max_queue_size=512, + schedule_delay_millis=1000, # 1 second + max_export_batch_size=512 ) - else: - # HTTP protocol - metric_endpoint = self._config.otlp_endpoint - if not metric_endpoint.endswith("/v1/metrics"): - metric_endpoint = metric_endpoint.rstrip("/") + "/v1/metrics" - metric_exporter = OTLPMetricExporterHTTP( - endpoint=metric_endpoint, - headers=self._config.otlp_headers + self._tracer_provider.add_span_processor(span_processor) + elif self._config.export_traces: + logger.info( + "Using platform SDK tracer provider; skipping explicit OTLP span exporter" ) - # PeriodicExportingMetricReader for batch export - metric_reader = PeriodicExportingMetricReader( - exporter=metric_exporter, - export_interval_millis=60000 # 60 seconds - ) + metric_readers = [] + if self._config.export_metrics: + # Initialize MeterProvider with OTLP exporter + if self._config.otlp_protocol == "grpc": + metric_exporter = OTLPMetricExporterGRPC( + endpoint=self._config.get_metric_endpoint(), + headers=self._config.otlp_headers + ) + else: + metric_exporter = OTLPMetricExporterHTTP( + endpoint=self._config.get_metric_endpoint(), + headers=self._config.otlp_headers + ) + + # PeriodicExportingMetricReader for batch export + metric_readers.append(PeriodicExportingMetricReader( + exporter=metric_exporter, + export_interval_millis=60000 # 60 seconds + )) self._meter_provider = MeterProvider( resource=resource, - metric_readers=[metric_reader] + metric_readers=metric_readers ) metrics.set_meter_provider(self._meter_provider) @@ -278,18 +508,52 @@ def _init_telemetry_otlp(self) -> None: logger.info( f"OTLP telemetry initialized successfully for service: {self._config.service_name}, " - f"endpoint: {self._config.otlp_endpoint}, protocol: {self._config.otlp_protocol}" + f"provider: {self._config.provider}, trace_endpoint: {self._config.get_trace_endpoint()}, " + f"metric_endpoint: {self._config.get_metric_endpoint()}, protocol: {self._config.otlp_protocol}" ) except Exception as e: logger.error(f"Failed to initialize OTLP telemetry: {str(e)}") # Do not raise - allow application to continue without monitoring + def _initialize_platform_sdk(self) -> Optional[Any]: + """Optionally initialize provider SDKs that wrap OpenTelemetry.""" + if not self._config: + return None + + if self._config.provider == "phoenix": + try: + from phoenix.otel import register + + kwargs = { + "project_name": self._config.project_name or self._config.service_name, + "endpoint": self._config.otlp_endpoint, + "protocol": "http/protobuf" if self._config.otlp_protocol == "http" else "grpc", + "headers": self._config.otlp_headers, + "auto_instrument": False, + } + tracer_provider = register(**kwargs) + logger.info("Phoenix SDK initialized for OpenTelemetry tracing") + return tracer_provider + except Exception as exc: + logger.warning(f"Phoenix SDK initialization skipped: {exc}") + elif self._config.provider == "langfuse": + try: + from langfuse import get_client + + client = get_client() + if hasattr(client, "auth_check"): + client.auth_check() + logger.info("Langfuse SDK client initialized") + except Exception as exc: + logger.warning(f"Langfuse SDK initialization skipped: {exc}") + return None + @property def is_enabled(self) -> bool: """Check if monitoring is enabled.""" - return (self._config is not None and - self._config.enable_telemetry and + return (self._config is not None and + self._config.enable_telemetry and OPENTELEMETRY_AVAILABLE) @property @@ -356,11 +620,17 @@ def trace_llm_request(self, operation_name: str, model_name: str, **attributes: ) @contextmanager - def trace_agent_step(self, step_name: str, agent_name: str, step_type: str, **attributes: Any) -> Iterator[Optional[Any]]: + def trace_agent_step( + self, + step_name: str, + agent_name: str, + step_type: str, + **attributes: Any + ) -> Iterator[Optional[Any]]: """ Context manager for tracing Agent execution steps. Uses OpenInference semantic conventions for attribute naming. - + Args: step_name: Name of the step (e.g., "web_search", "reasoning_step_1") agent_name: Name of the agent @@ -380,7 +650,7 @@ def trace_agent_step(self, step_name: str, agent_name: str, step_type: str, **at openinference_attrs.update(attributes) span_name = f"agent.{step_name}" - + with self._tracer.start_as_current_span( span_name, attributes=openinference_attrs @@ -403,11 +673,17 @@ def trace_agent_step(self, step_name: str, agent_name: str, step_type: str, **at ) @contextmanager - def trace_tool_call(self, tool_name: str, agent_name: str, tool_input: Optional[Dict] = None, **attributes: Any) -> Iterator[Optional[Any]]: + def trace_tool_call( + self, + tool_name: str, + agent_name: str, + tool_input: Optional[Dict] = None, + **attributes: Any + ) -> Iterator[Optional[Any]]: """ Context manager for tracing Agent tool calls. Uses OpenInference semantic conventions for attribute naming. - + Args: tool_name: Name of the tool being called agent_name: Name of the agent making the call @@ -425,18 +701,18 @@ def trace_tool_call(self, tool_name: str, agent_name: str, tool_input: Optional[ "agent.step.type": "tool_call", "agent.tool.name": tool_name, } - + # Add tool input as JSON string if tool_input: try: openinference_attrs["agent.tool.input"] = json.dumps(tool_input, ensure_ascii=False) except (TypeError, ValueError): openinference_attrs["agent.tool.input"] = str(tool_input) - + openinference_attrs.update(attributes) span_name = f"agent.tool.{tool_name}" - + with self._tracer.start_as_current_span( span_name, attributes=openinference_attrs @@ -466,7 +742,7 @@ def set_tool_output(self, output: Any) -> None: """ Set the output of a tool call on the current span. Call this within a trace_tool_call context manager. - + Args: output: Tool output (will be JSON serialized) """ @@ -541,7 +817,12 @@ def record_agent_metrics(self, metric_type: str, value: float, attributes: Dict[ if metric_type == "duration" and self._agent_execution_duration: self._agent_execution_duration.record(value, attributes) - def monitor_endpoint(self, operation_name: Optional[str] = None, include_params: bool = True, exclude_params: Optional[list] = None) -> Callable[[F], F]: + def monitor_endpoint( + self, + operation_name: Optional[str] = None, + include_params: bool = True, + exclude_params: Optional[list] = None + ) -> Callable[[F], F]: """ Decorator to add monitoring to any endpoint or service function. Monitoring is automatically enabled/disabled based on configuration. @@ -550,73 +831,95 @@ def decorator(func: F) -> F: op_name = operation_name or f"{func.__module__}.{func.__name__}" exclude_set = set(exclude_params or []) + def prepare_span(span, kwargs: Dict[str, Any]) -> None: + if span and include_params: + safe_params = { + k: v for k, v in kwargs.items() + if k not in exclude_set and isinstance(v, (str, int, float, bool)) + } + if safe_params: + self.set_span_attributes(**{f"param.{k}": v for k, v in safe_params.items()}) + self.add_span_event(f"{op_name}.started") + + def complete_span(start_time: float) -> None: + duration = time.time() - start_time + self.add_span_event(f"{op_name}.completed", {"duration": duration}) + + def fail_span(start_time: float, error: Exception) -> None: + duration = time.time() - start_time + self.add_span_event(f"{op_name}.error", { + "error.type": type(error).__name__, + "error.message": str(error), + "duration": duration + }) + @functools.wraps(func) async def async_wrapper(*args, **kwargs): # Always execute monitoring logic - internal methods handle enabled state with self.trace_llm_request(op_name, "nexent-service") as span: - if span and include_params: - safe_params = { - k: v for k, v in kwargs.items() - if k not in exclude_set and isinstance(v, (str, int, float, bool)) - } - if safe_params: - self.set_span_attributes( - **{f"param.{k}": v for k, v in safe_params.items()}) - - self.add_span_event(f"{op_name}.started") + prepare_span(span, kwargs) start_time = time.time() try: result = await func(*args, **kwargs) - duration = time.time() - start_time - self.add_span_event( - f"{op_name}.completed", {"duration": duration}) + complete_span(start_time) return result except Exception as e: - duration = time.time() - start_time - self.add_span_event(f"{op_name}.error", { - "error.type": type(e).__name__, - "error.message": str(e), - "duration": duration - }) + fail_span(start_time, e) + raise + + @functools.wraps(func) + async def async_generator_wrapper(*args, **kwargs): + # Keep the span open while the streaming response is consumed. + with self.trace_llm_request(op_name, "nexent-service") as span: + prepare_span(span, kwargs) + start_time = time.time() + + try: + async for item in func(*args, **kwargs): + yield item + complete_span(start_time) + except Exception as e: + fail_span(start_time, e) raise @functools.wraps(func) def sync_wrapper(*args, **kwargs): # Always execute monitoring logic - internal methods handle enabled state with self.trace_llm_request(op_name, "nexent-service") as span: - if span and include_params: - safe_params = { - k: v for k, v in kwargs.items() - if k not in exclude_set and isinstance(v, (str, int, float, bool)) - } - if safe_params: - self.set_span_attributes( - **{f"param.{k}": v for k, v in safe_params.items()}) - - self.add_span_event(f"{op_name}.started") + prepare_span(span, kwargs) start_time = time.time() try: result = func(*args, **kwargs) - duration = time.time() - start_time - self.add_span_event( - f"{op_name}.completed", {"duration": duration}) + complete_span(start_time) return result except Exception as e: - duration = time.time() - start_time - self.add_span_event(f"{op_name}.error", { - "error.type": type(e).__name__, - "error.message": str(e), - "duration": duration - }) + fail_span(start_time, e) + raise + + @functools.wraps(func) + def generator_wrapper(*args, **kwargs): + with self.trace_llm_request(op_name, "nexent-service") as span: + prepare_span(span, kwargs) + start_time = time.time() + + try: + for item in func(*args, **kwargs): + yield item + complete_span(start_time) + except Exception as e: + fail_span(start_time, e) raise # Return appropriate wrapper based on function type - if hasattr(func, '__code__') and func.__code__.co_flags & 0x80: + if inspect.isasyncgenfunction(func): + return cast(F, async_generator_wrapper) + if inspect.iscoroutinefunction(func): return cast(F, async_wrapper) - else: - return cast(F, sync_wrapper) + if inspect.isgeneratorfunction(func): + return cast(F, generator_wrapper) + return cast(F, sync_wrapper) return decorator @@ -666,7 +969,7 @@ def sync_wrapper(*args, **kwargs): }) raise - if hasattr(func, '__code__') and func.__code__.co_flags & 0x80: + if inspect.iscoroutinefunction(func): return cast(F, async_wrapper) else: return cast(F, sync_wrapper) @@ -677,7 +980,7 @@ def monitor_agent_execution(self, agent_name: str): """ Decorator to add monitoring to Agent execution. Tracks overall execution duration and error count. - + Args: agent_name: Name of the agent being monitored """ @@ -686,7 +989,7 @@ def decorator(func: F) -> F: async def async_wrapper(*args, **kwargs): start_time = time.time() status = "success" - + try: result = await func(*args, **kwargs) return result @@ -708,7 +1011,7 @@ async def async_wrapper(*args, **kwargs): def sync_wrapper(*args, **kwargs): start_time = time.time() status = "success" - + try: result = func(*args, **kwargs) return result @@ -726,7 +1029,7 @@ def sync_wrapper(*args, **kwargs): duration, {"agent.name": agent_name, "agent.status": status} ) - if hasattr(func, '__code__') and func.__code__.co_flags & 0x80: + if inspect.iscoroutinefunction(func): return cast(F, async_wrapper) else: return cast(F, sync_wrapper) @@ -849,4 +1152,4 @@ async def my_function(): 'LLMTokenTracker', 'get_monitoring_manager', 'is_opentelemetry_available', -] \ No newline at end of file +] diff --git a/test/sdk/monitor/conftest.py b/test/sdk/monitor/conftest.py index 565bfab83..1f4eb8cf6 100644 --- a/test/sdk/monitor/conftest.py +++ b/test/sdk/monitor/conftest.py @@ -8,6 +8,9 @@ """ import sys +import types +import importlib.util +from pathlib import Path from unittest.mock import MagicMock @@ -25,11 +28,17 @@ def pytest_configure(config): mock_opentelemetry.metrics = MagicMock() mock_opentelemetry.trace.status = MagicMock() mock_opentelemetry.exporter = MagicMock() - mock_opentelemetry.exporter.prometheus = MagicMock() - mock_opentelemetry.exporter.jaeger = MagicMock() - mock_opentelemetry.exporter.jaeger.thrift = MagicMock() + mock_opentelemetry.exporter.otlp = MagicMock() + mock_opentelemetry.exporter.otlp.proto = MagicMock() + mock_opentelemetry.exporter.otlp.proto.http = MagicMock() + mock_opentelemetry.exporter.otlp.proto.http.trace_exporter = MagicMock() + mock_opentelemetry.exporter.otlp.proto.http.metric_exporter = MagicMock() + mock_opentelemetry.exporter.otlp.proto.grpc = MagicMock() + mock_opentelemetry.exporter.otlp.proto.grpc.trace_exporter = MagicMock() + mock_opentelemetry.exporter.otlp.proto.grpc.metric_exporter = MagicMock() mock_opentelemetry.sdk = MagicMock() mock_opentelemetry.sdk.metrics = MagicMock() + mock_opentelemetry.sdk.metrics.export = MagicMock() mock_opentelemetry.sdk.trace = MagicMock() mock_opentelemetry.sdk.trace.export = MagicMock() mock_opentelemetry.sdk.resources = MagicMock() @@ -44,11 +53,25 @@ def pytest_configure(config): 'opentelemetry.metrics': mock_opentelemetry.metrics, 'opentelemetry.trace.status': mock_opentelemetry.trace.status, 'opentelemetry.exporter': mock_opentelemetry.exporter, - 'opentelemetry.exporter.prometheus': mock_opentelemetry.exporter.prometheus, - 'opentelemetry.exporter.jaeger': mock_opentelemetry.exporter.jaeger, - 'opentelemetry.exporter.jaeger.thrift': mock_opentelemetry.exporter.jaeger.thrift, + 'opentelemetry.exporter.otlp': mock_opentelemetry.exporter.otlp, + 'opentelemetry.exporter.otlp.proto': mock_opentelemetry.exporter.otlp.proto, + 'opentelemetry.exporter.otlp.proto.http': mock_opentelemetry.exporter.otlp.proto.http, + 'opentelemetry.exporter.otlp.proto.http.trace_exporter': ( + mock_opentelemetry.exporter.otlp.proto.http.trace_exporter + ), + 'opentelemetry.exporter.otlp.proto.http.metric_exporter': ( + mock_opentelemetry.exporter.otlp.proto.http.metric_exporter + ), + 'opentelemetry.exporter.otlp.proto.grpc': mock_opentelemetry.exporter.otlp.proto.grpc, + 'opentelemetry.exporter.otlp.proto.grpc.trace_exporter': ( + mock_opentelemetry.exporter.otlp.proto.grpc.trace_exporter + ), + 'opentelemetry.exporter.otlp.proto.grpc.metric_exporter': ( + mock_opentelemetry.exporter.otlp.proto.grpc.metric_exporter + ), 'opentelemetry.sdk': mock_opentelemetry.sdk, 'opentelemetry.sdk.metrics': mock_opentelemetry.sdk.metrics, + 'opentelemetry.sdk.metrics.export': mock_opentelemetry.sdk.metrics.export, 'opentelemetry.sdk.trace': mock_opentelemetry.sdk.trace, 'opentelemetry.sdk.trace.export': mock_opentelemetry.sdk.trace.export, 'opentelemetry.sdk.resources': mock_opentelemetry.sdk.resources, @@ -64,6 +87,31 @@ def pytest_configure(config): original_modules[module_name] = sys.modules[module_name] sys.modules[module_name] = modules_to_mock[module_name] + # Load the monitoring module directly so these tests do not import the full SDK package. + # The package __init__ imports data-processing dependencies that are unrelated here. + package_modules = { + "sdk": types.ModuleType("sdk"), + "sdk.nexent": types.ModuleType("sdk.nexent"), + "sdk.nexent.monitor": types.ModuleType("sdk.nexent.monitor"), + } + for module_name, module in package_modules.items(): + if module_name in sys.modules: + original_modules[module_name] = sys.modules[module_name] + sys.modules[module_name] = module + + repo_root = Path(__file__).resolve().parents[3] + monitoring_path = repo_root / "sdk" / "nexent" / "monitor" / "monitoring.py" + spec = importlib.util.spec_from_file_location( + "sdk.nexent.monitor.monitoring", + monitoring_path + ) + monitoring_module = importlib.util.module_from_spec(spec) + if "sdk.nexent.monitor.monitoring" in sys.modules: + original_modules["sdk.nexent.monitor.monitoring"] = sys.modules["sdk.nexent.monitor.monitoring"] + sys.modules["sdk.nexent.monitor.monitoring"] = monitoring_module + spec.loader.exec_module(monitoring_module) + sys.modules["sdk.nexent.monitor"].monitoring = monitoring_module + # Store for cleanup in pytest_unconfigure config._mocked_otel_modules = original_modules @@ -75,4 +123,3 @@ def pytest_unconfigure(config): if hasattr(config, '_mocked_otel_modules'): for module_name, original_module in config._mocked_otel_modules.items(): sys.modules[module_name] = original_module - diff --git a/test/sdk/monitor/test_monitoring.py b/test/sdk/monitor/test_monitoring.py index 63e69ac6d..88e697ba3 100644 --- a/test/sdk/monitor/test_monitoring.py +++ b/test/sdk/monitor/test_monitoring.py @@ -22,6 +22,9 @@ import pytest import asyncio from unittest.mock import Mock, MagicMock, patch +import json +import sys +import types class TestMonitoringConfig: @@ -33,9 +36,14 @@ def test_default_config(self): assert config.enable_telemetry is False assert config.service_name == "nexent-backend" + assert config.provider == "otlp" assert config.otlp_endpoint == "http://localhost:4318" + assert config.get_trace_endpoint() == "http://localhost:4318/v1/traces" + assert config.get_metric_endpoint() == "http://localhost:4318/v1/metrics" assert config.otlp_protocol == "http" assert config.otlp_headers == {} + assert config.export_traces is True + assert config.export_metrics is True assert config.telemetry_sample_rate == 1.0 assert config.llm_slow_request_threshold_seconds == 5.0 assert config.llm_slow_token_rate_threshold == 10.0 @@ -45,9 +53,13 @@ def test_custom_config(self): config = MonitoringConfig( enable_telemetry=True, service_name="test-service", - otlp_endpoint="https://phoenix.arize.com/v1", + provider="phoenix", + otlp_endpoint="https://app.phoenix.arize.com", otlp_protocol="grpc", - otlp_headers={"x-api-key": "test-key"}, + otlp_headers={"Authorization": "Bearer test-key"}, + export_metrics=False, + use_platform_sdk=True, + project_name="nexent-test", telemetry_sample_rate=0.5, llm_slow_request_threshold_seconds=10.0, llm_slow_token_rate_threshold=20.0 @@ -55,9 +67,13 @@ def test_custom_config(self): assert config.enable_telemetry is True assert config.service_name == "test-service" - assert config.otlp_endpoint == "https://phoenix.arize.com/v1" - assert config.otlp_protocol == "grpc" - assert config.otlp_headers == {"x-api-key": "test-key"} + assert config.provider == "phoenix" + assert config.otlp_endpoint == "https://app.phoenix.arize.com" + assert config.otlp_protocol == "http" + assert config.otlp_headers == {"Authorization": "Bearer test-key"} + assert config.export_metrics is False + assert config.use_platform_sdk is True + assert config.project_name == "nexent-test" assert config.telemetry_sample_rate == 0.5 assert config.llm_slow_request_threshold_seconds == 10.0 assert config.llm_slow_token_rate_threshold == 20.0 @@ -71,6 +87,57 @@ def test_invalid_protocol_defaults_to_http(self): ) assert config.otlp_protocol == "http" + def test_signal_endpoint_derivation_from_base_endpoint(self): + """Test HTTP endpoints are derived from a base OTLP endpoint.""" + config = MonitoringConfig( + otlp_endpoint="https://cloud.langfuse.com/api/public/otel" + ) + + assert config.get_trace_endpoint() == "https://cloud.langfuse.com/api/public/otel/v1/traces" + assert config.get_metric_endpoint() == "https://cloud.langfuse.com/api/public/otel/v1/metrics" + + def test_signal_endpoint_derivation_from_existing_signal_endpoint(self): + """Test signal endpoints are not duplicated when already provided.""" + config = MonitoringConfig( + otlp_endpoint="https://collector.example.com/v1/traces" + ) + + assert config.get_trace_endpoint() == "https://collector.example.com/v1/traces" + assert config.get_metric_endpoint() == "https://collector.example.com/v1/metrics" + + def test_from_json_file_with_overrides(self, tmp_path): + """Test monitoring config can be loaded from a config file and env overrides.""" + config_file = tmp_path / "monitoring.json" + config_file.write_text(json.dumps({ + "monitoring": { + "enable_telemetry": True, + "service_name": "file-service", + "exporter": { + "provider": "langfuse", + "endpoint": "https://cloud.langfuse.com/api/public/otel", + "headers": {"Authorization": "Basic file-token"}, + "export_metrics": False + } + } + }), encoding="utf-8") + + config = MonitoringConfig.from_file( + str(config_file), + overrides={ + "service_name": "env-service", + "otlp_headers": {"x-langfuse-ingestion-version": "4"}, + } + ) + + assert config.service_name == "env-service" + assert config.provider == "langfuse" + assert config.get_trace_endpoint() == "https://cloud.langfuse.com/api/public/otel/v1/traces" + assert config.otlp_headers == { + "Authorization": "Basic file-token", + "x-langfuse-ingestion-version": "4", + } + assert config.export_metrics is False + class TestMonitoringManager: """Test MonitoringManager singleton and core functionality.""" @@ -196,6 +263,63 @@ def test_init_telemetry_exception_handling(self): with patch('sdk.nexent.monitor.monitoring.Resource.create', side_effect=Exception("Test error")): manager.configure(config) + @patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True) + @patch('sdk.nexent.monitor.monitoring.trace') + @patch('sdk.nexent.monitor.monitoring.metrics') + @patch('sdk.nexent.monitor.monitoring.TracerProvider') + @patch('sdk.nexent.monitor.monitoring.MeterProvider') + @patch('sdk.nexent.monitor.monitoring.OTLPSpanExporterHTTP') + @patch('sdk.nexent.monitor.monitoring.Resource') + @patch('sdk.nexent.monitor.monitoring.RequestsInstrumentor') + def test_phoenix_platform_sdk_reuses_registered_tracer_provider( + self, + mock_requests_instr, + mock_resource, + mock_span_exporter_http, + mock_meter_provider, + mock_tracer_provider, + mock_metrics, + mock_trace + ): + """Test Phoenix SDK provider is reused instead of double-registering traces.""" + manager = MonitoringManager() + sdk_tracer_provider = MagicMock() + phoenix_module = types.ModuleType("phoenix") + phoenix_otel_module = types.ModuleType("phoenix.otel") + phoenix_otel_module.register = MagicMock(return_value=sdk_tracer_provider) + + mock_resource.create.return_value = MagicMock() + mock_metrics.get_meter.return_value = MagicMock() + mock_trace.get_tracer.return_value = MagicMock() + + with patch.dict(sys.modules, { + "phoenix": phoenix_module, + "phoenix.otel": phoenix_otel_module, + }): + config = MonitoringConfig( + enable_telemetry=True, + provider="phoenix", + otlp_endpoint="https://app.phoenix.arize.com/s/test-space", + otlp_headers={"Authorization": "Bearer test-key"}, + export_metrics=False, + use_platform_sdk=True, + project_name="nexent-test" + ) + manager.configure(config) + + phoenix_otel_module.register.assert_called_once_with( + project_name="nexent-test", + endpoint="https://app.phoenix.arize.com/s/test-space", + protocol="http/protobuf", + headers={"Authorization": "Bearer test-key"}, + auto_instrument=False + ) + assert manager._tracer_provider is sdk_tracer_provider + mock_tracer_provider.assert_not_called() + mock_trace.set_tracer_provider.assert_not_called() + mock_span_exporter_http.assert_not_called() + mock_requests_instr().instrument.assert_called_once() + @patch('sdk.nexent.monitor.monitoring.trace') def test_trace_llm_request_openinference_attrs(self, mock_trace): """Test LLM request tracing uses OpenInference attribute names.""" @@ -446,6 +570,44 @@ async def test_function(param1, param2="default"): result = asyncio.run(test_function("value1", param2="value2")) assert result == {"result": "success"} + def test_monitor_endpoint_decorator_async_generator(self): + """Test monitor_endpoint keeps context while async generators are consumed.""" + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=False) + manager.configure(config) + events = [] + original_add_span_event = manager.add_span_event + + def capture_event(name, attributes=None): + events.append((name, attributes or {})) + original_add_span_event(name, attributes) + + manager.add_span_event = capture_event + + @manager.monitor_endpoint("stream_operation") + async def stream_function(): + manager.add_span_event("stream_operation.inside") + yield "chunk-1" + manager.add_span_event("stream_operation.after_yield") + yield "chunk-2" + + async def consume_stream(): + return [item async for item in stream_function()] + + try: + result = asyncio.run(consume_stream()) + finally: + manager.add_span_event = original_add_span_event + + assert result == ["chunk-1", "chunk-2"] + event_names = [name for name, _ in events] + assert event_names == [ + "stream_operation.started", + "stream_operation.inside", + "stream_operation.after_yield", + "stream_operation.completed", + ] + def test_monitor_llm_call_decorator(self): """Test monitor_llm_call decorator.""" manager = MonitoringManager() @@ -598,4 +760,4 @@ def test_exporter_error_does_not_crash(self): config = MonitoringConfig(enable_telemetry=True) manager.configure(config) - assert manager._tracer is None \ No newline at end of file + assert manager._tracer is None From b71d7af25957611ad9f42a51064f58835133e942 Mon Sep 17 00:00:00 2001 From: hhhhsc Date: Wed, 6 May 2026 09:43:14 +0800 Subject: [PATCH 03/17] Add local Phoenix and Langfuse monitoring deployment support --- doc/docs/zh/sdk/monitoring.md | 72 ++++++- doc/docs/zh/sdk/opentelemetry-design.md | 68 +++++- docker/docker-compose-monitoring.yml | 203 +++++++++++++++++- docker/monitoring/monitoring.env | 46 ++++ docker/monitoring/monitoring.env.example | 46 ++++ .../otel-collector-langfuse-config.yml | 51 +++++ .../otel-collector-phoenix-config.yml | 48 +++++ docker/start-monitoring.sh | 130 ++++++++++- 8 files changed, 645 insertions(+), 19 deletions(-) create mode 100644 docker/monitoring/otel-collector-langfuse-config.yml create mode 100644 docker/monitoring/otel-collector-phoenix-config.yml diff --git a/doc/docs/zh/sdk/monitoring.md b/doc/docs/zh/sdk/monitoring.md index 5b93d47b1..6e21024ef 100644 --- a/doc/docs/zh/sdk/monitoring.md +++ b/doc/docs/zh/sdk/monitoring.md @@ -16,17 +16,76 @@ NexentAgent ──► OpenTelemetry SDK ──► OTLP Collector ──► Arize ```bash cd docker -cp .env.example .env +cp monitoring/monitoring.env.example monitoring/monitoring.env -vim .env +vim monitoring/monitoring.env ENABLE_TELEMETRY=true MONITORING_PROVIDER=otlp OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 OTEL_EXPORTER_OTLP_PROTOCOL=http -docker-compose -f docker-compose-monitoring.yml up -d +./start-monitoring.sh --stack collector ``` +## 本地化部署形态 + +`docker/start-monitoring.sh` 支持三种形态,均以 OpenTelemetry Collector 作为统一入口。业务服务只需要把 OTLP 发到 Collector,不需要感知后端平台差异。 + +| 形态 | 命令 | 包含服务 | 适用场景 | +|------|------|----------|----------| +| `collector` | `./start-monitoring.sh --stack collector` | OpenTelemetry Collector | 只验证埋点、或转发到外部云端平台 | +| `phoenix` | `./start-monitoring.sh --stack phoenix` | Collector + Phoenix | 本地 trace 调试、OpenInference 属性查看、实验分析 | +| `langfuse` | `./start-monitoring.sh --stack langfuse` | Collector + Langfuse Web/Worker + Postgres + ClickHouse + MinIO + Redis | 本地完整 LLMOps 体验、会话/用户/反馈/成本分析 | + +也可以在 `docker/monitoring/monitoring.env` 中设置默认形态: + +```bash +MONITORING_STACK=phoenix +``` + +### 本地 Phoenix + +Phoenix 本地部署使用 `arizephoenix/phoenix` 镜像,默认 UI 端口为 `6006`,gRPC OTLP 端口映射为 `4319`,数据持久化到 Docker volume `phoenix-data`。 + +```bash +cd docker +./start-monitoring.sh --stack phoenix +``` + +访问地址: + +- Phoenix UI:`http://localhost:6006` +- Collector OTLP HTTP:`http://localhost:4318` +- Collector OTLP gRPC:`localhost:4317` + +Nexent 后端在 Docker 网络内运行时: + +```bash +ENABLE_TELEMETRY=true +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http +OTEL_EXPORTER_OTLP_METRICS_ENABLED=false +``` + +后端直接在宿主机运行时,把 endpoint 改为 `http://localhost:4318`。 + +### 本地 Langfuse + +Langfuse 本地部署使用 v3 架构:Web、Worker、Postgres、ClickHouse、MinIO、Redis。默认 UI 端口为 `3001`,初始化项目和 API Key 来自 `monitoring.env`。 + +```bash +cd docker +./start-monitoring.sh --stack langfuse +``` + +访问地址: + +- Langfuse UI:`http://localhost:3001` +- 默认管理员:`admin@nexent.local` / `nexent-langfuse-admin` +- 默认项目 Key:`pk-lf-nexent-local` / `sk-lf-nexent-local` + +启动脚本会在 `LANGFUSE_OTLP_AUTH_HEADER` 为空时自动生成 `Basic base64(public_key:secret_key)`,并让 Collector 将 trace 转发到 `http://langfuse-web:3000/api/public/otel`。本地默认密钥只适合开发验证,生产部署必须替换 `LANGFUSE_NEXTAUTH_SECRET`、`LANGFUSE_SALT`、`LANGFUSE_ENCRYPTION_KEY`、数据库密码和对象存储密钥。 + ## AI 可观测性平台对接 ### Arize Phoenix @@ -240,7 +299,12 @@ service: exporters: [otlphttp/langfuse, logging] ``` -完整配置见 `docker/monitoring/otel-collector-config.yml`。 +本地 Phoenix 和 Langfuse 分别使用独立 Collector 配置: + +- `docker/monitoring/otel-collector-phoenix-config.yml` +- `docker/monitoring/otel-collector-langfuse-config.yml` + +基础 logging 配置见 `docker/monitoring/otel-collector-config.yml`。 ## 优雅降级 diff --git a/doc/docs/zh/sdk/opentelemetry-design.md b/doc/docs/zh/sdk/opentelemetry-design.md index fc78194d1..f1af77dc0 100644 --- a/doc/docs/zh/sdk/opentelemetry-design.md +++ b/doc/docs/zh/sdk/opentelemetry-design.md @@ -26,7 +26,7 @@ Nexent 的监控能力以 OpenTelemetry 为主干,SDK 和后端只负责生成 | 自动埋点 | FastAPI instrumentation、requests instrumentation | | AI 语义 | OpenInference 风格属性:`llm.*`、`agent.*`、`agent.tool.*` | | 配置 | 环境变量、`MONITORING_CONFIG_FILE` JSON/YAML | -| Collector | `otel/opentelemetry-collector-contrib`,使用 `otlphttp` 转发 HTTP 平台 | +| Collector | `otel/opentelemetry-collector-contrib`,使用 `otlphttp` 转发 HTTP 平台;本地可选择 logging、Phoenix、Langfuse 三类部署形态 | | 可选 SDK | `phoenix.otel.register`、`langfuse.get_client`,默认不启用;Phoenix SDK 成功注册时复用其 tracer provider | ## 配置模型 @@ -38,6 +38,7 @@ Nexent 的监控能力以 OpenTelemetry 为主干,SDK 和后端只负责生成 | `ENABLE_TELEMETRY` | 总开关 | | `MONITORING_CONFIG_FILE` | JSON/YAML 配置文件路径 | | `MONITORING_PROVIDER` | `otlp`、`phoenix`、`langfuse`、`jaeger`、`custom` | +| `MONITORING_STACK` | 本地部署形态:`collector`、`phoenix`、`langfuse` | | `MONITORING_USE_PLATFORM_SDK` | 是否额外初始化平台 SDK | | `MONITORING_PROJECT_NAME` | 平台项目名 | | `OTEL_SERVICE_NAME` | OpenTelemetry service name | @@ -127,6 +128,67 @@ OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION=4 OTEL_EXPORTER_OTLP_METRICS_ENABLED=false ``` +## 本地化部署设计 + +本地化部署通过 `docker/start-monitoring.sh` 选择形态。所有形态都保留 OpenTelemetry Collector 作为入口,Nexent 后端统一上报到 `http://otel-collector:4318` 或宿主机的 `http://localhost:4318`,平台差异只体现在 Collector exporter 和本地服务组合上。 + +| 形态 | Collector 配置 | 本地服务 | 数据去向 | 说明 | +|------|----------------|----------|----------|------| +| `collector` | `otel-collector-config.yml` | Collector | logging exporter | 最小形态,用于验证 span/metric 是否产生,或手动改配置转发到云端平台 | +| `phoenix` | `otel-collector-phoenix-config.yml` | Collector + Phoenix | `http://phoenix:6006/v1/traces` | Phoenix 容器同时提供 UI 和 OTLP HTTP/gRPC trace collector,适合本地 trace debug | +| `langfuse` | `otel-collector-langfuse-config.yml` | Collector + Langfuse Web/Worker + Postgres + ClickHouse + MinIO + Redis | `http://langfuse-web:3000/api/public/otel/v1/traces` | Langfuse v3 依赖多组件,适合完整 LLMOps 能力验证 | + +启动命令: + +```bash +cd docker +./start-monitoring.sh --stack collector +./start-monitoring.sh --stack phoenix +./start-monitoring.sh --stack langfuse +``` + +部署脚本职责: + +- 创建或复用 `nexent-network`。 +- 首次启动时从 `monitoring.env.example` 生成 `monitoring.env`。 +- 根据 `MONITORING_STACK` 或 `--stack` 选择 Docker Compose profile。 +- 根据部署形态设置 `OTEL_COLLECTOR_CONFIG_FILE`。 +- Langfuse 本地形态下,如果 `LANGFUSE_OTLP_AUTH_HEADER` 未显式配置,则使用初始化项目的 public/secret key 生成 Basic Auth header。 + +### Phoenix 本地形态 + +Phoenix 使用 `arizephoenix/phoenix` 镜像,默认暴露: + +| 端口 | 用途 | +|------|------| +| `6006` | Phoenix UI 和 OTLP HTTP `/v1/traces` | +| `4319` | 映射到容器内 gRPC OTLP `4317`,避免与 Collector gRPC 端口冲突 | + +Compose 中设置 `PHOENIX_WORKING_DIR=/mnt/data` 并挂载 `phoenix-data` volume,确保本地重启后 trace 数据不丢失。Collector 使用 `otlphttp/phoenix` exporter 的 base endpoint `http://phoenix:6006`,由 Collector 按 OTLP HTTP 规则追加 `/v1/traces`。 + +### Langfuse 本地形态 + +Langfuse v3 本地形态按官方自托管架构拆分为应用容器和存储组件: + +| 组件 | 用途 | +|------|------| +| `langfuse-web` | UI、API、OTLP HTTP ingestion | +| `langfuse-worker` | 异步消费和处理 trace 事件 | +| `langfuse-postgres` | 事务型元数据 | +| `langfuse-clickhouse` | trace/observation/score 分析数据 | +| `langfuse-minio` | S3 兼容对象存储,保存事件和大对象 | +| `langfuse-redis` | 队列和缓存 | + +初始化参数通过 `LANGFUSE_INIT_*` 配置,默认创建 `nexent-local` 项目和本地 API Key。Collector 使用 `otlphttp/langfuse` exporter,endpoint 为 `http://langfuse-web:3000/api/public/otel`,并携带: + +```yaml +headers: + Authorization: ${env:LANGFUSE_OTLP_AUTH_HEADER} + x-langfuse-ingestion-version: "4" +``` + +默认密钥仅用于本地验证。生产或共享环境必须替换认证密钥、数据库密码、对象存储密钥和 `LANGFUSE_ENCRYPTION_KEY`,并补充备份、高可用和升级策略。 + ## 埋点信息 | 埋点 | 位置 | 内容 | 目的 | @@ -251,5 +313,9 @@ flowchart TB - Phoenix Setup Tracing: https://arize.com/docs/phoenix/tracing/how-to-tracing/setup-tracing - Phoenix Setup OTEL: https://arize.com/docs/phoenix/tracing/how-to-tracing/setup-tracing/setup-using-phoenix-otel - Phoenix Authentication: https://arize.com/docs/phoenix/deployment/authentication +- Phoenix Self-Hosting: https://arize.com/docs/phoenix/self-hosting +- Phoenix Docker Deployment: https://arize.com/docs/phoenix/self-hosting/deployment-options/docker - Langfuse OpenTelemetry: https://langfuse.com/integrations/native/opentelemetry +- Langfuse Self-Hosting: https://langfuse.com/self-hosting +- Langfuse Docker Compose: https://langfuse.com/self-hosting/local - Langfuse Overview: https://langfuse.com/docs diff --git a/docker/docker-compose-monitoring.yml b/docker/docker-compose-monitoring.yml index 20cadb0a9..fb4764acf 100644 --- a/docker/docker-compose-monitoring.yml +++ b/docker/docker-compose-monitoring.yml @@ -4,14 +4,209 @@ services: container_name: nexent-otel-collector command: ["--config=/etc/otel-collector-config.yml"] volumes: - - ./monitoring/otel-collector-config.yml:/etc/otel-collector-config.yml + - ${OTEL_COLLECTOR_CONFIG_FILE:-./monitoring/otel-collector-config.yml}:/etc/otel-collector-config.yml ports: - - "4317:4317" - - "4318:4318" + - "${OTEL_COLLECTOR_GRPC_PORT:-4317}:4317" + - "${OTEL_COLLECTOR_HTTP_PORT:-4318}:4318" networks: - nexent-network restart: unless-stopped + phoenix: + image: arizephoenix/phoenix:${PHOENIX_VERSION:-latest} + container_name: nexent-phoenix + profiles: ["phoenix"] + environment: + PHOENIX_WORKING_DIR: /mnt/data + volumes: + - phoenix-data:/mnt/data + ports: + - "${PHOENIX_PORT:-6006}:6006" + - "${PHOENIX_GRPC_HOST_PORT:-4319}:4317" + networks: + - nexent-network + restart: unless-stopped + + langfuse-worker: + image: docker.io/langfuse/langfuse-worker:${LANGFUSE_VERSION:-3} + container_name: nexent-langfuse-worker + profiles: ["langfuse"] + restart: unless-stopped + depends_on: &langfuse-depends-on + langfuse-postgres: + condition: service_healthy + langfuse-minio: + condition: service_healthy + langfuse-redis: + condition: service_healthy + langfuse-clickhouse: + condition: service_healthy + environment: &langfuse-env + NEXTAUTH_URL: ${LANGFUSE_NEXTAUTH_URL:-http://localhost:3001} + NEXTAUTH_SECRET: ${LANGFUSE_NEXTAUTH_SECRET:-nexent-langfuse-secret} + DATABASE_URL: postgresql://${LANGFUSE_POSTGRES_USER:-postgres}:${LANGFUSE_POSTGRES_PASSWORD:-postgres}@langfuse-postgres:5432/${LANGFUSE_POSTGRES_DB:-postgres} + SALT: ${LANGFUSE_SALT:-nexent-langfuse-salt} + ENCRYPTION_KEY: ${LANGFUSE_ENCRYPTION_KEY:-0000000000000000000000000000000000000000000000000000000000000000} + TELEMETRY_ENABLED: ${LANGFUSE_TELEMETRY_ENABLED:-false} + LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES: ${LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES:-false} + CLICKHOUSE_MIGRATION_URL: clickhouse://langfuse-clickhouse:9000 + CLICKHOUSE_URL: http://langfuse-clickhouse:8123 + CLICKHOUSE_USER: ${LANGFUSE_CLICKHOUSE_USER:-clickhouse} + CLICKHOUSE_PASSWORD: ${LANGFUSE_CLICKHOUSE_PASSWORD:-clickhouse} + CLICKHOUSE_CLUSTER_ENABLED: ${LANGFUSE_CLICKHOUSE_CLUSTER_ENABLED:-false} + REDIS_HOST: langfuse-redis + REDIS_PORT: 6379 + REDIS_AUTH: ${LANGFUSE_REDIS_AUTH:-myredissecret} + REDIS_TLS_ENABLED: "false" + LANGFUSE_USE_AZURE_BLOB: "false" + LANGFUSE_USE_OCI_NATIVE_OBJECT_STORAGE: "false" + LANGFUSE_S3_EVENT_UPLOAD_BUCKET: ${LANGFUSE_S3_BUCKET:-langfuse} + LANGFUSE_S3_EVENT_UPLOAD_REGION: auto + LANGFUSE_S3_EVENT_UPLOAD_ACCESS_KEY_ID: ${LANGFUSE_MINIO_ROOT_USER:-minio} + LANGFUSE_S3_EVENT_UPLOAD_SECRET_ACCESS_KEY: ${LANGFUSE_MINIO_ROOT_PASSWORD:-miniosecret} + LANGFUSE_S3_EVENT_UPLOAD_ENDPOINT: http://langfuse-minio:9000 + LANGFUSE_S3_EVENT_UPLOAD_FORCE_PATH_STYLE: "true" + LANGFUSE_S3_EVENT_UPLOAD_PREFIX: events/ + LANGFUSE_S3_MEDIA_UPLOAD_BUCKET: ${LANGFUSE_S3_BUCKET:-langfuse} + LANGFUSE_S3_MEDIA_UPLOAD_REGION: auto + LANGFUSE_S3_MEDIA_UPLOAD_ACCESS_KEY_ID: ${LANGFUSE_MINIO_ROOT_USER:-minio} + LANGFUSE_S3_MEDIA_UPLOAD_SECRET_ACCESS_KEY: ${LANGFUSE_MINIO_ROOT_PASSWORD:-miniosecret} + LANGFUSE_S3_MEDIA_UPLOAD_ENDPOINT: http://localhost:${LANGFUSE_MINIO_API_PORT:-9092} + LANGFUSE_S3_MEDIA_UPLOAD_FORCE_PATH_STYLE: "true" + LANGFUSE_S3_MEDIA_UPLOAD_PREFIX: media/ + LANGFUSE_S3_BATCH_EXPORT_ENABLED: "false" + LANGFUSE_S3_BATCH_EXPORT_BUCKET: ${LANGFUSE_S3_BUCKET:-langfuse} + LANGFUSE_S3_BATCH_EXPORT_REGION: auto + LANGFUSE_S3_BATCH_EXPORT_ENDPOINT: http://langfuse-minio:9000 + LANGFUSE_S3_BATCH_EXPORT_EXTERNAL_ENDPOINT: http://localhost:${LANGFUSE_MINIO_API_PORT:-9092} + LANGFUSE_S3_BATCH_EXPORT_ACCESS_KEY_ID: ${LANGFUSE_MINIO_ROOT_USER:-minio} + LANGFUSE_S3_BATCH_EXPORT_SECRET_ACCESS_KEY: ${LANGFUSE_MINIO_ROOT_PASSWORD:-miniosecret} + LANGFUSE_S3_BATCH_EXPORT_FORCE_PATH_STYLE: "true" + networks: + - nexent-network + + langfuse-web: + image: docker.io/langfuse/langfuse:${LANGFUSE_VERSION:-3} + container_name: nexent-langfuse-web + profiles: ["langfuse"] + restart: unless-stopped + depends_on: *langfuse-depends-on + environment: + <<: *langfuse-env + LANGFUSE_INIT_ORG_ID: ${LANGFUSE_INIT_ORG_ID:-nexent} + LANGFUSE_INIT_ORG_NAME: ${LANGFUSE_INIT_ORG_NAME:-Nexent} + LANGFUSE_INIT_PROJECT_ID: ${LANGFUSE_INIT_PROJECT_ID:-nexent-local} + LANGFUSE_INIT_PROJECT_NAME: ${LANGFUSE_INIT_PROJECT_NAME:-Nexent Local} + LANGFUSE_INIT_PROJECT_PUBLIC_KEY: ${LANGFUSE_INIT_PROJECT_PUBLIC_KEY:-pk-lf-nexent-local} + LANGFUSE_INIT_PROJECT_SECRET_KEY: ${LANGFUSE_INIT_PROJECT_SECRET_KEY:-sk-lf-nexent-local} + LANGFUSE_INIT_USER_EMAIL: ${LANGFUSE_INIT_USER_EMAIL:-admin@nexent.local} + LANGFUSE_INIT_USER_NAME: ${LANGFUSE_INIT_USER_NAME:-Nexent Admin} + LANGFUSE_INIT_USER_PASSWORD: ${LANGFUSE_INIT_USER_PASSWORD:-nexent-langfuse-admin} + ports: + - "${LANGFUSE_PORT:-3001}:3000" + networks: + - nexent-network + + langfuse-clickhouse: + image: docker.io/clickhouse/clickhouse-server:${LANGFUSE_CLICKHOUSE_VERSION:-latest} + container_name: nexent-langfuse-clickhouse + profiles: ["langfuse"] + restart: unless-stopped + user: "101:101" + environment: + CLICKHOUSE_DB: default + CLICKHOUSE_USER: ${LANGFUSE_CLICKHOUSE_USER:-clickhouse} + CLICKHOUSE_PASSWORD: ${LANGFUSE_CLICKHOUSE_PASSWORD:-clickhouse} + volumes: + - langfuse-clickhouse-data:/var/lib/clickhouse + - langfuse-clickhouse-logs:/var/log/clickhouse-server + ports: + - "127.0.0.1:${LANGFUSE_CLICKHOUSE_HTTP_PORT:-8124}:8123" + - "127.0.0.1:${LANGFUSE_CLICKHOUSE_NATIVE_PORT:-9002}:9000" + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:8123/ping || exit 1"] + interval: 5s + timeout: 5s + retries: 10 + start_period: 1s + networks: + - nexent-network + + langfuse-minio: + image: docker.io/minio/minio:${LANGFUSE_MINIO_VERSION:-latest} + container_name: nexent-langfuse-minio + profiles: ["langfuse"] + restart: unless-stopped + entrypoint: sh + command: -c 'mkdir -p /data/${LANGFUSE_S3_BUCKET:-langfuse} && minio server --address ":9000" --console-address ":9001" /data' + environment: + MINIO_ROOT_USER: ${LANGFUSE_MINIO_ROOT_USER:-minio} + MINIO_ROOT_PASSWORD: ${LANGFUSE_MINIO_ROOT_PASSWORD:-miniosecret} + ports: + - "${LANGFUSE_MINIO_API_PORT:-9092}:9000" + - "127.0.0.1:${LANGFUSE_MINIO_CONSOLE_PORT:-9093}:9001" + volumes: + - langfuse-minio-data:/data + healthcheck: + test: ["CMD", "mc", "ready", "local"] + interval: 1s + timeout: 5s + retries: 5 + start_period: 1s + networks: + - nexent-network + + langfuse-redis: + image: docker.io/redis:7 + container_name: nexent-langfuse-redis + profiles: ["langfuse"] + restart: unless-stopped + command: > + --requirepass ${LANGFUSE_REDIS_AUTH:-myredissecret} + --maxmemory-policy noeviction + ports: + - "127.0.0.1:${LANGFUSE_REDIS_PORT:-6380}:6379" + volumes: + - langfuse-redis-data:/data + healthcheck: + test: ["CMD-SHELL", "redis-cli -a ${LANGFUSE_REDIS_AUTH:-myredissecret} ping | grep PONG"] + interval: 3s + timeout: 10s + retries: 10 + networks: + - nexent-network + + langfuse-postgres: + image: docker.io/postgres:${LANGFUSE_POSTGRES_VERSION:-17} + container_name: nexent-langfuse-postgres + profiles: ["langfuse"] + restart: unless-stopped + environment: + POSTGRES_USER: ${LANGFUSE_POSTGRES_USER:-postgres} + POSTGRES_PASSWORD: ${LANGFUSE_POSTGRES_PASSWORD:-postgres} + POSTGRES_DB: ${LANGFUSE_POSTGRES_DB:-postgres} + TZ: UTC + PGTZ: UTC + ports: + - "127.0.0.1:${LANGFUSE_POSTGRES_PORT:-5440}:5432" + volumes: + - langfuse-postgres-data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${LANGFUSE_POSTGRES_USER:-postgres}"] + interval: 3s + timeout: 3s + retries: 10 + networks: + - nexent-network + networks: nexent-network: - external: true \ No newline at end of file + external: true + +volumes: + phoenix-data: + langfuse-postgres-data: + langfuse-clickhouse-data: + langfuse-clickhouse-logs: + langfuse-minio-data: + langfuse-redis-data: diff --git a/docker/monitoring/monitoring.env b/docker/monitoring/monitoring.env index b15c8f097..38c382ab1 100644 --- a/docker/monitoring/monitoring.env +++ b/docker/monitoring/monitoring.env @@ -4,6 +4,7 @@ MONITORING_PROVIDER=otlp MONITORING_CONFIG_FILE= MONITORING_USE_PLATFORM_SDK=false MONITORING_PROJECT_NAME=nexent +MONITORING_STACK=collector # Use a base OTLP HTTP endpoint. SDK code derives /v1/traces and /v1/metrics. OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 @@ -22,3 +23,48 @@ LLM_SLOW_TOKEN_RATE_THRESHOLD=10.0 OTEL_COLLECTOR_GRPC_PORT=4317 OTEL_COLLECTOR_HTTP_PORT=4318 +OTEL_COLLECTOR_CONFIG_FILE= + +# Local Phoenix stack. Used by: ./start-monitoring.sh --stack phoenix +PHOENIX_VERSION=latest +PHOENIX_PORT=6006 +PHOENIX_GRPC_HOST_PORT=4319 + +# Local Langfuse stack. Used by: ./start-monitoring.sh --stack langfuse +# Defaults are for local development only. Replace secrets before production use. +LANGFUSE_VERSION=3 +LANGFUSE_PORT=3001 +LANGFUSE_NEXTAUTH_URL=http://localhost:3001 +LANGFUSE_NEXTAUTH_SECRET=nexent-langfuse-secret +LANGFUSE_SALT=nexent-langfuse-salt +LANGFUSE_ENCRYPTION_KEY=0000000000000000000000000000000000000000000000000000000000000000 +LANGFUSE_TELEMETRY_ENABLED=false +LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES=false +LANGFUSE_INIT_ORG_ID=nexent +LANGFUSE_INIT_ORG_NAME=Nexent +LANGFUSE_INIT_PROJECT_ID=nexent-local +LANGFUSE_INIT_PROJECT_NAME=Nexent-Local +LANGFUSE_INIT_PROJECT_PUBLIC_KEY=pk-lf-nexent-local +LANGFUSE_INIT_PROJECT_SECRET_KEY=sk-lf-nexent-local +LANGFUSE_INIT_USER_EMAIL=admin@nexent.local +LANGFUSE_INIT_USER_NAME=Nexent-Admin +LANGFUSE_INIT_USER_PASSWORD=nexent-langfuse-admin +LANGFUSE_OTLP_AUTH_HEADER= +LANGFUSE_POSTGRES_VERSION=17 +LANGFUSE_POSTGRES_USER=postgres +LANGFUSE_POSTGRES_PASSWORD=postgres +LANGFUSE_POSTGRES_DB=postgres +LANGFUSE_POSTGRES_PORT=5440 +LANGFUSE_CLICKHOUSE_VERSION=latest +LANGFUSE_CLICKHOUSE_USER=clickhouse +LANGFUSE_CLICKHOUSE_PASSWORD=clickhouse +LANGFUSE_CLICKHOUSE_HTTP_PORT=8124 +LANGFUSE_CLICKHOUSE_NATIVE_PORT=9002 +LANGFUSE_MINIO_VERSION=latest +LANGFUSE_MINIO_ROOT_USER=minio +LANGFUSE_MINIO_ROOT_PASSWORD=miniosecret +LANGFUSE_MINIO_API_PORT=9092 +LANGFUSE_MINIO_CONSOLE_PORT=9093 +LANGFUSE_S3_BUCKET=langfuse +LANGFUSE_REDIS_AUTH=myredissecret +LANGFUSE_REDIS_PORT=6380 diff --git a/docker/monitoring/monitoring.env.example b/docker/monitoring/monitoring.env.example index b15c8f097..38c382ab1 100644 --- a/docker/monitoring/monitoring.env.example +++ b/docker/monitoring/monitoring.env.example @@ -4,6 +4,7 @@ MONITORING_PROVIDER=otlp MONITORING_CONFIG_FILE= MONITORING_USE_PLATFORM_SDK=false MONITORING_PROJECT_NAME=nexent +MONITORING_STACK=collector # Use a base OTLP HTTP endpoint. SDK code derives /v1/traces and /v1/metrics. OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 @@ -22,3 +23,48 @@ LLM_SLOW_TOKEN_RATE_THRESHOLD=10.0 OTEL_COLLECTOR_GRPC_PORT=4317 OTEL_COLLECTOR_HTTP_PORT=4318 +OTEL_COLLECTOR_CONFIG_FILE= + +# Local Phoenix stack. Used by: ./start-monitoring.sh --stack phoenix +PHOENIX_VERSION=latest +PHOENIX_PORT=6006 +PHOENIX_GRPC_HOST_PORT=4319 + +# Local Langfuse stack. Used by: ./start-monitoring.sh --stack langfuse +# Defaults are for local development only. Replace secrets before production use. +LANGFUSE_VERSION=3 +LANGFUSE_PORT=3001 +LANGFUSE_NEXTAUTH_URL=http://localhost:3001 +LANGFUSE_NEXTAUTH_SECRET=nexent-langfuse-secret +LANGFUSE_SALT=nexent-langfuse-salt +LANGFUSE_ENCRYPTION_KEY=0000000000000000000000000000000000000000000000000000000000000000 +LANGFUSE_TELEMETRY_ENABLED=false +LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES=false +LANGFUSE_INIT_ORG_ID=nexent +LANGFUSE_INIT_ORG_NAME=Nexent +LANGFUSE_INIT_PROJECT_ID=nexent-local +LANGFUSE_INIT_PROJECT_NAME=Nexent-Local +LANGFUSE_INIT_PROJECT_PUBLIC_KEY=pk-lf-nexent-local +LANGFUSE_INIT_PROJECT_SECRET_KEY=sk-lf-nexent-local +LANGFUSE_INIT_USER_EMAIL=admin@nexent.local +LANGFUSE_INIT_USER_NAME=Nexent-Admin +LANGFUSE_INIT_USER_PASSWORD=nexent-langfuse-admin +LANGFUSE_OTLP_AUTH_HEADER= +LANGFUSE_POSTGRES_VERSION=17 +LANGFUSE_POSTGRES_USER=postgres +LANGFUSE_POSTGRES_PASSWORD=postgres +LANGFUSE_POSTGRES_DB=postgres +LANGFUSE_POSTGRES_PORT=5440 +LANGFUSE_CLICKHOUSE_VERSION=latest +LANGFUSE_CLICKHOUSE_USER=clickhouse +LANGFUSE_CLICKHOUSE_PASSWORD=clickhouse +LANGFUSE_CLICKHOUSE_HTTP_PORT=8124 +LANGFUSE_CLICKHOUSE_NATIVE_PORT=9002 +LANGFUSE_MINIO_VERSION=latest +LANGFUSE_MINIO_ROOT_USER=minio +LANGFUSE_MINIO_ROOT_PASSWORD=miniosecret +LANGFUSE_MINIO_API_PORT=9092 +LANGFUSE_MINIO_CONSOLE_PORT=9093 +LANGFUSE_S3_BUCKET=langfuse +LANGFUSE_REDIS_AUTH=myredissecret +LANGFUSE_REDIS_PORT=6380 diff --git a/docker/monitoring/otel-collector-langfuse-config.yml b/docker/monitoring/otel-collector-langfuse-config.yml new file mode 100644 index 000000000..667758882 --- /dev/null +++ b/docker/monitoring/otel-collector-langfuse-config.yml @@ -0,0 +1,51 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 1s + send_batch_size: 512 + + memory_limiter: + limit_mib: 256 + check_interval: 1s + + resource: + attributes: + - key: service.name + value: nexent-backend + action: upsert + - key: service.version + from_attribute: version + action: insert + +exporters: + logging: + verbosity: normal + + otlphttp/langfuse: + endpoint: http://langfuse-web:3000/api/public/otel + headers: + Authorization: ${env:LANGFUSE_OTLP_AUTH_HEADER} + x-langfuse-ingestion-version: "4" + +service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [otlphttp/langfuse, logging] + + metrics: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [logging] + + telemetry: + logs: + level: "info" diff --git a/docker/monitoring/otel-collector-phoenix-config.yml b/docker/monitoring/otel-collector-phoenix-config.yml new file mode 100644 index 000000000..4fa415aac --- /dev/null +++ b/docker/monitoring/otel-collector-phoenix-config.yml @@ -0,0 +1,48 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 1s + send_batch_size: 512 + + memory_limiter: + limit_mib: 256 + check_interval: 1s + + resource: + attributes: + - key: service.name + value: nexent-backend + action: upsert + - key: service.version + from_attribute: version + action: insert + +exporters: + logging: + verbosity: normal + + otlphttp/phoenix: + endpoint: http://phoenix:6006 + +service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [otlphttp/phoenix, logging] + + metrics: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [logging] + + telemetry: + logs: + level: "info" diff --git a/docker/start-monitoring.sh b/docker/start-monitoring.sh index fb8304816..6ab628574 100755 --- a/docker/start-monitoring.sh +++ b/docker/start-monitoring.sh @@ -1,12 +1,56 @@ #!/bin/bash # Nexent LLM Performance Monitoring Setup Script -# This script starts the OpenTelemetry Collector used by Nexent monitoring. +# This script starts the OpenTelemetry Collector alone, or with a local +# Phoenix/Langfuse observability backend. set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" MONITORING_DIR="$SCRIPT_DIR/monitoring" +COMPOSE_FILE="$SCRIPT_DIR/docker-compose-monitoring.yml" + +usage() { + cat < + +Stacks: + collector Start OpenTelemetry Collector only. This is the default. + phoenix Start Collector and local Arize Phoenix. + langfuse Start Collector and local Langfuse self-host stack. + +Set MONITORING_STACK in monitoring/monitoring.env to change the default. +EOF +} + +STACK_ARG="" +while [ $# -gt 0 ]; do + case "$1" in + --stack) + if [ $# -lt 2 ]; then + echo "❌ Error: --stack requires a value." + usage + exit 1 + fi + STACK_ARG="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + collector|phoenix|langfuse) + STACK_ARG="$1" + shift + ;; + *) + echo "❌ Error: unknown argument '$1'." + usage + exit 1 + ;; + esac +done echo "🚀 Starting Nexent LLM Performance Monitoring Setup..." @@ -31,9 +75,52 @@ if [ ! -f "$MONITORING_DIR/monitoring.env" ]; then echo "⚠️ Please review and update $MONITORING_DIR/monitoring.env as needed" fi +# Load deployment options. Keep values shell-compatible in monitoring.env. +set -a +# shellcheck disable=SC1091 +. "$MONITORING_DIR/monitoring.env" +set +a + +MONITORING_STACK="${STACK_ARG:-${MONITORING_STACK:-collector}}" +case "$MONITORING_STACK" in + collector) + OTEL_COLLECTOR_CONFIG_FILE="${OTEL_COLLECTOR_CONFIG_FILE:-./monitoring/otel-collector-config.yml}" + COMPOSE_PROFILES=() + ;; + phoenix) + OTEL_COLLECTOR_CONFIG_FILE="${OTEL_COLLECTOR_CONFIG_FILE:-./monitoring/otel-collector-phoenix-config.yml}" + COMPOSE_PROFILES=(--profile phoenix) + ;; + langfuse) + OTEL_COLLECTOR_CONFIG_FILE="${OTEL_COLLECTOR_CONFIG_FILE:-./monitoring/otel-collector-langfuse-config.yml}" + COMPOSE_PROFILES=(--profile langfuse) + LANGFUSE_INIT_PROJECT_PUBLIC_KEY="${LANGFUSE_INIT_PROJECT_PUBLIC_KEY:-pk-lf-nexent-local}" + LANGFUSE_INIT_PROJECT_SECRET_KEY="${LANGFUSE_INIT_PROJECT_SECRET_KEY:-sk-lf-nexent-local}" + if [ -z "${LANGFUSE_OTLP_AUTH_HEADER:-}" ]; then + LANGFUSE_OTLP_AUTH_HEADER="Basic $(printf "%s:%s" "$LANGFUSE_INIT_PROJECT_PUBLIC_KEY" "$LANGFUSE_INIT_PROJECT_SECRET_KEY" | base64 | tr -d '\n')" + fi + export LANGFUSE_OTLP_AUTH_HEADER + ;; + *) + echo "❌ Error: unsupported MONITORING_STACK '$MONITORING_STACK'." + usage + exit 1 + ;; +esac +export OTEL_COLLECTOR_CONFIG_FILE + +if docker compose version > /dev/null 2>&1; then + COMPOSE_CMD=(docker compose) +elif command -v docker-compose > /dev/null 2>&1; then + COMPOSE_CMD=(docker-compose) +else + echo "❌ Error: Docker Compose is not installed." + exit 1 +fi + # Start monitoring services -echo "🐳 Starting monitoring services..." -docker-compose -f "$SCRIPT_DIR/docker-compose-monitoring.yml" --env-file "$MONITORING_DIR/monitoring.env" up -d +echo "🐳 Starting monitoring services with stack: $MONITORING_STACK" +"${COMPOSE_CMD[@]}" -f "$COMPOSE_FILE" --env-file "$MONITORING_DIR/monitoring.env" "${COMPOSE_PROFILES[@]}" up -d # Wait for services to be ready echo "⏳ Waiting for services to start..." @@ -58,21 +145,43 @@ check_service() { } # Check OpenTelemetry Collector HTTP receiver -check_service "OpenTelemetry Collector HTTP receiver" "http://localhost:4318" "4318" || true +check_service "OpenTelemetry Collector HTTP receiver" "http://localhost:${OTEL_COLLECTOR_HTTP_PORT:-4318}" "${OTEL_COLLECTOR_HTTP_PORT:-4318}" || true + +case "$MONITORING_STACK" in + phoenix) + check_service "Phoenix UI" "http://localhost:${PHOENIX_PORT:-6006}" "${PHOENIX_PORT:-6006}" || true + ;; + langfuse) + check_service "Langfuse UI" "http://localhost:${LANGFUSE_PORT:-3001}" "${LANGFUSE_PORT:-3001}" || true + ;; +esac echo "" echo "🎉 Monitoring setup complete!" echo "" echo "📊 Access your monitoring tools:" -echo " • OTLP HTTP receiver: http://localhost:4318" -echo " • OTLP gRPC receiver: localhost:4317" -echo " • Configure Phoenix, Langfuse, Jaeger, or another OTLP backend in monitoring.env" +echo " • OTLP HTTP receiver: http://localhost:${OTEL_COLLECTOR_HTTP_PORT:-4318}" +echo " • OTLP gRPC receiver: localhost:${OTEL_COLLECTOR_GRPC_PORT:-4317}" +case "$MONITORING_STACK" in + phoenix) + echo " • Phoenix UI: http://localhost:${PHOENIX_PORT:-6006}" + ;; + langfuse) + echo " • Langfuse UI: http://localhost:${LANGFUSE_PORT:-3001}" + echo " • Langfuse admin: ${LANGFUSE_INIT_USER_EMAIL:-admin@nexent.local} / ${LANGFUSE_INIT_USER_PASSWORD:-nexent-langfuse-admin}" + ;; + collector) + echo " • Configure Phoenix, Langfuse, Jaeger, or another OTLP backend in monitoring.env" + ;; +esac echo "" echo "🔧 To enable monitoring in your Nexent backend:" echo " 1. Set ENABLE_TELEMETRY=true in your .env file" -echo " 2. Install performance dependencies:" +echo " 2. Set OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 for Docker services" +echo " or http://localhost:${OTEL_COLLECTOR_HTTP_PORT:-4318} for a backend running on the host" +echo " 3. Install performance dependencies:" echo " uv sync --extra performance" -echo " 3. Restart your Nexent backend service" +echo " 4. Restart your Nexent backend service" echo "" echo "📈 Key Metrics to Monitor:" echo " • Token Generation Rate (tokens/second)" @@ -80,4 +189,5 @@ echo " • Time to First Token (TTFT)" echo " • Request Duration" echo " • Error Rates" echo "" -echo "🛑 To stop monitoring services: docker-compose -f docker-compose-monitoring.yml down" +echo "🛑 To stop monitoring services:" +echo " ${COMPOSE_CMD[*]} -f $COMPOSE_FILE --env-file $MONITORING_DIR/monitoring.env --profile phoenix --profile langfuse down" From d6035af0559003ebbc978c7d7fc65927b81ab34a Mon Sep 17 00:00:00 2001 From: hhhhsc Date: Wed, 6 May 2026 15:46:26 +0800 Subject: [PATCH 04/17] =?UTF-8?q?=E2=9C=A8=20Feat:=20Enhance=20monitoring?= =?UTF-8?q?=20capabilities=20with=20FastAPI=20instrumentation=20and=20Open?= =?UTF-8?q?Telemetry=20integration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/consts/const.py | 8 + backend/services/agent_service.py | 30 +- backend/utils/monitoring.py | 24 + docker/docker-compose-monitoring.yml | 20 +- docker/monitoring/monitoring.env.example | 4 + make/main/Dockerfile | 2 +- sdk/nexent/core/agents/core_agent.py | 23 +- sdk/nexent/core/agents/nexent_agent.py | 98 +++- sdk/nexent/core/models/openai_llm.py | 34 ++ sdk/nexent/monitor/__init__.py | 48 ++ sdk/nexent/monitor/monitoring.py | 601 ++++++++++++++++++++++- test/sdk/monitor/test_monitoring.py | 258 ++++++++++ 12 files changed, 1119 insertions(+), 31 deletions(-) diff --git a/backend/consts/const.py b/backend/consts/const.py index a22c56afd..ebae779fe 100644 --- a/backend/consts/const.py +++ b/backend/consts/const.py @@ -338,6 +338,14 @@ class VectorDatabaseType(str, Enum): OTEL_EXPORTER_OTLP_METRICS_ENABLED_RAW = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENABLED") OTEL_EXPORTER_OTLP_METRICS_ENABLED = ( OTEL_EXPORTER_OTLP_METRICS_ENABLED_RAW or "true").lower() == "true" +MONITORING_INSTRUMENT_FASTAPI_RAW = os.getenv("MONITORING_INSTRUMENT_FASTAPI") +MONITORING_INSTRUMENT_FASTAPI = ( + MONITORING_INSTRUMENT_FASTAPI_RAW or "true").lower() == "true" +MONITORING_INSTRUMENT_REQUESTS_RAW = os.getenv("MONITORING_INSTRUMENT_REQUESTS") +MONITORING_INSTRUMENT_REQUESTS = ( + MONITORING_INSTRUMENT_REQUESTS_RAW or "false").lower() == "true" +MONITORING_FASTAPI_EXCLUDED_URLS = os.getenv("MONITORING_FASTAPI_EXCLUDED_URLS", "") +MONITORING_FASTAPI_EXCLUDE_SPANS = os.getenv("MONITORING_FASTAPI_EXCLUDE_SPANS", "receive,send") MONITORING_USE_PLATFORM_SDK_RAW = os.getenv("MONITORING_USE_PLATFORM_SDK") MONITORING_USE_PLATFORM_SDK = (MONITORING_USE_PLATFORM_SDK_RAW or "false").lower() == "true" MONITORING_PROJECT_NAME = os.getenv("MONITORING_PROJECT_NAME", "") diff --git a/backend/services/agent_service.py b/backend/services/agent_service.py index 73c6a4640..6154c87a4 100644 --- a/backend/services/agent_service.py +++ b/backend/services/agent_service.py @@ -74,7 +74,7 @@ from utils.llm_utils import call_llm_for_system_prompt # Monitoring utilities: expose monitoring context for downstream observers -from nexent.monitor import set_monitoring_context +from nexent.monitor import OPENINFERENCE_SPAN_KIND_CHAIN, set_monitoring_context # Import monitoring utilities from utils.monitoring import monitoring_manager @@ -1875,6 +1875,20 @@ async def run_agent_stream( agent_id=agent_request.agent_id, conversation_id=agent_request.conversation_id, ) + monitoring_manager.set_openinference_agent_context( + agent_id=agent_request.agent_id, + conversation_id=agent_request.conversation_id, + user_id=resolved_user_id, + tenant_id=resolved_tenant_id, + query=agent_request.query, + is_debug=agent_request.is_debug, + extra_metadata={ + "language": language, + "history_count": len(agent_request.history) if agent_request.history else 0, + "minio_files_count": len(agent_request.minio_files) if agent_request.minio_files else 0, + }, + span_kind=OPENINFERENCE_SPAN_KIND_CHAIN, + ) # Step 2: Save user message (if needed) if not agent_request.is_debug and not skip_user_save: @@ -1912,6 +1926,20 @@ async def run_agent_stream( memory_duration = time.time() - memory_start_time memory_enabled = memory_ctx_preview.user_config.memory_switch + monitoring_manager.set_openinference_agent_context( + agent_id=agent_request.agent_id, + conversation_id=agent_request.conversation_id, + user_id=resolved_user_id, + tenant_id=resolved_tenant_id, + query=agent_request.query, + is_debug=agent_request.is_debug, + memory_enabled=memory_enabled, + extra_metadata={ + "language": language, + "agent_share_option": getattr(memory_ctx_preview.user_config, "agent_share_option", "unknown"), + }, + span_kind=OPENINFERENCE_SPAN_KIND_CHAIN, + ) monitoring_manager.add_span_event("memory_context_build.completed", { "duration": memory_duration, "memory_enabled": memory_enabled, diff --git a/backend/utils/monitoring.py b/backend/utils/monitoring.py index 254809ca6..100e5d351 100644 --- a/backend/utils/monitoring.py +++ b/backend/utils/monitoring.py @@ -40,6 +40,12 @@ async def my_function(): OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION, OTEL_EXPORTER_OTLP_METRICS_ENABLED_RAW, OTEL_EXPORTER_OTLP_METRICS_ENABLED, + MONITORING_INSTRUMENT_FASTAPI_RAW, + MONITORING_INSTRUMENT_FASTAPI, + MONITORING_INSTRUMENT_REQUESTS_RAW, + MONITORING_INSTRUMENT_REQUESTS, + MONITORING_FASTAPI_EXCLUDED_URLS, + MONITORING_FASTAPI_EXCLUDE_SPANS, OTLP_HEADERS, TELEMETRY_SAMPLE_RATE_RAW, TELEMETRY_SAMPLE_RATE, @@ -71,6 +77,12 @@ async def my_function(): OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION, OTEL_EXPORTER_OTLP_METRICS_ENABLED_RAW, OTEL_EXPORTER_OTLP_METRICS_ENABLED, + MONITORING_INSTRUMENT_FASTAPI_RAW, + MONITORING_INSTRUMENT_FASTAPI, + MONITORING_INSTRUMENT_REQUESTS_RAW, + MONITORING_INSTRUMENT_REQUESTS, + MONITORING_FASTAPI_EXCLUDED_URLS, + MONITORING_FASTAPI_EXCLUDE_SPANS, OTLP_HEADERS, TELEMETRY_SAMPLE_RATE_RAW, TELEMETRY_SAMPLE_RATE, @@ -118,6 +130,14 @@ def _build_env_overrides() -> dict: overrides["otlp_headers"] = OTLP_HEADERS if OTEL_EXPORTER_OTLP_METRICS_ENABLED_RAW is not None: overrides["export_metrics"] = OTEL_EXPORTER_OTLP_METRICS_ENABLED + if MONITORING_INSTRUMENT_FASTAPI_RAW is not None: + overrides["instrument_fastapi"] = MONITORING_INSTRUMENT_FASTAPI + if MONITORING_INSTRUMENT_REQUESTS_RAW is not None: + overrides["instrument_requests"] = MONITORING_INSTRUMENT_REQUESTS + if MONITORING_FASTAPI_EXCLUDED_URLS: + overrides["fastapi_excluded_urls"] = MONITORING_FASTAPI_EXCLUDED_URLS + if MONITORING_FASTAPI_EXCLUDE_SPANS: + overrides["fastapi_exclude_spans"] = MONITORING_FASTAPI_EXCLUDE_SPANS if MONITORING_USE_PLATFORM_SDK_RAW is not None: overrides["use_platform_sdk"] = MONITORING_USE_PLATFORM_SDK if MONITORING_PROJECT_NAME: @@ -149,6 +169,10 @@ def _initialize_monitoring(): otlp_protocol=OTEL_EXPORTER_OTLP_PROTOCOL, otlp_headers=OTLP_HEADERS, export_metrics=OTEL_EXPORTER_OTLP_METRICS_ENABLED, + instrument_fastapi=MONITORING_INSTRUMENT_FASTAPI, + instrument_requests=MONITORING_INSTRUMENT_REQUESTS, + fastapi_excluded_urls=MONITORING_FASTAPI_EXCLUDED_URLS, + fastapi_exclude_spans=MONITORING_FASTAPI_EXCLUDE_SPANS, use_platform_sdk=MONITORING_USE_PLATFORM_SDK, project_name=MONITORING_PROJECT_NAME or None, telemetry_sample_rate=TELEMETRY_SAMPLE_RATE, diff --git a/docker/docker-compose-monitoring.yml b/docker/docker-compose-monitoring.yml index fb4764acf..c8c1b4078 100644 --- a/docker/docker-compose-monitoring.yml +++ b/docker/docker-compose-monitoring.yml @@ -3,13 +3,15 @@ services: image: otel/opentelemetry-collector-contrib:0.89.0 container_name: nexent-otel-collector command: ["--config=/etc/otel-collector-config.yml"] + environment: + LANGFUSE_OTLP_AUTH_HEADER: ${LANGFUSE_OTLP_AUTH_HEADER:-} volumes: - ${OTEL_COLLECTOR_CONFIG_FILE:-./monitoring/otel-collector-config.yml}:/etc/otel-collector-config.yml ports: - "${OTEL_COLLECTOR_GRPC_PORT:-4317}:4317" - "${OTEL_COLLECTOR_HTTP_PORT:-4318}:4318" networks: - - nexent-network + - nexent_nexent restart: unless-stopped phoenix: @@ -24,7 +26,7 @@ services: - "${PHOENIX_PORT:-6006}:6006" - "${PHOENIX_GRPC_HOST_PORT:-4319}:4317" networks: - - nexent-network + - nexent_nexent restart: unless-stopped langfuse-worker: @@ -83,7 +85,7 @@ services: LANGFUSE_S3_BATCH_EXPORT_SECRET_ACCESS_KEY: ${LANGFUSE_MINIO_ROOT_PASSWORD:-miniosecret} LANGFUSE_S3_BATCH_EXPORT_FORCE_PATH_STYLE: "true" networks: - - nexent-network + - nexent_nexent langfuse-web: image: docker.io/langfuse/langfuse:${LANGFUSE_VERSION:-3} @@ -105,7 +107,7 @@ services: ports: - "${LANGFUSE_PORT:-3001}:3000" networks: - - nexent-network + - nexent_nexent langfuse-clickhouse: image: docker.io/clickhouse/clickhouse-server:${LANGFUSE_CLICKHOUSE_VERSION:-latest} @@ -130,7 +132,7 @@ services: retries: 10 start_period: 1s networks: - - nexent-network + - nexent_nexent langfuse-minio: image: docker.io/minio/minio:${LANGFUSE_MINIO_VERSION:-latest} @@ -154,7 +156,7 @@ services: retries: 5 start_period: 1s networks: - - nexent-network + - nexent_nexent langfuse-redis: image: docker.io/redis:7 @@ -174,7 +176,7 @@ services: timeout: 10s retries: 10 networks: - - nexent-network + - nexent_nexent langfuse-postgres: image: docker.io/postgres:${LANGFUSE_POSTGRES_VERSION:-17} @@ -197,10 +199,10 @@ services: timeout: 3s retries: 10 networks: - - nexent-network + - nexent_nexent networks: - nexent-network: + nexent_nexent: external: true volumes: diff --git a/docker/monitoring/monitoring.env.example b/docker/monitoring/monitoring.env.example index 38c382ab1..fe040a911 100644 --- a/docker/monitoring/monitoring.env.example +++ b/docker/monitoring/monitoring.env.example @@ -16,6 +16,10 @@ OTEL_EXPORTER_OTLP_AUTHORIZATION= OTEL_EXPORTER_OTLP_X_API_KEY= OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION= OTEL_EXPORTER_OTLP_METRICS_ENABLED=true +MONITORING_INSTRUMENT_FASTAPI=true +MONITORING_INSTRUMENT_REQUESTS=false +MONITORING_FASTAPI_EXCLUDED_URLS= +MONITORING_FASTAPI_EXCLUDE_SPANS=receive,send TELEMETRY_SAMPLE_RATE=1.0 LLM_SLOW_REQUEST_THRESHOLD_SECONDS=5.0 diff --git a/make/main/Dockerfile b/make/main/Dockerfile index 665ebcd85..0f4027619 100644 --- a/make/main/Dockerfile +++ b/make/main/Dockerfile @@ -26,7 +26,7 @@ RUN uv sync --no-cache-dir $(test -n "$MIRROR" && echo "-i $MIRROR") && \ uv cache clean # Layer 1: install sdk in link mode COPY sdk /opt/sdk -RUN uv pip install --no-cache-dir /opt/sdk $(test -n "$MIRROR" && echo "-i $MIRROR") && \ +RUN uv pip install --no-cache-dir "/opt/sdk[performance]" $(test -n "$MIRROR" && echo "-i $MIRROR") && \ uv cache clean # Pre-download tiktoken cl100k_base model to avoid network issues during runtime diff --git a/sdk/nexent/core/agents/core_agent.py b/sdk/nexent/core/agents/core_agent.py index 0179f8d83..7c2c2f775 100644 --- a/sdk/nexent/core/agents/core_agent.py +++ b/sdk/nexent/core/agents/core_agent.py @@ -17,6 +17,8 @@ from smolagents.utils import AgentExecutionError, AgentGenerationError, truncate_content, AgentMaxStepsError, \ extract_code_from_text +from nexent.monitor import get_monitoring_manager + from ..utils.observer import MessageObserver, ProcessType from jinja2 import Template, StrictUndefined @@ -390,7 +392,25 @@ def _step_stream(self, memory_step: ActionStep) -> Generator[Any]: self.logger.log_code(title="Executing parsed code:", content=code_action, level=LogLevel.INFO) try: - code_output = self.python_executor(code_action) + monitoring_manager = get_monitoring_manager() + with monitoring_manager.trace_tool_call( + "python_interpreter", + self.name, + {"code": code_action, "step_number": memory_step.step_number}, + ): + code_output = self.python_executor(code_action) + monitoring_manager.set_tool_output({ + "output": getattr(code_output, "output", None), + "is_final_answer": getattr(code_output, "is_final_answer", False), + "logs": getattr(code_output, "logs", ""), + }) + if getattr(code_output, "is_final_answer", False): + with monitoring_manager.trace_tool_call( + "FinalAnswerTool", + self.name, + {"step_number": memory_step.step_number}, + ): + monitoring_manager.set_tool_output(code_output.output) execution_outputs_console = [] if len(code_output.logs) > 0: # Record execution results @@ -772,4 +792,3 @@ def _handle_max_steps_reached(self, task: str) -> Any: self.memory.steps.append(final_memory_step) return model_output - diff --git a/sdk/nexent/core/agents/nexent_agent.py b/sdk/nexent/core/agents/nexent_agent.py index 023c8348e..49f3556ab 100644 --- a/sdk/nexent/core/agents/nexent_agent.py +++ b/sdk/nexent/core/agents/nexent_agent.py @@ -1,12 +1,16 @@ import json +import functools +import inspect import re import time from threading import Event -from typing import List +from typing import Any, Callable, Dict, List from smolagents import ActionStep, AgentText, TaskStep, Timing from smolagents.tools import Tool +from nexent.monitor import get_monitoring_manager + from ..models.openai_llm import OpenAIModel from ..tools import * # Used for tool creation, do not delete!!! from ..utils.constants import THINK_TAG_PATTERN, THINK_PREFIX_PATTERN @@ -16,6 +20,88 @@ from .agent_context import ContextManager +def _tool_name(tool_obj: Any) -> str: + """Return the most useful tool name for monitoring.""" + return ( + getattr(tool_obj, "name", None) + or getattr(tool_obj, "__name__", None) + or type(tool_obj).__name__ + ) + + +def _build_tool_input(callable_obj: Callable, args: tuple, kwargs: Dict[str, Any]) -> Dict[str, Any]: + """Best-effort conversion of tool call arguments into span input attributes.""" + try: + signature = inspect.signature(callable_obj) + bound = signature.bind_partial(*args, **kwargs) + return dict(bound.arguments) + except (TypeError, ValueError): + tool_input: Dict[str, Any] = {} + if args: + tool_input["args"] = list(args) + if kwargs: + tool_input.update(kwargs) + return tool_input + + +def _wrap_tool_with_monitoring(tool_obj: Any, agent_name: str) -> Any: + """Wrap smolagents tools and callables with a tool span.""" + if getattr(tool_obj, "_nexent_monitoring_wrapped", False): + return tool_obj + + monitoring_manager = get_monitoring_manager() + tool_name = _tool_name(tool_obj) + + if hasattr(tool_obj, "forward") and callable(tool_obj.forward): + original_forward = tool_obj.forward + + if inspect.iscoroutinefunction(original_forward): + @functools.wraps(original_forward) + async def monitored_forward(*args, **kwargs): + tool_input = _build_tool_input(original_forward, args, kwargs) + with monitoring_manager.trace_tool_call(tool_name, agent_name, tool_input): + result = await original_forward(*args, **kwargs) + monitoring_manager.set_tool_output(result) + return result + else: + @functools.wraps(original_forward) + def monitored_forward(*args, **kwargs): + tool_input = _build_tool_input(original_forward, args, kwargs) + with monitoring_manager.trace_tool_call(tool_name, agent_name, tool_input): + result = original_forward(*args, **kwargs) + monitoring_manager.set_tool_output(result) + return result + + tool_obj.forward = monitored_forward + setattr(tool_obj, "_nexent_monitoring_wrapped", True) + return tool_obj + + if callable(tool_obj): + original_callable = tool_obj + + if inspect.iscoroutinefunction(original_callable): + @functools.wraps(original_callable) + async def monitored_callable(*args, **kwargs): + tool_input = _build_tool_input(original_callable, args, kwargs) + with monitoring_manager.trace_tool_call(tool_name, agent_name, tool_input): + result = await original_callable(*args, **kwargs) + monitoring_manager.set_tool_output(result) + return result + else: + @functools.wraps(original_callable) + def monitored_callable(*args, **kwargs): + tool_input = _build_tool_input(original_callable, args, kwargs) + with monitoring_manager.trace_tool_call(tool_name, agent_name, tool_input): + result = original_callable(*args, **kwargs) + monitoring_manager.set_tool_output(result) + return result + + setattr(monitored_callable, "_nexent_monitoring_wrapped", True) + return monitored_callable + + return tool_obj + + class NexentAgent: def __init__(self, observer: MessageObserver, model_config_list: List[ModelConfig], @@ -239,7 +325,13 @@ def create_single_agent(self, agent_config: AgentConfig): prompt_templates = agent_config.prompt_templates try: - tool_list = [self.create_tool(tool_config) for tool_config in agent_config.tools] + tool_list = [ + _wrap_tool_with_monitoring( + self.create_tool(tool_config), + agent_config.name, + ) + for tool_config in agent_config.tools + ] except Exception as e: raise ValueError(f"Error in creating tool: {e}") @@ -493,4 +585,4 @@ def _val_width(vals, extra_val=None): # Optional: write to local file with open("nexent_context_metrics.log", "a", encoding="utf-8") as f: - f.write("\n".join(lines) + "\n") \ No newline at end of file + f.write("\n".join(lines) + "\n") diff --git a/sdk/nexent/core/models/openai_llm.py b/sdk/nexent/core/models/openai_llm.py index 99aa3fdcb..dda1dc041 100644 --- a/sdk/nexent/core/models/openai_llm.py +++ b/sdk/nexent/core/models/openai_llm.py @@ -4,12 +4,14 @@ _monitoring_operation, _monitoring_display_name, _detect_model_type, + OPENINFERENCE_INPUT_VALUE, ) from ..utils.token_estimation import estimate_tokens_text import logging import threading import asyncio import time +import json from typing import List, Optional, Dict, Any from openai.types.chat.chat_completion_message import ChatCompletionMessage @@ -78,6 +80,37 @@ def __call__(self, messages: List[Dict[str, Any]], stop_sequences: Optional[List response_format: dict[str, str] | None = None, tools_to_call_from: Optional[List[Tool]] = None, _token_tracker=None, **kwargs, ) -> ChatMessage: _monitoring_operation.set("chat_completion") + if _token_tracker is None: + invocation_parameters = { + "temperature": self.temperature, + "top_p": self.top_p, + **{k: v for k, v in kwargs.items() if isinstance(v, (str, int, float, bool))}, + } + trace_attributes = { + "llm.invocation_parameters": json.dumps(invocation_parameters, ensure_ascii=False), + "model_id": self.model_id, + } + try: + trace_attributes[OPENINFERENCE_INPUT_VALUE] = json.dumps(messages or [], ensure_ascii=False) + except (TypeError, ValueError): + trace_attributes[OPENINFERENCE_INPUT_VALUE] = str(messages) + + with self._monitoring.trace_llm_request( + f"{self.display_name or self.model_id}.generate", + self.model_id, + **trace_attributes, + ) as span: + token_tracker = self._monitoring.create_token_tracker( + self.model_id, span) + return self.__call__( + messages=messages, + stop_sequences=stop_sequences, + response_format=response_format, + tools_to_call_from=tools_to_call_from, + _token_tracker=token_tracker, + **kwargs, + ) + token_tracker = _token_tracker or self._monitoring.create_token_tracker( self.model_id) @@ -238,6 +271,7 @@ def __call__(self, messages: List[Dict[str, Any]], stop_sequences: Optional[List if token_tracker: total_duration = time.time() - stream_start_time + self._monitoring.set_openinference_output(model_output) self._monitoring.add_span_event("completion_finished", { "total_duration": total_duration, "output_length": len(model_output), diff --git a/sdk/nexent/monitor/__init__.py b/sdk/nexent/monitor/__init__.py index 03efe90f1..2632d911e 100644 --- a/sdk/nexent/monitor/__init__.py +++ b/sdk/nexent/monitor/__init__.py @@ -19,6 +19,30 @@ get_monitoring_context, set_monitoring_operation, record_model_call, + OPENINFERENCE_SPAN_KIND, + OPENINFERENCE_SPAN_KIND_AGENT, + OPENINFERENCE_SPAN_KIND_CHAIN, + OPENINFERENCE_SPAN_KIND_LLM, + OPENINFERENCE_SPAN_KIND_TOOL, + OPENINFERENCE_SPAN_KIND_RETRIEVER, + OPENINFERENCE_INPUT_VALUE, + OPENINFERENCE_OUTPUT_VALUE, + OPENINFERENCE_METADATA, + OPENINFERENCE_SESSION_ID, + OPENINFERENCE_USER_ID, + OPENINFERENCE_TAG_TAGS, + LANGFUSE_OBSERVATION_TYPE, + LANGFUSE_OBSERVATION_INPUT, + LANGFUSE_OBSERVATION_OUTPUT, + LANGFUSE_OBSERVATION_MODEL_NAME, + LANGFUSE_OBSERVATION_MODEL_PARAMETERS, + LANGFUSE_OBSERVATION_USAGE_DETAILS, + LANGFUSE_TRACE_NAME, + LANGFUSE_TRACE_INPUT, + LANGFUSE_TRACE_OUTPUT, + LANGFUSE_TRACE_TAGS, + LANGFUSE_SESSION_ID, + LANGFUSE_USER_ID, ) __version__ = "0.2.0" @@ -35,4 +59,28 @@ 'get_monitoring_context', 'set_monitoring_operation', 'record_model_call', + 'OPENINFERENCE_SPAN_KIND', + 'OPENINFERENCE_SPAN_KIND_AGENT', + 'OPENINFERENCE_SPAN_KIND_CHAIN', + 'OPENINFERENCE_SPAN_KIND_LLM', + 'OPENINFERENCE_SPAN_KIND_TOOL', + 'OPENINFERENCE_SPAN_KIND_RETRIEVER', + 'OPENINFERENCE_INPUT_VALUE', + 'OPENINFERENCE_OUTPUT_VALUE', + 'OPENINFERENCE_METADATA', + 'OPENINFERENCE_SESSION_ID', + 'OPENINFERENCE_USER_ID', + 'OPENINFERENCE_TAG_TAGS', + 'LANGFUSE_OBSERVATION_TYPE', + 'LANGFUSE_OBSERVATION_INPUT', + 'LANGFUSE_OBSERVATION_OUTPUT', + 'LANGFUSE_OBSERVATION_MODEL_NAME', + 'LANGFUSE_OBSERVATION_MODEL_PARAMETERS', + 'LANGFUSE_OBSERVATION_USAGE_DETAILS', + 'LANGFUSE_TRACE_NAME', + 'LANGFUSE_TRACE_INPUT', + 'LANGFUSE_TRACE_OUTPUT', + 'LANGFUSE_TRACE_TAGS', + 'LANGFUSE_SESSION_ID', + 'LANGFUSE_USER_ID', ] diff --git a/sdk/nexent/monitor/monitoring.py b/sdk/nexent/monitor/monitoring.py index 3eed851a6..f08eff140 100644 --- a/sdk/nexent/monitor/monitoring.py +++ b/sdk/nexent/monitor/monitoring.py @@ -119,6 +119,36 @@ def get_monitoring_context() -> Dict[str, Any]: DEFAULT_OTLP_ENDPOINT = "http://localhost:4318" TRACE_PATH = "/v1/traces" METRIC_PATH = "/v1/metrics" + +OPENINFERENCE_SPAN_KIND = "openinference.span.kind" +OPENINFERENCE_SPAN_KIND_AGENT = "AGENT" +OPENINFERENCE_SPAN_KIND_CHAIN = "CHAIN" +OPENINFERENCE_SPAN_KIND_LLM = "LLM" +OPENINFERENCE_SPAN_KIND_TOOL = "TOOL" +OPENINFERENCE_SPAN_KIND_RETRIEVER = "RETRIEVER" +OPENINFERENCE_INPUT_VALUE = "input.value" +OPENINFERENCE_OUTPUT_VALUE = "output.value" +OPENINFERENCE_METADATA = "metadata" +OPENINFERENCE_SESSION_ID = "session.id" +OPENINFERENCE_USER_ID = "user.id" +OPENINFERENCE_TAG_TAGS = "tag.tags" + +LANGFUSE_OBSERVATION_TYPE = "langfuse.observation.type" +LANGFUSE_OBSERVATION_INPUT = "langfuse.observation.input" +LANGFUSE_OBSERVATION_OUTPUT = "langfuse.observation.output" +LANGFUSE_OBSERVATION_MODEL_NAME = "langfuse.observation.model.name" +LANGFUSE_OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters" +LANGFUSE_OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details" +LANGFUSE_TRACE_NAME = "langfuse.trace.name" +LANGFUSE_TRACE_INPUT = "langfuse.trace.input" +LANGFUSE_TRACE_OUTPUT = "langfuse.trace.output" +LANGFUSE_TRACE_TAGS = "langfuse.trace.tags" +LANGFUSE_SESSION_ID = "langfuse.session.id" +LANGFUSE_USER_ID = "langfuse.user.id" + +AGENT_OPERATION_NAMES = { + "agent.run", +} SUPPORTED_PROVIDERS = {"otlp", "phoenix", "langfuse", "jaeger", "custom"} @@ -256,6 +286,10 @@ class MonitoringConfig: otlp_headers: Dict[str, str] = field(default_factory=dict) export_traces: bool = True export_metrics: bool = True + instrument_fastapi: bool = True + instrument_requests: bool = False + fastapi_excluded_urls: str = "" + fastapi_exclude_spans: List[str] = field(default_factory=lambda: ["receive", "send"]) use_platform_sdk: bool = False project_name: Optional[str] = None telemetry_sample_rate: float = 1.0 @@ -304,6 +338,10 @@ def from_file(cls, config_file: str, overrides: Optional[Dict[str, Any]] = None) "otlp_headers": headers, "export_traces": exporter.get("export_traces", data.get("export_traces")), "export_metrics": exporter.get("export_metrics", data.get("export_metrics")), + "instrument_fastapi": data.get("instrument_fastapi"), + "instrument_requests": data.get("instrument_requests"), + "fastapi_excluded_urls": data.get("fastapi_excluded_urls"), + "fastapi_exclude_spans": data.get("fastapi_exclude_spans"), "use_platform_sdk": exporter.get("use_platform_sdk", data.get("use_platform_sdk")), "project_name": exporter.get("project_name", data.get("project_name")), "telemetry_sample_rate": data.get("telemetry_sample_rate"), @@ -331,6 +369,20 @@ def __post_init__(self): self.enable_telemetry = _as_bool(self.enable_telemetry) self.export_traces = _as_bool(self.export_traces, True) self.export_metrics = _as_bool(self.export_metrics, True) + self.instrument_fastapi = _as_bool(self.instrument_fastapi, True) + self.instrument_requests = _as_bool(self.instrument_requests, False) + if isinstance(self.fastapi_exclude_spans, str): + self.fastapi_exclude_spans = [ + item.strip() + for item in self.fastapi_exclude_spans.split(",") + if item.strip() + ] + else: + self.fastapi_exclude_spans = [ + str(item).strip() + for item in self.fastapi_exclude_spans + if str(item).strip() + ] self.use_platform_sdk = _as_bool(self.use_platform_sdk) self.telemetry_sample_rate = _as_float(self.telemetry_sample_rate, 1.0) self.llm_slow_request_threshold_seconds = _as_float( @@ -571,8 +623,10 @@ def _init_telemetry_otlp(self) -> None: unit="errors" ) - # Auto-instrument other libraries - RequestsInstrumentor().instrument() + # Auto-instrument outbound HTTP calls only when explicitly enabled. + # AI observability UIs otherwise get noisy generic HTTP spans. + if self._config.instrument_requests: + RequestsInstrumentor().instrument() logger.info( f"OTLP telemetry initialized successfully for service: {self._config.service_name}, " @@ -632,8 +686,20 @@ def tracer(self): def setup_fastapi_app(self, app) -> bool: """Setup monitoring for a FastAPI application.""" try: - if self.is_enabled and app and OPENTELEMETRY_AVAILABLE: - FastAPIInstrumentor.instrument_app(app) + if self.is_enabled and app and OPENTELEMETRY_AVAILABLE and self._config: + if not self._config.instrument_fastapi: + logger.info("FastAPI auto instrumentation is disabled") + return False + + instrument_kwargs: Dict[str, Any] = {} + if self._config.fastapi_excluded_urls: + instrument_kwargs["excluded_urls"] = self._config.fastapi_excluded_urls + + signature = inspect.signature(FastAPIInstrumentor.instrument_app) + if "exclude_spans" in signature.parameters: + instrument_kwargs["exclude_spans"] = self._config.fastapi_exclude_spans + + FastAPIInstrumentor.instrument_app(app, **instrument_kwargs) logger.info( "FastAPI application monitoring initialized successfully" ) @@ -648,6 +714,419 @@ def setup_fastapi_app(self, app) -> bool: logger.error(f"Failed to initialize FastAPI monitoring: {e}") return False + @staticmethod + def _infer_openinference_span_kind(operation_name: str) -> str: + """Infer OpenInference span kind for Nexent service operations.""" + if operation_name in AGENT_OPERATION_NAMES: + return OPENINFERENCE_SPAN_KIND_AGENT + return OPENINFERENCE_SPAN_KIND_CHAIN + + @staticmethod + def _to_openinference_json_value(value: Any) -> str: + """Convert a value to the JSON-string form expected by OpenInference.""" + if isinstance(value, str): + return value + try: + return json.dumps(value, ensure_ascii=False) + except (TypeError, ValueError): + return str(value) + + @staticmethod + def _to_langfuse_attribute_value(value: Any) -> Any: + """Convert metadata values to Langfuse filterable attribute values.""" + if isinstance(value, (str, int, float, bool)): + return value + try: + return json.dumps(value, ensure_ascii=False) + except (TypeError, ValueError): + return str(value) + + @staticmethod + def _to_langfuse_observation_type(span_kind: str) -> str: + """Map OpenInference span kind to Langfuse observation type.""" + return { + OPENINFERENCE_SPAN_KIND_AGENT: "agent", + OPENINFERENCE_SPAN_KIND_CHAIN: "chain", + OPENINFERENCE_SPAN_KIND_LLM: "generation", + OPENINFERENCE_SPAN_KIND_TOOL: "tool", + OPENINFERENCE_SPAN_KIND_RETRIEVER: "retriever", + }.get(span_kind, "span") + + def build_langfuse_attributes( + self, + span_kind: str, + input_value: Any = None, + output_value: Any = None, + metadata: Optional[Dict[str, Any]] = None, + tags: Optional[List[str]] = None, + session_id: Optional[Any] = None, + user_id: Optional[Any] = None, + trace_name: Optional[str] = None, + trace_level: bool = False, + ) -> Dict[str, Any]: + """Build Langfuse OTel attributes for trace and observation mapping.""" + attrs: Dict[str, Any] = { + LANGFUSE_OBSERVATION_TYPE: self._to_langfuse_observation_type( + span_kind), + } + if input_value is not None: + input_json = self._to_openinference_json_value(input_value) + attrs[LANGFUSE_OBSERVATION_INPUT] = input_json + if trace_level: + attrs[LANGFUSE_TRACE_INPUT] = input_json + if output_value is not None: + output_json = self._to_openinference_json_value(output_value) + attrs[LANGFUSE_OBSERVATION_OUTPUT] = output_json + if trace_level: + attrs[LANGFUSE_TRACE_OUTPUT] = output_json + if metadata: + for key, value in metadata.items(): + if value is not None: + attrs[f"langfuse.observation.metadata.{key}"] = ( + self._to_langfuse_attribute_value(value) + ) + if trace_level: + attrs[f"langfuse.trace.metadata.{key}"] = ( + self._to_langfuse_attribute_value(value) + ) + if tags is not None: + attrs[LANGFUSE_TRACE_TAGS] = tags + if session_id is not None: + attrs[LANGFUSE_SESSION_ID] = str(session_id) + if user_id is not None: + attrs[LANGFUSE_USER_ID] = str(user_id) + if trace_name: + attrs[LANGFUSE_TRACE_NAME] = trace_name + return attrs + + def build_openinference_attributes( + self, + span_kind: str, + input_value: Any = None, + output_value: Any = None, + metadata: Optional[Dict[str, Any]] = None, + tags: Optional[List[str]] = None, + session_id: Optional[Any] = None, + user_id: Optional[Any] = None, + attributes: Optional[Dict[str, Any]] = None, + ) -> Dict[str, Any]: + """Build Phoenix/OpenInference attributes for a custom span.""" + attrs: Dict[str, Any] = { + OPENINFERENCE_SPAN_KIND: span_kind, + } + if input_value is not None: + attrs[OPENINFERENCE_INPUT_VALUE] = self._to_openinference_json_value( + input_value) + if output_value is not None: + attrs[OPENINFERENCE_OUTPUT_VALUE] = self._to_openinference_json_value( + output_value) + if metadata is not None: + attrs[OPENINFERENCE_METADATA] = self._to_openinference_json_value( + metadata) + if tags is not None: + attrs[OPENINFERENCE_TAG_TAGS] = self._to_openinference_json_value( + tags) + if session_id is not None: + attrs[OPENINFERENCE_SESSION_ID] = str(session_id) + if user_id is not None: + attrs[OPENINFERENCE_USER_ID] = str(user_id) + attrs.update(self.build_langfuse_attributes( + span_kind=span_kind, + input_value=input_value, + output_value=output_value, + metadata=metadata, + tags=tags, + session_id=session_id, + user_id=user_id, + trace_name=attributes.get(LANGFUSE_TRACE_NAME) if attributes else None, + trace_level=span_kind == OPENINFERENCE_SPAN_KIND_AGENT, + )) + if attributes: + attrs.update(attributes) + return attrs + + @contextmanager + def trace_operation( + self, + operation_name: str, + span_kind: str = OPENINFERENCE_SPAN_KIND_CHAIN, + **attributes: Any + ) -> Iterator[Optional[Any]]: + """Trace a non-LLM operation using OpenInference span kind semantics.""" + if not self.is_enabled or not OPENTELEMETRY_AVAILABLE or not self._tracer: + yield None + return + + span_attrs = { + OPENINFERENCE_SPAN_KIND: span_kind, + LANGFUSE_OBSERVATION_TYPE: self._to_langfuse_observation_type( + span_kind), + } + span_attrs.update(attributes) + + with self._tracer.start_as_current_span( + operation_name, + attributes=span_attrs + ) as span: + try: + yield span + except Exception as e: + span.set_status(Status(StatusCode.ERROR, str(e))) + span.set_attribute("error.type", type(e).__name__) + span.set_attribute("error.message", str(e)) + raise + + @contextmanager + def trace_openinference_span( + self, + operation_name: str, + span_kind: str, + input_value: Any = None, + output_value: Any = None, + metadata: Optional[Dict[str, Any]] = None, + tags: Optional[List[str]] = None, + session_id: Optional[Any] = None, + user_id: Optional[Any] = None, + attributes: Optional[Dict[str, Any]] = None, + ) -> Iterator[Optional[Any]]: + """Trace a custom Phoenix/OpenInference span.""" + span_attrs = self.build_openinference_attributes( + span_kind=span_kind, + input_value=input_value, + output_value=output_value, + metadata=metadata, + tags=tags, + session_id=session_id, + user_id=user_id, + attributes=attributes, + ) + with self.trace_operation(operation_name, span_kind, **span_attrs) as span: + yield span + + def trace_agent( + self, + operation_name: str, + input_value: Any = None, + output_value: Any = None, + metadata: Optional[Dict[str, Any]] = None, + tags: Optional[List[str]] = None, + session_id: Optional[Any] = None, + user_id: Optional[Any] = None, + attributes: Optional[Dict[str, Any]] = None, + ) -> Iterator[Optional[Any]]: + """Trace a custom agent span.""" + return self.trace_openinference_span( + operation_name=operation_name, + span_kind=OPENINFERENCE_SPAN_KIND_AGENT, + input_value=input_value, + output_value=output_value, + metadata=metadata, + tags=tags, + session_id=session_id, + user_id=user_id, + attributes=attributes, + ) + + def trace_chain( + self, + operation_name: str, + input_value: Any = None, + output_value: Any = None, + metadata: Optional[Dict[str, Any]] = None, + tags: Optional[List[str]] = None, + session_id: Optional[Any] = None, + user_id: Optional[Any] = None, + attributes: Optional[Dict[str, Any]] = None, + ) -> Iterator[Optional[Any]]: + """Trace a custom chain span.""" + return self.trace_openinference_span( + operation_name=operation_name, + span_kind=OPENINFERENCE_SPAN_KIND_CHAIN, + input_value=input_value, + output_value=output_value, + metadata=metadata, + tags=tags, + session_id=session_id, + user_id=user_id, + attributes=attributes, + ) + + def trace_retriever( + self, + operation_name: str, + input_value: Any = None, + output_value: Any = None, + metadata: Optional[Dict[str, Any]] = None, + tags: Optional[List[str]] = None, + session_id: Optional[Any] = None, + user_id: Optional[Any] = None, + attributes: Optional[Dict[str, Any]] = None, + ) -> Iterator[Optional[Any]]: + """Trace a custom retriever span.""" + return self.trace_openinference_span( + operation_name=operation_name, + span_kind=OPENINFERENCE_SPAN_KIND_RETRIEVER, + input_value=input_value, + output_value=output_value, + metadata=metadata, + tags=tags, + session_id=session_id, + user_id=user_id, + attributes=attributes, + ) + + def set_openinference_output( + self, + output_value: Any, + metadata: Optional[Dict[str, Any]] = None, + tags: Optional[List[str]] = None, + ) -> None: + """Attach OpenInference output fields to the current span.""" + attrs = self.build_openinference_attributes( + span_kind="", + output_value=output_value, + metadata=metadata, + tags=tags, + ) + attrs.pop(OPENINFERENCE_SPAN_KIND, None) + attrs.pop(LANGFUSE_OBSERVATION_TYPE, None) + self.set_span_attributes(**attrs) + + def set_openinference_agent_context( + self, + agent_id: Optional[int] = None, + conversation_id: Optional[int] = None, + user_id: Optional[str] = None, + tenant_id: Optional[str] = None, + agent_name: Optional[str] = None, + query: Optional[str] = None, + is_debug: Optional[bool] = None, + memory_enabled: Optional[bool] = None, + extra_metadata: Optional[Dict[str, Any]] = None, + span_kind: Optional[str] = OPENINFERENCE_SPAN_KIND_AGENT, + ) -> None: + """Attach Phoenix/OpenInference agent dimensions to the current span.""" + metadata = { + "agent_id": agent_id, + "agent_name": agent_name, + "tenant_id": tenant_id, + "conversation_id": conversation_id, + "is_debug": is_debug, + "memory_enabled": memory_enabled, + } + if extra_metadata: + metadata.update(extra_metadata) + metadata = {k: v for k, v in metadata.items() if v is not None} + + tags = ["nexent", "agent"] + if agent_id is not None: + tags.append(f"agent_id:{agent_id}") + if tenant_id: + tags.append(f"tenant_id:{tenant_id}") + if is_debug is True: + tags.append("debug") + if memory_enabled is True: + tags.append("memory_enabled") + elif memory_enabled is False: + tags.append("memory_disabled") + + effective_span_kind = span_kind or "" + attrs: Dict[str, Any] = { + OPENINFERENCE_METADATA: json.dumps(metadata, ensure_ascii=False), + OPENINFERENCE_TAG_TAGS: json.dumps(tags, ensure_ascii=False), + LANGFUSE_TRACE_TAGS: tags, + } + if span_kind: + attrs[OPENINFERENCE_SPAN_KIND] = span_kind + attrs[LANGFUSE_OBSERVATION_TYPE] = self._to_langfuse_observation_type( + effective_span_kind) + if query is not None: + attrs[OPENINFERENCE_INPUT_VALUE] = query + attrs[LANGFUSE_OBSERVATION_INPUT] = query + attrs[LANGFUSE_TRACE_INPUT] = query + if conversation_id is not None: + attrs[OPENINFERENCE_SESSION_ID] = str(conversation_id) + attrs[LANGFUSE_SESSION_ID] = str(conversation_id) + attrs["conversation.id"] = conversation_id + if user_id: + attrs[OPENINFERENCE_USER_ID] = str(user_id) + attrs[LANGFUSE_USER_ID] = str(user_id) + if tenant_id: + attrs["tenant.id"] = str(tenant_id) + if agent_id is not None: + attrs["agent.id"] = agent_id + if agent_name: + attrs["agent.name"] = agent_name + attrs[LANGFUSE_TRACE_NAME] = agent_name + + for key, value in metadata.items(): + attrs[f"langfuse.trace.metadata.{key}"] = ( + self._to_langfuse_attribute_value(value) + ) + attrs[f"langfuse.observation.metadata.{key}"] = ( + self._to_langfuse_attribute_value(value) + ) + + self.set_span_attributes(**attrs) + + def apply_openinference_context_attributes( + self, + span_kind: Optional[str] = None, + ) -> None: + """Attach request-scoped OpenInference context to the current span.""" + context = get_monitoring_context() + agent_id = context.get("agent_id") + conversation_id = context.get("conversation_id") + user_id = context.get("user_id") + tenant_id = context.get("tenant_id") + if not any([agent_id is not None, conversation_id is not None, user_id, tenant_id]): + return + + metadata = { + "agent_id": agent_id, + "tenant_id": tenant_id, + "conversation_id": conversation_id, + } + metadata = {k: v for k, v in metadata.items() if v is not None} + + tags = ["nexent"] + if span_kind == OPENINFERENCE_SPAN_KIND_AGENT: + tags.append("agent") + if agent_id is not None: + tags.append(f"agent_id:{agent_id}") + if tenant_id: + tags.append(f"tenant_id:{tenant_id}") + + attrs: Dict[str, Any] = { + OPENINFERENCE_METADATA: json.dumps(metadata, ensure_ascii=False), + OPENINFERENCE_TAG_TAGS: json.dumps(tags, ensure_ascii=False), + LANGFUSE_TRACE_TAGS: tags, + } + if span_kind: + attrs[OPENINFERENCE_SPAN_KIND] = span_kind + attrs[LANGFUSE_OBSERVATION_TYPE] = self._to_langfuse_observation_type( + span_kind) + if conversation_id is not None: + attrs[OPENINFERENCE_SESSION_ID] = str(conversation_id) + attrs[LANGFUSE_SESSION_ID] = str(conversation_id) + attrs["conversation.id"] = conversation_id + if user_id: + attrs[OPENINFERENCE_USER_ID] = str(user_id) + attrs[LANGFUSE_USER_ID] = str(user_id) + if tenant_id: + attrs["tenant.id"] = str(tenant_id) + if agent_id is not None: + attrs["agent.id"] = agent_id + for key, value in metadata.items(): + attrs[f"langfuse.trace.metadata.{key}"] = ( + self._to_langfuse_attribute_value(value) + ) + attrs[f"langfuse.observation.metadata.{key}"] = ( + self._to_langfuse_attribute_value(value) + ) + + self.set_span_attributes(**attrs) + @contextmanager def trace_llm_request(self, operation_name: str, model_name: str, **attributes: Any) -> Iterator[Optional[Any]]: """ @@ -660,11 +1139,39 @@ def trace_llm_request(self, operation_name: str, model_name: str, **attributes: # OpenInference semantic attributes openinference_attrs = { + OPENINFERENCE_SPAN_KIND: attributes.pop( + OPENINFERENCE_SPAN_KIND, + OPENINFERENCE_SPAN_KIND_LLM, + ), + LANGFUSE_OBSERVATION_TYPE: "generation", + LANGFUSE_OBSERVATION_MODEL_NAME: model_name, "llm.model_name": model_name, "llm.operation.name": operation_name, + "gen_ai.request.model": model_name, } # Add user-provided attributes openinference_attrs.update(attributes) + if ( + OPENINFERENCE_INPUT_VALUE in openinference_attrs + and LANGFUSE_OBSERVATION_INPUT not in openinference_attrs + ): + openinference_attrs[LANGFUSE_OBSERVATION_INPUT] = ( + openinference_attrs[OPENINFERENCE_INPUT_VALUE] + ) + if ( + OPENINFERENCE_OUTPUT_VALUE in openinference_attrs + and LANGFUSE_OBSERVATION_OUTPUT not in openinference_attrs + ): + openinference_attrs[LANGFUSE_OBSERVATION_OUTPUT] = ( + openinference_attrs[OPENINFERENCE_OUTPUT_VALUE] + ) + if ( + "llm.invocation_parameters" in openinference_attrs + and LANGFUSE_OBSERVATION_MODEL_PARAMETERS not in openinference_attrs + ): + openinference_attrs[LANGFUSE_OBSERVATION_MODEL_PARAMETERS] = ( + openinference_attrs["llm.invocation_parameters"] + ) with self._tracer.start_as_current_span( operation_name, @@ -711,10 +1218,18 @@ def trace_agent_step( # OpenInference semantic attributes for agent openinference_attrs = { + OPENINFERENCE_SPAN_KIND: attributes.pop( + OPENINFERENCE_SPAN_KIND, + OPENINFERENCE_SPAN_KIND_CHAIN, + ), "agent.name": agent_name, "agent.step.name": step_name, "agent.step.type": step_type, } + openinference_attrs[LANGFUSE_OBSERVATION_TYPE] = ( + self._to_langfuse_observation_type( + openinference_attrs[OPENINFERENCE_SPAN_KIND]) + ) openinference_attrs.update(attributes) span_name = f"agent.{step_name}" @@ -764,18 +1279,31 @@ def trace_tool_call( # OpenInference semantic attributes for tool call openinference_attrs = { + OPENINFERENCE_SPAN_KIND: attributes.pop( + OPENINFERENCE_SPAN_KIND, + OPENINFERENCE_SPAN_KIND_TOOL, + ), + LANGFUSE_OBSERVATION_TYPE: "tool", "agent.name": agent_name, "agent.step.name": tool_name, "agent.step.type": "tool_call", "agent.tool.name": tool_name, + "tool.name": tool_name, } # Add tool input as JSON string if tool_input: try: - openinference_attrs["agent.tool.input"] = json.dumps(tool_input, ensure_ascii=False) + tool_input_json = json.dumps(tool_input, ensure_ascii=False) + openinference_attrs["agent.tool.input"] = tool_input_json + openinference_attrs["tool.parameters"] = tool_input_json + openinference_attrs[OPENINFERENCE_INPUT_VALUE] = tool_input_json + openinference_attrs[LANGFUSE_OBSERVATION_INPUT] = tool_input_json except (TypeError, ValueError): openinference_attrs["agent.tool.input"] = str(tool_input) + openinference_attrs["tool.parameters"] = str(tool_input) + openinference_attrs[OPENINFERENCE_INPUT_VALUE] = str(tool_input) + openinference_attrs[LANGFUSE_OBSERVATION_INPUT] = str(tool_input) openinference_attrs.update(attributes) @@ -822,10 +1350,17 @@ def set_tool_output(self, output: Any) -> None: try: if isinstance(output, str): span.set_attribute("agent.tool.output", output) + span.set_attribute(OPENINFERENCE_OUTPUT_VALUE, output) + span.set_attribute(LANGFUSE_OBSERVATION_OUTPUT, output) else: - span.set_attribute("agent.tool.output", json.dumps(output, ensure_ascii=False)) + output_json = json.dumps(output, ensure_ascii=False) + span.set_attribute("agent.tool.output", output_json) + span.set_attribute(OPENINFERENCE_OUTPUT_VALUE, output_json) + span.set_attribute(LANGFUSE_OBSERVATION_OUTPUT, output_json) except (TypeError, ValueError): span.set_attribute("agent.tool.output", str(output)) + span.set_attribute(OPENINFERENCE_OUTPUT_VALUE, str(output)) + span.set_attribute(LANGFUSE_OBSERVATION_OUTPUT, str(output)) def get_current_span(self) -> Optional[Any]: """Get the current active span.""" @@ -899,7 +1434,7 @@ def decorator(func: F) -> F: op_name = operation_name or f"{func.__module__}.{func.__name__}" exclude_set = set(exclude_params or []) - def prepare_span(span, kwargs: Dict[str, Any]) -> None: + def prepare_span(span, kwargs: Dict[str, Any], span_kind: str) -> None: if span and include_params: safe_params = { k: v for k, v in kwargs.items() @@ -907,6 +1442,7 @@ def prepare_span(span, kwargs: Dict[str, Any]) -> None: } if safe_params: self.set_span_attributes(**{f"param.{k}": v for k, v in safe_params.items()}) + self.apply_openinference_context_attributes(span_kind) self.add_span_event(f"{op_name}.started") def complete_span(start_time: float) -> None: @@ -924,8 +1460,9 @@ def fail_span(start_time: float, error: Exception) -> None: @functools.wraps(func) async def async_wrapper(*args, **kwargs): # Always execute monitoring logic - internal methods handle enabled state - with self.trace_llm_request(op_name, "nexent-service") as span: - prepare_span(span, kwargs) + span_kind = self._infer_openinference_span_kind(op_name) + with self.trace_operation(op_name, span_kind) as span: + prepare_span(span, kwargs, span_kind) start_time = time.time() try: @@ -939,8 +1476,9 @@ async def async_wrapper(*args, **kwargs): @functools.wraps(func) async def async_generator_wrapper(*args, **kwargs): # Keep the span open while the streaming response is consumed. - with self.trace_llm_request(op_name, "nexent-service") as span: - prepare_span(span, kwargs) + span_kind = self._infer_openinference_span_kind(op_name) + with self.trace_operation(op_name, span_kind) as span: + prepare_span(span, kwargs, span_kind) start_time = time.time() try: @@ -954,8 +1492,9 @@ async def async_generator_wrapper(*args, **kwargs): @functools.wraps(func) def sync_wrapper(*args, **kwargs): # Always execute monitoring logic - internal methods handle enabled state - with self.trace_llm_request(op_name, "nexent-service") as span: - prepare_span(span, kwargs) + span_kind = self._infer_openinference_span_kind(op_name) + with self.trace_operation(op_name, span_kind) as span: + prepare_span(span, kwargs, span_kind) start_time = time.time() try: @@ -968,8 +1507,9 @@ def sync_wrapper(*args, **kwargs): @functools.wraps(func) def generator_wrapper(*args, **kwargs): - with self.trace_llm_request(op_name, "nexent-service") as span: - prepare_span(span, kwargs) + span_kind = self._infer_openinference_span_kind(op_name) + with self.trace_operation(op_name, span_kind) as span: + prepare_span(span, kwargs, span_kind) start_time = time.time() try: @@ -1203,10 +1743,17 @@ def record_completion(self, input_tokens: int = 0, output_tokens: int = 0) -> No # Add span attributes using OpenInference naming if self.span: + usage_details = { + "input": input_tokens, + "output": output_tokens, + "total": input_tokens + output_tokens, + } self.span.set_attributes({ "llm.token_count.prompt": input_tokens, "llm.token_count.completion": output_tokens, "llm.token_count.total": input_tokens + output_tokens, + LANGFUSE_OBSERVATION_USAGE_DETAILS: json.dumps( + usage_details, ensure_ascii=False), "llm.generation_rate": generation_rate, "llm.duration.total": total_duration, "llm.time_to_first_token": self.first_token_time - self.start_time if self.first_token_time else 0 @@ -1862,6 +2409,30 @@ async def my_function(): 'get_monitoring_context', 'set_monitoring_operation', 'record_model_call', + 'OPENINFERENCE_SPAN_KIND', + 'OPENINFERENCE_SPAN_KIND_AGENT', + 'OPENINFERENCE_SPAN_KIND_CHAIN', + 'OPENINFERENCE_SPAN_KIND_LLM', + 'OPENINFERENCE_SPAN_KIND_TOOL', + 'OPENINFERENCE_SPAN_KIND_RETRIEVER', + 'OPENINFERENCE_INPUT_VALUE', + 'OPENINFERENCE_OUTPUT_VALUE', + 'OPENINFERENCE_METADATA', + 'OPENINFERENCE_SESSION_ID', + 'OPENINFERENCE_USER_ID', + 'OPENINFERENCE_TAG_TAGS', + 'LANGFUSE_OBSERVATION_TYPE', + 'LANGFUSE_OBSERVATION_INPUT', + 'LANGFUSE_OBSERVATION_OUTPUT', + 'LANGFUSE_OBSERVATION_MODEL_NAME', + 'LANGFUSE_OBSERVATION_MODEL_PARAMETERS', + 'LANGFUSE_OBSERVATION_USAGE_DETAILS', + 'LANGFUSE_TRACE_NAME', + 'LANGFUSE_TRACE_INPUT', + 'LANGFUSE_TRACE_OUTPUT', + 'LANGFUSE_TRACE_TAGS', + 'LANGFUSE_SESSION_ID', + 'LANGFUSE_USER_ID', '_detect_model_type', '_MonitoredClient', '_MonitoredChatCompletions', diff --git a/test/sdk/monitor/test_monitoring.py b/test/sdk/monitor/test_monitoring.py index d737cdf44..a223b3cc5 100644 --- a/test/sdk/monitor/test_monitoring.py +++ b/test/sdk/monitor/test_monitoring.py @@ -34,6 +34,26 @@ _monitoring_display_name, set_monitoring_operation, _enqueue_client_monitoring_record, + OPENINFERENCE_SPAN_KIND, + OPENINFERENCE_SPAN_KIND_AGENT, + OPENINFERENCE_SPAN_KIND_CHAIN, + OPENINFERENCE_SPAN_KIND_LLM, + OPENINFERENCE_SPAN_KIND_TOOL, + OPENINFERENCE_SPAN_KIND_RETRIEVER, + OPENINFERENCE_SESSION_ID, + OPENINFERENCE_USER_ID, + OPENINFERENCE_METADATA, + OPENINFERENCE_TAG_TAGS, + OPENINFERENCE_INPUT_VALUE, + OPENINFERENCE_OUTPUT_VALUE, + LANGFUSE_OBSERVATION_TYPE, + LANGFUSE_OBSERVATION_INPUT, + LANGFUSE_OBSERVATION_OUTPUT, + LANGFUSE_OBSERVATION_MODEL_NAME, + LANGFUSE_TRACE_INPUT, + LANGFUSE_TRACE_TAGS, + LANGFUSE_SESSION_ID, + LANGFUSE_USER_ID, ) import pytest import asyncio @@ -62,6 +82,10 @@ def test_default_config(self): assert config.otlp_headers == {} assert config.export_traces is True assert config.export_metrics is True + assert config.instrument_fastapi is True + assert config.instrument_requests is False + assert config.fastapi_excluded_urls == "" + assert config.fastapi_exclude_spans == ["receive", "send"] assert config.telemetry_sample_rate == 1.0 assert config.llm_slow_request_threshold_seconds == 5.0 assert config.llm_slow_token_rate_threshold == 10.0 @@ -76,6 +100,10 @@ def test_custom_config(self): otlp_protocol="grpc", otlp_headers={"Authorization": "Bearer test-key"}, export_metrics=False, + instrument_fastapi=False, + instrument_requests=True, + fastapi_excluded_urls="/agent/run", + fastapi_exclude_spans="send", use_platform_sdk=True, project_name="nexent-test", telemetry_sample_rate=0.5, @@ -90,6 +118,10 @@ def test_custom_config(self): assert config.otlp_protocol == "http" assert config.otlp_headers == {"Authorization": "Bearer test-key"} assert config.export_metrics is False + assert config.instrument_fastapi is False + assert config.instrument_requests is True + assert config.fastapi_excluded_urls == "/agent/run" + assert config.fastapi_exclude_spans == ["send"] assert config.use_platform_sdk is True assert config.project_name == "nexent-test" assert config.telemetry_sample_rate == 0.5 @@ -130,6 +162,8 @@ def test_from_json_file_with_overrides(self, tmp_path): "monitoring": { "enable_telemetry": True, "service_name": "file-service", + "instrument_requests": True, + "fastapi_exclude_spans": ["receive"], "exporter": { "provider": "langfuse", "endpoint": "https://cloud.langfuse.com/api/public/otel", @@ -155,6 +189,8 @@ def test_from_json_file_with_overrides(self, tmp_path): "x-langfuse-ingestion-version": "4", } assert config.export_metrics is False + assert config.instrument_requests is True + assert config.fastapi_exclude_spans == ["receive"] class TestMonitoringManager: @@ -235,6 +271,44 @@ def test_init_telemetry_http(self, mock_requests_instr, mock_resource, mock_tracer_provider.assert_called_once() mock_span_exporter_http.assert_called_once() mock_batch_processor.assert_called_once() + mock_requests_instr().instrument.assert_not_called() + + @patch('sdk.nexent.monitor.monitoring.trace') + @patch('sdk.nexent.monitor.monitoring.metrics') + @patch('sdk.nexent.monitor.monitoring.TracerProvider') + @patch('sdk.nexent.monitor.monitoring.MeterProvider') + @patch('sdk.nexent.monitor.monitoring.OTLPSpanExporterHTTP') + @patch('sdk.nexent.monitor.monitoring.BatchSpanProcessor') + @patch('sdk.nexent.monitor.monitoring.Resource') + @patch('sdk.nexent.monitor.monitoring.RequestsInstrumentor') + def test_init_telemetry_requests_instrumentation_opt_in( + self, + mock_requests_instr, + mock_resource, + mock_batch_processor, + mock_span_exporter_http, + mock_meter_provider, + mock_tracer_provider, + mock_metrics, + mock_trace, + ): + """Test requests auto instrumentation is opt-in.""" + with patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True): + manager = MonitoringManager() + config = MonitoringConfig( + enable_telemetry=True, + instrument_requests=True, + export_metrics=False, + ) + + mock_resource.create.return_value = MagicMock() + mock_tracer_provider.return_value = MagicMock() + mock_meter_provider.return_value = MagicMock() + mock_trace.get_tracer.return_value = MagicMock() + mock_metrics.get_meter.return_value = MagicMock() + + manager.configure(config) + mock_requests_instr().instrument.assert_called_once() @patch('sdk.nexent.monitor.monitoring.trace') @@ -281,6 +355,34 @@ def test_init_telemetry_exception_handling(self): with patch('sdk.nexent.monitor.monitoring.Resource.create', side_effect=Exception("Test error")): manager.configure(config) + def test_setup_fastapi_app_excludes_streaming_internal_spans(self): + """Test FastAPI instrumentation suppresses noisy ASGI send/receive spans.""" + with patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True): + manager = MonitoringManager() + manager.configure(MonitoringConfig( + enable_telemetry=True, + fastapi_excluded_urls="/health", + fastapi_exclude_spans=["receive", "send"], + )) + app = MagicMock() + calls = {} + + def fake_instrument_app(app_arg, excluded_urls=None, exclude_spans=None): + calls["app"] = app_arg + calls["excluded_urls"] = excluded_urls + calls["exclude_spans"] = exclude_spans + + with patch( + 'sdk.nexent.monitor.monitoring.FastAPIInstrumentor.instrument_app', + new=fake_instrument_app, + ): + result = manager.setup_fastapi_app(app) + + assert result is True + assert calls["app"] is app + assert calls["excluded_urls"] == "/health" + assert calls["exclude_spans"] == ["receive", "send"] + @patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True) @patch('sdk.nexent.monitor.monitoring.trace') @patch('sdk.nexent.monitor.monitoring.metrics') @@ -361,6 +463,114 @@ def test_trace_llm_request_openinference_attrs(self, mock_trace): assert attributes["llm.model_name"] == "gpt-4" assert "llm.operation.name" in attributes assert attributes["llm.operation.name"] == "test_op" + assert attributes[OPENINFERENCE_SPAN_KIND] == OPENINFERENCE_SPAN_KIND_LLM + assert attributes[LANGFUSE_OBSERVATION_TYPE] == "generation" + assert attributes[LANGFUSE_OBSERVATION_MODEL_NAME] == "gpt-4" + + @patch('sdk.nexent.monitor.monitoring.trace') + def test_set_openinference_agent_context_attrs(self, mock_trace): + """Test Phoenix/OpenInference agent context attributes are added to current span.""" + with patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True): + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=True) + manager.configure(config) + + mock_span = MagicMock() + mock_trace.get_current_span.return_value = mock_span + + manager.set_openinference_agent_context( + agent_id=1, + conversation_id=2, + user_id="user-1", + tenant_id="tenant-1", + query="hello", + is_debug=False, + memory_enabled=True, + ) + + attrs = mock_span.set_attributes.call_args.args[0] + assert attrs[OPENINFERENCE_SPAN_KIND] == OPENINFERENCE_SPAN_KIND_AGENT + assert attrs[OPENINFERENCE_SESSION_ID] == "2" + assert attrs[OPENINFERENCE_USER_ID] == "user-1" + assert attrs[OPENINFERENCE_INPUT_VALUE] == "hello" + assert attrs[LANGFUSE_OBSERVATION_TYPE] == "agent" + assert attrs[LANGFUSE_SESSION_ID] == "2" + assert attrs[LANGFUSE_USER_ID] == "user-1" + assert attrs[LANGFUSE_OBSERVATION_INPUT] == "hello" + assert attrs[LANGFUSE_TRACE_INPUT] == "hello" + assert "agent_id:1" in attrs[LANGFUSE_TRACE_TAGS] + assert attrs["langfuse.trace.metadata.agent_id"] == 1 + assert "agent_id:1" in json.loads(attrs[OPENINFERENCE_TAG_TAGS]) + metadata = json.loads(attrs[OPENINFERENCE_METADATA]) + assert metadata["agent_id"] == 1 + assert metadata["tenant_id"] == "tenant-1" + + manager.set_openinference_agent_context( + agent_id=1, + conversation_id=2, + user_id="user-1", + tenant_id="tenant-1", + span_kind=OPENINFERENCE_SPAN_KIND_CHAIN, + ) + attrs = mock_span.set_attributes.call_args.args[0] + assert attrs[OPENINFERENCE_SPAN_KIND] == OPENINFERENCE_SPAN_KIND_CHAIN + assert attrs[LANGFUSE_OBSERVATION_TYPE] == "chain" + + @patch('sdk.nexent.monitor.monitoring.trace') + def test_trace_openinference_convenience_spans(self, mock_trace): + """Test custom OpenInference span helpers build Phoenix-friendly attributes.""" + with patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True): + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=True) + manager.configure(config) + manager._tracer = MagicMock() + + mock_span = MagicMock() + manager._tracer.start_as_current_span.return_value.__enter__ = Mock(return_value=mock_span) + manager._tracer.start_as_current_span.return_value.__exit__ = Mock(return_value=None) + mock_trace.get_current_span.return_value = mock_span + + with manager.trace_agent( + "TestAgent.run", + input_value={"query": "hello"}, + metadata={"agent_id": 1}, + tags=["nexent", "agent"], + session_id=2, + user_id="user-1", + attributes={"agent.id": 1}, + ): + pass + + attrs = manager._tracer.start_as_current_span.call_args.kwargs["attributes"] + assert attrs[OPENINFERENCE_SPAN_KIND] == OPENINFERENCE_SPAN_KIND_AGENT + assert json.loads(attrs[OPENINFERENCE_INPUT_VALUE]) == {"query": "hello"} + assert json.loads(attrs[OPENINFERENCE_METADATA]) == {"agent_id": 1} + assert json.loads(attrs[OPENINFERENCE_TAG_TAGS]) == ["nexent", "agent"] + assert attrs[OPENINFERENCE_SESSION_ID] == "2" + assert attrs[OPENINFERENCE_USER_ID] == "user-1" + assert attrs[LANGFUSE_OBSERVATION_TYPE] == "agent" + assert attrs[LANGFUSE_SESSION_ID] == "2" + assert attrs[LANGFUSE_USER_ID] == "user-1" + assert json.loads(attrs[LANGFUSE_OBSERVATION_INPUT]) == {"query": "hello"} + assert attrs["langfuse.trace.metadata.agent_id"] == 1 + assert attrs["agent.id"] == 1 + + with manager.trace_chain("Step 0"): + pass + attrs = manager._tracer.start_as_current_span.call_args.kwargs["attributes"] + assert attrs[OPENINFERENCE_SPAN_KIND] == OPENINFERENCE_SPAN_KIND_CHAIN + assert attrs[LANGFUSE_OBSERVATION_TYPE] == "chain" + + with manager.trace_retriever("KnowledgeBase.search"): + pass + attrs = manager._tracer.start_as_current_span.call_args.kwargs["attributes"] + assert attrs[OPENINFERENCE_SPAN_KIND] == OPENINFERENCE_SPAN_KIND_RETRIEVER + assert attrs[LANGFUSE_OBSERVATION_TYPE] == "retriever" + + manager.set_openinference_output({"answer": "ok"}) + output_attrs = mock_span.set_attributes.call_args.args[0] + assert json.loads(output_attrs[OPENINFERENCE_OUTPUT_VALUE]) == {"answer": "ok"} + assert json.loads(output_attrs[LANGFUSE_OBSERVATION_OUTPUT]) == {"answer": "ok"} class TestAgentStepTracing: @@ -396,6 +606,8 @@ def test_trace_agent_step_tool_call(self, mock_trace): assert attributes["agent.step.name"] == "web_search" assert "agent.step.type" in attributes assert attributes["agent.step.type"] == "tool_call" + assert attributes[OPENINFERENCE_SPAN_KIND] == OPENINFERENCE_SPAN_KIND_CHAIN + assert attributes[LANGFUSE_OBSERVATION_TYPE] == "chain" @patch('sdk.nexent.monitor.monitoring.trace') def test_trace_agent_step_reasoning(self, mock_trace): @@ -464,8 +676,16 @@ def test_trace_tool_call_with_input_output(self, mock_trace): assert attributes["agent.tool.name"] == "web_search" assert "agent.tool.input" in attributes assert "query" in attributes["agent.tool.input"] + assert attributes[OPENINFERENCE_SPAN_KIND] == OPENINFERENCE_SPAN_KIND_TOOL + assert attributes["tool.name"] == "web_search" + assert "query" in attributes["tool.parameters"] + assert "query" in attributes[OPENINFERENCE_INPUT_VALUE] + assert attributes[LANGFUSE_OBSERVATION_TYPE] == "tool" + assert "query" in attributes[LANGFUSE_OBSERVATION_INPUT] mock_span.set_attribute.assert_called() + mock_span.set_attribute.assert_any_call(OPENINFERENCE_OUTPUT_VALUE, '{"results": ["item1", "item2"]}') + mock_span.set_attribute.assert_any_call(LANGFUSE_OBSERVATION_OUTPUT, '{"results": ["item1", "item2"]}') def test_trace_agent_step_disabled(self): """Test agent step tracing when disabled.""" @@ -626,6 +846,44 @@ async def consume_stream(): "stream_operation.completed", ] + @patch('sdk.nexent.monitor.monitoring.trace') + def test_monitor_endpoint_uses_openinference_span_kind(self, mock_trace): + """Test monitor_endpoint creates Phoenix-friendly chain/agent spans.""" + with patch('sdk.nexent.monitor.monitoring.OPENTELEMETRY_AVAILABLE', True): + manager = MonitoringManager() + config = MonitoringConfig(enable_telemetry=True) + manager.configure(config) + manager._tracer = MagicMock() + + mock_span = MagicMock() + manager._tracer.start_as_current_span.return_value.__enter__ = Mock(return_value=mock_span) + manager._tracer.start_as_current_span.return_value.__exit__ = Mock(return_value=None) + mock_trace.get_current_span.return_value = mock_span + + @manager.monitor_endpoint("agent.run") + def agent_func(): + return "ok" + + assert agent_func() == "ok" + attrs = manager._tracer.start_as_current_span.call_args.kwargs["attributes"] + assert attrs[OPENINFERENCE_SPAN_KIND] == OPENINFERENCE_SPAN_KIND_AGENT + + @manager.monitor_endpoint("agent_service.run_agent_stream") + def chain_func(): + return "ok" + + assert chain_func() == "ok" + attrs = manager._tracer.start_as_current_span.call_args.kwargs["attributes"] + assert attrs[OPENINFERENCE_SPAN_KIND] == OPENINFERENCE_SPAN_KIND_CHAIN + + @manager.monitor_endpoint("agent_run") + def internal_agent_func(): + return "ok" + + assert internal_agent_func() == "ok" + attrs = manager._tracer.start_as_current_span.call_args.kwargs["attributes"] + assert attrs[OPENINFERENCE_SPAN_KIND] == OPENINFERENCE_SPAN_KIND_CHAIN + def test_monitor_llm_call_decorator(self): """Test monitor_llm_call decorator.""" manager = MonitoringManager() From 6290b32b069b7d71be293cf9b2b2ffd458131e31 Mon Sep 17 00:00:00 2001 From: hhhhsc Date: Fri, 8 May 2026 10:03:55 +0800 Subject: [PATCH 05/17] =?UTF-8?q?=E2=9C=A8=20Feat:=20Add=20Grafana=20and?= =?UTF-8?q?=20Tempo=20support=20for=20enhanced=20monitoring=20capabilities?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/apps/monitoring_app.py | 56 +++ doc/docs/en/sdk/monitoring.md | 4 +- doc/docs/zh/sdk/monitoring.md | 87 ++++- doc/docs/zh/sdk/opentelemetry-design.md | 318 +++++++++++++----- docker/.env.bak | 168 --------- docker/.env.example | 2 +- docker/docker-compose-monitoring.yml | 39 +++ .../grafana/dashboards/nexent-llm-agent.json | 150 +++++++++ .../provisioning/dashboards/dashboards.yml | 12 + .../provisioning/datasources/datasources.yml | 18 + docker/monitoring/monitoring.env | 9 + docker/monitoring/monitoring.env.example | 9 + .../otel-collector-grafana-config.yml | 50 +++ .../otel-collector-langfuse-config.yml | 18 + .../otel-collector-phoenix-config.yml | 18 + docker/monitoring/tempo.yml | 42 +++ docker/start-monitoring.sh | 64 ++-- frontend/components/navigation/TopNavbar.tsx | 98 +++++- frontend/public/locales/en/common.json | 4 +- frontend/public/locales/zh/common.json | 1 + frontend/services/api.ts | 1 + frontend/services/monitoringService.ts | 22 +- frontend/types/monitoring.ts | 10 + sdk/nexent/monitor/monitoring.py | 2 +- test/backend/app/test_monitoring_app.py | 65 ++++ 25 files changed, 981 insertions(+), 286 deletions(-) delete mode 100644 docker/.env.bak create mode 100644 docker/monitoring/grafana/dashboards/nexent-llm-agent.json create mode 100644 docker/monitoring/grafana/provisioning/dashboards/dashboards.yml create mode 100644 docker/monitoring/grafana/provisioning/datasources/datasources.yml create mode 100644 docker/monitoring/otel-collector-grafana-config.yml create mode 100644 docker/monitoring/tempo.yml diff --git a/backend/apps/monitoring_app.py b/backend/apps/monitoring_app.py index 310365293..6de547cc6 100644 --- a/backend/apps/monitoring_app.py +++ b/backend/apps/monitoring_app.py @@ -6,6 +6,7 @@ """ import logging +import os from http import HTTPStatus from typing import Annotated, Optional @@ -21,6 +22,51 @@ router = APIRouter(prefix="/monitoring") +def _is_truthy_env(value: Optional[str]) -> bool: + """Return whether an environment value means enabled.""" + return str(value or "").strip().lower() in {"1", "true", "yes", "on"} + + +def _normalize_monitoring_provider(value: Optional[str]) -> str: + return str(value or "otlp").strip().lower() + + +def _build_monitoring_ui( + provider: str, +) -> tuple[Optional[str], Optional[str], Optional[str]]: + """Map MONITORING_PROVIDER to a monitoring UI port and path.""" + if provider == "grafana": + port = os.getenv("GRAFANA_PORT", "3002") + path = "/d/nexent-llm-agent/nexent-agent-trace-monitoring?orgId=1" + return port, path, "Grafana" + if provider == "phoenix": + port = os.getenv("PHOENIX_PORT", "6006") + return port, "/", "Phoenix" + if provider == "langfuse": + port = os.getenv("LANGFUSE_PORT", "3001") + return port, "/project/nexent-local", "Langfuse" + if provider == "jaeger": + port = os.getenv("JAEGER_UI_PORT", "16686") + return port, "/", "Jaeger" + return None, None, None + + +def get_monitoring_status() -> dict: + """Return telemetry state and the monitoring UI entrypoint for frontend use.""" + telemetry_enabled = _is_truthy_env(os.getenv("ENABLE_TELEMETRY")) + provider = _normalize_monitoring_provider(os.getenv("MONITORING_PROVIDER")) + dashboard_port, dashboard_path, provider_name = _build_monitoring_ui(provider) + + return { + "telemetry_enabled": telemetry_enabled, + "provider": provider, + "provider_name": provider_name, + "ui_enabled": telemetry_enabled and bool(dashboard_port), + "dashboard_port": dashboard_port, + "dashboard_path": dashboard_path, + } + + def _compute_time_range_filter(time_range: str) -> str: """Convert time_range parameter to SQL timestamp condition.""" hours = {"24h": 24, "7d": 168, "30d": 720}.get(time_range, 24) @@ -113,3 +159,13 @@ async def list_models_endpoint( logger.error(f"Failed to list monitoring models: {str(e)}") raise HTTPException( status_code=HTTPStatus.INTERNAL_SERVER_ERROR, detail=str(e)) + + +@router.get("/status", response_model=ConversationResponse) +async def get_monitoring_status_endpoint(): + """Return whether monitoring UI should be shown in the frontend.""" + return ConversationResponse( + code=0, + message="success", + data=get_monitoring_status(), + ) diff --git a/doc/docs/en/sdk/monitoring.md b/doc/docs/en/sdk/monitoring.md index fb3a79a08..0fcd7f9a2 100644 --- a/doc/docs/en/sdk/monitoring.md +++ b/doc/docs/en/sdk/monitoring.md @@ -5,7 +5,7 @@ Enterprise-grade observability for AI agents using OpenTelemetry OTLP protocol. ## Architecture ``` -NexentAgent ──► OpenTelemetry SDK ──► OTLP Collector ──► Arize Phoenix / Langfuse / Jaeger +NexentAgent ──► OpenTelemetry SDK ──► OTLP Collector ──► Arize Phoenix / Langfuse / Grafana Tempo / Jaeger │ │ │ OpenInference Semantics │ │ (llm.*, agent.* attributes) │ @@ -114,7 +114,7 @@ jaeger: |----------|---------|-------------| | `ENABLE_TELEMETRY` | `false` | Enable/disable monitoring | | `MONITORING_CONFIG_FILE` | (empty) | JSON/YAML monitoring config file path | -| `MONITORING_PROVIDER` | `otlp` | Provider profile: `otlp`, `phoenix`, `langfuse`, `jaeger`, `custom` | +| `MONITORING_PROVIDER` | `otlp` | Provider profile: `otlp`, `phoenix`, `langfuse`, `jaeger`, `grafana`, `custom` | | `MONITORING_USE_PLATFORM_SDK` | `false` | Whether to also initialize a provider SDK | | `MONITORING_PROJECT_NAME` | `nexent` | Observability platform project name | | `OTEL_SERVICE_NAME` | `nexent-backend` | Service identifier | diff --git a/doc/docs/zh/sdk/monitoring.md b/doc/docs/zh/sdk/monitoring.md index 6e21024ef..e6932152f 100644 --- a/doc/docs/zh/sdk/monitoring.md +++ b/doc/docs/zh/sdk/monitoring.md @@ -5,7 +5,7 @@ ## 系统架构 ``` -NexentAgent ──► OpenTelemetry SDK ──► OTLP Collector ──► Arize Phoenix / Langfuse / Jaeger +NexentAgent ──► OpenTelemetry SDK ──► OTLP Collector ──► Arize Phoenix / Langfuse / Grafana Tempo / Jaeger │ │ │ OpenInference 语义约定 │ │ (llm.*, agent.* 属性) │ @@ -29,13 +29,14 @@ OTEL_EXPORTER_OTLP_PROTOCOL=http ## 本地化部署形态 -`docker/start-monitoring.sh` 支持三种形态,均以 OpenTelemetry Collector 作为统一入口。业务服务只需要把 OTLP 发到 Collector,不需要感知后端平台差异。 +`docker/start-monitoring.sh` 支持四种形态,均以 OpenTelemetry Collector 作为统一入口。业务服务只需要把 OTLP 发到 Collector,不需要感知后端平台差异。 | 形态 | 命令 | 包含服务 | 适用场景 | |------|------|----------|----------| | `collector` | `./start-monitoring.sh --stack collector` | OpenTelemetry Collector | 只验证埋点、或转发到外部云端平台 | | `phoenix` | `./start-monitoring.sh --stack phoenix` | Collector + Phoenix | 本地 trace 调试、OpenInference 属性查看、实验分析 | | `langfuse` | `./start-monitoring.sh --stack langfuse` | Collector + Langfuse Web/Worker + Postgres + ClickHouse + MinIO + Redis | 本地完整 LLMOps 体验、会话/用户/反馈/成本分析 | +| `grafana` | `./start-monitoring.sh --stack grafana` | Collector + Grafana + Tempo | 本地 Tempo trace 查询 | 也可以在 `docker/monitoring/monitoring.env` 中设置默认形态: @@ -86,6 +87,31 @@ cd docker 启动脚本会在 `LANGFUSE_OTLP_AUTH_HEADER` 为空时自动生成 `Basic base64(public_key:secret_key)`,并让 Collector 将 trace 转发到 `http://langfuse-web:3000/api/public/otel`。本地默认密钥只适合开发验证,生产部署必须替换 `LANGFUSE_NEXTAUTH_SECRET`、`LANGFUSE_SALT`、`LANGFUSE_ENCRYPTION_KEY`、数据库密码和对象存储密钥。 +### 本地 Grafana + Tempo + +Grafana 本地部署使用 Grafana Tempo 存储 traces,并启用 Tempo `metrics-generator` 的 `local-blocks` processor 支持 Grafana trace breakdown 中的 TraceQL metrics 查询。Collector 接收 Nexent 后端的 OTLP traces/metrics,其中 traces 通过 OTLP gRPC 转发到 Tempo;OTLP metrics 只进入 Collector logging pipeline,不再启动 Prometheus 或暴露 Prometheus scrape 端口。 + +```bash +cd docker +./start-monitoring.sh --stack grafana +``` + +后端 `.env` 使用已有的 `MONITORING_PROVIDER` 控制前端顶栏监控入口: + +```bash +ENABLE_TELEMETRY=true +MONITORING_PROVIDER=grafana +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +``` + +访问地址: + +- Grafana UI:`http://localhost:3002` +- 默认管理员:`admin` / `nexent-grafana-admin` +- Tempo API:`http://localhost:3200` + +Grafana 会自动预置 Tempo datasource,并加载 `Nexent Agent Trace Monitoring` dashboard。Trace 查询入口在 Grafana Explore 中选择 `Tempo` datasource,示例 TraceQL 为 `{ resource.service.name = "nexent-backend" }`。 + ## AI 可观测性平台对接 ### Arize Phoenix @@ -173,7 +199,7 @@ jaeger: |------|--------|------| | `ENABLE_TELEMETRY` | `false` | 启用/禁用监控 | | `MONITORING_CONFIG_FILE` | (空) | JSON/YAML 监控配置文件路径 | -| `MONITORING_PROVIDER` | `otlp` | 平台配置:`otlp`、`phoenix`、`langfuse`、`jaeger`、`custom` | +| `MONITORING_PROVIDER` | `otlp` | 平台配置:`otlp`、`phoenix`、`langfuse`、`jaeger`、`grafana`、`custom` | | `MONITORING_USE_PLATFORM_SDK` | `false` | 是否额外初始化平台 SDK | | `MONITORING_PROJECT_NAME` | `nexent` | 监控平台项目名 | | `OTEL_SERVICE_NAME` | `nexent-backend` | 服务标识 | @@ -186,6 +212,16 @@ jaeger: | `OTEL_EXPORTER_OTLP_X_API_KEY` | (空) | `x-api-key` header,用于兼容需要该 header 的平台 | | `OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION` | (空) | Langfuse 实时摄取版本,例如 `4` | | `OTEL_EXPORTER_OTLP_METRICS_ENABLED` | `true` | 是否导出 OTLP metrics | +| `MONITORING_INSTRUMENT_FASTAPI` | `true` | 是否启用 FastAPI 自动 HTTP server span | +| `MONITORING_INSTRUMENT_REQUESTS` | `false` | 是否启用 requests 自动 HTTP client span;默认关闭,避免 AI trace 被普通 HTTP 请求刷屏 | +| `MONITORING_FASTAPI_EXCLUDED_URLS` | (空) | FastAPI 自动埋点排除 URL,逗号分隔正则;例如只看 agent 业务 span 时可设为 `/agent/run` | +| `MONITORING_FASTAPI_EXCLUDE_SPANS` | `receive,send` | 排除 ASGI 内部 `receive/send` span;流式接口建议保持默认值 | +| `GRAFANA_PORT` | `3002` | 本地 Grafana UI 端口 | +| `GRAFANA_ADMIN_USER` | `admin` | 本地 Grafana 管理员用户名 | +| `GRAFANA_ADMIN_PASSWORD` | `nexent-grafana-admin` | 本地 Grafana 管理员密码 | +| `GRAFANA_DEFAULT_LANGUAGE` | `zh-Hans` | 本地 Grafana 默认界面语言 | +| `TEMPO_VERSION` | `2.10.1` | 本地 Tempo 镜像版本,避免 `latest` 配置兼容性漂移 | +| `TEMPO_PORT` | `3200` | 本地 Tempo HTTP API 端口 | ## 配置文件 @@ -196,6 +232,9 @@ monitoring: enable_telemetry: true service_name: nexent-backend project_name: nexent-production + instrument_fastapi: true + instrument_requests: false + fastapi_exclude_spans: [receive, send] exporter: provider: langfuse protocol: http @@ -242,6 +281,48 @@ with monitoring_manager.trace_tool_call("web_search", "agent_name", {"query": "t monitoring_manager.set_tool_output({"results": results}) ``` +### Phoenix 自定义层级埋点 + +如果希望 Phoenix 展示 `agent -> chain -> llm/tool` 的层级结构,使用 OpenInference span kind 封装方法: + +```python +from nexent.monitor import get_monitoring_manager + +monitoring_manager = get_monitoring_manager() + +with monitoring_manager.trace_agent( + "TestAgent.run", + input_value={"query": "你好"}, + metadata={"agent_id": 1, "tenant_id": "tenant_id"}, + tags=["nexent", "agent", "agent_id:1"], + session_id=1001, + user_id="user_id", +): + with monitoring_manager.trace_chain("Step 0"): + with monitoring_manager.trace_chain("Step 1"): + with monitoring_manager.trace_llm_request("OpenAIModel.generate", "gpt-4"): + result = call_llm() + + with monitoring_manager.trace_tool_call("FinalAnswerTool", "TestAgent", {"query": "你好"}): + monitoring_manager.set_tool_output({"answer": result}) + + monitoring_manager.set_openinference_output({"answer": result}) +``` + +Phoenix 左侧的 `agent`、`chain`、`llm`、`tool` 标签来自 `openinference.span.kind`。span 必须通过嵌套 `with` 创建,Phoenix 才会显示成树形结构。 + +同一套方法也会写入 Langfuse 识别的 OTel 属性: + +| Nexent 方法 | Phoenix 属性 | Langfuse observation type | +|-------------|--------------|---------------------------| +| `trace_agent` | `openinference.span.kind=AGENT` | `langfuse.observation.type=agent` | +| `trace_chain` | `openinference.span.kind=CHAIN` | `langfuse.observation.type=chain` | +| `trace_llm_request` | `openinference.span.kind=LLM` | `langfuse.observation.type=generation` | +| `trace_tool_call` | `openinference.span.kind=TOOL` | `langfuse.observation.type=tool` | +| `trace_retriever` | `openinference.span.kind=RETRIEVER` | `langfuse.observation.type=retriever` | + +`session_id`、`user_id`、`tags` 和 `metadata` 会同步写入 `langfuse.session.id`、`langfuse.user.id`、`langfuse.trace.tags`、`langfuse.trace.metadata.*`,可在 Langfuse 中按会话、用户和业务字段过滤。`input_value`、`output_value` 会同步写入 `langfuse.observation.input` 和 `langfuse.observation.output`。 + ## OpenInference 语义属性 系统使用 OpenInference 语义约定,专为 AI 可观测性设计: diff --git a/doc/docs/zh/sdk/opentelemetry-design.md b/doc/docs/zh/sdk/opentelemetry-design.md index f1af77dc0..2127fdbed 100644 --- a/doc/docs/zh/sdk/opentelemetry-design.md +++ b/doc/docs/zh/sdk/opentelemetry-design.md @@ -1,19 +1,21 @@ # Nexent OpenTelemetry 可观测性设计 -生成日期:2026-04-28 -基准分支:`dev/opentelemetry` +生成日期:2026-05-06 +基准分支:当前 OpenTelemetry 功能分支 ## 设计目标 -Nexent 的监控能力以 OpenTelemetry 为主干,SDK 和后端只负责生成标准 span、event、metric,并通过 OTLP 导出。Phoenix、Langfuse、Jaeger 等平台只作为可配置的 exporter 或可选 SDK 增强层,避免把业务代码绑定到单一平台。 +Nexent 的监控能力以 OpenTelemetry 为主干,SDK 和后端只负责生成标准 span、event、metric,并通过 OTLP 导出。Phoenix、Langfuse、Grafana Tempo、Jaeger 等平台作为可配置 exporter 或可选 SDK 增强层接入,业务代码不绑定单一平台。 -目标: +核心目标: -- Agent 流式运行期间保持 trace 上下文,完整覆盖 API、服务准备、Agent 线程、LLM 流式输出、工具调用。 -- 支持 `otlp`、`phoenix`、`langfuse`、`jaeger`、`custom` provider profile。 +- Agent 流式运行期间保持 trace 上下文,覆盖 API、服务准备、Agent 异步 generator、Agent 线程、LLM 流式输出、Python 解释器执行、真实工具调用和最终答案。 +- 通过 OpenInference 属性适配 Phoenix,通过 `langfuse.*` 属性适配 Langfuse,同一套业务埋点可同时服务多个监控平台。 +- 支持 `otlp`、`phoenix`、`langfuse`、`jaeger`、`grafana`、`custom` provider profile。 - 同时支持环境变量和 JSON/YAML 配置文件,环境变量可覆盖文件配置。 - 支持 base endpoint 和 signal-specific endpoint,避免 `/v1/traces`、`/v1/metrics` 路径重复拼接。 -- 保持 OpenTelemetry 原生实现,平台 SDK 只通过 `MONITORING_USE_PLATFORM_SDK=true` 显式启用。 +- FastAPI/requests 自动埋点可配置,默认压制流式接口中的 ASGI `receive/send` 噪声。 +- 平台 SDK 只通过 `MONITORING_USE_PLATFORM_SDK=true` 显式启用,默认保持 OpenTelemetry 原生实现。 ## 技术栈 @@ -23,34 +25,61 @@ Nexent 的监控能力以 OpenTelemetry 为主干,SDK 和后端只负责生成 | 导出协议 | OTLP HTTP、OTLP gRPC | | Trace exporter | `opentelemetry-exporter-otlp` HTTP/gRPC trace exporter | | Metric exporter | `opentelemetry-exporter-otlp` HTTP/gRPC metric exporter | -| 自动埋点 | FastAPI instrumentation、requests instrumentation | -| AI 语义 | OpenInference 风格属性:`llm.*`、`agent.*`、`agent.tool.*` | +| 自动埋点 | FastAPI instrumentation、requests instrumentation;requests 默认关闭 | +| AI 语义 | OpenInference 属性、Langfuse OTel 属性、Nexent 自定义业务属性 | +| Agent 框架 | SmolAgents `CodeAgent` 扩展、Nexent `CoreAgent`、`NexentAgent` | | 配置 | 环境变量、`MONITORING_CONFIG_FILE` JSON/YAML | -| Collector | `otel/opentelemetry-collector-contrib`,使用 `otlphttp` 转发 HTTP 平台;本地可选择 logging、Phoenix、Langfuse 三类部署形态 | -| 可选 SDK | `phoenix.otel.register`、`langfuse.get_client`,默认不启用;Phoenix SDK 成功注册时复用其 tracer provider | +| Collector | `otel/opentelemetry-collector-contrib`,支持 logging、Phoenix、Langfuse、Grafana/Tempo 四类本地部署形态 | +| 可选 SDK | `phoenix.otel.register`、`langfuse.get_client`;Phoenix SDK 成功注册时复用其 tracer provider | + +## 总体架构 + +```mermaid +flowchart LR + Backend[Nexent Backend / SDK] --> OTel[OpenTelemetry TracerProvider / MeterProvider] + OTel --> Exporter[OTLP Trace / Metric Exporter] + Exporter --> Collector[OpenTelemetry Collector] + Collector --> Phoenix[Arize Phoenix] + Collector --> Langfuse[Langfuse] + Collector --> Tempo[Grafana Tempo] + Collector --> Other[Jaeger / Custom Backend] + + Backend --> FastAPI[FastAPI Auto Instrumentation] + Backend --> Manual[Manual AI Spans] + Manual --> OI[OpenInference Attributes] + Manual --> LF[Langfuse Attributes] +``` ## 配置模型 ### 环境变量 -| 变量 | 说明 | -|------|------| -| `ENABLE_TELEMETRY` | 总开关 | -| `MONITORING_CONFIG_FILE` | JSON/YAML 配置文件路径 | -| `MONITORING_PROVIDER` | `otlp`、`phoenix`、`langfuse`、`jaeger`、`custom` | -| `MONITORING_STACK` | 本地部署形态:`collector`、`phoenix`、`langfuse` | -| `MONITORING_USE_PLATFORM_SDK` | 是否额外初始化平台 SDK | -| `MONITORING_PROJECT_NAME` | 平台项目名 | -| `OTEL_SERVICE_NAME` | OpenTelemetry service name | -| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP base endpoint | -| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | 可选 trace 专用 endpoint | -| `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` | 可选 metric 专用 endpoint | -| `OTEL_EXPORTER_OTLP_PROTOCOL` | `http` 或 `grpc` | -| `OTEL_EXPORTER_OTLP_HEADERS` | 通用 `key=value,key2=value2` header | -| `OTEL_EXPORTER_OTLP_AUTHORIZATION` | `Authorization` header,常用于 Phoenix bearer auth 和 Langfuse Basic Auth | -| `OTEL_EXPORTER_OTLP_X_API_KEY` | `x-api-key` header,用于兼容需要该 header 的平台 | -| `OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION` | Langfuse 摄取版本,例如 `4` | -| `OTEL_EXPORTER_OTLP_METRICS_ENABLED` | 是否导出 metric | +| 变量 | 默认值 | 说明 | +|------|--------|------| +| `ENABLE_TELEMETRY` | `false` | 监控总开关 | +| `MONITORING_CONFIG_FILE` | 空 | JSON/YAML 配置文件路径 | +| `MONITORING_PROVIDER` | `otlp` | `otlp`、`phoenix`、`langfuse`、`jaeger`、`grafana`、`custom` | +| `MONITORING_STACK` | `collector` | 本地部署形态:`collector`、`phoenix`、`langfuse`、`grafana` | +| `MONITORING_USE_PLATFORM_SDK` | `false` | 是否额外初始化平台 SDK | +| `MONITORING_PROJECT_NAME` | `nexent` | 平台项目名 | +| `OTEL_SERVICE_NAME` | `nexent-backend` | OpenTelemetry service name | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | `http://localhost:4318` | OTLP base endpoint | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | 空 | 可选 trace 专用 endpoint | +| `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` | 空 | 可选 metric 专用 endpoint | +| `OTEL_EXPORTER_OTLP_PROTOCOL` | `http` | `http` 或 `grpc` | +| `OTEL_EXPORTER_OTLP_HEADERS` | 空 | 通用 `key=value,key2=value2` header | +| `OTEL_EXPORTER_OTLP_AUTHORIZATION` | 空 | `Authorization` header,常用于 Phoenix bearer auth 和 Langfuse Basic Auth | +| `OTEL_EXPORTER_OTLP_X_API_KEY` | 空 | `x-api-key` header,用于兼容需要该 header 的平台 | +| `OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION` | 空 | Langfuse 摄取版本,例如 `4` | +| `OTEL_EXPORTER_OTLP_METRICS_ENABLED` | `true` | 是否导出 metric | +| `MONITORING_INSTRUMENT_FASTAPI` | `true` | 是否启用 FastAPI 自动 HTTP server span | +| `MONITORING_INSTRUMENT_REQUESTS` | `false` | 是否启用 requests 自动 HTTP client span | +| `MONITORING_FASTAPI_EXCLUDED_URLS` | 空 | FastAPI 自动埋点排除 URL,逗号分隔正则 | +| `MONITORING_FASTAPI_EXCLUDE_SPANS` | `receive,send` | 排除 ASGI 内部 `receive/send` span,流式接口建议保持默认 | +| `GRAFANA_PORT` | `3002` | 本地 Grafana UI 端口 | +| `GRAFANA_DEFAULT_LANGUAGE` | `zh-Hans` | 本地 Grafana 默认界面语言 | +| `TEMPO_VERSION` | `2.10.1` | 本地 Tempo 镜像版本,避免 `latest` 配置兼容性漂移 | +| `TEMPO_PORT` | `3200` | 本地 Tempo HTTP API 端口 | ### 配置文件 @@ -59,6 +88,10 @@ monitoring: enable_telemetry: true service_name: nexent-backend project_name: nexent-production + instrument_fastapi: true + instrument_requests: false + fastapi_excluded_urls: "" + fastapi_exclude_spans: [receive, send] exporter: provider: langfuse protocol: http @@ -98,9 +131,23 @@ OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 OTEL_EXPORTER_OTLP_PROTOCOL=http ``` +前端顶栏监控入口只根据后端 `MONITORING_PROVIDER` 映射 UI 端口和路径,最终跳转地址由前端使用当前页面 URL 的 hostname 组装,避免固定写死 `localhost`: + +- `phoenix` -> `${currentHostname}:${PHOENIX_PORT:-6006}/` +- `langfuse` -> `${currentHostname}:${LANGFUSE_PORT:-3001}/project/nexent-local` +- `jaeger` -> `${currentHostname}:${JAEGER_UI_PORT:-16686}/` +- `grafana` -> `${currentHostname}:${GRAFANA_PORT:-3002}/d/nexent-llm-agent/nexent-agent-trace-monitoring?orgId=1` +- `otlp` / `custom` 默认不显示顶栏监控入口 + +因此本地 Grafana 形态需要在后端 `.env` 中设置: + +```bash +MONITORING_PROVIDER=grafana +``` + ### Phoenix -Phoenix 支持通过 OTLP HTTP 接收 traces,也提供 `phoenix.otel` SDK 包装 OpenTelemetry。 +Phoenix 通过 OpenInference 属性识别 AI span 类型,核心字段是 `openinference.span.kind`。 ```bash MONITORING_PROVIDER=phoenix @@ -128,6 +175,8 @@ OTEL_EXPORTER_OTLP_LANGFUSE_INGESTION_VERSION=4 OTEL_EXPORTER_OTLP_METRICS_ENABLED=false ``` +当前实现会同时写入 `langfuse.observation.type`、`langfuse.session.id`、`langfuse.user.id`、`langfuse.trace.tags`、`langfuse.trace.metadata.*`、`langfuse.observation.input`、`langfuse.observation.output` 等属性,以便 Langfuse 正确展示 generation/tool/agent 并支持过滤聚合。 + ## 本地化部署设计 本地化部署通过 `docker/start-monitoring.sh` 选择形态。所有形态都保留 OpenTelemetry Collector 作为入口,Nexent 后端统一上报到 `http://otel-collector:4318` 或宿主机的 `http://localhost:4318`,平台差异只体现在 Collector exporter 和本地服务组合上。 @@ -137,6 +186,7 @@ OTEL_EXPORTER_OTLP_METRICS_ENABLED=false | `collector` | `otel-collector-config.yml` | Collector | logging exporter | 最小形态,用于验证 span/metric 是否产生,或手动改配置转发到云端平台 | | `phoenix` | `otel-collector-phoenix-config.yml` | Collector + Phoenix | `http://phoenix:6006/v1/traces` | Phoenix 容器同时提供 UI 和 OTLP HTTP/gRPC trace collector,适合本地 trace debug | | `langfuse` | `otel-collector-langfuse-config.yml` | Collector + Langfuse Web/Worker + Postgres + ClickHouse + MinIO + Redis | `http://langfuse-web:3000/api/public/otel/v1/traces` | Langfuse v3 依赖多组件,适合完整 LLMOps 能力验证 | +| `grafana` | `otel-collector-grafana-config.yml` | Collector + Grafana + Tempo | traces 转发到 `tempo:4317`,metrics 只进入 Collector logging pipeline | Grafana + Tempo trace 查询 | 启动命令: @@ -145,6 +195,7 @@ cd docker ./start-monitoring.sh --stack collector ./start-monitoring.sh --stack phoenix ./start-monitoring.sh --stack langfuse +./start-monitoring.sh --stack grafana ``` 部署脚本职责: @@ -168,7 +219,7 @@ Compose 中设置 `PHOENIX_WORKING_DIR=/mnt/data` 并挂载 `phoenix-data` volum ### Langfuse 本地形态 -Langfuse v3 本地形态按官方自托管架构拆分为应用容器和存储组件: +Langfuse v3 本地形态按自托管架构拆分为应用容器和存储组件: | 组件 | 用途 | |------|------| @@ -189,24 +240,69 @@ headers: 默认密钥仅用于本地验证。生产或共享环境必须替换认证密钥、数据库密码、对象存储密钥和 `LANGFUSE_ENCRYPTION_KEY`,并补充备份、高可用和升级策略。 +### Grafana 本地形态 + +Grafana 本地形态面向 trace 调试: + +| 组件 | 用途 | +|------|------| +| `grafana` | 展示 Nexent Agent trace dashboard,并预置 Tempo datasource | +| `tempo` | 接收 Collector 转发的 OTLP traces,并提供 Grafana Explore 查询后端 | + +Collector trace pipeline 使用 `otlp/tempo` exporter 转发到 `tempo:4317`。Tempo 启用 `metrics-generator` 的 `local-blocks` processor,用于支持 Grafana trace breakdown 中的 TraceQL metrics 查询。Collector metrics pipeline 保留为 logging exporter,用于兼容后端仍开启 OTLP metrics 的场景,但本地 Grafana 形态不再提供 Prometheus 指标存储和指标 dashboard。 + +默认访问地址: + +- Grafana:`http://localhost:3002` +- Tempo API:`http://localhost:3200` + +## Span 语义映射 + +| Nexent 场景 | Phoenix / OpenInference | Langfuse | +|-------------|-------------------------|----------| +| Agent 入口 | `openinference.span.kind=AGENT` | `langfuse.observation.type=agent` | +| 服务准备、流式生成、线程执行、普通步骤 | `openinference.span.kind=CHAIN` | `langfuse.observation.type=chain` | +| LLM 调用 | `openinference.span.kind=LLM` | `langfuse.observation.type=generation` | +| 工具调用 | `openinference.span.kind=TOOL` | `langfuse.observation.type=tool` | +| 检索类调用 | `openinference.span.kind=RETRIEVER` | `langfuse.observation.type=retriever` | + +上下文属性: + +| 属性 | 说明 | +|------|------| +| `input.value` / `output.value` | OpenInference 输入输出 | +| `metadata` | OpenInference JSON metadata | +| `session.id` / `user.id` | OpenInference 会话和用户 | +| `tag.tags` | OpenInference tags | +| `langfuse.observation.input` / `langfuse.observation.output` | Langfuse observation 输入输出 | +| `langfuse.session.id` / `langfuse.user.id` | Langfuse 会话和用户 | +| `langfuse.trace.tags` | Langfuse trace tags | +| `langfuse.trace.metadata.*` / `langfuse.observation.metadata.*` | Langfuse 可过滤业务 metadata | + ## 埋点信息 -| 埋点 | 位置 | 内容 | 目的 | -|------|------|------|------| -| FastAPI 自动 span | `backend/apps/app_factory.py` | route、method、status、duration | API 入口耗时和错误定位 | -| requests 自动 span | `MonitoringManager` 初始化 | 外部 HTTP 调用 | 观测模型服务、工具服务、MCP 等依赖 | -| `agent.run` | `backend/apps/agent_app.py` | `/agent/run` 请求 | Agent 运行入口追踪 | -| `agent_service.run_agent_stream` | `backend/services/agent_service.py` | `agent_id`、`conversation_id`、debug、文件数、记忆开关、策略、准备耗时 | 分析 SSE 创建前的准备阶段 | -| `user_resolution.*` | `run_agent_stream` | 用户、租户、语言和耗时 | 鉴权与租户解析定位 | -| `user_message_save.*` | `run_agent_stream` | 保存或跳过原因、耗时 | 判断会话写入是否正常 | -| `memory_context_build.*` | `run_agent_stream` | 记忆开关、共享策略、耗时 | 定位记忆上下文瓶颈 | -| `streaming_strategy.*` | `run_agent_stream` | `with_memory` 或 `no_memory` | 判断实际执行分支 | -| `generate_stream_no_memory.*` | `generate_stream_no_memory` | 准备与流式输出事件 | 追踪无记忆流式执行 | -| `agent_run` | `sdk/nexent/core/agents/run_agent.py` | 线程启动、缓存读取、消息 yield | 追踪 Agent 流式输出 | -| `agent_run_thread` | `run_agent.py` | Agent 创建、MCP 工具装载、执行错误 | 追踪实际 Agent 执行线程 | -| `chat_completion` | `openai_llm.py` | 模型、温度、top_p、消息数、token、TTFT、chunk 数、输出长度 | LLM 性能、成本和异常分析 | -| `trace_agent_step` | SDK 公共 API | `agent.name`、`agent.step.name`、`agent.step.type` | 供后续推理步骤、工具选择等细粒度埋点扩展 | -| `trace_tool_call` | SDK 公共 API | 工具名、输入、输出、耗时、错误 | 工具可用性和延迟分析 | +| 埋点 | 位置 | 类型 | 内容 | 目的 | +|------|------|------|------|------| +| FastAPI 自动 span | `MonitoringManager.setup_fastapi_app` | HTTP server | route、method、status、duration | API 入口耗时和错误定位 | +| FastAPI `receive/send` 排除 | `fastapi_exclude_spans` | 降噪配置 | 默认 `receive,send` | 避免 SSE 流式接口生成大量 `unknown POST /agent/run http ...` | +| requests 自动 span | `MonitoringConfig.instrument_requests` | HTTP client | 外部请求 URL、method、status | 默认关闭;需要分析外部 HTTP 依赖时开启 | +| `agent.run` | `backend/apps/agent_app.py` | AGENT | `/agent/run` 请求入口 | 作为一次 Agent 运行的顶层业务 trace | +| `agent_service.run_agent_stream` | `backend/services/agent_service.py` | CHAIN | `agent_id`、`conversation_id`、debug、文件数、记忆开关、策略、准备耗时 | 分析 SSE 创建前的准备阶段 | +| `set_openinference_agent_context` | `run_agent_stream` | 当前 span 上下文 | session、user、tenant、agent、metadata、tags | 给 Phoenix/Langfuse 建立 Agent、用户、会话维度 | +| `user_resolution.*` | `run_agent_stream` | event | 用户、租户、语言和耗时 | 鉴权与租户解析定位 | +| `user_message_save.*` | `run_agent_stream` | event | 保存或跳过原因、耗时 | 判断会话写入是否正常 | +| `memory_context_build.*` | `run_agent_stream` | event | 记忆开关、共享策略、耗时 | 定位记忆上下文瓶颈 | +| `streaming_strategy.*` | `run_agent_stream` | event | `with_memory` 或 `no_memory` | 判断实际执行分支 | +| `generate_stream_with_memory` | `backend/services/agent_service.py` | CHAIN | memory token、预处理任务、fallback 分支 | 追踪带记忆路径的流式执行 | +| `generate_stream_no_memory` | `backend/services/agent_service.py` | CHAIN | 准备与流式输出事件 | 追踪无记忆流式执行 | +| `agent_run` | `sdk/nexent/core/agents/run_agent.py` | CHAIN | 线程启动、缓存读取、消息 yield | 追踪 Agent 异步 generator 消费过程 | +| `agent_run_thread` | `sdk/nexent/core/agents/run_agent.py` | CHAIN | Agent 创建、MCP 工具装载、执行错误 | 追踪实际 Agent 执行线程 | +| `{display_name or model_id}.generate` | `sdk/nexent/core/models/openai_llm.py` | LLM / generation | 模型、温度、top_p、消息、输入输出、token、TTFT、chunk 数 | LLM 性能、成本、输出和异常分析 | +| `python_interpreter` | `sdk/nexent/core/agents/core_agent.py` | TOOL | 生成代码、step number、执行输出、日志、是否最终答案 | 观测 CodeAgent 解释器执行 | +| 真实工具名 | `sdk/nexent/core/agents/nexent_agent.py` | TOOL | local/MCP/langchain/builtin 工具输入输出 | 观测真实工具可用性、延迟、错误和输入输出 | +| `FinalAnswerTool` | `sdk/nexent/core/agents/core_agent.py` | TOOL | 最终答案输出 | 让 Phoenix/Langfuse 中能明确看到最终答案节点 | +| `trace_agent` / `trace_chain` / `trace_retriever` | SDK 公共 API | AGENT / CHAIN / RETRIEVER | 自定义输入输出、metadata、tags、session、user | SDK 用户自定义层级埋点 | +| `trace_tool_call` | SDK 公共 API | TOOL | 工具名、输入、输出、耗时、错误 | SDK 用户自定义工具埋点 | ### 事件清单 @@ -221,9 +317,8 @@ headers: | `agent_service.run_agent_stream` | `streaming_response.creating` / `streaming_response.created` / `run_agent_stream.preparation_completed` | `duration`、`media_type`、`total_preparation_time` | 观测 SSE 响应创建和整体准备耗时 | | `generate_stream_no_memory` | `generate_stream_no_memory.started` / `generate_stream_no_memory.completed` / `generate_stream_no_memory.streaming.started` / `generate_stream_no_memory.streaming.completed` | 无 | 观测无记忆路径的准备和流式消费边界 | | `agent_run` | `agent_run.started` / `agent_run.thread_started` / `agent_run.get_cached_message` / `agent_run.get_cached_message_completed` / `agent_run.yield_message` | 无 | 观测 Agent 线程启动、缓存轮询和消息 yield | -| `monitor_llm_call` | `llm_call_started` / `llm_call_completed` / `llm_call_error` | `error.*` | 统一记录 LLM 调用生命周期 | -| `openai_chat.chat_completion` | `completion_started` / `completion_finished` / `model_stopped` / `error_occurred` | `model_id`、`temperature`、`top_p`、`message_count`、`total_duration`、`output_length`、`chunk_count`、`error.*` | 分析模型参数、流式输出耗时、停止和异常 | -| `trace_tool_call` | span 属性 `agent.tool.input` / `agent.tool.output` | JSON 字符串、`agent.tool.duration_ms`、`error.*` | 分析工具输入输出、耗时和异常 | +| LLM span | `completion_started` / `first_token_received` / `token_generated` / `completion_finished` / `model_stopped` / `error_occurred` | `model_id`、`temperature`、`top_p`、`message_count`、`total_duration`、`output_length`、`chunk_count`、`error.*` | 分析模型参数、流式输出耗时、停止和异常 | +| Tool span | span 属性 `agent.tool.input` / `agent.tool.output` | JSON 字符串、`agent.tool.duration_ms`、`error.*` | 分析工具输入输出、耗时和异常 | ## 指标 @@ -245,58 +340,80 @@ headers: flowchart TD U[用户] --> FE[前端 Chat] FE --> API[POST /agent/run] - API --> S1[agent.run span] - S1 --> S2[agent_service.run_agent_stream span] - S2 --> A[user_resolution] - S2 --> B[user_message_save] - S2 --> C[memory_context_build] - C --> D{streaming_strategy} - D -->|with_memory| E[generate_stream_with_memory] - D -->|no_memory| F[generate_stream_no_memory span] - E --> G[StreamingResponse] - F --> G - G --> H[agent_run async generator span] - H --> I[agent_run_thread span] - I --> J[NexentAgent] - J --> K[Tool / MCP / HTTP spans] - J --> L[chat_completion span] - L --> M[token events and LLM metrics] - K --> OTel[OpenTelemetry Tracer/Meter Provider] - M --> OTel + API --> HTTP[FastAPI HTTP span: 可配置隐藏] + HTTP --> A0[agent.run span: AGENT] + A0 --> S1[agent_service.run_agent_stream: CHAIN] + S1 --> R[user_resolution events] + S1 --> Save[user_message_save events] + S1 --> Mem[memory_context_build events] + Mem --> Strategy{streaming_strategy} + Strategy -->|with_memory| G1[generate_stream_with_memory: CHAIN] + Strategy -->|no_memory| G2[generate_stream_no_memory: CHAIN] + G1 --> AR[agent_run async generator: CHAIN] + G2 --> AR + AR --> Thread[agent_run_thread: CHAIN] + Thread --> NX[NexentAgent / CoreAgent] + NX --> Step[Agent step / code action] + Step --> LLM[Model.generate: LLM / generation] + Step --> PY[python_interpreter: TOOL] + PY --> Tool[Real local / MCP / langchain / builtin tool: TOOL] + PY --> Final[FinalAnswerTool: TOOL] + LLM --> Attr1[OpenInference + Langfuse attrs] + Tool --> Attr1 + Final --> Attr1 + Attr1 --> OTel[OpenTelemetry Tracer/Meter Provider] OTel --> Collector[OTLP Collector] Collector --> Phoenix[Phoenix] Collector --> Langfuse[Langfuse] + Collector --> Tempo[Grafana Tempo] Collector --> Other[Jaeger / Custom Backend] ``` +预期平台树形结构: + +```text +agent.run agent +└─ agent_service.run_agent_stream chain + └─ agent_service.generate_* chain + └─ agent_run chain + └─ agent_run_thread chain + ├─ Model.generate llm / generation + ├─ python_interpreter tool + │ └─ RealTool tool + └─ FinalAnswerTool tool +``` + +FastAPI HTTP span 可以保留在最上层用于接口视角,也可以通过 `MONITORING_FASTAPI_EXCLUDED_URLS=/agent/run` 在 AI trace 视图中隐藏。 + ## 监控页面结构 ```mermaid flowchart TB Page[Agent 监控页] --> Filters[筛选区: 时间 / 租户 / 用户 / Agent / 会话 / 模型 / 状态] - Page --> KPIs[指标区: 成功率 / P95 / TTFT / tokens/s / token 成本 / 错误数] - Page --> List[Trace 列表] + Page --> KPIs[指标区: 成功率 / P95 / TTFT / tokens/s / token 成本 / 工具错误数] + Page --> TraceList[Trace 列表: Agent / 会话 / 用户 / 状态 / 耗时 / Token / 模型 / 最后错误] Page --> Detail[Trace 详情] - Detail --> Waterfall[Span 瀑布图] - Detail --> Timeline[Agent 时间线] - Detail --> LLM[LLM 调用面板] - Detail --> Tool[工具调用面板] - Detail --> Raw[原始 OTel 属性] - Detail --> Eval[反馈和评估] + Detail --> Waterfall[Span 瀑布图: agent / chain / llm / tool] + Detail --> Timeline[Agent 时间线: 准备 / 记忆 / LLM / 工具 / 最终答案] + Detail --> LLMPanel[LLM 面板: prompt / output / token / TTFT / generation rate] + Detail --> ToolPanel[工具面板: 工具名 / 输入 / 输出 / 耗时 / 错误] + Detail --> Session[会话和用户上下文] + Detail --> Raw[原始 OTel 属性和 events] + Detail --> Eval[反馈、评分和评估] ``` 与 Phoenix 和 Langfuse 对比: -| 方案 | 优点 | 不足 | -|------|------|------| -| Phoenix | OpenInference 生态匹配好,适合 trace debug、实验、评估;`phoenix.otel` 可降低接入成本 | Nexent 的租户、权限、Agent 配置需要额外映射 | -| Langfuse | Trace、session、user、prompt、evaluation、dashboard 能力完整,OTLP endpoint 和 SDK 都基于 OpenTelemetry | 需要补充 `langfuse.*` 属性才能获得更好的筛选聚合体验 | -| Nexent 自建页 | 可直接关联租户、会话、Agent 配置和权限,适合产品内闭环 | 需要自建 trace 存储、查询、聚合和瀑布图 | +| 方案 | 优点 | 不足 | Nexent 当前适配 | +|------|------|------|----------------| +| Phoenix | OpenInference 生态匹配好,适合 trace debug、实验、评估;`phoenix.otel` 可降低接入成本 | Nexent 的租户、权限、Agent 配置需要通过属性映射;HTTP 自动 span 容易产生 `unknown` 噪声 | 写入 `openinference.span.kind`、`input.value`、`output.value`、`metadata`、`session.id`、`user.id`,并支持 FastAPI 降噪 | +| Langfuse | Trace、session、user、prompt、evaluation、dashboard 能力完整,适合 LLMOps 闭环 | 需要 `langfuse.*` 属性才能获得更好的 observation 类型、用户、会话和 metadata 聚合 | 写入 `langfuse.observation.type`、`langfuse.session.id`、`langfuse.user.id`、`langfuse.trace.metadata.*`、`langfuse.observation.input/output` | +| Nexent 自建页 | 可直接关联租户、会话、Agent 配置、权限、版本和业务动作,适合产品内闭环 | 需要自建 trace 存储、查询、聚合、瀑布图、权限隔离和成本统计 | 当前先通过 OTLP 对接外部平台,后续可基于同一批属性构建自有页面 | 推荐路径: -1. 短期使用 OTLP 对接 Phoenix/Langfuse,先满足调试和分析。 -2. 中期在 Nexent 增加 trace 跳转、轻量指标概览。 +1. 短期使用 OTLP 对接 Phoenix/Langfuse,满足调试和分析。 +2. 中期在 Nexent 增加 trace 跳转、轻量指标概览和异常聚合。 3. 长期按租户、会话、Agent 版本建立自有监控页,同时保留 OTLP 双写能力。 ## 已修复的设计风险 @@ -305,8 +422,39 @@ flowchart TB |------|------| | async generator span 提前结束 | `monitor_endpoint` 使用 `inspect.isasyncgenfunction`,在 `async for` 消费期间保持 span 打开 | | `/v1/traces` 路径重复拼接 | SDK 支持 base endpoint 和 signal endpoint 自动归一化 | -| Collector header 无法兼容平台 | Collector 默认只 logging;平台转发示例改用 `otlphttp/` exporter,并拆分 `Authorization`、`x-api-key`、`x-langfuse-ingestion-version` | -| 单测漏掉流式函数 | 增加 async generator 装饰器测试 | +| Collector header 无法兼容平台 | Collector 默认只 logging;平台转发配置拆分 `Authorization`、`x-api-key`、`x-langfuse-ingestion-version` | +| Phoenix 只看到接口看不到 Agent | 顶层 `agent.run` 标记为 AGENT,内部服务、线程、generator 标记为 CHAIN | +| Phoenix/Langfuse 中出现大量 `unknown POST /agent/run http ...` | 默认排除 FastAPI ASGI `receive/send` span;requests 自动埋点默认关闭;可配置隐藏 `/agent/run` HTTP span | +| Langfuse 无法识别 observation 类型 | 增加 `langfuse.observation.type` 和 trace/session/user/metadata/input/output 属性 | +| LLM span 不明显或缺输出 | LLM span 命名为 `{display_name or model_id}.generate`,并写入 `output.value` 和 `langfuse.observation.output` | +| 工具 span 缺失 | 在 `NexentAgent.create_single_agent` 统一包装 local/MCP/langchain/builtin 工具,并在 `CoreAgent` 增加 `python_interpreter` 和 `FinalAnswerTool` span | +| 单测漏掉流式函数 | 增加 async generator 装饰器测试和 OpenInference/Langfuse 属性测试 | + +## 使用建议 + +只看 Agent 业务链路时: + +```bash +MONITORING_INSTRUMENT_FASTAPI=true +MONITORING_FASTAPI_EXCLUDE_SPANS=receive,send +MONITORING_FASTAPI_EXCLUDED_URLS=/agent/run +MONITORING_INSTRUMENT_REQUESTS=false +``` + +同时看接口入口和 Agent 业务链路时: + +```bash +MONITORING_INSTRUMENT_FASTAPI=true +MONITORING_FASTAPI_EXCLUDE_SPANS=receive,send +MONITORING_FASTAPI_EXCLUDED_URLS= +MONITORING_INSTRUMENT_REQUESTS=false +``` + +需要排查外部 HTTP 依赖时: + +```bash +MONITORING_INSTRUMENT_REQUESTS=true +``` ## 参考 diff --git a/docker/.env.bak b/docker/.env.bak deleted file mode 100644 index 24b53751b..000000000 --- a/docker/.env.bak +++ /dev/null @@ -1,168 +0,0 @@ -# ===== Necessary Configs (Necessary till now, will be migrated to frontend page) ===== - -# Voice Service Config -APPID=app_id -TOKEN=token - -# ===== Non-essential Configs (Modify if you know what you are doing) ===== - -CLUSTER=volcano_tts -VOICE_TYPE=zh_male_jieshuonansheng_mars_bigtts -SPEED_RATIO=1.3 - -# ===== Proxy Configuration (Optional) ===== - -# HTTP_PROXY=http://proxy-server:port -# HTTPS_PROXY=http://proxy-server:port -# NO_PROXY=localhost,127.0.0.1 - -# ===== Backend Configuration (No need to modify at all) ===== - -# Model Path Config -CLIP_MODEL_PATH=/opt/models/clip-vit-base-patch32 -NLTK_DATA=/opt/models/nltk_data - -# Elasticsearch Service -ELASTICSEARCH_HOST=http://nexent-elasticsearch:9200 -ELASTIC_PASSWORD=nexent@2025 - -# Elasticsearch Memory Configuration -ES_JAVA_OPTS="-Xms2g -Xmx2g" - -# Elasticsearch Disk Watermark Configuration -ES_DISK_WATERMARK_LOW=85% -ES_DISK_WATERMARK_HIGH=90% -ES_DISK_WATERMARK_FLOOD_STAGE=95% - -# Main Services -# Config service (port 5010) - Main API service for config operations -CONFIG_SERVICE_URL=http://nexent-config:5010 -ELASTICSEARCH_SERVICE=http://nexent-config:5010/api - -# Runtime service (port 5014) - Runtime execution service for agent operations -RUNTIME_SERVICE_URL=http://nexent-runtime:5014 - -# MCP service (port 5011) - MCP protocol service -NEXENT_MCP_SERVER=http://nexent-mcp:5011 -MCP_MANAGEMENT_API=http://nexent-mcp:5015 - -# Data process service (port 5012) - Data processing service -DATA_PROCESS_SERVICE=http://nexent-data-process:5012/api - -# Northbound service (port 5013) - Northbound API service -NORTHBOUND_API_SERVER=http://nexent-northbound:5013/api - -# Postgres Config -POSTGRES_HOST=nexent-postgresql -POSTGRES_USER=root -NEXENT_POSTGRES_PASSWORD=nexent@4321 -POSTGRES_DB=nexent -POSTGRES_PORT=5432 - -# Minio Config -MINIO_ENDPOINT=http://nexent-minio:9000 -MINIO_ROOT_USER=nexent -MINIO_ROOT_PASSWORD=nexent@4321 -MINIO_REGION=cn-north-1 -MINIO_DEFAULT_BUCKET=nexent - -# Redis Config -REDIS_URL=redis://redis:6379/0 -REDIS_BACKEND_URL=redis://redis:6379/1 - -# Model Engine Config -MODEL_ENGINE_ENABLED=false - -# Supabase Config -DASHBOARD_USERNAME=supabase -DASHBOARD_PASSWORD=Huawei123 - -# Supabase db Config -SUPABASE_POSTGRES_PASSWORD=Huawei123 -SUPABASE_POSTGRES_HOST=db -SUPABASE_POSTGRES_DB=supabase -SUPABASE_POSTGRES_PORT=5436 - -# Supabase Auth Config -SITE_URL=http://localhost:3011 -SUPABASE_URL=http://supabase-kong-mini:8000 -API_EXTERNAL_URL=http://supabase-kong-mini:8000 -DISABLE_SIGNUP=false -JWT_EXPIRY=3600 -DEBUG_JWT_EXPIRE_SECONDS=0 - -# Supabase Configuration -ENABLE_EMAIL_SIGNUP=true -ENABLE_EMAIL_AUTOCONFIRM=true -ENABLE_ANONYMOUS_USERS=false - -# Supabase Phone Config -ENABLE_PHONE_SIGNUP=false -ENABLE_PHONE_AUTOCONFIRM=false - -MAILER_URLPATHS_CONFIRMATION="/auth/v1/verify" -MAILER_URLPATHS_INVITE="/auth/v1/verify" -MAILER_URLPATHS_RECOVERY="/auth/v1/verify" -MAILER_URLPATHS_EMAIL_CHANGE="/auth/v1/verify" - -INVITE_CODE=nexent2025 - -# Terminal Tool SSH Key Path -SSH_PRIVATE_KEY_PATH=/path/to/openssh-server/ssh-keys/openssh_server_key - -# ===== Data Processing Service Configuration ===== - -# Redis Port -REDIS_PORT=6379 - -# Flower Monitoring -FLOWER_PORT=5555 - -# Ray Configuration -RAY_ACTOR_NUM_CPUS=2 -RAY_DASHBOARD_PORT=8265 -RAY_DASHBOARD_HOST=0.0.0.0 -RAY_NUM_CPUS=4 -RAY_OBJECT_STORE_MEMORY_GB=0.25 -RAY_TEMP_DIR=/tmp/ray -RAY_LOG_LEVEL=INFO - -# Service Control Flags -DISABLE_RAY_DASHBOARD=true -DISABLE_CELERY_FLOWER=true -DOCKER_ENVIRONMENT=false -ENABLE_UPLOAD_IMAGE=false - -# Celery Configuration -CELERY_WORKER_PREFETCH_MULTIPLIER=1 -CELERY_TASK_TIME_LIMIT=3600 -ELASTICSEARCH_REQUEST_TIMEOUT=30 - -# Worker Configuration -QUEUES=process_q,forward_q -WORKER_NAME= -WORKER_CONCURRENCY=4 - -# Skills Configuration -SKILLS_PATH=/mnt/nexent/skills - -# Telemetry and Monitoring Configuration -ENABLE_TELEMETRY=false -SERVICE_NAME=nexent-backend -JAEGER_ENDPOINT=http://localhost:14268/api/traces -PROMETHEUS_PORT=8000 -TELEMETRY_SAMPLE_RATE=1.0 -LLM_SLOW_REQUEST_THRESHOLD_SECONDS=5.0 -LLM_SLOW_TOKEN_RATE_THRESHOLD=10.0 - -# Market Backend Address -MARKET_BACKEND=http://60.204.251.153:8010 -DEPLOYMENT_VERSION="speed" -# Root dir -ROOT_DIR="/c/Users/18270/nexent-data" -TERMINAL_MOUNT_DIR="/opt/terminal" -SSH_USERNAME="root" -SSH_PASSWORD="731215" -NEXENT_MCP_DOCKER_IMAGE="ccr.ccs.tencentyun.com/nexent-hub/nexent-mcp:v2.0.1" -MINIO_ACCESS_KEY="72c31cb5b521511cea652723" -MINIO_SECRET_KEY="m5gcSuKzZnp84CqmG7z5VKnd2C+H5U3PSr7eoJeygmI=" diff --git a/docker/.env.example b/docker/.env.example index f48cd5e55..29ffeaa33 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -160,7 +160,7 @@ ENABLE_TELEMETRY=false # Optional JSON/YAML config file. Environment variables override file values when set. MONITORING_CONFIG_FILE= -# Provider profile: otlp, phoenix, langfuse, jaeger, custom +# Provider profile: otlp, phoenix, langfuse, jaeger, grafana, custom MONITORING_PROVIDER=otlp MONITORING_USE_PLATFORM_SDK=false MONITORING_PROJECT_NAME=nexent diff --git a/docker/docker-compose-monitoring.yml b/docker/docker-compose-monitoring.yml index c8c1b4078..776fdf44c 100644 --- a/docker/docker-compose-monitoring.yml +++ b/docker/docker-compose-monitoring.yml @@ -1,3 +1,5 @@ +name: monitor + services: otel-collector: image: otel/opentelemetry-collector-contrib:0.89.0 @@ -29,6 +31,41 @@ services: - nexent_nexent restart: unless-stopped + tempo: + image: grafana/tempo:${TEMPO_VERSION:-2.10.1} + container_name: nexent-tempo + profiles: ["grafana"] + command: ["--config.file=/etc/tempo.yml"] + volumes: + - ./monitoring/tempo.yml:/etc/tempo.yml:ro + - tempo-data:/var/tempo + ports: + - "${TEMPO_PORT:-3200}:3200" + networks: + - nexent_nexent + restart: unless-stopped + + grafana: + image: grafana/grafana:${GRAFANA_VERSION:-latest} + container_name: nexent-grafana + profiles: ["grafana"] + environment: + GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin} + GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-nexent-grafana-admin} + GF_USERS_ALLOW_SIGN_UP: "false" + GF_USERS_DEFAULT_LANGUAGE: ${GRAFANA_DEFAULT_LANGUAGE:-zh-Hans} + volumes: + - grafana-data:/var/lib/grafana + - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro + - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro + ports: + - "${GRAFANA_PORT:-3002}:3000" + depends_on: + - tempo + networks: + - nexent_nexent + restart: unless-stopped + langfuse-worker: image: docker.io/langfuse/langfuse-worker:${LANGFUSE_VERSION:-3} container_name: nexent-langfuse-worker @@ -212,3 +249,5 @@ volumes: langfuse-clickhouse-logs: langfuse-minio-data: langfuse-redis-data: + grafana-data: + tempo-data: diff --git a/docker/monitoring/grafana/dashboards/nexent-llm-agent.json b/docker/monitoring/grafana/dashboards/nexent-llm-agent.json new file mode 100644 index 000000000..d4e2c321b --- /dev/null +++ b/docker/monitoring/grafana/dashboards/nexent-llm-agent.json @@ -0,0 +1,150 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Nexent Agent traces backed by Grafana Tempo.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [ + { + "asDropdown": false, + "icon": "external link", + "includeVars": false, + "keepTime": true, + "tags": [], + "targetBlank": false, + "title": "Open Tempo Explore", + "tooltip": "Open Grafana Explore with the Tempo datasource", + "type": "link", + "url": "/explore?left=%7B%22datasource%22:%22Tempo%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22query%22:%22%7B%20resource.service.name%20%3D%20%5C%22nexent-backend%5C%22%20%7D%22,%22queryType%22:%22traceql%22%7D%5D%7D" + } + ], + "panels": [ + { + "datasource": { + "type": "tempo", + "uid": "Tempo" + }, + "description": "Recent traces for Nexent backend. Open a trace row to inspect the agent, chain, LLM, and tool span waterfall.", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 16, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "datasource": { + "type": "tempo", + "uid": "Tempo" + }, + "limit": 100, + "query": "{ resource.service.name = \"nexent-backend\" }", + "queryType": "traceql", + "refId": "A", + "tableType": "traces" + } + ], + "title": "Recent Agent Traces", + "type": "table" + }, + { + "description": "TraceQL shortcuts for common Nexent views.", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 2, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "Service traces:\n{ resource.service.name = \"nexent-backend\" }\n\nAgent spans:\n{ resource.service.name = \"nexent-backend\" && span.openinference.span.kind = \"AGENT\" }\n\nLLM spans:\n{ resource.service.name = \"nexent-backend\" && span.openinference.span.kind = \"LLM\" }\n\nTool spans:\n{ resource.service.name = \"nexent-backend\" && span.openinference.span.kind = \"TOOL\" }\n\nError traces:\n{ resource.service.name = \"nexent-backend\" && status = error }", + "mode": "markdown" + }, + "pluginVersion": "11.0.0", + "title": "TraceQL Examples", + "type": "text" + } + ], + "preload": false, + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "nexent", + "agent", + "tempo" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Nexent Agent Trace Monitoring", + "uid": "nexent-llm-agent", + "version": 1, + "weekStart": "" +} diff --git a/docker/monitoring/grafana/provisioning/dashboards/dashboards.yml b/docker/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 000000000..b863e9d16 --- /dev/null +++ b/docker/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: Nexent Monitoring + orgId: 1 + folder: Nexent + type: file + disableDeletion: false + updateIntervalSeconds: 30 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards diff --git a/docker/monitoring/grafana/provisioning/datasources/datasources.yml b/docker/monitoring/grafana/provisioning/datasources/datasources.yml new file mode 100644 index 000000000..e4eea64cf --- /dev/null +++ b/docker/monitoring/grafana/provisioning/datasources/datasources.yml @@ -0,0 +1,18 @@ +apiVersion: 1 + +deleteDatasources: + - name: Prometheus + orgId: 1 + - name: Jaeger + orgId: 1 + - name: Tempo + orgId: 1 + +datasources: + - name: Tempo + uid: Tempo + type: tempo + access: proxy + url: http://nexent-tempo:3200 + isDefault: true + editable: true diff --git a/docker/monitoring/monitoring.env b/docker/monitoring/monitoring.env index 38c382ab1..e6659bd8f 100644 --- a/docker/monitoring/monitoring.env +++ b/docker/monitoring/monitoring.env @@ -68,3 +68,12 @@ LANGFUSE_MINIO_CONSOLE_PORT=9093 LANGFUSE_S3_BUCKET=langfuse LANGFUSE_REDIS_AUTH=myredissecret LANGFUSE_REDIS_PORT=6380 + +# Local Grafana stack. Used by: ./start-monitoring.sh --stack grafana +GRAFANA_VERSION=latest +GRAFANA_PORT=3002 +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD=nexent-grafana-admin +GRAFANA_DEFAULT_LANGUAGE=zh-Hans +TEMPO_VERSION=2.10.1 +TEMPO_PORT=3200 diff --git a/docker/monitoring/monitoring.env.example b/docker/monitoring/monitoring.env.example index fe040a911..4f78cb4f2 100644 --- a/docker/monitoring/monitoring.env.example +++ b/docker/monitoring/monitoring.env.example @@ -72,3 +72,12 @@ LANGFUSE_MINIO_CONSOLE_PORT=9093 LANGFUSE_S3_BUCKET=langfuse LANGFUSE_REDIS_AUTH=myredissecret LANGFUSE_REDIS_PORT=6380 + +# Local Grafana stack. Used by: ./start-monitoring.sh --stack grafana +GRAFANA_VERSION=latest +GRAFANA_PORT=3002 +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD=nexent-grafana-admin +GRAFANA_DEFAULT_LANGUAGE=zh-Hans +TEMPO_VERSION=2.10.1 +TEMPO_PORT=3200 diff --git a/docker/monitoring/otel-collector-grafana-config.yml b/docker/monitoring/otel-collector-grafana-config.yml new file mode 100644 index 000000000..85bb6092d --- /dev/null +++ b/docker/monitoring/otel-collector-grafana-config.yml @@ -0,0 +1,50 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 1s + send_batch_size: 512 + + memory_limiter: + limit_mib: 256 + check_interval: 1s + + resource: + attributes: + - key: service.name + value: nexent-backend + action: upsert + - key: service.version + from_attribute: version + action: insert + +exporters: + logging: + verbosity: normal + + otlp/tempo: + endpoint: tempo:4317 + tls: + insecure: true + +service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [otlp/tempo, logging] + + metrics: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [logging] + + telemetry: + logs: + level: "info" diff --git a/docker/monitoring/otel-collector-langfuse-config.yml b/docker/monitoring/otel-collector-langfuse-config.yml index 667758882..c3f0a1af4 100644 --- a/docker/monitoring/otel-collector-langfuse-config.yml +++ b/docker/monitoring/otel-collector-langfuse-config.yml @@ -33,6 +33,24 @@ exporters: headers: Authorization: ${env:LANGFUSE_OTLP_AUTH_HEADER} x-langfuse-ingestion-version: "4" + # 1. 超时控制 (Timeout) + # 防止 Collector 等待太久导致协程暴涨 + timeout: 5s + + # 2. 发送队列 (Sending Queue) + # 当后端处理变慢时,把数据先缓存在 Collector 内存中 + sending_queue: + enabled: true + num_consumers: 10 # 并发发送的工作线程数(可提升发送吞吐量) + queue_size: 5000 # 队列最大可容纳的批次数。如果队列满了,新来的数据将被丢弃! + + # 3. 失败重试 (Retry on Failure) + # 遇到网络抖动或后端返回 503 等临时性错误时,进行指数退避重试 + retry_on_failure: + enabled: true + initial_interval: 1s # 第一次重试间隔 1s + max_interval: 30s # 最大重试间隔不超过 30s + max_elapsed_time: 300s # 一条数据最多重试 5 分钟,超过则彻底放弃并丢弃 service: pipelines: diff --git a/docker/monitoring/otel-collector-phoenix-config.yml b/docker/monitoring/otel-collector-phoenix-config.yml index 4fa415aac..f148d0a84 100644 --- a/docker/monitoring/otel-collector-phoenix-config.yml +++ b/docker/monitoring/otel-collector-phoenix-config.yml @@ -30,6 +30,24 @@ exporters: otlphttp/phoenix: endpoint: http://phoenix:6006 + # 1. 超时控制 (Timeout) + # 防止 Collector 等待太久导致协程暴涨 + timeout: 5s + + # 2. 发送队列 (Sending Queue) + # 当后端处理变慢时,把数据先缓存在 Collector 内存中 + sending_queue: + enabled: true + num_consumers: 10 # 并发发送的工作线程数(可提升发送吞吐量) + queue_size: 5000 # 队列最大可容纳的批次数。如果队列满了,新来的数据将被丢弃! + + # 3. 失败重试 (Retry on Failure) + # 遇到网络抖动或后端返回 503 等临时性错误时,进行指数退避重试 + retry_on_failure: + enabled: true + initial_interval: 1s # 第一次重试间隔 1s + max_interval: 30s # 最大重试间隔不超过 30s + max_elapsed_time: 300s # 一条数据最多重试 5 分钟,超过则彻底放弃并丢弃 service: pipelines: diff --git a/docker/monitoring/tempo.yml b/docker/monitoring/tempo.yml new file mode 100644 index 000000000..429972501 --- /dev/null +++ b/docker/monitoring/tempo.yml @@ -0,0 +1,42 @@ +target: all +multitenancy_enabled: false + +server: + http_listen_port: 3200 + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +metrics_generator: + ring: + kvstore: + store: inmemory + storage: + path: /var/tempo/generator/wal + remote_write: [] + traces_storage: + path: /var/tempo/generator/traces + processor: + local_blocks: + filter_server_spans: false + flush_to_storage: true + +storage: + trace: + backend: local + wal: + path: /var/tempo/wal + local: + path: /var/tempo/blocks + +overrides: + defaults: + metrics_generator: + processors: + - local-blocks diff --git a/docker/start-monitoring.sh b/docker/start-monitoring.sh index 6ab628574..28a58a7c3 100755 --- a/docker/start-monitoring.sh +++ b/docker/start-monitoring.sh @@ -2,7 +2,7 @@ # Nexent LLM Performance Monitoring Setup Script # This script starts the OpenTelemetry Collector alone, or with a local -# Phoenix/Langfuse observability backend. +# Phoenix/Langfuse/Grafana observability backend. set -e @@ -12,13 +12,14 @@ COMPOSE_FILE="$SCRIPT_DIR/docker-compose-monitoring.yml" usage() { cat < +Usage: $(basename "$0") [collector|phoenix|langfuse|grafana] + $(basename "$0") --stack Stacks: collector Start OpenTelemetry Collector only. This is the default. phoenix Start Collector and local Arize Phoenix. langfuse Start Collector and local Langfuse self-host stack. + grafana Start Collector, Grafana, and Tempo. Set MONITORING_STACK in monitoring/monitoring.env to change the default. EOF @@ -40,7 +41,7 @@ while [ $# -gt 0 ]; do usage exit 0 ;; - collector|phoenix|langfuse) + collector|phoenix|langfuse|grafana) STACK_ARG="$1" shift ;; @@ -61,11 +62,11 @@ if ! docker info > /dev/null 2>&1; then fi # Create external network if it doesn't exist -if ! docker network ls | grep -q nexent-network; then - echo "🔗 Creating nexent-network..." - docker network create nexent-network +if ! docker network ls | grep -q nexent_nexent; then + echo "🔗 Creating nexent_nexent..." + docker network create nexent_nexent else - echo "✅ nexent-network already exists" + echo "✅ nexent_nexent already exists" fi # Copy environment file if it doesn't exist @@ -101,6 +102,10 @@ case "$MONITORING_STACK" in fi export LANGFUSE_OTLP_AUTH_HEADER ;; + grafana) + OTEL_COLLECTOR_CONFIG_FILE="${OTEL_COLLECTOR_CONFIG_FILE:-./monitoring/otel-collector-grafana-config.yml}" + COMPOSE_PROFILES=(--profile grafana) + ;; *) echo "❌ Error: unsupported MONITORING_STACK '$MONITORING_STACK'." usage @@ -120,7 +125,7 @@ fi # Start monitoring services echo "🐳 Starting monitoring services with stack: $MONITORING_STACK" -"${COMPOSE_CMD[@]}" -f "$COMPOSE_FILE" --env-file "$MONITORING_DIR/monitoring.env" "${COMPOSE_PROFILES[@]}" up -d +"${COMPOSE_CMD[@]}" -f "$COMPOSE_FILE" --env-file "$MONITORING_DIR/monitoring.env" "${COMPOSE_PROFILES[@]}" up -d --remove-orphans # Wait for services to be ready echo "⏳ Waiting for services to start..." @@ -154,6 +159,10 @@ case "$MONITORING_STACK" in langfuse) check_service "Langfuse UI" "http://localhost:${LANGFUSE_PORT:-3001}" "${LANGFUSE_PORT:-3001}" || true ;; + grafana) + check_service "Grafana" "http://localhost:${GRAFANA_PORT:-3002}/api/health" "${GRAFANA_PORT:-3002}" || true + check_service "Tempo API" "http://localhost:${TEMPO_PORT:-3200}/ready" "${TEMPO_PORT:-3200}" || true + ;; esac echo "" @@ -170,24 +179,41 @@ case "$MONITORING_STACK" in echo " • Langfuse UI: http://localhost:${LANGFUSE_PORT:-3001}" echo " • Langfuse admin: ${LANGFUSE_INIT_USER_EMAIL:-admin@nexent.local} / ${LANGFUSE_INIT_USER_PASSWORD:-nexent-langfuse-admin}" ;; + grafana) + echo " • Grafana UI: http://localhost:${GRAFANA_PORT:-3002}" + echo " • Grafana admin: ${GRAFANA_ADMIN_USER:-admin} / ${GRAFANA_ADMIN_PASSWORD:-nexent-grafana-admin}" + echo " • Tempo API: http://localhost:${TEMPO_PORT:-3200}" + ;; collector) - echo " • Configure Phoenix, Langfuse, Jaeger, or another OTLP backend in monitoring.env" + echo " • Configure Phoenix, Langfuse, Tempo, Jaeger, or another OTLP backend in monitoring.env" ;; esac echo "" echo "🔧 To enable monitoring in your Nexent backend:" echo " 1. Set ENABLE_TELEMETRY=true in your .env file" -echo " 2. Set OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 for Docker services" +case "$MONITORING_STACK" in + collector) + BACKEND_MONITORING_PROVIDER="otlp" + ;; + grafana) + BACKEND_MONITORING_PROVIDER="grafana" + ;; + *) + BACKEND_MONITORING_PROVIDER="$MONITORING_STACK" + ;; +esac +echo " 2. Set MONITORING_PROVIDER=$BACKEND_MONITORING_PROVIDER in your .env file" +echo " 3. Set OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 for Docker services" echo " or http://localhost:${OTEL_COLLECTOR_HTTP_PORT:-4318} for a backend running on the host" -echo " 3. Install performance dependencies:" +echo " 4. Install performance dependencies:" echo " uv sync --extra performance" -echo " 4. Restart your Nexent backend service" +echo " 5. Restart your Nexent backend service" echo "" -echo "📈 Key Metrics to Monitor:" -echo " • Token Generation Rate (tokens/second)" -echo " • Time to First Token (TTFT)" -echo " • Request Duration" -echo " • Error Rates" +echo "🔎 Key Trace Data to Inspect:" +echo " • Agent span hierarchy" +echo " • LLM generation spans" +echo " • Tool call spans" +echo " • Error events" echo "" echo "🛑 To stop monitoring services:" -echo " ${COMPOSE_CMD[*]} -f $COMPOSE_FILE --env-file $MONITORING_DIR/monitoring.env --profile phoenix --profile langfuse down" +echo " ${COMPOSE_CMD[*]} -f $COMPOSE_FILE --env-file $MONITORING_DIR/monitoring.env --profile phoenix --profile langfuse --profile grafana down --remove-orphans" diff --git a/frontend/components/navigation/TopNavbar.tsx b/frontend/components/navigation/TopNavbar.tsx index 2fbeee744..9c4afbf16 100644 --- a/frontend/components/navigation/TopNavbar.tsx +++ b/frontend/components/navigation/TopNavbar.tsx @@ -1,26 +1,91 @@ "use client"; -import { Button } from "antd"; +import { Button, Tooltip } from "antd"; import { AvatarDropdown } from "@/components/auth/avatarDropdown"; import { useTranslation } from "react-i18next"; -import { ChevronDown, Globe } from "lucide-react"; +import { Activity, ChevronDown, Globe } from "lucide-react"; import { Dropdown } from "antd"; import Link from "next/link"; import { HEADER_CONFIG, SIDER_CONFIG } from "@/const/layoutConstants"; import { languageOptions } from "@/const/constants"; import { useLanguageSwitch } from "@/lib/language"; -import React from "react"; +import React, { useEffect, useState } from "react"; import { Flex, Layout } from "antd"; import { ChatTopNavContent } from "./ChatTopNavContent"; import { useAuthorizationContext } from "../providers/AuthorizationProvider"; import { useDeployment } from "../providers/deploymentProvider"; +import { monitoringService } from "@/services/monitoringService"; +import type { MonitoringStatus } from "@/types/monitoring"; + const { Header } = Layout; +const MONITORING_PROVIDER_UI: Record = { + phoenix: { port: "6006", path: "/" }, + langfuse: { port: "3001", path: "/project/nexent-local" }, + jaeger: { port: "16686", path: "/" }, + grafana: { + port: "3002", + path: "/d/nexent-llm-agent/nexent-agent-trace-monitoring?orgId=1", + }, +}; + +function buildMonitoringUrl(status: MonitoringStatus | null): string | null { + if (!status?.telemetry_enabled || typeof window === "undefined") return null; + + const providerConfig = status.provider + ? MONITORING_PROVIDER_UI[status.provider.toLowerCase()] + : null; + const dashboardPort = status.dashboard_port || providerConfig?.port; + + if (dashboardPort) { + const path = status.dashboard_path || providerConfig?.path || "/"; + const normalizedPath = path.startsWith("/") ? path : `/${path}`; + return `${window.location.protocol}//${window.location.hostname}:${dashboardPort}${normalizedPath}`; + } + + if (status.dashboard_url) { + try { + const url = new URL(status.dashboard_url); + if (["localhost", "127.0.0.1", "0.0.0.0"].includes(url.hostname)) { + url.hostname = window.location.hostname; + } + return url.toString(); + } catch { + return status.dashboard_url; + } + } + + return null; +} + export function TopNavbar({ isChatPage }: { isChatPage: boolean }) { const { t } = useTranslation("common"); const { user, isLoading } = useAuthorizationContext(); - const { isSpeedMode } = useDeployment() + const { isSpeedMode } = useDeployment(); const { currentLanguage, handleLanguageChange } = useLanguageSwitch(); + const [monitoringStatus, setMonitoringStatus] = + useState(null); + + useEffect(() => { + let mounted = true; + + monitoringService.fetchStatus().then((status) => { + if (mounted) { + setMonitoringStatus(status); + } + }); + + return () => { + mounted = false; + }; + }, []); + + const monitoringUrl = buildMonitoringUrl(monitoringStatus); + + const openMonitoringDashboard = () => { + if (!monitoringUrl) return; + window.open(monitoringUrl, "_blank", "noopener,noreferrer"); + }; // Left content - Logo + optional additional title (aligned with sidebar width) const leftContent = ( @@ -61,6 +126,18 @@ export function TopNavbar({ isChatPage }: { isChatPage: boolean }) { // Right content - Additional content + default navigation items const rightContent = ( + {monitoringUrl && ( + +