From 01fd95c0b2855fc87b1d117f063506e09b3253bf Mon Sep 17 00:00:00 2001 From: Bryan Bednarski Date: Wed, 14 Jan 2026 00:29:49 +0000 Subject: [PATCH 01/13] forward-compatible register_llm() ETCD registration Signed-off-by: Bryan Bednarski --- external/dynamo/optimized/ARCHITECTURE.md | 405 +++++ external/dynamo/optimized/PARAMETERS.md | 181 +++ external/dynamo/optimized/README.md | 273 ++++ external/dynamo/optimized/__init__.py | 28 + external/dynamo/optimized/config.yaml | 73 + external/dynamo/optimized/processor.py | 564 +++++++ external/dynamo/optimized/router.py | 1375 +++++++++++++++++ .../start_dynamo_optimized_thompson_hints.sh | 705 +++++++++ 8 files changed, 3604 insertions(+) create mode 100644 external/dynamo/optimized/ARCHITECTURE.md create mode 100644 external/dynamo/optimized/PARAMETERS.md create mode 100644 external/dynamo/optimized/README.md create mode 100644 external/dynamo/optimized/__init__.py create mode 100644 external/dynamo/optimized/config.yaml create mode 100644 external/dynamo/optimized/processor.py create mode 100644 external/dynamo/optimized/router.py create mode 100755 external/dynamo/start_dynamo_optimized_thompson_hints.sh diff --git a/external/dynamo/optimized/ARCHITECTURE.md b/external/dynamo/optimized/ARCHITECTURE.md new file mode 100644 index 0000000000..72e43e627a --- /dev/null +++ b/external/dynamo/optimized/ARCHITECTURE.md @@ -0,0 +1,405 @@ +# Optimized Thompson Sampling Router Architecture + +## Overview + +This architecture uses the **default Dynamo frontend** with custom **Processor** and **Router** components to implement Thompson Sampling-based intelligent worker selection with KV cache locality awareness. + +### Processor-as-Backend Pattern + +**Key insight**: The default Dynamo frontend has its own built-in router (`DYN_ROUTER_MODE`) and routes directly to `dynamo.backend.generate`. To intercept requests and apply custom Thompson Sampling routing: + +1. **Processor registers as `dynamo.backend.generate`** - The frontend discovers our processor as the "backend" +2. **SGLang Worker registers as `dynamo.worker.generate`** - Our processor forwards to actual workers after routing +3. **Frontend's built-in router becomes irrelevant** - The frontend routes to `dynamo.backend.generate` which is our processor + +``` +Frontend (built-in router: round-robin) + → routes to dynamo.backend.generate + → OUR PROCESSOR (intercepts!) + → queries Thompson Sampling router + → forwards to dynamo.worker.generate (actual SGLang workers) +``` + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ CLIENT │ +│ │ +│ POST /v1/chat/completions │ +│ { │ +│ "model": "llama-3.3-70b", │ +│ "messages": [...], │ +│ "nvext": { │ +│ "annotations": [ │ +│ "prefix_id:my-session-001", │ +│ "total_requests:10", │ +│ "osl:MEDIUM", │ +│ "iat:LOW" │ +│ ] │ +│ } │ +│ } │ +└─────────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ DEFAULT DYNAMO FRONTEND │ +│ (python -m dynamo.frontend) │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ OpenAI HTTP Server (port 8000) │ │ +│ │ • /v1/chat/completions │ │ +│ │ • /v1/models │ │ +│ │ • /health │ │ +│ │ • /metrics (Prometheus) │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Preprocessor │ │ +│ │ • Tokenization (chat template applied) │ │ +│ │ • NVExt parsing → PreprocessedRequest │ │ +│ │ • Annotations preserved: prefix_id, total_requests, osl, iat │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ │ PreprocessedRequest │ +│ │ (tokens + annotations + extra_args) │ +└────────────────────────────────────────┼────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ CUSTOM PROCESSOR │ +│ (registers as: dynamo.backend.generate) │ +│ (intercepts frontend requests!) │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ 1. Receive PreprocessedRequest from frontend │ │ +│ │ • Extract annotations: prefix_id, total_requests, osl, iat │ │ +│ │ • Compute reuse_budget = total_requests - processed_for_prefix │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ 2. Query Router (find_worker endpoint) │ │ +│ │ RouterRequest { │ │ +│ │ tokens: [...], │ │ +│ │ prefix_id: "my-session-001", │ │ +│ │ reuse_budget: 9, │ │ +│ │ expected_osl: "MEDIUM", │ │ +│ │ interarrival: "LOW" │ │ +│ │ } │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ 3. Route to Selected Backend Worker │ │ +│ │ • Use worker_id from router to direct request │ │ +│ │ • Stream response tokens back to frontend │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ 4. Send Feedback to Router │ │ +│ │ RouterFeedbackRequest { │ │ +│ │ decision_id: "abc123", │ │ +│ │ latency_ms: 245.5, │ │ +│ │ success: true, │ │ +│ │ tokens_in: 128, │ │ +│ │ tokens_out: 64 │ │ +│ │ } │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ Prometheus Metrics (port 8081): │ +│ • thompson_processor_requests_total │ +│ • thompson_processor_request_latency_seconds │ +│ • thompson_processor_tokens_processed_total │ +└─────────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ CUSTOM ROUTER │ +│ (dynamo/router component) │ +│ │ +│ Endpoints: │ +│ • find_worker: Select optimal worker for request │ +│ • feedback: Receive latency feedback to update bandits │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Thompson Sampling Algorithm │ │ +│ │ │ │ +│ │ Score(worker) = LinTS(features) + Beta_TS(worker) │ +│ │ + Affinity(prefix_sticky) │ │ +│ │ - SwitchCost(if switching) │ │ +│ │ × LoadModifier(queue, GPU, outstanding) │ │ +│ │ │ │ +│ │ Features (9-dim): │ │ +│ │ [1, inv_load, kv_overlap, affinity, outstanding, │ │ +│ │ decode_cost, prefill_cost, iat_factor, reuse_budget] │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ KV Cache Indexer │ │ +│ │ • Tracks KV cache blocks per worker │ │ +│ │ • Computes overlap scores for routing decisions │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Bandit State │ │ +│ │ • Beta bandits: (α, β) per worker │ │ +│ │ • LinTS: A matrix, b vector per worker │ │ +│ │ • Pending decisions awaiting feedback │ │ +│ │ • Latency EMA baselines (global, per-worker, per-bucket) │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ Prometheus Metrics (port 8081): │ +│ • thompson_router_decisions_total{worker_id} │ +│ • thompson_router_kv_overlap{worker_id} │ +│ • thompson_router_feedback_latency_seconds │ +│ • thompson_router_reward{worker_id} │ +│ • thompson_router_pending_decisions │ +└─────────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ BACKEND WORKER (Unified Mode) │ +│ (python -m dynamo.sglang) │ +│ (registers as: dynamo.worker.generate) │ +│ (NOT backend.generate - that's our processor!) │ +│ │ +│ Default Configuration (start_dynamo_optimized_thompson_hints.sh): │ +│ │ +│ ┌───────────────────────────────────────────────────────────────────────────┐ │ +│ │ Unified Worker │ │ +│ │ GPUs: 0,1,2,3 (DYNAMO_GPU_DEVICES) │ │ +│ │ TP: 4 (DYNAMO_TP_SIZE) │ │ +│ │ Endpoint: dynamo.worker.generate (--endpoint flag) │ │ +│ │ │ │ +│ │ • KV Cache (shared across TP ranks) │ │ +│ │ • SGLang Engine │ │ +│ │ • Prometheus Metrics (port 8081) │ │ +│ └───────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ Environment Variables for GPU Configuration: │ +│ DYNAMO_GPU_DEVICES="0,1,2,3" # Which GPUs to use (default: 0,1,2,3) │ +│ DYNAMO_TP_SIZE=4 # Tensor parallelism degree (default: 4) │ +│ │ +│ Metrics exposed: │ +│ • sglang:* metrics on port 8081 │ +│ • dynamo_component_* metrics │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +## Scaling to Multiple Workers (8-GPU Example) + +For systems with more GPUs, you can run multiple workers. The current startup script +runs a **single unified worker** by default. To scale to multiple workers: + +### Option A: Two Workers with TP=4 (8 GPUs total) +```bash +# Worker 1: GPUs 0-3 +export DYNAMO_GPU_DEVICES="0,1,2,3" +export DYNAMO_TP_SIZE=4 +# (start first worker) + +# Worker 2: GPUs 4-7 +export DYNAMO_GPU_DEVICES="4,5,6,7" +export DYNAMO_TP_SIZE=4 +# (start second worker) +``` + +### Option B: One Worker with TP=8 (8 GPUs, single worker) +```bash +export DYNAMO_GPU_DEVICES="0,1,2,3,4,5,6,7" +export DYNAMO_TP_SIZE=8 +``` + +> **Note**: The Thompson Sampling router benefits most from multiple workers, +> as it can learn optimal routing between them. With a single worker, the router +> still tracks KV cache overlap but cannot make routing decisions between workers. + +## Key Differences from Generalized Architecture + +| Aspect | Generalized | Optimized | +|--------|-------------|-----------| +| Frontend | Custom `frontend.py` with HTTP headers | Default `dynamo.frontend` with nvext | +| Hint Passing | HTTP headers (`x-prefix-*`) | `nvext.annotations` in request body | +| Tokenization | Custom (in frontend) | Handled by Dynamo preprocessor | +| Metrics | CSV files | Prometheus (`/metrics` endpoint) | +| Model Mapping | Custom `FRONTEND_MODEL_MAPPING` | Dynamo's `--model-name`/`--model-path` | +| **Processor Registration** | `dynamo.processor.process` | **`dynamo.backend.generate`** (intercepts frontend) | +| **Worker Registration** | `dynamo.backend.generate` | **`dynamo.worker.generate`** (processor forwards to) | + +### Why "Processor-as-Backend"? + +The default Dynamo frontend has a built-in router (`DYN_ROUTER_MODE=round-robin|random|kv`) that routes directly to `dynamo.backend.generate`. To inject our custom Thompson Sampling routing: + +1. **Processor claims `backend.generate`** - Frontend thinks it's talking to the backend +2. **Processor queries custom router** - Thompson Sampling selects best worker +3. **Processor forwards to `worker.generate`** - Actual SGLang workers +4. **Frontend's built-in router is irrelevant** - We've intercepted the request pipeline + +## NVExt Annotations + +The client passes routing hints via the `nvext.annotations` field in the request: + +```json +{ + "model": "llama-3.3-70b", + "messages": [{"role": "user", "content": "Hello!"}], + "nvext": { + "annotations": [ + "prefix_id:session-12345", + "total_requests:10", + "osl:MEDIUM", + "iat:LOW" + ] + } +} +``` + +### Annotation Keys + +| Key | Type | Description | Values | +|-----|------|-------------|--------| +| `prefix_id` | string | Unique identifier for request prefix/session | Any string | +| `total_requests` | int | Total expected requests for this prefix | Positive integer | +| `osl` | enum | Output Sequence Length expectation | `LOW`, `MEDIUM`, `HIGH` | +| `iat` | enum | Inter-Arrival Time (request frequency) | `LOW`, `MEDIUM`, `HIGH` | + +## Quick Start + +```bash +# Required: Set path to your model +export DYNAMO_MODEL_DIR="/path/to/Llama-3.3-70B-Instruct" + +# Optional: Configure GPU devices (default: 0,1,2,3) +export DYNAMO_GPU_DEVICES="0,1,2,3" +export DYNAMO_TP_SIZE=4 + +# Optional: Set model name (default: llama-3.3-70b) +export DYNAMO_MODEL_NAME="llama-3.3-70b" + +# Start the system +bash start_dynamo_optimized_thompson_hints.sh +``` + +## Component Startup Order + +1. **ETCD** - Service discovery and metadata +2. **NATS** - Message queue for KV events (if using kv router mode) +3. **Backend Worker** - SGLang GPU worker → registers at `dynamo.worker.generate` +4. **Router** - Thompson Sampling router → registers at `dynamo.router.{find_worker,feedback}` +5. **Processor** - Request orchestrator → **registers at `dynamo.backend.generate`** (intercepts frontend!) +6. **Frontend** - HTTP API server → routes to `dynamo.backend.generate` (our processor) + +> **Important**: The Processor must register as `backend.generate` before the Frontend starts, +> otherwise the Frontend might discover the SGLang worker directly (if it registered as `backend.generate`). + +## Prometheus Metrics + +All components expose metrics on port 8081 by default (`DYN_SYSTEM_PORT`): + +### Router Metrics +``` +thompson_router_decisions_total{worker_id="0"} 1234 +thompson_router_kv_overlap{worker_id="0"} 0.75 +thompson_router_feedback_latency_seconds_bucket{le="0.1"} 100 +thompson_router_reward{worker_id="0"} 0.65 +thompson_router_pending_decisions 5 +thompson_router_timeout_penalties_total 2 +``` + +### Processor Metrics +``` +thompson_processor_requests_total 5000 +thompson_processor_request_latency_seconds_bucket{le="1.0"} 4500 +thompson_processor_tokens_in_total 128000 +thompson_processor_tokens_out_total 64000 +thompson_processor_routing_decisions_total{worker_id="0"} 1234 +``` + +## Environment Variables + +### GPU and Worker Configuration + +These variables control how the backend worker uses GPUs. **Modify these to scale your deployment.** + +| Variable | Default | Description | +|----------|---------|-------------| +| `DYNAMO_GPU_DEVICES` | `0,1,2,3` | Comma-separated list of GPU device IDs to use | +| `DYNAMO_TP_SIZE` | `4` | Tensor parallelism degree (must match number of GPUs) | +| `DYNAMO_MODEL_DIR` | (required) | Path to the model directory on the host | +| `DYNAMO_MODEL_NAME` | `llama-3.3-70b` | Model name exposed to clients | +| `DYNAMO_SHM_SIZE` | `16g` | Shared memory size for the container | +| `DYNAMO_WORKER_INIT_TIMEOUT_S` | `600` | Timeout (seconds) for worker initialization | + +### Example GPU Configurations + +```bash +# Default: Single worker using GPUs 0-3 with TP=4 +export DYNAMO_GPU_DEVICES="0,1,2,3" +export DYNAMO_TP_SIZE=4 + +# 8-GPU system: Single worker using all 8 GPUs with TP=8 +export DYNAMO_GPU_DEVICES="0,1,2,3,4,5,6,7" +export DYNAMO_TP_SIZE=8 + +# 8-GPU system: Use only GPUs 4-7 with TP=4 +export DYNAMO_GPU_DEVICES="4,5,6,7" +export DYNAMO_TP_SIZE=4 + +# 2-GPU system: Use GPUs 0-1 with TP=2 +export DYNAMO_GPU_DEVICES="0,1" +export DYNAMO_TP_SIZE=2 +``` + +### Network and Metrics Configuration + +| Variable | Default | Description | +|----------|---------|-------------| +| `DYNAMO_HTTP_PORT` | `8000` | Frontend HTTP API port | +| `DYNAMO_METRICS_PORT` | `8081` | Prometheus metrics port | +| `DYN_HTTP_PORT` | `8000` | Dynamo frontend HTTP port (same as above) | +| `DYN_SYSTEM_PORT` | `8081` | Dynamo system/metrics port | +| `DYNAMO_ROUTER_WAIT_FOR_WORKERS_TIMEOUT_S` | `600` | Worker discovery timeout | + +> **Note on `DYN_ROUTER_MODE`**: The startup script passes `--router-mode round-robin` to the +> default frontend, but this is **irrelevant** in our architecture. The frontend's built-in +> router routes to `dynamo.backend.generate`, which is our Processor (not a real backend). +> Our Processor intercepts the request and uses our custom Thompson Sampling router instead. + +## Sample Client Request + +```bash +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llama-3.3-70b", + "messages": [{"role": "user", "content": "What is 2+2?"}], + "max_tokens": 100, + "stream": true, + "nvext": { + "annotations": [ + "prefix_id:math-session-001", + "total_requests:5", + "osl:LOW", + "iat:HIGH" + ] + } + }' +``` + +## Request Flow (Detailed) + +1. **Client → Frontend**: HTTP POST with nvext annotations +2. **Frontend (Preprocessor)**: Tokenizes messages, creates `PreprocessedRequest` with annotations +3. **Frontend (Built-in Router)**: Routes to `dynamo.backend.generate` (round-robin, but only one "backend" - our processor!) +4. **Processor (as backend.generate)**: Receives request, extracts hints from annotations +5. **Processor → Router**: Queries Thompson Sampling router for worker selection +6. **Router**: Computes Thompson Sampling scores, returns worker_id +7. **Processor → Worker**: Sends request to `dynamo.worker.generate` via `engine_client.direct(worker_id)` +7. **Backend → Processor**: Streams response tokens +8. **Processor → Router**: Sends latency feedback for bandit update +9. **Processor → Frontend**: Streams response +10. **Frontend → Client**: SSE stream or JSON response + +## Files + +- `processor.py` - Custom processor with nvext annotation extraction +- `router.py` - Thompson Sampling router with Prometheus metrics +- `ARCHITECTURE.md` - This document + diff --git a/external/dynamo/optimized/PARAMETERS.md b/external/dynamo/optimized/PARAMETERS.md new file mode 100644 index 0000000000..0d747f9fe6 --- /dev/null +++ b/external/dynamo/optimized/PARAMETERS.md @@ -0,0 +1,181 @@ +# Thompson Sampling Router Parameters + +This document describes all configurable parameters for the `WorkloadAwareRouter` in `router.py`. + +## Configuration Methods + +Parameters can be set via: + +1. **YAML Config File** (`config.yaml`) - All 31 parameters +2. **CLI Flags** - 5 flags for common operations: + - `--config` - Path to YAML config file + - `--affinity-base` - Primary stickiness control + - `--temp-base` - Primary exploration control + - `--lints-v` - Exploration variance + - `--override` - Override any config value (repeatable) + +**Precedence:** CLI flags override config file values. + +## Usage Examples + +```bash +# Use config file only +python router.py --config config.yaml + +# Override specific values +python router.py --config config.yaml --affinity-base 0.5 --temp-base 1.5 + +# Override nested values +python router.py --config config.yaml --override load_balancing.gpu_penalty_weight=2.0 + +# Multiple overrides +python router.py --config config.yaml \ + --override switching_cost.base=0.3 \ + --override feedback.timeout_seconds=60 +``` + +--- + +## Parameter Reference + +### Infrastructure + +| Parameter | Config Path | Default | Type | Description | +|-----------|-------------|---------|------|-------------| +| Block Size | `infrastructure.block_size` | 64 | int | KV cache block size for overlap computation | +| Router Type | `infrastructure.router_type` | "kv" | str | Router mode: "kv" (KV-aware) or "kv_load" (load-based only) | +| Min Workers | `infrastructure.min_workers` | 1 | int | Minimum workers required before routing starts | + +### Affinity (Stickiness) + +Controls how strongly the router prefers keeping requests on the same worker for KV cache reuse. + +| Parameter | Config Path | CLI Flag | Default | Type | Description | +|-----------|-------------|----------|---------|------|-------------| +| Base | `affinity.base` | `--affinity-base` | 0.30 | float | Base bonus when staying on same worker. Higher = more sticky. | +| Reuse Weight | `affinity.reuse_weight` | `--override` | 0.15 | float | Additional bonus per remaining request in session | +| IAT Weight | `affinity.iat_weight` | `--override` | 0.20 | float | Bonus scaling based on inter-arrival time hint | +| Sticky Load Floor | `affinity.sticky_load_floor` | `--override` | 0.70 | float | Minimum load modifier for sticky decisions (prevents load from overriding stickiness) | + +**Tuning Guide:** +- **High affinity (0.4-0.6):** Prioritize KV cache hits, good for multi-turn conversations +- **Low affinity (0.1-0.2):** Prioritize load balancing, good for independent requests + +### Exploration (Temperature) + +Controls the explore vs exploit tradeoff in worker selection. + +| Parameter | Config Path | CLI Flag | Default | Type | Description | +|-----------|-------------|----------|---------|------|-------------| +| Base TS Weight | `exploration.base_ts_weight` | `--override` | 0.10 | float | Weight for Thompson Sampling exploration term | +| Temp Base | `exploration.temperature.base` | `--temp-base` | 1.0 | float | Base softmax temperature | +| Temp Min | `exploration.temperature.min` | `--override` | 0.15 | float | Minimum temperature (more greedy selection) | +| Temp Max | `exploration.temperature.max` | `--override` | 2.0 | float | Maximum temperature (more random selection) | + +**Tuning Guide:** +- **Low temperature (0.2-0.5):** Greedy, always pick best-scored worker +- **High temperature (1.5-2.0):** More exploration, useful early or with changing workloads + +### Switching Cost + +Penalty applied when moving a prefix session to a different worker. + +| Parameter | Config Path | CLI Flag | Default | Type | Description | +|-----------|-------------|----------|---------|------|-------------| +| Base | `switching_cost.base` | `--override` | 0.20 | float | Base penalty for switching workers | +| Reuse Penalty | `switching_cost.reuse_penalty` | `--override` | 0.08 | float | Additional penalty per remaining request in session | +| IAT Penalty | `switching_cost.iat_penalty` | `--override` | 0.05 | float | Additional penalty based on inter-arrival time | + +**Tuning Guide:** +- **High switching cost (0.3-0.5):** Strongly discourage worker changes mid-session +- **Low switching cost (0.05-0.1):** Allow flexible rebalancing + +### Load Balancing + +Controls how much to penalize workers with high load. + +| Parameter | Config Path | CLI Flag | Default | Type | Description | +|-----------|-------------|----------|---------|------|-------------| +| Queue Penalty | `load_balancing.queue_penalty_weight` | `--override` | 0.50 | float | Weight for pending request queue depth | +| GPU Penalty | `load_balancing.gpu_penalty_weight` | `--override` | 1.00 | float | Weight for GPU KV cache memory usage | +| Outstanding Work | `load_balancing.outstanding_work_weight` | `--override` | 0.45 | float | Weight for outstanding work (expected future load) | +| Job-GPU Coupling | `load_balancing.job_gpu_coupling_weight` | `--override` | 0.40 | float | How much job cost amplifies GPU load penalty | +| Job-Queue Coupling | `load_balancing.job_queue_coupling_weight` | `--override` | 0.20 | float | How much job cost amplifies queue penalty | + +**Tuning Guide:** +- **High GPU penalty (1.5-2.0):** Aggressively avoid memory-constrained workers +- **High queue penalty (0.8-1.0):** Prioritize low-latency routing + +### Prefill Cost Model + +How input sequence length (ISL) contributes to job cost estimation. + +| Parameter | Config Path | CLI Flag | Default | Type | Description | +|-----------|-------------|----------|---------|------|-------------| +| Token Scale | `prefill.token_scale` | `--override` | 1024.0 | float | Normalization denominator for token count | +| Weight | `prefill.weight` | `--override` | 1.0 | float | Weight of prefill cost in total job cost | + +### LinTS Learner + +Parameters controlling the Linear Thompson Sampling algorithm. + +| Parameter | Config Path | CLI Flag | Default | Type | Description | +|-----------|-------------|----------|---------|------|-------------| +| Lambda | `lints.lambda` | `--override` | 1.0 | float | Ridge regression regularization strength | +| V | `lints.v` | `--lints-v` | 0.25 | float | Exploration variance in posterior sampling | +| Forget Rate | `lints.forget_rate` | `--override` | 0.995 | float | Exponential decay for old observations | + +**Tuning Guide:** +- **High V (0.4-0.6):** More exploration, keeps trying alternatives +- **Low V (0.05-0.15):** More exploitation, trusts learned model +- **High forget rate (0.999):** Long memory, slow adaptation +- **Low forget rate (0.95):** Short memory, fast adaptation to changes + +### Feedback Handling + +Controls the delayed reward mechanism. + +| Parameter | Config Path | CLI Flag | Default | Type | Description | +|-----------|-------------|----------|---------|------|-------------| +| Timeout Seconds | `feedback.timeout_seconds` | `--override` | 120.0 | float | Max wait time for feedback before applying timeout penalty | +| Sweep Interval | `feedback.sweep_interval_seconds` | `--override` | 5.0 | float | How often to check for timed-out decisions | +| Timeout Reward | `feedback.timeout_reward` | `--override` | 0.0 | float | Reward for timed-out requests (0.0 = treat as failure) | +| Latency EMA Alpha | `feedback.latency_ema_alpha` | `--override` | 0.2 | float | Smoothing factor for latency baseline EMA | + +### Debug + +| Parameter | Config Path | CLI Flag | Default | Type | Description | +|-----------|-------------|----------|---------|------|-------------| +| Traces Enabled | `debug.traces_enabled` | `--override` | false | bool | Enable detailed decision trace logging | +| Trace Dir | `debug.trace_dir` | `--override` | /tmp/dynamo_router_traces | str | Directory for trace output files | +| Buffer Size | `debug.buffer_size` | `--override` | 2000 | int | In-memory trace buffer size | + +--- + +## Feature Vector (LinTS Input) + +The router uses a 9-dimensional feature vector as input to the LinTS learner: + +| Index | Feature | Source | Description | +|-------|---------|--------|-------------| +| 0 | Bias | Constant | Always 1.0 (intercept term) | +| 1 | Inverse Load | Computed | 1/(1 + gpu_penalty + queue_penalty) | +| 2 | Overlap | KV Indexer | KV cache overlap score [0, 1] | +| 3 | Affinity | State | 1 if same worker as last request, else 0 | +| 4 | Outstanding | State | tanh(0.1 × outstanding_work) | +| 5 | Decode Cost | OSL Hint | decode_cost / 3.0 | +| 6 | Prefill Cost | Computed | tanh(prefill_cost) | +| 7 | IAT Factor | IAT Hint | iat_factor / 1.5 | +| 8 | Reuse Budget | Hint | tanh(0.25 × reuse_budget) | + +--- + +## Learned vs Fixed Parameters + +| Category | Updated At | Examples | +|----------|-----------|----------| +| **Learned (runtime)** | Every request | `linA`, `linb` matrices, Beta(α,β) bandits, latency EMAs | +| **Fixed (startup)** | Never | All 31 config parameters above | + +The config parameters are **hyperparameters** that control *how* the router learns, not *what* it learns. + diff --git a/external/dynamo/optimized/README.md b/external/dynamo/optimized/README.md new file mode 100644 index 0000000000..fdb1443939 --- /dev/null +++ b/external/dynamo/optimized/README.md @@ -0,0 +1,273 @@ +# Optimized Thompson Sampling Router Architecture + +This directory contains the optimized implementation of the Thompson Sampling router for Dynamo, using the "Processor-as-Backend" pattern with **Dynamic Discovery** to intercept requests from the default Dynamo frontend. + +## Architecture Overview (Dynamic Discovery Mode) + +``` +┌─────────────────────────────────────────────────────────────────────────┐ +│ Client Request (with nvext.annotations) │ +│ ↓ │ +│ Default Dynamo Frontend (port 8000) │ +│ ↓ tokenization + nvext parsing │ +│ ↓ discovers backends via ETCD ModelWatcher │ +│ ↓ finds Processor's model card! │ +│ │ +│ Custom Processor (dynamo.backend.generate-{instance_id}) │ +│ ↓ extracts: prefix_id, total_requests, osl, iat │ +│ ↓ queries Thompson Sampling router │ +│ │ +│ Custom Router (dynamo.router.find_worker) │ +│ ↓ KV overlap + workload-aware selection │ +│ ↓ returns worker_id │ +│ │ +│ Processor forwards to dynamo.worker.generate (with worker_id) │ +│ ↓ │ +│ SGLang Worker (actual inference) │ +│ ↓ │ +│ Response + Feedback to Router │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +## Components + +| Component | File | Endpoint | Purpose | +|-----------|------|----------|---------| +| Processor | `processor.py` | `dynamo.backend.generate` + ETCD model card | Intercepts frontend requests, extracts hints, coordinates routing | +| Router | `router.py` | `dynamo.router.find_worker` | Thompson Sampling + KV overlap worker selection | +| Config | `config.yaml` | - | Router configuration parameters | + +## Dynamic Discovery Pattern (Forward-Compatible) + +Instead of using the deprecated `--static-endpoint` flag on the frontend, this processor uses **dynamic discovery** via ETCD: + +1. **Processor** registers as `dynamo.backend.generate` (dynamic mode with instance ID) +2. **Processor** calls `register_llm()` to advertise a model card in ETCD +3. **Frontend's ModelWatcher** discovers the processor's model card +4. **Frontend** routes requests to the discovered processor endpoint +5. **SGLang Worker** registers as `dynamo.worker.generate` (also dynamic) + +### Why Dynamic Discovery? + +The `--static-endpoint` flag is **deprecated** and will be removed in future Dynamo versions. Dynamic discovery provides: + +- Forward compatibility with future Dynamo releases +- Support for multiple processor instances (load balancing) +- Standard Dynamo discovery patterns +- Dynamic scaling capabilities + +## Processor Registration + +The processor uses `register_llm()` to advertise itself in ETCD: + +```python +@dynamo_worker(static=False) # Dynamic mode for ETCD discovery +async def worker(runtime: DistributedRuntime): + component = runtime.namespace("dynamo").component("backend") + await component.create_service() + + endpoint = component.endpoint("generate") + + # Register model card so frontend can discover us + await register_llm( + model_input=ModelInput.Tokens, + model_type=ModelType.Chat | ModelType.Completions, + endpoint=endpoint, + model_path=args.model_path, + model_name=args.model_name, + ) + + handler = ProcessorRequestHandler(runtime, ...) + await endpoint.serve_endpoint(handler.generate) +``` + +### Required Arguments + +The processor now requires: +- `--model-path`: Path to the model directory (for tokenizer and model card) +- `--model-name`: Served model name (must match frontend's expected model) + +## Usage + +### Starting the System + +```bash +# Set required environment variable +export DYNAMO_MODEL_DIR="/path/to/Llama-3.3-70B-Instruct" + +# Start all components +bash ../start_dynamo_optimized_thompson_hints.sh +``` + +### Making Requests + +```bash +# Basic request (no routing hints) +curl http://localhost:8000/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "llama-3.3-70b", + "messages": [{"role": "user", "content": "Hello!"}], + "max_tokens": 50 + }' + +# Request with nvext annotations (routing hints) +curl http://localhost:8000/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "llama-3.3-70b", + "messages": [{"role": "user", "content": "Hello!"}], + "max_tokens": 50, + "nvext": { + "annotations": [ + "prefix_id:my-session-001", + "total_requests:10", + "osl:MEDIUM", + "iat:LOW" + ] + } + }' +``` + +### Routing Hint Annotations + +| Annotation | Format | Description | +|------------|--------|-------------| +| `prefix_id` | `prefix_id:` | Unique identifier for prefix reuse across requests | +| `total_requests` | `total_requests:` | Expected total requests in this prefix group | +| `osl` | `osl:LOW\|MEDIUM\|HIGH` | Expected output sequence length | +| `iat` | `iat:LOW\|MEDIUM\|HIGH` | Inter-arrival time hint | + +--- + +## Troubleshooting + +### Verifying Processor Interception + +To confirm that requests are flowing through the processor (not directly to workers), run: + +```bash +docker logs dynamo-sglang-optimized 2>&1 | grep -E "(Processor|processor|Processing request|Routing decision|dynamo.backend|backend.generate|find_worker)" | tail -50 +``` + +### Expected Output (Nominal Operation) + +When the system is working correctly, you should see output similar to: + +``` +Step 3: Starting Custom Processor (Registers as backend.generate)... +Processor PID: 3735 +Registered at: dynamo.backend.generate (intercepts frontend requests) + +INFO processor._init_prometheus_metrics: Prometheus metrics initialized for processor +INFO processor.initialize: Router clients created, waiting for instances... +INFO dynamo_runtime::component::client: wait_for_instances: Found 1 instance(s) for endpoint: dynamo/router/_endpoint_/find_worker +INFO processor.initialize: Router clients initialized successfully +INFO processor.initialize: Engine client created, waiting for worker instances... +INFO processor.initialize: Processor initialized successfully (routing to dynamo.worker.generate) + +INFO processor.generate: Processing request: prefix=auto-3f0519ac1cc442d2... total=1 osl=MEDIUM iat=MEDIUM tokens=37 +INFO processor.generate: Routing decision: worker=7587892168930944779 decision=bcc5180740ed44c6... reuse_budget=0 + +INFO processor.generate: Processing request: prefix=auto-2593032a6cf843ce... total=1 osl=MEDIUM iat=MEDIUM tokens=37 +INFO processor.generate: Routing decision: worker=7587892168930944779 decision=ba4440fd3a144822... reuse_budget=0 +``` + +### Key Indicators of Success + +| Log Message | Meaning | +|-------------|---------| +| `Registering model card: model_name=...` | Processor registering with ETCD | +| `Model card registered successfully` | Frontend can now discover the processor | +| `Router clients initialized successfully` | Connected to Thompson Sampling router | +| `Processor initialized successfully` | Ready to process requests | +| `Processing request: prefix=... tokens=N` | Request received and being processed | +| `Routing decision: worker=... decision=...` | Router selected a worker | + +### Common Issues + +#### 1. Frontend Not Finding Processor + +**Symptom:** Requests fail or go directly to workers, bypassing processor. + +**Cause:** Model card not registered or model name mismatch. + +**Verification:** +```bash +# Check if processor registered its model card +docker logs dynamo-sglang-optimized 2>&1 | grep -i "model card" + +# Check ETCD for registered models +curl -s http://localhost:2379/v3/kv/range -X POST \ + -H "Content-Type: application/json" \ + -d '{"key":"ZHluYW1v","range_end":"ZHluYW1w"}' | jq . +``` + +**Solution:** +1. Ensure `--model-name` matches between processor and frontend +2. Ensure `--model-path` points to a valid model directory +3. Processor must start BEFORE frontend + +#### 2. "missing field `token_ids`" Error + +**Cause:** Processor couldn't reach workers. + +**Solution:** Ensure workers are registered and running: +```bash +docker logs dynamo-sglang-optimized 2>&1 | grep "worker.generate" +``` + +#### 3. Requests Bypassing Processor + +**Symptom:** No "Processing request" logs, but responses still work. + +**Cause:** Frontend is routing directly to workers instead of through the processor. + +**Verification:** +```bash +# Check if processor is receiving requests +docker logs dynamo-sglang-optimized 2>&1 | grep "Processing request" +``` + +**Solution:** +1. Ensure processor's `--model-name` matches frontend's `--model-name` exactly +2. Processor must register BEFORE frontend starts +3. Check that processor's model card is in ETCD + +#### 4. Router Not Found + +**Symptom:** `Router stream ended without worker_id; falling back to engine load balancing` + +**Cause:** Router not started or not registered. + +**Solution:** Check router logs: +```bash +docker logs dynamo-sglang-optimized 2>&1 | grep -i router +``` + +--- + +## Prometheus Metrics + +| Metric | Description | +|--------|-------------| +| `thompson_processor_requests_total` | Total requests processed | +| `thompson_processor_request_latency_seconds` | Request latency histogram | +| `thompson_processor_tokens_in_total` | Total input tokens | +| `thompson_processor_tokens_out_total` | Total output tokens | +| `thompson_processor_routing_decisions_total` | Routing decisions by worker | +| `thompson_processor_router_errors_total` | Router communication errors | +| `thompson_processor_engine_errors_total` | Backend engine errors | +| `thompson_processor_active_requests` | Currently active requests | + +Access metrics: +```bash +curl http://localhost:8081/metrics | grep thompson_processor +``` + +--- + +## Configuration + +See `config.yaml` for router configuration options and `PARAMETERS.md` for detailed parameter documentation. + diff --git a/external/dynamo/optimized/__init__.py b/external/dynamo/optimized/__init__.py new file mode 100644 index 0000000000..da8420481d --- /dev/null +++ b/external/dynamo/optimized/__init__.py @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Optimized Thompson Sampling Router Architecture. + +This package contains custom Dynamo components that work with the default +Dynamo frontend, using nvext.annotations for routing hints and Prometheus +for metrics. + +Components: + - processor.py: Custom processor with nvext annotation extraction + - router.py: Thompson Sampling router with Prometheus metrics + +See ARCHITECTURE.md for detailed documentation. +""" + diff --git a/external/dynamo/optimized/config.yaml b/external/dynamo/optimized/config.yaml new file mode 100644 index 0000000000..2c6dbe8f59 --- /dev/null +++ b/external/dynamo/optimized/config.yaml @@ -0,0 +1,73 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Thompson Sampling Router Configuration +# ====================================== +# +# This file contains all configurable parameters for the WorkloadAwareRouter. +# Parameters can be overridden via CLI flags (see PARAMETERS.md for details). +# +# CLI Override Examples: +# python router.py --config config.yaml --affinity-base 0.5 +# python router.py --config config.yaml --override affinity.reuse_weight=0.2 +# + +# Infrastructure settings +infrastructure: + block_size: 64 # KV cache block size for overlap computation + router_type: kv # Router type: "kv" (KV-aware) or "kv_load" (load-based) + min_workers: 1 # Minimum workers required before routing starts + +# Affinity settings - controls stickiness to same worker for prefix reuse +affinity: + base: 0.30 # Base affinity bonus when staying on same worker (CLI: --affinity-base) + reuse_weight: 0.15 # Additional bonus per remaining request in session + iat_weight: 0.20 # Bonus scaling based on inter-arrival time + sticky_load_floor: 0.70 # Minimum load modifier for sticky decisions + +# Exploration settings - controls explore vs exploit tradeoff +exploration: + base_ts_weight: 0.10 # Weight for Thompson Sampling exploration term + temperature: + base: 1.0 # Base softmax temperature (CLI: --temp-base) + min: 0.15 # Minimum temperature (more greedy) + max: 2.0 # Maximum temperature (more random) + +# Switching cost - penalty for moving prefix to different worker +switching_cost: + base: 0.20 # Base penalty for switching workers + reuse_penalty: 0.08 # Additional penalty per remaining request + iat_penalty: 0.05 # Additional penalty based on inter-arrival time + +# Load balancing - how much to penalize busy workers +load_balancing: + queue_penalty_weight: 0.50 # Weight for queue depth penalty + gpu_penalty_weight: 1.00 # Weight for GPU memory usage penalty + outstanding_work_weight: 0.45 # Weight for outstanding work penalty + job_gpu_coupling_weight: 0.40 # Coupling between job cost and GPU load + job_queue_coupling_weight: 0.20 # Coupling between job cost and queue depth + +# Prefill cost model - how to weight input sequence length +prefill: + token_scale: 1024.0 # Normalization scale for token count + weight: 1.0 # Weight of prefill cost in total job cost + +# LinTS (Linear Thompson Sampling) learner parameters +lints: + lambda: 1.0 # Ridge regression regularization strength + v: 0.25 # Exploration variance in posterior sampling (CLI: --lints-v) + forget_rate: 0.995 # Exponential decay for old observations (0.995 = slow forget) + +# Feedback handling - delayed reward processing +feedback: + timeout_seconds: 120.0 # Seconds to wait for feedback before timeout penalty + sweep_interval_seconds: 5.0 # How often to check for timed-out decisions + timeout_reward: 0.0 # Reward assigned to timed-out decisions (0.0 = bad) + latency_ema_alpha: 0.2 # EMA smoothing factor for latency baselines + +# Debug settings +debug: + traces_enabled: false # Enable debug trace logging + trace_dir: /tmp/dynamo_router_traces # Directory for trace files + buffer_size: 2000 # In-memory trace buffer size + diff --git a/external/dynamo/optimized/processor.py b/external/dynamo/optimized/processor.py new file mode 100644 index 0000000000..9a91759604 --- /dev/null +++ b/external/dynamo/optimized/processor.py @@ -0,0 +1,564 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Optimized Processor for Thompson Sampling Router Architecture. + +This processor uses the "Processor-as-Backend" pattern with DYNAMIC DISCOVERY +to intercept requests from the default Dynamo frontend and apply custom Thompson +Sampling routing. + +## Dynamic Discovery Mode (Forward-Compatible) + +Instead of using the deprecated `--static-endpoint` flag on the frontend, this +processor registers a model card in ETCD so the frontend can discover it via +its ModelWatcher. This is the forward-compatible approach. + +### Requirements: +- Processor must be started with `--model-path` and `--model-name` arguments +- Model path must point to a valid model directory with tokenizer files +- Model name must match what the frontend expects (e.g., "llama-3.3-70b") + +### Endpoint Registration Pattern + +1. **This Processor registers as `dynamo.backend.generate`** - Dynamically with instance ID +2. **Processor calls `register_llm()`** - Advertises model card in ETCD +3. **Frontend's ModelWatcher discovers us** - Routes requests to our endpoint +4. **SGLang Worker registers as `dynamo.worker.generate`** - We forward to actual workers + +## Request Flow + +``` +Frontend (discovers backends via ETCD ModelWatcher) + → routes to dynamo.backend.generate-{instance_id} + → THIS PROCESSOR (discovered via model card!) + → extracts hints from nvext annotations + → queries Thompson Sampling router → worker_id + → forwards to dynamo.worker.generate (actual SGLang workers) +``` + +Key differences from generalized/processor.py: +- Uses dynamic discovery (no --static-endpoint on frontend) +- Registers model card via register_llm() for ETCD discovery +- Registers as `dynamo.backend.generate` (not `dynamo.processor.process`) +- Forwards to `dynamo.worker.generate` (not `dynamo.backend.generate`) +- Receives PreprocessedRequest instead of ChatCompletionRequest +- Extracts hints from nvext annotations (prefix_id:value format) +- Uses Prometheus metrics instead of CSV logging +- No tokenization (handled by frontend preprocessor) +""" + +import argparse +import asyncio +import logging +import os +import time +import uuid +from collections.abc import AsyncIterator +from typing import Any + +import uvloop +from dynamo.runtime import DistributedRuntime +from dynamo.runtime import dynamo_worker +from dynamo.runtime.logging import configure_dynamo_logging +from dynamo.llm import ModelInput, ModelType, register_llm +from pydantic import BaseModel + +# Prometheus metrics - import lazily to ensure proper multiprocess setup +_prometheus_initialized = False +_metrics = {} + +configure_dynamo_logging() +logger = logging.getLogger(__name__) + + +def _init_prometheus_metrics(): + """Initialize Prometheus metrics lazily.""" + global _prometheus_initialized, _metrics + if _prometheus_initialized: + return _metrics + + try: + from prometheus_client import Counter, Histogram, Gauge, CollectorRegistry, REGISTRY + + _metrics["requests_total"] = Counter( + "thompson_processor_requests_total", + "Total requests processed by the Thompson Sampling processor", + registry=REGISTRY, + ) + _metrics["request_latency"] = Histogram( + "thompson_processor_request_latency_seconds", + "Request latency in seconds", + buckets=[0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0], + registry=REGISTRY, + ) + _metrics["tokens_in"] = Counter( + "thompson_processor_tokens_in_total", + "Total input tokens processed", + registry=REGISTRY, + ) + _metrics["tokens_out"] = Counter( + "thompson_processor_tokens_out_total", + "Total output tokens generated", + registry=REGISTRY, + ) + _metrics["routing_decisions"] = Counter( + "thompson_processor_routing_decisions_total", + "Routing decisions by worker", + ["worker_id"], + registry=REGISTRY, + ) + _metrics["router_errors"] = Counter( # i.e errors when picking a worker + "thompson_processor_router_errors_total", + "Router communication errors", + registry=REGISTRY, + ) + _metrics["engine_errors"] = Counter( # i.e errors when streaming from the engine + "thompson_processor_engine_errors_total", + "Backend engine errors", + registry=REGISTRY, + ) + _metrics["active_requests"] = Gauge( + "thompson_processor_active_requests", + "Currently active requests being processed", + registry=REGISTRY, + ) + _prometheus_initialized = True + logger.info("Prometheus metrics initialized for processor") + except ImportError: + logger.warning("prometheus_client not available, metrics disabled") + _prometheus_initialized = True # Don't retry + + return _metrics + + +# ----------------------- request / response models ----------------------- # +class RouterRequest(BaseModel): + """Request to the Thompson Sampling router.""" + tokens: list[int] + prefix_id: str = "" + reuse_budget: int = 0 # remaining *after this request* + expected_osl: str | None = "MEDIUM" + interarrival: str | None = "MEDIUM" + + +class RouterFeedbackRequest(BaseModel): + """Feedback to the router after request completion.""" + decision_id: str + latency_ms: float + success: bool | None = True + tokens_in: int | None = None + tokens_out: int | None = None + finish_reason: str | None = None + + +# -------------------------- processor handler -------------------------- # +class ProcessorRequestHandler: + """ + Processor that receives PreprocessedRequest from the default Dynamo frontend, + extracts routing hints from nvext annotations, and coordinates with the + Thompson Sampling router for intelligent worker selection. + """ + + def __init__( + self, + runtime: DistributedRuntime, + enable_router: bool = True, + ): + self.runtime = runtime + self.enable_router = enable_router + + self.router_pick_client = None + self.router_feedback_client = None + self.engine_client = None + + # Prefix-level state: {prefix_id: {"total": int, "processed": int}} + self._prefix_state: dict[str, dict[str, int]] = {} + self._prefix_lock = asyncio.Lock() + + # Prometheus metrics + self._metrics = {} + + async def initialize(self): + """Initialize processor by connecting to router and backend.""" + # Initialize Prometheus metrics + self._metrics = _init_prometheus_metrics() + + if self.enable_router: + ns = self.runtime.namespace("dynamo").component("router") + self.router_pick_client = await ns.endpoint("find_worker").client() + self.router_feedback_client = await ns.endpoint("feedback").client() + logger.info("Router clients created, waiting for instances...") + await self.router_pick_client.wait_for_instances() + logger.info("Router clients initialized successfully") + + # Engine client - connects to actual workers at dynamo.worker.generate + # (We register as "backend" to intercept frontend requests, but actual SGLang + # workers register as "worker" so we can forward to them after routing) + self.engine_client = await self.runtime.namespace("dynamo").component("worker").endpoint("generate").client() + logger.info("Engine client created, waiting for worker instances...") + await self.engine_client.wait_for_instances() + logger.info("Processor initialized successfully (routing to dynamo.worker.generate)") + + # ---- annotation extraction ---- + @staticmethod + def _extract_annotation(annotations: list[str], key: str, default: str | None = None) -> str | None: + """Extract value from annotations list (format: 'key:value').""" + prefix = f"{key}:" + for ann in annotations: + if ann.startswith(prefix): + return ann[len(prefix):] + return default + + def _extract_hints(self, request: dict[str, Any]) -> tuple[str, int, str, str]: + """ + Extract routing hints from PreprocessedRequest annotations. + + Returns: (prefix_id, total_requests, osl, iat) + """ + annotations = request.get("annotations", []) + if not isinstance(annotations, list): + annotations = [] + + # Extract from annotations + prefix_id = self._extract_annotation(annotations, "prefix_id") + if not prefix_id: + prefix_id = f"auto-{uuid.uuid4().hex}" + + total_str = self._extract_annotation(annotations, "total_requests", "1") + try: + total_requests = max(1, int(total_str)) + except (ValueError, TypeError): + total_requests = 1 + + osl = self._extract_annotation(annotations, "osl", "MEDIUM") + osl = osl.upper() if osl else "MEDIUM" + if osl not in ("LOW", "MEDIUM", "HIGH"): + osl = "MEDIUM" + + iat = self._extract_annotation(annotations, "iat", "MEDIUM") + iat = iat.upper() if iat else "MEDIUM" + if iat not in ("LOW", "MEDIUM", "HIGH"): + iat = "MEDIUM" + + return prefix_id, total_requests, osl, iat + + async def _update_prefix_state(self, prefix_id: str, total_requests: int) -> int: + """ + Updates prefix counters and returns remaining_after (reuse_budget). + """ + async with self._prefix_lock: + s = self._prefix_state.get(prefix_id) + if s is None: + s = {"total": total_requests, "processed": 0} + self._prefix_state[prefix_id] = s + else: + s["total"] = max(s["total"], total_requests) + + s["processed"] += 1 + remaining_after = max(s["total"] - s["processed"], 0) + + if remaining_after == 0: + # Drop state immediately when finished + self._prefix_state.pop(prefix_id, None) + + return remaining_after + + async def _pick_worker( + self, + token_ids: list[int], + prefix_id: str, + reuse_budget: int, + osl: str, + iat: str, + ) -> tuple[int | None, str | None]: + """Pick a worker via the router.""" + if not self.router_pick_client: + return None, None + + req = RouterRequest( + tokens=token_ids, + prefix_id=prefix_id, + reuse_budget=max(int(reuse_budget), 0), + expected_osl=osl, + interarrival=iat, + ) + try: + stream = await self.router_pick_client.generate(req.model_dump()) + + worker_id: int | None = None + decision_id: str | None = None + async for chunk in stream: + data = chunk.data() + if "error" in data: + logger.error("Router error: %s", data["error"]) + if self._metrics.get("router_errors"): + self._metrics["router_errors"].inc() + break + wid = data.get("worker_id", -1) + if wid == -1: + break + worker_id = int(wid) + decision_id = data.get("decision_id") + break + + if worker_id is not None and self._metrics.get("routing_decisions"): + self._metrics["routing_decisions"].labels(worker_id=str(worker_id)).inc() + + if worker_id is None: + logger.warning("Router stream ended without worker_id; falling back to engine load balancing.") + + return worker_id, decision_id + + except Exception as e: + logger.error("Failed to pick worker: %s", e) + if self._metrics.get("router_errors"): + self._metrics["router_errors"].inc() + return None, None + + async def _send_feedback_safely( + self, + decision_id: str | None, + latency_ms: float, + success: bool, + tokens_in: int, + tokens_out: int, + finish_reason: str | None, + ): + """Send feedback to router (fire-and-forget style).""" + if not decision_id or not self.router_feedback_client: + return + try: + fb = RouterFeedbackRequest( + decision_id=decision_id, + latency_ms=float(latency_ms), + success=bool(success), + tokens_in=int(tokens_in), + tokens_out=int(tokens_out), + finish_reason=finish_reason or "", + ) + stream = await self.router_feedback_client.generate(fb.model_dump()) + async for _ in stream: + pass + except Exception: + logger.exception("Failed to send router feedback") + + async def _stream_from_engine( + self, + request: dict[str, Any], + worker_id: int | None, + decision_id: str | None, + tokens_in: int, + ) -> AsyncIterator[dict[str, Any]]: + """ + Stream response from the backend engine. + Yields response chunks and sends feedback on completion. + """ + t0 = time.perf_counter() + tokens_out = 0 + finish_reason: str | None = None + + try: + # Route to specific worker or use engine's load balancing + if worker_id is not None: + stream = await self.engine_client.direct(request, worker_id) + else: + stream = await self.engine_client.generate(request) + + async for chunk in stream: + data = chunk.data() + + if "error" in data: + latency_ms = (time.perf_counter() - t0) * 1000.0 + await self._send_feedback_safely( + decision_id, latency_ms, False, tokens_in, tokens_out, "error" + ) + if self._metrics.get("engine_errors"): + self._metrics["engine_errors"].inc() + yield {"error": data["error"]} + return + + # Count output tokens + if "token_ids" in data and isinstance(data["token_ids"], list): + tokens_out += len(data["token_ids"]) + + # Pass through the chunk + yield data + + if "finish_reason" in data and data["finish_reason"] is not None: + finish_reason = data["finish_reason"] + latency_ms = (time.perf_counter() - t0) * 1000.0 + + # Send feedback + await self._send_feedback_safely( + decision_id, latency_ms, True, tokens_in, tokens_out, finish_reason + ) + + # Update metrics + if self._metrics.get("request_latency"): + self._metrics["request_latency"].observe(latency_ms / 1000.0) + if self._metrics.get("tokens_in"): + self._metrics["tokens_in"].inc(tokens_in) + if self._metrics.get("tokens_out"): + self._metrics["tokens_out"].inc(tokens_out) + + return + + except Exception as e: + latency_ms = (time.perf_counter() - t0) * 1000.0 + await self._send_feedback_safely( + decision_id, latency_ms, False, tokens_in, tokens_out, "exception" + ) + if self._metrics.get("engine_errors"): + self._metrics["engine_errors"].inc() + logger.exception("Engine stream exception") + yield {"error": str(e)} + return + + # ---- main generation ---- + async def generate(self, raw: dict[str, Any]): + """ + Processor endpoint: receives PreprocessedRequest from frontend. + + Expected format (from Dynamo preprocessor): + { + "token_ids": [...], + "annotations": ["prefix_id:xyz", "total_requests:10", ...], + "sampling_options": {...}, + "stop_conditions": {...}, + ... + } + """ + # Track active requests + if self._metrics.get("active_requests"): + self._metrics["active_requests"].inc() + + try: + # Increment request counter + if self._metrics.get("requests_total"): + self._metrics["requests_total"].inc() + + # Extract routing hints from annotations + prefix_id, total_requests, osl, iat = self._extract_hints(raw) + + # Get token IDs from preprocessed request + token_ids = raw.get("token_ids", []) + if not isinstance(token_ids, list): + token_ids = [] + + tokens_in = len(token_ids) + logger.info( + "Processing request: prefix=%s total=%d osl=%s iat=%s tokens=%d", + prefix_id, total_requests, osl, iat, tokens_in + ) + + # Compute reuse_budget := remaining AFTER this request + reuse_budget = await self._update_prefix_state(prefix_id, total_requests) + + # Pick worker via router + worker_id, decision_id = await self._pick_worker( + token_ids, prefix_id, reuse_budget, osl, iat + ) + + logger.info( + "Routing decision: worker=%s decision=%s reuse_budget=%d", + worker_id, decision_id, reuse_budget + ) + + # Stream from engine + async for resp in self._stream_from_engine(raw, worker_id, decision_id, tokens_in): + yield resp + + finally: + if self._metrics.get("active_requests"): + self._metrics["active_requests"].dec() + + +# -------------------------- worker entry point -------------------------- # +def parse_args(): + p = argparse.ArgumentParser(description="Optimized Thompson Sampling Processor") + p.add_argument( + "--enable-router", + action="store_true", + default=True, + help="Enable Thompson Sampling router integration", + ) + p.add_argument( + "--no-router", + action="store_false", + dest="enable_router", + help="Disable router (use engine load balancing only)", + ) + p.add_argument( + "--model-path", + type=str, + required=True, + help="Path to the model directory (for loading tokenizer and model card)", + ) + p.add_argument( + "--model-name", + type=str, + required=True, + help="Served model name (must match frontend's --model-name)", + ) + return p.parse_args() + + +@dynamo_worker(static=False) # Dynamic mode for ETCD discovery by frontend +async def worker(runtime: DistributedRuntime): + args = parse_args() + + # DYNAMIC DISCOVERY MODE: + # Instead of using --static-endpoint on the frontend, we register a model card + # in ETCD so the frontend can discover us via its ModelWatcher. + # + # This is the forward-compatible approach since --static-endpoint is deprecated. + # + # Flow: + # 1. We register as dynamo.backend.generate (dynamically with instance ID) + # 2. We call register_llm() to advertise ourselves in ETCD + # 3. Frontend's ModelWatcher discovers us and routes requests to us + # 4. We forward to actual workers at dynamo.worker.generate + + component = runtime.namespace("dynamo").component("backend") + await component.create_service() + + # Create the endpoint FIRST (needed for register_llm) + endpoint = component.endpoint("generate") + + # Register the model card with ETCD so the frontend can discover us + # We accept preprocessed tokens (ModelInput.Tokens) and serve chat/completions + logger.info(f"Registering model card: model_name={args.model_name}, model_path={args.model_path}") + await register_llm( + model_input=ModelInput.Tokens, # We accept tokenized input from frontend + model_type=ModelType.Chat | ModelType.Completions, # Chat and completions endpoints + endpoint=endpoint, + model_path=args.model_path, + model_name=args.model_name, + ) + logger.info("Model card registered successfully - frontend can now discover us via ETCD") + + # Initialize the request handler + # Note: We use the same runtime for both serving AND client connections now, + # since we're fully dynamic. The runtime will discover workers dynamically. + handler = ProcessorRequestHandler(runtime, enable_router=args.enable_router) + await handler.initialize() + + # Serve as "backend.generate" - frontend will route to us after ETCD discovery + await endpoint.serve_endpoint(handler.generate) + + +if __name__ == "__main__": + uvloop.install() + asyncio.run(worker()) # pylint: disable=no-value-for-parameter diff --git a/external/dynamo/optimized/router.py b/external/dynamo/optimized/router.py new file mode 100644 index 0000000000..8d032d4dc9 --- /dev/null +++ b/external/dynamo/optimized/router.py @@ -0,0 +1,1375 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Optimized Thompson Sampling Router with Prometheus Metrics. + +This router implements Contextual Thompson Sampling with: + - KV overlap locality + - Remaining per-prefix requests (reuse_budget) + - OSL-based decode cost, ISL/prefill cost per worker + - IAT-based stickiness/opportunity weighting + - Instant & outstanding load (no TTL decay) + - Delayed bandit update using observed latency via `feedback` endpoint + - Timeout penalty for missing feedback + - Prometheus metrics (instead of CSV) + - Debug traces for offline analysis + +Key differences from generalized/router.py: + - Uses Prometheus metrics instead of CSV logging + - Removed CSV file I/O + - Added comprehensive Prometheus gauges, counters, and histograms +""" + +import argparse +import asyncio +import json +import logging +import math +import os +import random +import threading +import time +import uuid +from collections import deque +from functools import wraps +from pathlib import Path +from typing import Any + +import numpy as np +import uvloop +import yaml +from dynamo.runtime import DistributedRuntime +from dynamo.runtime import dynamo_worker +from dynamo.runtime.logging import configure_dynamo_logging +from pydantic import BaseModel + +# Try to import KV routing classes from dynamo.llm, fallback to stubs if unavailable +try: + from dynamo.llm import KvIndexer + from dynamo.llm import OverlapScores +except ImportError: + logger_init = logging.getLogger(__name__) + logger_init.warning("dynamo.llm KV classes not available, using fallback implementations") + + class OverlapScores: + """Fallback: KV cache overlap scores between a request and workers.""" + + def __init__(self, scores: dict[int, float] | None = None): + self.scores = scores if scores is not None else {} + + class KvIndexer: + """Fallback: KV cache indexer for finding overlap between requests and workers.""" + + def __init__(self, engine: Any, block_size: int): + self.engine = engine + self.block_size = block_size + + async def find_matches_for_request(self, tokens: list[int], min_overlap: int) -> OverlapScores: + """Find overlap scores for each worker. Returns empty scores (round-robin fallback).""" + return OverlapScores({}) + + +configure_dynamo_logging() +logger = logging.getLogger(__name__) + +WorkerId = int + + +# ---------------------- config loading ---------------------- # +def get_default_config_path() -> Path: + """Get path to default config.yaml in the same directory as this script.""" + return Path(__file__).parent / "config.yaml" + + +def load_config(config_path: str | Path | None = None) -> dict[str, Any]: + """Load configuration from YAML file. + + Args: + config_path: Path to YAML config file. If None, uses default config.yaml. + + Returns: + Configuration dictionary with nested structure. + """ + if config_path is None: + config_path = get_default_config_path() + + config_path = Path(config_path) + if not config_path.exists(): + logger.warning("Config file not found: %s, using built-in defaults", config_path) + return get_builtin_defaults() + + with open(config_path, encoding="utf-8") as f: + config = yaml.safe_load(f) + + logger.info("Loaded config from: %s", config_path) + return config + + +def get_builtin_defaults() -> dict[str, Any]: + """Return built-in default configuration (matches config.yaml).""" + return { + "infrastructure": { + "block_size": 64, + "router_type": "kv", + "min_workers": 1, + }, + "affinity": { + "base": 0.30, + "reuse_weight": 0.15, + "iat_weight": 0.20, + "sticky_load_floor": 0.70, + }, + "exploration": { + "base_ts_weight": 0.10, + "temperature": { + "base": 1.0, + "min": 0.15, + "max": 2.0, + }, + }, + "switching_cost": { + "base": 0.20, + "reuse_penalty": 0.08, + "iat_penalty": 0.05, + }, + "load_balancing": { + "queue_penalty_weight": 0.50, + "gpu_penalty_weight": 1.00, + "outstanding_work_weight": 0.45, + "job_gpu_coupling_weight": 0.40, + "job_queue_coupling_weight": 0.20, + }, + "prefill": { + "token_scale": 1024.0, + "weight": 1.0, + }, + "lints": { + "lambda": 1.0, + "v": 0.25, + "forget_rate": 0.995, + }, + "feedback": { + "timeout_seconds": 120.0, + "sweep_interval_seconds": 5.0, + "timeout_reward": 0.0, + "latency_ema_alpha": 0.2, + }, + "debug": { + "traces_enabled": False, + "trace_dir": "/tmp/dynamo_router_traces", + "buffer_size": 2000, + }, + } + + +def get_nested(config: dict, dotted_key: str, default: Any = None) -> Any: + """Get a nested value from config using dot notation. + + Args: + config: Configuration dictionary + dotted_key: Key in dot notation, e.g., "affinity.base" + default: Default value if key not found + + Returns: + Value at the nested key, or default if not found. + """ + keys = dotted_key.split(".") + obj = config + for k in keys: + if not isinstance(obj, dict) or k not in obj: + return default + obj = obj[k] + return obj + + +def set_nested(config: dict, dotted_key: str, value: Any) -> None: + """Set a nested value in config using dot notation. + + Args: + config: Configuration dictionary (modified in place) + dotted_key: Key in dot notation, e.g., "affinity.base" + value: Value to set + """ + keys = dotted_key.split(".") + obj = config + for k in keys[:-1]: + if k not in obj: + obj[k] = {} + obj = obj[k] + obj[keys[-1]] = value + + +def auto_cast(value_str: str) -> Any: + """Auto-cast a string value to appropriate type. + + Args: + value_str: String value from CLI + + Returns: + Value cast to int, float, bool, or str as appropriate. + """ + # Boolean + if value_str.lower() in ("true", "yes", "1"): + return True + if value_str.lower() in ("false", "no", "0"): + return False + + # Integer + try: + return int(value_str) + except ValueError: + pass + + # Float + try: + return float(value_str) + except ValueError: + pass + + # String + return value_str + + +def apply_cli_overrides(config: dict, args: argparse.Namespace) -> dict: + """Apply CLI argument overrides to configuration. + + Args: + config: Base configuration dictionary + args: Parsed CLI arguments + + Returns: + Configuration with CLI overrides applied. + """ + # Apply explicit CLI flags + if args.affinity_base is not None: + set_nested(config, "affinity.base", args.affinity_base) + logger.info("CLI override: affinity.base = %s", args.affinity_base) + + if args.temp_base is not None: + set_nested(config, "exploration.temperature.base", args.temp_base) + logger.info("CLI override: exploration.temperature.base = %s", args.temp_base) + + if args.lints_v is not None: + set_nested(config, "lints.v", args.lints_v) + logger.info("CLI override: lints.v = %s", args.lints_v) + + # Apply generic --override flags + if args.override: + for override in args.override: + if "=" not in override: + logger.warning("Invalid override format (expected key=value): %s", override) + continue + key, value_str = override.split("=", 1) + value = auto_cast(value_str) + set_nested(config, key, value) + logger.info("CLI override: %s = %s", key, value) + + return config + + +# Prometheus metrics - initialized lazily +_prometheus_initialized = False +_metrics = {} + + +def _init_prometheus_metrics(): + """Initialize Prometheus metrics lazily.""" + global _prometheus_initialized, _metrics + if _prometheus_initialized: + return _metrics + + try: + from prometheus_client import Counter, Histogram, Gauge, REGISTRY + + _metrics["decisions_total"] = Counter( + "thompson_router_decisions_total", + "Total routing decisions by worker", + ["worker_id"], + registry=REGISTRY, + ) + _metrics["kv_overlap"] = Gauge( + "thompson_router_kv_overlap", + "KV cache overlap score for last decision by worker", + ["worker_id"], + registry=REGISTRY, + ) + _metrics["feedback_latency"] = Histogram( + "thompson_router_feedback_latency_seconds", + "Latency from feedback by worker", + ["worker_id"], + buckets=[0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0], + registry=REGISTRY, + ) + _metrics["reward"] = Gauge( + "thompson_router_reward", + "Last computed reward by worker", + ["worker_id"], + registry=REGISTRY, + ) + _metrics["pending_decisions"] = Gauge( + "thompson_router_pending_decisions", + "Number of pending decisions awaiting feedback", + registry=REGISTRY, + ) + _metrics["timeout_penalties"] = Counter( + "thompson_router_timeout_penalties_total", + "Total timeout penalties applied", + registry=REGISTRY, + ) + _metrics["sticky_decisions"] = Counter( + "thompson_router_sticky_decisions_total", + "Decisions that stayed on the same worker (sticky)", + registry=REGISTRY, + ) + _metrics["switch_decisions"] = Counter( + "thompson_router_switch_decisions_total", + "Decisions that switched to a different worker", + registry=REGISTRY, + ) + _metrics["beta_alpha"] = Gauge( + "thompson_router_beta_alpha", + "Beta distribution alpha parameter by worker", + ["worker_id"], + registry=REGISTRY, + ) + _metrics["beta_beta"] = Gauge( + "thompson_router_beta_beta", + "Beta distribution beta parameter by worker", + ["worker_id"], + registry=REGISTRY, + ) + _metrics["prefix_state_size"] = Gauge( + "thompson_router_prefix_state_size", + "Number of active prefix states", + registry=REGISTRY, + ) + _metrics["reuse_budget"] = Histogram( + "thompson_router_reuse_budget", + "Distribution of reuse_budget values", + buckets=[0, 1, 2, 5, 10, 20, 50, 100], + registry=REGISTRY, + ) + _metrics["tokens_per_request"] = Histogram( + "thompson_router_tokens_per_request", + "Distribution of input token counts", + buckets=[32, 64, 128, 256, 512, 1024, 2048, 4096, 8192], + registry=REGISTRY, + ) + _prometheus_initialized = True + logger.info("Prometheus metrics initialized for router") + except ImportError: + logger.warning("prometheus_client not available, metrics disabled") + _prometheus_initialized = True # Don't retry + + return _metrics + + +# ---------------------- request / response models ---------------------- # +class RouterRequest(BaseModel): + tokens: list[int] + prefix_id: str = "" + reuse_budget: int = 0 # remaining *after this request* + expected_osl: str | None = "MEDIUM" + interarrival: str | None = "MEDIUM" + + +class RouterResponse(BaseModel): + worker_id: int + prefix_hit_rate: float + decision_id: str | None = None + + +class FeedbackRequest(BaseModel): + decision_id: str + latency_ms: float + success: bool | None = True + tokens_in: int | None = None + tokens_out: int | None = None + finish_reason: str | None = None + + +class FeedbackAck(BaseModel): + ok: bool + used_baseline: float + reward: float + worker_id: int | None = None + error: str | None = None + + +# ---------------------- helper decorator ---------------------- # +def safe_update(lock_name: str): + def decorator(fn): + @wraps(fn) + def wrapper(self, *args, **kwargs): + lock = getattr(self, lock_name) + with lock: + return fn(self, *args, **kwargs) + return wrapper + return decorator + + +# ---------------------- router implementation ---------------------- # +class WorkloadAwareRouter: + """ + Contextual Thompson Sampling router with Prometheus metrics. + """ + + def __init__( + self, + runtime: DistributedRuntime, + block_size: int = 64, + router_type: str = "kv", + min_workers: int = 1, + # Affinity / exploration + affinity_base: float = 0.30, + affinity_reuse_weight: float = 0.15, + affinity_iat_weight: float = 0.20, + base_ts_weight: float = 0.10, + sticky_load_floor: float = 0.70, + # Softmax temperature + temp_base: float = 1.0, + temp_min: float = 0.15, + temp_max: float = 2.0, + # Switching cost + switch_cost_base: float = 0.20, + switch_cost_reuse: float = 0.08, + switch_cost_iat: float = 0.05, + # Load / opportunity cost + queue_penalty_weight: float = 0.50, + gpu_penalty_weight: float = 1.00, + outstanding_work_weight: float = 0.45, + job_gpu_coupling_weight: float = 0.40, + job_queue_coupling_weight: float = 0.20, + # Prefill / ISL + prefill_token_scale: float = 1024.0, + prefill_weight: float = 1.0, + # LinTS + lints_lambda: float = 1.0, + lints_v: float = 0.25, + lints_forget: float = 0.995, + # ---------- Feedback timeout / sweep ---------- + feedback_timeout_seconds: float = 120.0, + pending_sweep_interval_seconds: float = 5.0, + timeout_reward: float = 0.0, + # ---------- Latency EMA (reward normalization) ---------- + latency_ema_alpha: float = 0.2, + # ---------- Debug traces ---------- + debug_traces: bool = False, + debug_trace_dir: str = "/tmp/dynamo_router_traces", + debug_buffer_size: int = 2000, + ): + self.runtime = runtime + self.block_size = block_size + self.router_type = router_type + self.min_workers = min_workers + + # clients / helpers (initialized later) + self.engine_client = None + self.indexer: KvIndexer | None = None + + # concurrency primitives + self._init_lock = threading.Lock() + self._bandit_lock = threading.Lock() + self._prefix_lock = threading.Lock() + self._lin_lock = threading.Lock() + self._pending_lock = threading.Lock() + + # prefix state: pid -> {"worker": int|None, "reuse_remaining": int} + self.prefix_cache_state: dict[str, dict[str, int | None]] = {} + # pid -> {"decode_cost","prefill_cost","iat_factor"} + self.prefix_meta: dict[str, dict[str, float]] = {} + + # Beta bandits and LinTS params + self.worker_bandits: dict[int, tuple[float, float]] = {} + self.feature_dim = 9 + self.lin_lambda = float(lints_lambda) + self.lin_v = float(lints_v) + self.lin_forget = float(lints_forget) + self.lin_forget = max(1e-6, min(self.lin_forget, 0.999999)) + self.linA: dict[int, np.ndarray] = {} + self.linb: dict[int, np.ndarray] = {} + + # knobs + self.affinity_base = float(affinity_base) + self.affinity_reuse_weight = float(affinity_reuse_weight) + self.affinity_iat_weight = float(affinity_iat_weight) + self.base_ts_weight = float(base_ts_weight) + self.sticky_load_floor = float(sticky_load_floor) + self.temp_base = float(temp_base) + self.temp_min = float(temp_min) + self.temp_max = float(temp_max) + self.switch_cost_base = float(switch_cost_base) + self.switch_cost_reuse = float(switch_cost_reuse) + self.switch_cost_iat = float(switch_cost_iat) + self.queue_penalty_weight = float(queue_penalty_weight) + self.gpu_penalty_weight = float(gpu_penalty_weight) + self.outstanding_work_weight = float(outstanding_work_weight) + self.job_gpu_coupling_weight = float(job_gpu_coupling_weight) + self.job_queue_coupling_weight = float(job_queue_coupling_weight) + self.prefill_token_scale = float(prefill_token_scale) + self.prefill_weight = float(prefill_weight) + + # LinTS numerics + self._jt_base = 1e-9 + self._jt_mult = 10.0 + self._jt_max = 1e-3 + self._eig_floor = 1e-10 + + # Feedback timeout / sweep + self.feedback_timeout_seconds = float(feedback_timeout_seconds) + self.pending_sweep_interval_seconds = float(pending_sweep_interval_seconds) + self.timeout_reward = float(max(0.0, min(1.0, timeout_reward))) + self._last_pending_sweep = 0.0 + + # Latency EMA baselines (two modes: raw ms, or ms/token) + self.latency_ema_alpha = float(latency_ema_alpha) + # Global (per-mode) + self.lat_ema_global: dict[bool, float | None] = {False: None, True: None} + # Per worker (per-mode) + self.lat_ema_worker: dict[tuple[int, bool], float] = {} + # Per bucket (per-mode): (wid, osl, prefill_bin, per_tok) -> value + self.lat_ema_bucket: dict[tuple[int, str, str, bool], float] = {} + + # Pending decisions waiting for feedback + self.pending: dict[str, dict[str, Any]] = {} + + # Debug traces + self.debug_traces = bool(debug_traces) + self.debug_trace_dir = str(debug_trace_dir) + self.recent_traces: deque = deque(maxlen=int(debug_buffer_size)) + if self.debug_traces: + os.makedirs(self.debug_trace_dir, exist_ok=True) + logger.info("Router debug traces enabled -> %s", self.debug_trace_dir) + + # Prometheus metrics + self._metrics = {} + + # --------------------- tracing --------------------- # + def _emit_trace(self, kind: str, payload: dict[str, Any]): + if not self.debug_traces: + return + item = {"ts": time.time(), "kind": kind, **payload} + self.recent_traces.append(item) + try: + path = os.path.join(self.debug_trace_dir, "router_traces.jsonl") + with open(path, "a", encoding="utf-8") as f: + f.write(json.dumps(item, separators=(",", ":")) + "\n") + except Exception as e: + logger.debug("Trace write failed: %s", e) + + # --------------------- level mappings --------------------- # + @staticmethod + def _norm_level(s: str | None, default: str = "MEDIUM") -> str: + if not s: + return default + s = str(s).strip().upper() + return s if s in ("LOW", "MEDIUM", "HIGH") else default + + @staticmethod + def _decode_cost(osl: str) -> float: + return {"LOW": 1.0, "MEDIUM": 2.0, "HIGH": 3.0}[osl] + + @staticmethod + def _iat_factor(iat: str) -> float: + return {"LOW": 1.5, "MEDIUM": 1.0, "HIGH": 0.6}[iat] + + # --------------------- init --------------------- # + async def initialize(self): + """Initialize router by polling for backend workers.""" + # Initialize Prometheus metrics + self._metrics = _init_prometheus_metrics() + + # Connect to actual SGLang workers at dynamo.worker.generate + # (NOT backend.generate - that's where the Processor registers to intercept frontend) + engine = self.runtime.namespace("dynamo").component("worker") + logger.info("Getting engine client for dynamo/worker/generate") + self.engine_client = await engine.endpoint("generate").client() + + min_workers = int(self.min_workers) + if min_workers < 0: + raise ValueError(f"min_workers must be >= 0, got {min_workers}") + + timeout_s = float(os.environ.get("DYNAMO_ROUTER_WAIT_FOR_WORKERS_TIMEOUT_S", "600")) + if not math.isfinite(timeout_s) or timeout_s <= 0: + raise ValueError("DYNAMO_ROUTER_WAIT_FOR_WORKERS_TIMEOUT_S must be a finite number > 0") + + deadline = time.monotonic() + timeout_s + backoff_s = 0.5 + + logger.info("Waiting for backend workers (min_workers=%d, timeout_s=%.1f)...", min_workers, timeout_s) + + if min_workers == 0: + instance_ids_raw = list(self.engine_client.instance_ids()) + logger.info("Backend workers discovered (min_workers=0): %s", instance_ids_raw) + else: + while True: + remaining = deadline - time.monotonic() + if remaining <= 0: + raise TimeoutError( + f"Timed out after {timeout_s}s waiting for >= {min_workers} backend worker(s)" + ) + + try: + await asyncio.wait_for( + self.engine_client.wait_for_instances(), + timeout=min(remaining, 10.0), + ) + except TimeoutError: + pass + + instance_ids_raw = list(self.engine_client.instance_ids()) + if len(instance_ids_raw) >= min_workers: + try: + instance_ids = [int(w) for w in instance_ids_raw] + except Exception: + instance_ids = instance_ids_raw + logger.info("Backend workers discovered: %s", instance_ids) + break + + await asyncio.sleep(backoff_s) + backoff_s = min(backoff_s * 1.5, 5.0) + + self.indexer = KvIndexer(engine, self.block_size) + + self._initialize_bandits() + self._initialize_contextual() + logger.info("WorkloadAwareRouter initialized with %d backend worker(s)", + len(list(self.engine_client.instance_ids()))) + + @safe_update("_init_lock") + def _initialize_bandits(self): + for wid in self.engine_client.instance_ids(): + wid = int(wid) + self.worker_bandits.setdefault(wid, (1.0, 1.0)) + # Update Prometheus metrics + if self._metrics.get("beta_alpha"): + self._metrics["beta_alpha"].labels(worker_id=str(wid)).set(1.0) + if self._metrics.get("beta_beta"): + self._metrics["beta_beta"].labels(worker_id=str(wid)).set(1.0) + + @safe_update("_init_lock") + def _initialize_contextual(self): + for wid in self.engine_client.instance_ids(): + wid = int(wid) + if wid not in self.linA: + self.linA[wid] = self.lin_lambda * np.eye(self.feature_dim, dtype=np.float64) + self.linb[wid] = np.zeros(self.feature_dim, dtype=np.float64) + + def _ensure_worker_context(self, worker_id: int): + if worker_id not in self.linA: + with self._lin_lock: + if worker_id not in self.linA: + self.linA[worker_id] = self.lin_lambda * np.eye(self.feature_dim, dtype=np.float64) + self.linb[worker_id] = np.zeros(self.feature_dim, dtype=np.float64) + + # --------------------- prefix state --------------------- # + @safe_update("_prefix_lock") + def _get_prefix(self, pid: str) -> tuple[int | None, int]: + info = self.prefix_cache_state.get(pid) + if info: + return info.get("worker"), int(info.get("reuse_remaining") or 0) + return None, 0 + + @safe_update("_prefix_lock") + def _set_prefix( + self, + pid: str, + wid: int, + reuse_remaining: int, + decode_cost: float, + prefill_cost: float, + iat_factor: float, + ): + """Record/refresh prefix assignment.""" + if reuse_remaining <= 0: + self.prefix_cache_state.pop(pid, None) + self.prefix_meta.pop(pid, None) + else: + self.prefix_cache_state[pid] = {"worker": wid, "reuse_remaining": max(0, int(reuse_remaining))} + self.prefix_meta[pid] = { + "decode_cost": float(decode_cost), + "prefill_cost": float(max(prefill_cost, 0.0)), + "iat_factor": float(iat_factor), + } + + # Update prefix state size metric + if self._metrics.get("prefix_state_size"): + self._metrics["prefix_state_size"].set(len(self.prefix_cache_state)) + + def _worker_outstanding(self, wid: int) -> tuple[int, float]: + """Returns (reuse_total, work_total) for a worker.""" + reuse_total = 0 + work_total = 0.0 + for pid, info in self.prefix_cache_state.items(): + if info.get("worker") != wid: + continue + r = int(info.get("reuse_remaining") or 0) + reuse_total += r + meta = self.prefix_meta.get(pid) + if meta: + work_total += float(r) * (float(meta.get("decode_cost", 2.0)) + + float(meta.get("prefill_cost", 0.0))) * float(meta.get("iat_factor", 1.0)) + return reuse_total, work_total + + # --------------------- bandits --------------------- # + def _linTS_sample(self, wid: int, x: np.ndarray) -> float: + self._ensure_worker_context(wid) + with self._lin_lock: + A = np.array(self.linA[wid], dtype=np.float64, copy=True) + b = np.array(self.linb[wid], dtype=np.float64, copy=True) + + A = 0.5 * (A + A.T) + eye = np.eye(self.feature_dim, dtype=np.float64) + jitter = self._jt_base + L = None + while True: + try: + L = np.linalg.cholesky(A + jitter * eye) + break + except np.linalg.LinAlgError: + jitter = jitter * self._jt_mult if jitter > 0 else self._jt_base + if jitter > self._jt_max: + vals, vecs = np.linalg.eigh(A) + vals = np.maximum(vals, self._eig_floor) + A_inv = vecs @ (np.diag(1.0 / vals)) @ vecs.T + mu = A_inv @ b + z = np.random.normal(size=self.feature_dim) + noise = vecs @ (z / np.sqrt(vals)) + theta = mu + (self.lin_v * noise) + return float(theta @ x) + + y = np.linalg.solve(L, b) + mu = np.linalg.solve(L.T, y) + z = np.random.normal(size=self.feature_dim) + noise = np.linalg.solve(L.T, z) + theta = mu + (self.lin_v * noise) + return float(theta @ x) + + def _update_contextual(self, wid: int, x: np.ndarray, reward: float): + r = float(max(0.0, min(1.0, reward))) + with self._lin_lock: + A = self.linA[wid] + b = self.linb[wid] + A *= self.lin_forget + b *= self.lin_forget + A += np.outer(x, x) + ridge = (1.0 - self.lin_forget) * self.lin_lambda + if ridge > 0.0: + A += ridge * np.eye(self.feature_dim, dtype=np.float64) + self.linA[wid] = 0.5 * (A + A.T) + self.linb[wid] = b + x * r + + def _ts_sample(self, worker_id: int) -> float: + with self._bandit_lock: + alpha, beta = self.worker_bandits.get(worker_id, (1.0, 1.0)) + return np.random.beta(alpha, beta) + + def _update_bandit(self, worker_id: int, reward: float): + with self._bandit_lock: + alpha, beta = self.worker_bandits.get(worker_id, (1.0, 1.0)) + r = float(max(0.0, min(1.0, reward))) + new_alpha = alpha + r + new_beta = beta + 1.0 - r + self.worker_bandits[worker_id] = (new_alpha, new_beta) + + # Update Prometheus metrics + if self._metrics.get("beta_alpha"): + self._metrics["beta_alpha"].labels(worker_id=str(worker_id)).set(new_alpha) + if self._metrics.get("beta_beta"): + self._metrics["beta_beta"].labels(worker_id=str(worker_id)).set(new_beta) + + # --------------------- features / scores --------------------- # + def _prefill_cost_for_worker(self, tokens: list[int], overlap: float) -> float: + isl = max(0, len(tokens)) + frac = min(max(float(overlap), 0.0), 1.0) + uncached = max(0.0, float(isl) * (1.0 - frac)) + return (uncached / self.prefill_token_scale) * self.prefill_weight + + @staticmethod + def _prefill_bin(prefill_cost: float) -> str: + if prefill_cost < 0.25: + return "LOW" + if prefill_cost < 0.75: + return "MEDIUM" + return "HIGH" + + def _feature_vector( + self, + wid: int, + metrics: dict[str, Any] | None, + scores: "OverlapScores", + last_w: int | None, + reuse_after: int, + decode_cost: float, + prefill_cost: float, + iat_factor: float, + ) -> np.ndarray: + gpu = 0.0 + queue = 0.0 + if metrics and isinstance(metrics, dict) and "endpoints" in metrics: + for ep in metrics["endpoints"]: + if ep.get("worker_id") == wid: + gpu = float(ep.get("gpu_cache_usage_perc", 0.0)) + queue = float(ep.get("num_requests_waiting", 0.0)) + break + inv_load = 1.0 / (1.0 + self.gpu_penalty_weight * max(0.0, gpu) + self.queue_penalty_weight * max(0.0, queue)) + + overlap = float(scores.scores.get(wid, 0.0)) + affinity = 1.0 if (last_w is not None and wid == last_w) else 0.0 + _, work_out = self._worker_outstanding(wid) + + decode_norm = decode_cost / 3.0 + prefill_norm = math.tanh(prefill_cost) + iat_norm = iat_factor / 1.5 + outstanding_norm = math.tanh(0.1 * work_out) + reuse_norm = math.tanh(0.25 * float(max(reuse_after, 0))) + + return np.array([ + 1.0, + inv_load, + overlap, + affinity, + outstanding_norm, + decode_norm, + prefill_norm, + iat_norm, + reuse_norm, + ], dtype=np.float64) + + def _load_score(self, wid: int, metrics: dict[str, Any] | None, job_cost_total: float) -> float: + gpu = 0.0 + queue = 0.0 + if metrics and isinstance(metrics, dict) and "endpoints" in metrics: + for ep in metrics["endpoints"]: + if ep.get("worker_id") == wid: + gpu = float(ep.get("gpu_cache_usage_perc", 0.0)) + queue = float(ep.get("num_requests_waiting", 0.0)) + break + _, work_out = self._worker_outstanding(wid) + penalty = (self.gpu_penalty_weight * gpu + self.queue_penalty_weight * queue + + self.outstanding_work_weight * max(0.0, work_out) + + self.job_gpu_coupling_weight * job_cost_total * gpu + + self.job_queue_coupling_weight * job_cost_total * queue) + return 1.0 / (1.0 + max(0.0, penalty)) + + def _softmax(self, scores: list[float], temp: float) -> list[float]: + t = float(min(max(temp, self.temp_min), self.temp_max)) + m = float(np.max(scores)) + exps = np.exp((np.array(scores) - m) / max(1e-6, t)) + s = float(np.sum(exps)) + if s <= 0.0 or not np.isfinite(s): + return [1.0 / len(scores)] * len(scores) + return list((exps / s).astype(float)) + + # --------------------- selection --------------------- # + def _select_worker( + self, + worker_ids, + req: RouterRequest, + metrics: dict[str, Any] | None, + scores: OverlapScores, + ) -> tuple[int, dict[str, float], dict[int, dict[str, float]], list[float], list[float]]: + osl = self._norm_level(req.expected_osl, "MEDIUM") + iat = self._norm_level(req.interarrival, "MEDIUM") + last_w, _ = self._get_prefix(req.prefix_id) + + reuse_after = max(int(req.reuse_budget), 0) + decode_cost = self._decode_cost(osl) + iat_factor = self._iat_factor(iat) + + temp = self.temp_base / (1.0 + float(reuse_after) * iat_factor) + temp = min(max(temp, self.temp_min), self.temp_max) + + raw_scores: list[float] = [] + worker_list: list[int] = [int(w) for w in worker_ids] + per_worker_ctx: dict[int, dict[str, float]] = {} + load_mods: list[float] = [] + overlaps: list[float] = [] + + for wid in worker_list: + overlap = float(scores.scores.get(wid, 0.0)) + prefill_cost = self._prefill_cost_for_worker(req.tokens, overlap) + job_cost_total = decode_cost + prefill_cost + + x = self._feature_vector( + wid=wid, + metrics=metrics, + scores=scores, + last_w=last_w, + reuse_after=reuse_after, + decode_cost=decode_cost, + prefill_cost=prefill_cost, + iat_factor=iat_factor, + ) + + val = self._linTS_sample(wid, x) + explore_w = self.base_ts_weight / (1.0 + float(reuse_after) * iat_factor) + val += explore_w * self._ts_sample(wid) + + if last_w == wid and (reuse_after > 0): + val += (self.affinity_base + self.affinity_reuse_weight * float(reuse_after) + + self.affinity_iat_weight * iat_factor) * (0.5 + 0.5 * overlap) + + if last_w is not None and wid != last_w and (reuse_after > 0): + val -= (self.switch_cost_base + self.switch_cost_reuse * float(reuse_after) + + self.switch_cost_iat * iat_factor) + + load_mod = self._load_score(wid, metrics, job_cost_total=job_cost_total) + if last_w == wid and reuse_after > 0: + load_mod = max(load_mod, self.sticky_load_floor) + val *= load_mod + + if np.isnan(val) or np.isinf(val): + val = -1e9 + + raw_scores.append(float(val)) + load_mods.append(float(load_mod)) + overlaps.append(float(overlap)) + per_worker_ctx[wid] = { + "decode_cost": decode_cost, + "prefill_cost": prefill_cost, + "iat_factor": iat_factor, + "overlap": overlap, + "reuse_after": float(reuse_after), + "load_mod": load_mod, + } + + probs = self._softmax(raw_scores, temp) + r = random.random() + cum = 0.0 + idx = 0 + for i, p in enumerate(probs): + cum += p + if r <= cum: + idx = i + break + chosen = int(worker_list[idx]) + + return chosen, per_worker_ctx[chosen], per_worker_ctx, raw_scores, probs + + # --------------------- latency baselines & reward --------------------- # + def _ema_update(self, old: float | None, new: float) -> float: + a = self.latency_ema_alpha + return new if old is None else (a * new + (1.0 - a) * old) + + def _get_latency_baseline(self, wid: int, osl: str, prefill_bin: str, per_tok: bool, fallback: float) -> float: + key_b = (wid, osl, prefill_bin, per_tok) + key_w = (wid, per_tok) + if key_b in self.lat_ema_bucket: + return self.lat_ema_bucket[key_b] + if key_w in self.lat_ema_worker: + return self.lat_ema_worker[key_w] + if self.lat_ema_global[per_tok] is not None: + return self.lat_ema_global[per_tok] # type: ignore + return max(1.0, float(fallback)) + + def _update_latency_baselines(self, wid: int, osl: str, prefill_bin: str, metric: float, per_tok: bool) -> float: + self.lat_ema_global[per_tok] = self._ema_update(self.lat_ema_global[per_tok], metric) + key_w = (wid, per_tok) + self.lat_ema_worker[key_w] = self._ema_update(self.lat_ema_worker.get(key_w), metric) + key_b = (wid, osl, prefill_bin, per_tok) + self.lat_ema_bucket[key_b] = self._ema_update(self.lat_ema_bucket.get(key_b), metric) + return self.lat_ema_bucket[key_b] + + @staticmethod + def _latency_metric(latency_ms: float, tokens_out: int | None) -> tuple[float, bool]: + if tokens_out is not None and int(tokens_out) > 0: + return float(latency_ms) / float(max(1, int(tokens_out))), True + return float(latency_ms), False + + @staticmethod + def _metric_to_reward(metric: float, baseline: float, success: bool) -> float: + if not success: + return 0.0 + denom = max(1e-3, baseline) + ratio = metric / denom + return float(1.0 / (1.0 + ratio)) + + # --------------------- timeout sweep --------------------- # + def _sweep_pending(self, now: float): + if now - self._last_pending_sweep < self.pending_sweep_interval_seconds: + return + self._last_pending_sweep = now + expired: list[tuple[str, dict[str, Any]]] = [] + with self._pending_lock: + for did, rec in list(self.pending.items()): + if now - float(rec.get("start_ts", now)) >= self.feedback_timeout_seconds: + expired.append((did, rec)) + self.pending.pop(did, None) + + # Update pending count metric + if self._metrics.get("pending_decisions"): + self._metrics["pending_decisions"].set(len(self.pending)) + + for did, rec in expired: + wid = int(rec["wid"]) + x = rec["x"] + reward = float(self.timeout_reward) + self._update_bandit(wid, reward) + self._update_contextual(wid, x, reward) + + if self._metrics.get("timeout_penalties"): + self._metrics["timeout_penalties"].inc() + + self._emit_trace("timeout", { + "decision_id": did, + "wid": wid, + "reward": reward, + "age": self.feedback_timeout_seconds, + "prefix_id": rec.get("prefix_id"), + "osl": rec.get("osl"), + "prefill_bin": rec.get("prefill_bin"), + }) + logger.warning("Timeout feedback: wid=%s decision=%s reward=%.3f", wid, did, reward) + + # --------------------- main endpoint: find_worker --------------------- # + async def generate(self, request: dict): + req = RouterRequest(**request) + + worker_ids = [int(w) for w in self.engine_client.instance_ids()] + if not worker_ids: + yield RouterResponse(worker_id=-1, prefix_hit_rate=0.0).model_dump() + return + + now = time.time() + self._sweep_pending(now) + + # Track tokens per request + if self._metrics.get("tokens_per_request"): + self._metrics["tokens_per_request"].observe(len(req.tokens)) + if self._metrics.get("reuse_budget"): + self._metrics["reuse_budget"].observe(req.reuse_budget) + + metrics = None # TODO: Replace with proper metrics query when API is available + if self.router_type == "kv_load": + wid, _ = self._get_underloaded(metrics) + yield RouterResponse(worker_id=wid, prefix_hit_rate=0.0).model_dump() + return + + scores: OverlapScores = await self.indexer.find_matches_for_request(req.tokens, 0) + chosen, chosen_ctx, all_ctx, raw_scores, probs = self._select_worker(worker_ids, req, metrics, scores) + + last_w, _ = self._get_prefix(req.prefix_id) + + osl = self._norm_level(req.expected_osl, "MEDIUM") + iat = self._norm_level(req.interarrival, "MEDIUM") + decode_cost = self._decode_cost(osl) + overlap_chosen = float(scores.scores.get(chosen, 0.0)) + prefill_cost_chosen = self._prefill_cost_for_worker(req.tokens, overlap_chosen) + iat_factor = self._iat_factor(iat) + + # Update prefix state + self._set_prefix( + req.prefix_id, + chosen, + reuse_remaining=max(int(req.reuse_budget), 0), + decode_cost=decode_cost, + prefill_cost=prefill_cost_chosen, + iat_factor=iat_factor, + ) + + # Build feature x for chosen & store pending decision + x = self._feature_vector( + wid=chosen, + metrics=metrics, + scores=scores, + last_w=last_w, + reuse_after=max(int(req.reuse_budget), 0), + decode_cost=decode_cost, + prefill_cost=prefill_cost_chosen, + iat_factor=iat_factor, + ) + decision_id = uuid.uuid4().hex + with self._pending_lock: + self.pending[decision_id] = { + "wid": int(chosen), + "x": x, + "osl": osl, + "prefill_bin": self._prefill_bin(prefill_cost_chosen), + "start_ts": now, + "prefix_id": req.prefix_id, + "tokens_in": len(req.tokens), + "reuse_after": int(req.reuse_budget), + "overlap": overlap_chosen, + "prefill_cost": float(prefill_cost_chosen), + "decode_cost": float(decode_cost), + } + # Update pending count metric + if self._metrics.get("pending_decisions"): + self._metrics["pending_decisions"].set(len(self.pending)) + + # Update Prometheus metrics + if self._metrics.get("decisions_total"): + self._metrics["decisions_total"].labels(worker_id=str(chosen)).inc() + if self._metrics.get("kv_overlap"): + self._metrics["kv_overlap"].labels(worker_id=str(chosen)).set(overlap_chosen) + + # Track sticky vs switch decisions + if last_w is not None: + if chosen == last_w: + if self._metrics.get("sticky_decisions"): + self._metrics["sticky_decisions"].inc() + else: + if self._metrics.get("switch_decisions"): + self._metrics["switch_decisions"].inc() + + # Decision trace + if self.debug_traces: + worker_list = [int(w) for w in worker_ids] + details = { + wid: { + "score": float(raw_scores[i]), + "prob": float(probs[i]), + **all_ctx[wid], + } + for i, wid in enumerate(worker_list) + } + self._emit_trace("decision", { + "decision_id": decision_id, + "prefix_id": req.prefix_id, + "chosen": int(chosen), + "workers": details, + }) + + logger.info( + "Router picked worker=%s decision=%s prefix=%s (last=%s reuse_after=%s osl=%s prefill_cost=%.3f iat=%s overlap=%.3f)", + chosen, decision_id, req.prefix_id, last_w, req.reuse_budget, osl, prefill_cost_chosen, iat, overlap_chosen, + ) + + resp = RouterResponse(worker_id=chosen, prefix_hit_rate=overlap_chosen, decision_id=decision_id) + yield resp.model_dump() + return + + # --------------------- feedback endpoint --------------------- # + async def feedback(self, request: dict): + """Ex-post reward update from processor with observed latency.""" + try: + fb = FeedbackRequest(**request) + except Exception as e: + ack = FeedbackAck(ok=False, used_baseline=0.0, reward=0.0, error=str(e)) + yield ack.model_dump() + return + + with self._pending_lock: + decision = self.pending.pop(fb.decision_id, None) + # Update pending count metric + if self._metrics.get("pending_decisions"): + self._metrics["pending_decisions"].set(len(self.pending)) + + if not decision: + ack = FeedbackAck(ok=False, used_baseline=0.0, reward=0.0, error="unknown_decision") + yield ack.model_dump() + return + + wid: int = int(decision["wid"]) + x: np.ndarray = decision["x"] + osl: str = str(decision["osl"]) + prefill_bin: str = str(decision["prefill_bin"]) + tokens_out = None if fb.tokens_out is None else int(fb.tokens_out) + metric, per_tok = self._latency_metric(float(fb.latency_ms), tokens_out) + + # Baseline lookup (hierarchical) + baseline_before = self._get_latency_baseline(wid, osl, prefill_bin, per_tok, fallback=metric) + reward = self._metric_to_reward(metric, baseline_before, bool(fb.success)) + + # Update EMAs only on successes + if fb.success: + baseline_after = self._update_latency_baselines(wid, osl, prefill_bin, metric, per_tok) + else: + baseline_after = baseline_before + + # Update bandits with ex-post reward + self._update_bandit(wid, reward) + self._update_contextual(wid, x, reward) + + # Update Prometheus metrics + if self._metrics.get("feedback_latency"): + self._metrics["feedback_latency"].labels(worker_id=str(wid)).observe(fb.latency_ms / 1000.0) + if self._metrics.get("reward"): + self._metrics["reward"].labels(worker_id=str(wid)).set(reward) + + self._emit_trace("feedback", { + "decision_id": fb.decision_id, + "wid": wid, + "latency_ms": float(fb.latency_ms), + "tokens_out": tokens_out, + "metric": metric, + "per_tok": per_tok, + "baseline_used": baseline_before, + "baseline_after": baseline_after, + "reward": reward, + "success": bool(fb.success), + "finish_reason": fb.finish_reason or "", + }) + + logger.info( + "Feedback: wid=%s decision=%s metric=%.3f%s baseline=%.3f reward=%.3f success=%s", + wid, fb.decision_id, metric, " ms/tok" if per_tok else " ms", baseline_before, reward, fb.success, + ) + + ack = FeedbackAck(ok=True, used_baseline=float(baseline_before), reward=float(reward), worker_id=wid) + yield ack.model_dump() + return + + # --------------------- helpers --------------------- # + def _get_underloaded(self, metrics: dict[str, Any] | None): + if not metrics or not metrics.get("endpoints"): + wid = int(random.choice(list(self.engine_client.instance_ids()))) + return wid, 0.0 + loads = {ep.get("worker_id"): ep.get("gpu_cache_usage_perc", 0.0) for ep in metrics["endpoints"]} + min_val = min(loads.values()) + candidates = [wid for wid, v in loads.items() if v == min_val] + return random.choice(candidates), min_val + + +# ---------------------- worker entry point ---------------------- # +def parse_args(): + """Parse minimal CLI arguments. + + The router uses a YAML config file for most parameters. + Only frequently-tuned parameters have dedicated CLI flags. + Use --override for any other parameter. + + See PARAMETERS.md for full documentation. + """ + parser = argparse.ArgumentParser( + description="Optimized Thompson Sampling Router with Prometheus Metrics", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Use default config + python router.py + + # Use custom config file + python router.py --config /path/to/config.yaml + + # Override specific values + python router.py --config config.yaml --affinity-base 0.5 --temp-base 1.5 + + # Override any config value + python router.py --config config.yaml --override load_balancing.gpu_penalty_weight=2.0 + +See PARAMETERS.md for full parameter documentation. + """, + ) + + # Config file + parser.add_argument( + "--config", + type=str, + default=None, + help="Path to YAML config file (default: config.yaml in script directory)", + ) + + # Primary tuning knobs (explicit CLI flags) + parser.add_argument( + "--affinity-base", + type=float, + default=None, + help="Primary stickiness control [0.0-1.0] (overrides config)", + ) + parser.add_argument( + "--temp-base", + type=float, + default=None, + help="Primary exploration control [0.15-2.0] (overrides config)", + ) + parser.add_argument( + "--lints-v", + type=float, + default=None, + help="LinTS exploration variance [0.0-1.0] (overrides config)", + ) + + # Generic override for any config value + parser.add_argument( + "--override", + action="append", + default=[], + metavar="KEY=VALUE", + help="Override any config value using dot notation (repeatable)", + ) + + return parser.parse_args() + + +@dynamo_worker(static=False) +async def worker(runtime: DistributedRuntime): + # Parse CLI and load config + args = parse_args() + config = load_config(args.config) + config = apply_cli_overrides(config, args) + + component = runtime.namespace("dynamo").component("router") + await component.create_service() + logger.info("Initializing Optimized Thompson Sampling Router (Prometheus metrics)") + + # Extract config values with nested access + router = WorkloadAwareRouter( + runtime, + # Infrastructure + block_size=get_nested(config, "infrastructure.block_size", 64), + router_type=str(get_nested(config, "infrastructure.router_type", "kv")).lower(), + min_workers=get_nested(config, "infrastructure.min_workers", 1), + # Affinity + affinity_base=get_nested(config, "affinity.base", 0.30), + affinity_reuse_weight=get_nested(config, "affinity.reuse_weight", 0.15), + affinity_iat_weight=get_nested(config, "affinity.iat_weight", 0.20), + sticky_load_floor=get_nested(config, "affinity.sticky_load_floor", 0.70), + # Exploration + base_ts_weight=get_nested(config, "exploration.base_ts_weight", 0.10), + temp_base=get_nested(config, "exploration.temperature.base", 1.0), + temp_min=get_nested(config, "exploration.temperature.min", 0.15), + temp_max=get_nested(config, "exploration.temperature.max", 2.0), + # Switching cost + switch_cost_base=get_nested(config, "switching_cost.base", 0.20), + switch_cost_reuse=get_nested(config, "switching_cost.reuse_penalty", 0.08), + switch_cost_iat=get_nested(config, "switching_cost.iat_penalty", 0.05), + # Load balancing + queue_penalty_weight=get_nested(config, "load_balancing.queue_penalty_weight", 0.50), + gpu_penalty_weight=get_nested(config, "load_balancing.gpu_penalty_weight", 1.00), + outstanding_work_weight=get_nested(config, "load_balancing.outstanding_work_weight", 0.45), + job_gpu_coupling_weight=get_nested(config, "load_balancing.job_gpu_coupling_weight", 0.40), + job_queue_coupling_weight=get_nested(config, "load_balancing.job_queue_coupling_weight", 0.20), + # Prefill + prefill_token_scale=get_nested(config, "prefill.token_scale", 1024.0), + prefill_weight=get_nested(config, "prefill.weight", 1.0), + # LinTS + lints_lambda=get_nested(config, "lints.lambda", 1.0), + lints_v=get_nested(config, "lints.v", 0.25), + lints_forget=get_nested(config, "lints.forget_rate", 0.995), + # Feedback + feedback_timeout_seconds=get_nested(config, "feedback.timeout_seconds", 120.0), + pending_sweep_interval_seconds=get_nested(config, "feedback.sweep_interval_seconds", 5.0), + timeout_reward=get_nested(config, "feedback.timeout_reward", 0.0), + latency_ema_alpha=get_nested(config, "feedback.latency_ema_alpha", 0.2), + # Debug + debug_traces=get_nested(config, "debug.traces_enabled", False), + debug_trace_dir=get_nested(config, "debug.trace_dir", "/tmp/dynamo_router_traces"), + debug_buffer_size=get_nested(config, "debug.buffer_size", 2000), + ) + await router.initialize() + + # Serve both endpoints + await asyncio.gather( + component.endpoint("find_worker").serve_endpoint(router.generate), + component.endpoint("feedback").serve_endpoint(router.feedback), + ) + + +if __name__ == "__main__": + uvloop.install() + asyncio.run(worker()) + diff --git a/external/dynamo/start_dynamo_optimized_thompson_hints.sh b/external/dynamo/start_dynamo_optimized_thompson_hints.sh new file mode 100755 index 0000000000..0d15a52818 --- /dev/null +++ b/external/dynamo/start_dynamo_optimized_thompson_hints.sh @@ -0,0 +1,705 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Dynamo SGLang with OPTIMIZED Thompson Sampling Router Architecture +# +# Key difference from generalized architecture: +# - Uses DEFAULT Dynamo frontend (python -m dynamo.frontend) +# - Custom Processor + Router components +# - Routing hints passed via nvext.annotations instead of HTTP headers +# - Prometheus metrics instead of CSV files +# +# Architecture: +# Client → Default Dynamo Frontend (tokenization + nvext parsing) +# ↓ PreprocessedRequest with annotations +# Custom Processor (extracts hints, queries router) +# ↓ RouterRequest +# Custom Router (Thompson Sampling + KV overlap) +# ↓ worker_id +# SGLang Backend Worker +# ↓ response tokens +# Processor sends feedback to Router +# +# Components: +# - ETCD (metadata and worker discovery) +# - NATS (message queue for KV events) +# - Default Dynamo Frontend (HTTP API on port 8000) +# - Custom Router (Thompson Sampling + KV overlap) +# - Custom Processor (hint extraction + routing) +# - SGLang Worker (unified mode, GPUs 0-3, TP=4) +# +# Prometheus Metrics: +# - Frontend: http://localhost:8000/metrics +# - Backend/Router/Processor: http://localhost:8081/metrics +# +# To stop all components: bash stop_dynamo.sh + +set -euo pipefail + +# Configuration Variables (can be overridden via environment variables) +CONTAINER_NAME="dynamo-sglang-optimized" +WORKER_GPUS="${DYNAMO_GPU_DEVICES:-0,1,2,3}" +TP_SIZE="${DYNAMO_TP_SIZE:-4}" +HTTP_PORT="${DYNAMO_HTTP_PORT:-8000}" +METRICS_PORT="${DYNAMO_METRICS_PORT:-8081}" +MODEL="/workspace/models/Llama-3.3-70B-Instruct" +SERVED_MODEL_NAME="${DYNAMO_MODEL_NAME:-llama-3.3-70b}" +IMAGE="nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.1" +SHM_SIZE="${DYNAMO_SHM_SIZE:-16g}" +WORKER_INIT_TIMEOUT_S="${DYNAMO_WORKER_INIT_TIMEOUT_S:-600}" + +# Local paths - DYNAMO_MODEL_DIR must be set or script will error +if [ -z "${DYNAMO_MODEL_DIR:-}" ]; then + echo "ERROR: DYNAMO_MODEL_DIR environment variable must be set" + echo "" + echo "Example:" + echo " export DYNAMO_MODEL_DIR=\"/path/to/your/models/Llama-3.3-70B-Instruct\"" + echo "" + echo "Then run this script again." + exit 1 +fi + +# Validate model directory +if [ -d "${DYNAMO_MODEL_DIR}" ]; then + if [ ! -f "${DYNAMO_MODEL_DIR}/config.json" ]; then + echo "ERROR: ${DYNAMO_MODEL_DIR} exists but is not a valid model directory" + echo "" + echo "Missing: config.json" + echo "" + echo "Find it: find ~/.cache/huggingface/hub -name config.json -path '*Llama-3.3-70B*'" + exit 1 + fi + + if ! grep -q '"model_type"' "${DYNAMO_MODEL_DIR}/config.json" 2>/dev/null; then + echo "ERROR: ${DYNAMO_MODEL_DIR}/config.json is missing 'model_type' field" + echo "" + echo "This usually means incomplete/corrupted download. Try:" + echo " rm -rf ${DYNAMO_MODEL_DIR}" + echo " huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ${DYNAMO_MODEL_DIR}" + exit 1 + fi +fi +LOCAL_MODEL_DIR="${DYNAMO_MODEL_DIR}" + +# Repository directory - auto-detect from script location +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CUSTOM_DYNAMO_DIR="${SCRIPT_DIR}/optimized" + +echo "=========================================================" +echo "Dynamo SGLang with OPTIMIZED Thompson Sampling Router" +echo "=========================================================" +echo "Model: Llama-3.3-70B-Instruct" +echo "Container: $CONTAINER_NAME" +echo "HTTP Port: $HTTP_PORT (default Dynamo frontend)" +echo "Metrics Port: $METRICS_PORT (Prometheus)" +echo "" +echo "Architecture Differences (vs generalized):" +echo " - Default Dynamo frontend (not custom frontend.py)" +echo " - Hints via nvext.annotations (not HTTP headers)" +echo " - Prometheus metrics (not CSV files)" +echo "" +echo "Components:" +echo " - ETCD (metadata and discovery)" +echo " - NATS (message queue for KV events)" +echo " - Default Frontend (HTTP API on port $HTTP_PORT)" +echo " - Custom Router (Thompson Sampling + KV overlap)" +echo " - Custom Processor (hint extraction + routing)" +echo " - SGLang Worker (unified mode)" +echo "" +echo "Backend Worker:" +echo " Unified: GPUs $WORKER_GPUS (TP=$TP_SIZE)" +echo "" +echo "=========================================================" + +# Verify custom components exist +if [ ! -f "$CUSTOM_DYNAMO_DIR/router.py" ]; then + echo "✗ ERROR: Custom router.py not found at: $CUSTOM_DYNAMO_DIR/router.py" + exit 1 +fi +if [ ! -f "$CUSTOM_DYNAMO_DIR/processor.py" ]; then + echo "✗ ERROR: Custom processor.py not found at: $CUSTOM_DYNAMO_DIR/processor.py" + exit 1 +fi +echo "✓ Custom components found in: $CUSTOM_DYNAMO_DIR" +echo "" + +# Start ETCD if not running +if docker ps -a --format '{{.Names}}' | grep -q "^etcd-dynamo$"; then + echo "Removing existing ETCD container..." + docker rm -f etcd-dynamo +fi + +echo "Starting ETCD container..." +docker run -d \ + --name etcd-dynamo \ + --network host \ + -e ALLOW_NONE_AUTHENTICATION=yes \ + -e ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379 \ + -e ETCD_ADVERTISE_CLIENT_URLS=http://localhost:2379 \ + bitnamilegacy/etcd:3.6.1 + +# Wait for ETCD to be ready +echo "Waiting for ETCD to be ready..." +for i in {1..30}; do + if curl -s http://localhost:2379/health > /dev/null 2>&1; then + echo "✓ ETCD is ready" + sleep 2 + break + fi + if [ $i -eq 30 ]; then + echo "✗ ERROR: ETCD failed to start within 30 seconds" + docker logs etcd-dynamo + exit 1 + fi + sleep 1 +done + +# Start NATS if not running +if docker ps -a --format '{{.Names}}' | grep -q "^nats-dynamo$"; then + echo "Removing existing NATS container..." + docker rm -f nats-dynamo +fi + +echo "Starting NATS container..." +docker run -d \ + --name nats-dynamo \ + --network host \ + nats:2.11.4 \ + -js + +# Wait for NATS to be ready +echo "Waiting for NATS to be ready..." +for i in {1..30}; do + if timeout 2 bash -c 'cat < /dev/null > /dev/tcp/localhost/4222' 2>/dev/null; then + echo "✓ NATS is ready" + break + fi + if [ $i -eq 30 ]; then + echo "✗ ERROR: NATS failed to start within 30 seconds" + docker logs nats-dynamo + exit 1 + fi + sleep 1 +done +echo "" + +# Clean up existing Dynamo container if it exists +if docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + echo "Removing existing Dynamo container: $CONTAINER_NAME" + docker rm -f $CONTAINER_NAME +fi + +# Verify HF_TOKEN is set +if [ -z "${HF_TOKEN:-}" ]; then + echo "" + echo "⚠ HF_TOKEN environment variable is not set." + echo "" + if [ -d "$LOCAL_MODEL_DIR" ]; then + echo "✓ Local model found - proceeding without HF_TOKEN" + HF_TOKEN="dummy" + else + echo "✗ Local model NOT found and no HF_TOKEN to download it" + echo "" + read -p "Please enter your HuggingFace token (or press Enter to skip): " HF_TOKEN + if [ -z "$HF_TOKEN" ]; then + echo "WARNING: Proceeding without HF_TOKEN." + HF_TOKEN="dummy" + else + echo "✓ HuggingFace token received" + fi + fi +else + echo "✓ HuggingFace token is set" +fi +echo "" + +# Verify model exists locally +if [ ! -d "$LOCAL_MODEL_DIR" ]; then + echo "WARNING: Model directory not found at: $LOCAL_MODEL_DIR" + echo "" + echo "To download the model, run:" + echo " huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir $LOCAL_MODEL_DIR" + echo "" + read -p "Continue anyway (model will be downloaded from HuggingFace)? [y/N] " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + exit 1 + fi +fi + +# Start container with optimized Thompson Sampling components +echo "" +echo "Starting Dynamo container with OPTIMIZED Thompson Sampling components..." +docker run -d \ + --name $CONTAINER_NAME \ + --gpus "\"device=${WORKER_GPUS}\"" \ + --network host \ + --ipc=host \ + --shm-size=$SHM_SIZE \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + -v $LOCAL_MODEL_DIR:$MODEL:ro \ + -v $CUSTOM_DYNAMO_DIR:/workspace/custom_dynamo:ro \ + -e HF_TOKEN="$HF_TOKEN" \ + -e HUGGING_FACE_HUB_TOKEN="$HF_TOKEN" \ + -e RUST_BACKTRACE=1 \ + -e PYTHONUNBUFFERED=1 \ + -e DYN_HTTP_PORT=$HTTP_PORT \ + -e DYN_SYSTEM_PORT=$METRICS_PORT \ + -e DYN_ROUTER_MODE=round-robin \ + $IMAGE \ + bash -c " + set -e + + echo '=========================================================' + echo 'Verifying external infrastructure services...' + echo '=========================================================' + + # Verify ETCD is accessible + if curl -s http://localhost:2379/health > /dev/null 2>&1; then + echo '✓ ETCD accessible at localhost:2379' + else + echo '✗ ERROR: ETCD not accessible at localhost:2379' + exit 1 + fi + + # Verify NATS is accessible + if timeout 2 bash -c '/dev/null; then + echo '✓ NATS accessible at localhost:4222' + else + echo '✗ ERROR: NATS not accessible at localhost:4222' + exit 1 + fi + + echo '' + + # Function to wait for worker initialization via ETCD registration + wait_for_worker() { + local worker_type=\$1 + local pid=\$2 + local max_wait=${WORKER_INIT_TIMEOUT_S:-600} + local elapsed=0 + local poll_interval=5 + + echo \"Waiting for \$worker_type worker (PID \$pid) to initialize...\" + echo \" Detection: ETCD worker registration\" + echo \" Timeout: \${max_wait}s\" + + while [ \$elapsed -lt \$max_wait ]; do + if ! kill -0 \$pid 2>/dev/null; then + echo \"ERROR: \$worker_type worker process died!\" + return 1 + fi + + local etcd_response=\$(curl -s --max-time 2 http://localhost:2379/v3/kv/range \ + -X POST \ + -H \"Content-Type: application/json\" \ + -d '{\"key\":\"AA==\",\"range_end\":\"AA==\",\"keys_only\":true}' 2>&1) + + if [ \$((elapsed % 30)) -eq 0 ] && [ \$elapsed -gt 0 ]; then + echo \" [DEBUG] ETCD count: \$(echo \"\$etcd_response\" | grep -o '\"count\":\"[^\"]*\"')\" + fi + + if echo \"\$etcd_response\" | grep -q '\"count\"' && \ + ! echo \"\$etcd_response\" | grep -q '\"count\":\"0\"'; then + echo \"✓ \$worker_type worker is ready (registered with ETCD at \${elapsed}s)\" + return 0 + fi + + sleep \$poll_interval + elapsed=\$((elapsed + poll_interval)) + if [ \$((elapsed % 30)) -eq 0 ]; then + echo \" ... \${elapsed}s / \${max_wait}s (waiting for ETCD registration)\" + fi + done + + echo \"ERROR: \$worker_type worker failed to register with ETCD within \${max_wait}s\" + return 1 + } + + echo '=========================================================' + echo 'Step 1: Starting Unified Worker (GPUs 0,1,2,3 = Host GPUs $WORKER_GPUS)...' + echo '=========================================================' + # CRITICAL: Register worker at dynamo.worker.generate (not default backend.generate) + # This allows the custom Processor to register as backend.generate and intercept + # frontend requests, then forward to these workers after Thompson Sampling routing. + CUDA_VISIBLE_DEVICES=0,1,2,3 \ + python3 -m dynamo.sglang \ + --model-path $MODEL \ + --served-model-name $SERVED_MODEL_NAME \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $TP_SIZE \ + --trust-remote-code \ + --enable-metrics \ + --mem-fraction-static 0.8 \ + --endpoint dynamo.worker.generate & + WORKER_PID=\$! + echo \"Unified Worker PID: \$WORKER_PID\" + echo \"Registered at: dynamo.worker.generate\" + echo \"\" + + # Wait for unified worker to initialize + wait_for_worker \"Unified\" \$WORKER_PID || exit 1 + + echo '' + echo '=========================================================' + echo 'Step 2: Starting Custom Router (Thompson Sampling + Prometheus)...' + echo '=========================================================' + # Router uses config.yaml for all parameters + # Override specific values with --affinity-base, --temp-base, --lints-v, or --override + python3 /workspace/custom_dynamo/router.py \ + --config /workspace/custom_dynamo/config.yaml & + ROUTER_PID=\$! + echo \"Router PID: \$ROUTER_PID\" + sleep 15 + echo \"\" + + echo '' + echo '=========================================================' + echo 'Step 3: Starting Custom Processor (Dynamic Discovery Mode)...' + echo '=========================================================' + # DYNAMIC DISCOVERY MODE (forward-compatible, --static-endpoint deprecated): + # Processor registers as dynamo.backend.generate AND calls register_llm() + # to advertise a model card in ETCD. The frontend's ModelWatcher discovers + # this and routes requests to us. + python3 /workspace/custom_dynamo/processor.py \ + --enable-router \ + --model-path $MODEL \ + --model-name $SERVED_MODEL_NAME & + PROCESSOR_PID=\$! + echo \"Processor PID: \$PROCESSOR_PID\" + echo \"Model: $SERVED_MODEL_NAME (from $MODEL)\" + echo \"Registered at: dynamo.backend.generate (discovered via ETCD model card)\" + echo \"Forwards to: dynamo.worker.generate (actual SGLang workers)\" + sleep 15 + echo \"\" + + echo '' + echo '=========================================================' + echo 'Step 4: Starting Default Dynamo Frontend (Dynamic Discovery)...' + echo '=========================================================' + # DYNAMIC DISCOVERY MODE (forward-compatible): + # No --static-endpoint needed! The frontend uses its ModelWatcher to + # discover backends registered in ETCD. Our processor registered a + # model card in Step 3, so the frontend will find and route to it. + python3 -m dynamo.frontend \ + --http-port $HTTP_PORT \ + --model-name $SERVED_MODEL_NAME \ + --model-path $MODEL & + FRONTEND_PID=\$! + echo \"Frontend PID: \$FRONTEND_PID\" + echo \"Discovery: ETCD ModelWatcher (no --static-endpoint)\" + sleep 15 + echo \"\" + + echo '' + echo '=========================================================' + echo '✓ All components started successfully!' + echo '=========================================================' + echo \"Infrastructure Services (External):\" + echo \" ETCD: localhost:2379\" + echo \" NATS: localhost:4222\" + echo \"\" + echo \"Dynamo Components (This Container):\" + echo \" Unified Worker: PID \$WORKER_PID (GPUs $WORKER_GPUS, TP=$TP_SIZE)\" + echo \" → Registered at: dynamo.worker.generate\" + echo \" Router: PID \$ROUTER_PID (Thompson Sampling + Prometheus)\" + echo \" → Registered at: dynamo.router.{find_worker,feedback}\" + echo \" Processor: PID \$PROCESSOR_PID (NVExt annotation extraction)\" + echo \" → Registered at: dynamo.backend.generate (model card in ETCD)\" + echo \" Frontend: PID \$FRONTEND_PID (Default Dynamo HTTP API on port $HTTP_PORT)\" + echo \" → Discovery: ETCD ModelWatcher (finds processor's model card)\" + echo '' + echo 'Request Flow (Dynamic Discovery Mode):' + echo ' Client → Default Frontend API (port $HTTP_PORT)' + echo ' ↓ (tokenization + nvext parsing)' + echo ' Frontend discovers backends via ETCD ModelWatcher' + echo ' ↓ (finds Processor model card!)' + echo ' Custom Processor (dynamo.backend.generate-{id})' + echo ' ↓ (extract hints from annotations)' + echo ' ↓ (query Thompson Sampling router)' + echo ' Custom Router → worker_id' + echo ' ↓ (KV overlap + workload-aware selection)' + echo ' Processor routes to → dynamo.worker.generate (with worker_id)' + echo ' ↓' + echo ' Unified Worker (dynamo.worker.generate)' + echo ' ↓' + echo ' Response + Feedback to Router' + echo '' + echo 'Prometheus Metrics:' + echo ' - Frontend: http://localhost:$HTTP_PORT/metrics' + echo ' - Backend: http://localhost:$METRICS_PORT/metrics' + echo ' - Router: thompson_router_* metrics' + echo ' - Processor: thompson_processor_* metrics' + echo '=========================================================' + + # Monitor all processes + while true; do + if ! kill -0 \$FRONTEND_PID 2>/dev/null; then + echo \"ERROR: Frontend died!\" + exit 1 + fi + if ! kill -0 \$PROCESSOR_PID 2>/dev/null; then + echo \"ERROR: Processor died!\" + exit 1 + fi + if ! kill -0 \$ROUTER_PID 2>/dev/null; then + echo \"ERROR: Router died!\" + exit 1 + fi + if ! kill -0 \$WORKER_PID 2>/dev/null; then + echo \"ERROR: Unified worker died!\" + exit 1 + fi + sleep 10 + done + " + +# Wait for container to start +echo "" +echo "Waiting for container to start..." +sleep 15 + +# Check if container started successfully +if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + echo "" + echo "=========================================================" + echo "✓ Dynamo with OPTIMIZED Thompson Sampling Router Started!" + echo "=========================================================" + echo "" + echo "Architecture (Dynamic Discovery - Forward Compatible):" + echo "" + echo " Endpoint Registration:" + echo " • SGLang Worker: dynamo.worker.generate (actual inference)" + echo " • Processor: dynamo.backend.generate + ETCD model card" + echo " • Router: dynamo.router.{find_worker,feedback}" + echo "" + echo " Discovery Mode:" + echo " • Frontend uses ETCD ModelWatcher (no --static-endpoint)" + echo " • Processor registers model card via register_llm()" + echo " • Frontend discovers processor as a 'backend' automatically" + echo "" + echo " Request Flow:" + echo " Client Request (with nvext.annotations)" + echo " ↓" + echo " Default Dynamo Frontend (port $HTTP_PORT)" + echo " ↓ discovers backends via ETCD ModelWatcher" + echo " Custom Processor (discovered via model card)" + echo " ↓ extracts: prefix_id, total_requests, osl, iat" + echo " ↓ queries Thompson Sampling router" + echo " Custom Router → worker_id" + echo " ↓ KV overlap + workload-aware selection" + echo " Processor forwards to dynamo.worker.generate" + echo " ↓" + echo " Unified Worker (GPUs $WORKER_GPUS, TP=$TP_SIZE)" + echo " ↓" + echo " Response + Feedback Loop" + echo "" + echo "Infrastructure Services (Managed):" + echo " ETCD: etcd-dynamo container, localhost:2379" + echo " NATS: nats-dynamo container, localhost:4222" + echo "" + echo "Prometheus Metrics:" + echo " Frontend: http://localhost:$HTTP_PORT/metrics" + echo " Backend/Router/Processor: http://localhost:$METRICS_PORT/metrics" + echo "" + echo "API Endpoint: http://localhost:$HTTP_PORT/v1/chat/completions" + echo "Health Check: http://localhost:$HTTP_PORT/health" + echo "" + echo "NVExt Annotations (in request body):" + echo " \"nvext\": {" + echo " \"annotations\": [" + echo " \"prefix_id:\"," + echo " \"total_requests:\"," + echo " \"osl:LOW|MEDIUM|HIGH\"," + echo " \"iat:LOW|MEDIUM|HIGH\"" + echo " ]" + echo " }" + echo "" + echo "Useful Commands:" + echo " Interactive shell: docker exec -it $CONTAINER_NAME bash" + echo " View Dynamo logs: docker logs -f $CONTAINER_NAME" + echo " View ETCD logs: docker logs -f etcd-dynamo" + echo " View NATS logs: docker logs -f nats-dynamo" + echo " GPU usage: watch -n 2 nvidia-smi" + echo " Stop all: bash stop_dynamo.sh" + echo "" + echo "Prometheus Metrics:" + echo " curl http://localhost:$HTTP_PORT/metrics | grep dynamo" + echo " curl http://localhost:$METRICS_PORT/metrics | grep thompson" + echo "" + echo "=========================================================" + echo "Test Request (with nvext annotations):" + echo "=========================================================" + echo "" + echo "# Basic test (no hints)" + echo "curl http://localhost:$HTTP_PORT/v1/chat/completions \\" + echo " -H 'Content-Type: application/json' \\" + echo " -d '{" + echo " \"model\": \"$SERVED_MODEL_NAME\"," + echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}]," + echo " \"max_tokens\": 50" + echo " }'" + echo "" + echo "# Test with nvext annotations (routing hints)" + echo "curl http://localhost:$HTTP_PORT/v1/chat/completions \\" + echo " -H 'Content-Type: application/json' \\" + echo " -d '{" + echo " \"model\": \"$SERVED_MODEL_NAME\"," + echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}]," + echo " \"max_tokens\": 50," + echo " \"nvext\": {" + echo " \"annotations\": [" + echo " \"prefix_id:test-session-001\"," + echo " \"total_requests:5\"," + echo " \"osl:MEDIUM\"," + echo " \"iat:LOW\"" + echo " ]" + echo " }" + echo " }'" + echo "" + echo "# Streaming test with hints" + echo "curl http://localhost:$HTTP_PORT/v1/chat/completions \\" + echo " -H 'Content-Type: application/json' \\" + echo " -d '{" + echo " \"model\": \"$SERVED_MODEL_NAME\"," + echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}]," + echo " \"max_tokens\": 50," + echo " \"stream\": true," + echo " \"nvext\": {" + echo " \"annotations\": [\"prefix_id:stream-test\", \"total_requests:1\"]" + echo " }" + echo " }'" + echo "" + echo "=========================================================" + echo "" + echo "Waiting for SGLang to initialize (this may take 5-10 minutes for a 70B model)..." + echo "Monitoring logs (Ctrl+C to exit, container continues)..." + echo "" + + # Wait for server to be ready + echo "Checking for API availability (timeout=15 minutes)..." + max_attempts=900 + attempt=0 + + while [ $attempt -lt $max_attempts ]; do + # Use || true to prevent curl connection failures from exiting due to set -e + # curl returns "000" for connection refused, so we just need to prevent the exit + health_response=$(curl -s --max-time 5 -o /dev/null -w "%{http_code}" http://localhost:$HTTP_PORT/health 2>/dev/null) || true + if [ "$health_response" = "200" ]; then + echo "✓ Dynamo API is ready! (health check passed)" + break + fi + attempt=$((attempt + 1)) + if [ $((attempt % 15)) -eq 0 ]; then + echo " ... still waiting ($attempt/$max_attempts) - health response: $health_response" + fi + sleep 1 + done + + if [ $attempt -ge $max_attempts ]; then + echo "" + echo "⚠ Timeout waiting for API. Check logs with: docker logs $CONTAINER_NAME" + echo "" + else + echo "" + echo "Quick test (polling every 15s for up to 5 minutes):" + echo "" + + quick_test_max_attempts=20 # 20 * 15s = 5 minutes + quick_test_attempt=0 + quick_test_success=false + + while [ $quick_test_attempt -lt $quick_test_max_attempts ]; do + quick_test_attempt=$((quick_test_attempt + 1)) + echo " Attempt $quick_test_attempt/$quick_test_max_attempts..." + + quick_test_response=$(curl -s --max-time 60 http://localhost:$HTTP_PORT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "'$SERVED_MODEL_NAME'", + "messages": [{"role": "user", "content": "Say hello"}], + "max_tokens": 20 + }' 2>&1) || true + + # Check if response is empty/null + if [ -z "$quick_test_response" ]; then + echo " Empty response, retrying in 15s..." + sleep 15 + continue + fi + + # Check if response contains an error + error_message=$(echo "$quick_test_response" | jq -r '.error.message // .error // empty' 2>/dev/null) + if [ -n "$error_message" ]; then + echo "" + echo "=========================================================" + echo "✗ Quick test failed with error:" + echo " $error_message" + echo "=========================================================" + echo "" + echo "Full response:" + echo "$quick_test_response" | jq . 2>/dev/null || echo "$quick_test_response" + echo "" + echo "Check logs with: docker logs $CONTAINER_NAME" + exit 1 + fi + + # Check if response has valid choices (success) + choices_content=$(echo "$quick_test_response" | jq -r '.choices[0].message.content // empty' 2>/dev/null) + if [ -n "$choices_content" ]; then + echo "" + echo "=========================================================" + echo "✓ Quick test successful!" + echo "=========================================================" + echo "" + echo "$quick_test_response" | jq '.choices[0].message.content, .usage' + echo "" + echo "=========================================================" + echo "Container is running. View logs with:" + echo " docker logs -f $CONTAINER_NAME" + echo "=========================================================" + quick_test_success=true + break + fi + + # Response exists but no choices - might still be loading + echo " Response received but no valid choices, retrying in 15s..." + echo " Response: $(echo "$quick_test_response" | head -c 200)..." + sleep 15 + done + + if [ "$quick_test_success" = false ]; then + echo "" + echo "=========================================================" + echo "⚠ Quick test timed out after 5 minutes" + echo "=========================================================" + echo "" + echo "Container is running but may not be fully ready." + echo "Try manually: curl http://localhost:$HTTP_PORT/v1/chat/completions ..." + echo "Check logs with: docker logs $CONTAINER_NAME" + fi + fi +else + echo "" + echo "=========================================================" + echo "✗ Container failed to start!" + echo "=========================================================" + echo "" + echo "Check logs with: docker logs $CONTAINER_NAME" + exit 1 +fi From c9c9dca4c574f4806c7d759ca0e1b939ccca573e Mon Sep 17 00:00:00 2001 From: Bryan Bednarski Date: Thu, 22 Jan 2026 01:46:44 +0000 Subject: [PATCH 02/13] processor with KVES draft Signed-off-by: Bryan Bednarski --- external/dynamo/optimized/processor.py | 524 +++++++++++++----- .../start_dynamo_optimized_thompson_hints.sh | 2 +- src/nat/llm/dynamo_llm.py | 131 ++++- 3 files changed, 501 insertions(+), 156 deletions(-) diff --git a/external/dynamo/optimized/processor.py b/external/dynamo/optimized/processor.py index 9a91759604..b4d4425f75 100644 --- a/external/dynamo/optimized/processor.py +++ b/external/dynamo/optimized/processor.py @@ -55,112 +55,246 @@ - Forwards to `dynamo.worker.generate` (not `dynamo.backend.generate`) - Receives PreprocessedRequest instead of ChatCompletionRequest - Extracts hints from nvext annotations (prefix_id:value format) -- Uses Prometheus metrics instead of CSV logging +- Uses Dynamo metrics API for Prometheus integration (auto-exposed at /metrics) - No tokenization (handled by frontend preprocessor) + +## Metrics + +All metrics are exposed via Dynamo's `/metrics` endpoint (requires DYN_SYSTEM_PORT). +Metrics use the `dynamo_component_` prefix and include standard Dynamo labels: +- `dynamo_namespace`, `dynamo_component`, `dynamo_endpoint` + +Custom metrics for Thompson Sampling routing: +- `requests_total` - Total requests processed +- `request_latency_seconds` - End-to-end request latency histogram +- `tokens_in_total` / `tokens_out_total` - Token throughput counters +- `routing_decisions_total` - Per-worker routing decision counter +- `router_errors_total` / `engine_errors_total` - Error counters +- `active_requests` - Current in-flight request gauge + +KV Cache Efficiency (KVE) metrics: +- `kve_prompt_tokens_total` - Total prompt tokens (efficiency denominator) +- `kve_cached_tokens_total` - Total cached tokens hit (efficiency numerator) +- `kve_device_blocks_total` - Cache hits from device (GPU) memory +- `kve_host_blocks_total` - Cache hits from host (CPU) memory +- `kve_disk_blocks_total` - Cache hits from disk + +## Grafana Integration + +Metrics are exposed at `/metrics` in Prometheus format. Enable with: + DYN_SYSTEM_PORT=8081 python processor.py --model-path ... --model-name ... + +Full metric names include the `dynamo_component_` prefix: + dynamo_component_requests_total{dynamo_namespace="dynamo",dynamo_component="backend",dynamo_endpoint="generate"} + +Example PromQL queries for Grafana dashboards: + # KV Cache Efficiency (%) + rate(dynamo_component_kve_cached_tokens_total[5m]) / rate(dynamo_component_kve_prompt_tokens_total[5m]) * 100 + + # Request latency p99 + histogram_quantile(0.99, rate(dynamo_component_request_latency_seconds_bucket[5m])) + +## Data Source Requirements + +KVE metrics require the underlying engine to return cache efficiency data: +- `usage.prompt_tokens_details.cached_tokens` - Standard OpenAI field (should work with prefix caching enabled) +- `nvext.cache_hit_breakdown` - Engine-specific extension (NOT standard Dynamo NvExt) """ import argparse import asyncio import logging -import os import time import uuid from collections.abc import AsyncIterator from typing import Any import uvloop -from dynamo.runtime import DistributedRuntime -from dynamo.runtime import dynamo_worker -from dynamo.runtime.logging import configure_dynamo_logging from dynamo.llm import ModelInput, ModelType, register_llm +from dynamo.runtime import DistributedRuntime, dynamo_worker +from dynamo.runtime.logging import configure_dynamo_logging from pydantic import BaseModel -# Prometheus metrics - import lazily to ensure proper multiprocess setup -_prometheus_initialized = False -_metrics = {} - configure_dynamo_logging() logger = logging.getLogger(__name__) -def _init_prometheus_metrics(): - """Initialize Prometheus metrics lazily.""" - global _prometheus_initialized, _metrics - if _prometheus_initialized: - return _metrics +# ----------------------- request / response models ----------------------- # +class RouterRequest(BaseModel): + """Request to the Thompson Sampling router.""" + + tokens: list[int] + prefix_id: str = "" + reuse_budget: int = 0 # remaining *after this request* + expected_osl: str | None = "MEDIUM" + interarrival: str | None = "MEDIUM" + + +class RouterFeedbackRequest(BaseModel): + """Feedback to the router after request completion.""" + + decision_id: str + latency_ms: float + success: bool | None = True + tokens_in: int | None = None + tokens_out: int | None = None + finish_reason: str | None = None + + +# ----------------------- KV efficiency data ----------------------- # +class KVEfficiencyData: + """ + Container for KV cache efficiency data extracted from worker responses. + + This data is used to compute and publish KVE metrics asynchronously, + ensuring zero impact on routing throughput. + """ + + __slots__ = ("prompt_tokens", "cached_tokens", "device_blocks", "host_blocks", "disk_blocks") + + def __init__(self): + self.prompt_tokens: int = 0 + self.cached_tokens: int = 0 + self.device_blocks: int = 0 + self.host_blocks: int = 0 + self.disk_blocks: int = 0 + + def has_data(self) -> bool: + """Check if any KVE data was collected.""" + return self.prompt_tokens > 0 + + @classmethod + def from_response(cls, data: dict[str, Any]) -> "KVEfficiencyData": + """ + Extract KVE data from a worker response chunk. + + Expected fields in response (OpenAI-compatible): + - usage.prompt_tokens: Total prompt tokens + - usage.prompt_tokens_details.cached_tokens: Cached token count - try: - from prometheus_client import Counter, Histogram, Gauge, CollectorRegistry, REGISTRY + Optional engine-specific fields (may not be present): + - nvext.cache_hit_breakdown.{device,host,disk}_blocks: Per-tier hits - _metrics["requests_total"] = Counter( - "thompson_processor_requests_total", + Note: cache_hit_breakdown is NOT a standard Dynamo NvExt field. + It must be enabled/configured in the underlying engine (vLLM/SGLang). + """ + kve = cls() + + # Extract from usage field (OpenAI-compatible, should always work) + usage = data.get("usage") + if isinstance(usage, dict): + kve.prompt_tokens = usage.get("prompt_tokens", 0) or 0 + prompt_details = usage.get("prompt_tokens_details") + if isinstance(prompt_details, dict): + kve.cached_tokens = prompt_details.get("cached_tokens", 0) or 0 + + # Extract cache breakdown from nvext (engine-specific, may not be present) + # This is NOT a standard Dynamo NvExt field - requires engine configuration + nvext = data.get("nvext") + if isinstance(nvext, dict): + breakdown = nvext.get("cache_hit_breakdown") + if isinstance(breakdown, dict): + kve.device_blocks = breakdown.get("device_blocks", 0) or 0 + kve.host_blocks = breakdown.get("host_blocks", 0) or 0 + kve.disk_blocks = breakdown.get("disk_blocks", 0) or 0 + + return kve + + +# ----------------------- metrics dataclass ----------------------- # +class ProcessorMetrics: + """ + Container for Thompson Sampling processor metrics. + + All metrics are created via Dynamo's metrics API, which: + - Automatically exposes them at /metrics in Prometheus format + - Adds standard labels (dynamo_namespace, dynamo_component, dynamo_endpoint) + - Integrates with Dynamo's Grafana dashboards + """ + + def __init__(self, endpoint): + """ + Initialize metrics using Dynamo's metrics API. + + Args: + endpoint: Dynamo endpoint object providing the metrics interface. + """ + # Request throughput + self.requests_total = endpoint.metrics.create_intcounter( + "requests_total", "Total requests processed by the Thompson Sampling processor", - registry=REGISTRY, ) - _metrics["request_latency"] = Histogram( - "thompson_processor_request_latency_seconds", - "Request latency in seconds", + + # Latency histogram with buckets suited for LLM inference + # Buckets: 100ms, 250ms, 500ms, 1s, 2.5s, 5s, 10s, 30s, 60s, 120s + self.request_latency_seconds = endpoint.metrics.create_histogram( + "request_latency_seconds", + "End-to-end request latency in seconds", buckets=[0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0], - registry=REGISTRY, ) - _metrics["tokens_in"] = Counter( - "thompson_processor_tokens_in_total", + + # Token throughput + self.tokens_in_total = endpoint.metrics.create_intcounter( + "tokens_in_total", "Total input tokens processed", - registry=REGISTRY, ) - _metrics["tokens_out"] = Counter( - "thompson_processor_tokens_out_total", + self.tokens_out_total = endpoint.metrics.create_intcounter( + "tokens_out_total", "Total output tokens generated", - registry=REGISTRY, ) - _metrics["routing_decisions"] = Counter( - "thompson_processor_routing_decisions_total", + + # Routing decisions by worker (for analyzing load distribution) + self.routing_decisions_total = endpoint.metrics.create_intcountervec( + "routing_decisions_total", "Routing decisions by worker", ["worker_id"], - registry=REGISTRY, ) - _metrics["router_errors"] = Counter( # i.e errors when picking a worker - "thompson_processor_router_errors_total", - "Router communication errors", - registry=REGISTRY, + + # Error tracking + self.router_errors_total = endpoint.metrics.create_intcounter( + "router_errors_total", + "Router communication errors (failed to pick worker)", ) - _metrics["engine_errors"] = Counter( # i.e errors when streaming from the engine - "thompson_processor_engine_errors_total", - "Backend engine errors", - registry=REGISTRY, + self.engine_errors_total = endpoint.metrics.create_intcounter( + "engine_errors_total", + "Backend engine errors (failed during streaming)", ) - _metrics["active_requests"] = Gauge( - "thompson_processor_active_requests", + + # Active request gauge + self.active_requests = endpoint.metrics.create_intgauge( + "active_requests", "Currently active requests being processed", - registry=REGISTRY, ) - _prometheus_initialized = True - logger.info("Prometheus metrics initialized for processor") - except ImportError: - logger.warning("prometheus_client not available, metrics disabled") - _prometheus_initialized = True # Don't retry - - return _metrics + # ----------------------------------------------------------------- + # KV Cache Efficiency (KVE) metrics + # These track cache hit rates for analyzing routing effectiveness. + # Efficiency = kve_cached_tokens_total / kve_prompt_tokens_total + # ----------------------------------------------------------------- + self.kve_prompt_tokens_total = endpoint.metrics.create_intcounter( + "kve_prompt_tokens_total", + "Total prompt tokens processed (KV efficiency denominator)", + ) + self.kve_cached_tokens_total = endpoint.metrics.create_intcounter( + "kve_cached_tokens_total", + "Total cached tokens hit (KV efficiency numerator)", + ) -# ----------------------- request / response models ----------------------- # -class RouterRequest(BaseModel): - """Request to the Thompson Sampling router.""" - tokens: list[int] - prefix_id: str = "" - reuse_budget: int = 0 # remaining *after this request* - expected_osl: str | None = "MEDIUM" - interarrival: str | None = "MEDIUM" - + # Cache hit breakdown by memory tier (for analyzing cache hierarchy) + self.kve_device_blocks_total = endpoint.metrics.create_intcounter( + "kve_device_blocks_total", + "KV cache blocks hit from device (GPU) memory", + ) + self.kve_host_blocks_total = endpoint.metrics.create_intcounter( + "kve_host_blocks_total", + "KV cache blocks hit from host (CPU) memory", + ) + self.kve_disk_blocks_total = endpoint.metrics.create_intcounter( + "kve_disk_blocks_total", + "KV cache blocks hit from disk storage", + ) -class RouterFeedbackRequest(BaseModel): - """Feedback to the router after request completion.""" - decision_id: str - latency_ms: float - success: bool | None = True - tokens_in: int | None = None - tokens_out: int | None = None - finish_reason: str | None = None + logger.info("Processor metrics initialized via Dynamo metrics API") # -------------------------- processor handler -------------------------- # @@ -174,11 +308,22 @@ class ProcessorRequestHandler: def __init__( self, runtime: DistributedRuntime, + endpoint, enable_router: bool = True, ): + """ + Initialize the processor request handler. + + Args: + runtime: Dynamo distributed runtime for client connections. + endpoint: Dynamo endpoint for metrics registration. + enable_router: Whether to use Thompson Sampling router (default: True). + """ self.runtime = runtime + self.endpoint = endpoint self.enable_router = enable_router + # Client connections (initialized in initialize()) self.router_pick_client = None self.router_feedback_client = None self.engine_client = None @@ -187,38 +332,42 @@ def __init__( self._prefix_state: dict[str, dict[str, int]] = {} self._prefix_lock = asyncio.Lock() - # Prometheus metrics - self._metrics = {} + # Metrics (initialized in initialize()) + self._metrics: ProcessorMetrics | None = None async def initialize(self): - """Initialize processor by connecting to router and backend.""" - # Initialize Prometheus metrics - self._metrics = _init_prometheus_metrics() + """Initialize processor by setting up metrics and connecting to services.""" + # Initialize metrics using Dynamo's metrics API + self._metrics = ProcessorMetrics(self.endpoint) + # Connect to Thompson Sampling router if self.enable_router: - ns = self.runtime.namespace("dynamo").component("router") - self.router_pick_client = await ns.endpoint("find_worker").client() - self.router_feedback_client = await ns.endpoint("feedback").client() + router_component = self.runtime.namespace("dynamo").component("router") + self.router_pick_client = await router_component.endpoint("find_worker").client() + self.router_feedback_client = await router_component.endpoint("feedback").client() logger.info("Router clients created, waiting for instances...") await self.router_pick_client.wait_for_instances() logger.info("Router clients initialized successfully") - # Engine client - connects to actual workers at dynamo.worker.generate + # Connect to actual workers at dynamo.worker.generate # (We register as "backend" to intercept frontend requests, but actual SGLang # workers register as "worker" so we can forward to them after routing) - self.engine_client = await self.runtime.namespace("dynamo").component("worker").endpoint("generate").client() + worker_component = self.runtime.namespace("dynamo").component("worker") + self.engine_client = await worker_component.endpoint("generate").client() logger.info("Engine client created, waiting for worker instances...") await self.engine_client.wait_for_instances() logger.info("Processor initialized successfully (routing to dynamo.worker.generate)") # ---- annotation extraction ---- @staticmethod - def _extract_annotation(annotations: list[str], key: str, default: str | None = None) -> str | None: + def _extract_annotation( + annotations: list[str], key: str, default: str | None = None + ) -> str | None: """Extract value from annotations list (format: 'key:value').""" prefix = f"{key}:" for ann in annotations: if ann.startswith(prefix): - return ann[len(prefix):] + return ann[len(prefix) :] return default def _extract_hints(self, request: dict[str, Any]) -> tuple[str, int, str, str]: @@ -231,22 +380,25 @@ def _extract_hints(self, request: dict[str, Any]) -> tuple[str, int, str, str]: if not isinstance(annotations, list): annotations = [] - # Extract from annotations + # Extract prefix_id (generate one if not provided) prefix_id = self._extract_annotation(annotations, "prefix_id") if not prefix_id: prefix_id = f"auto-{uuid.uuid4().hex}" + # Extract total_requests count total_str = self._extract_annotation(annotations, "total_requests", "1") try: total_requests = max(1, int(total_str)) except (ValueError, TypeError): total_requests = 1 + # Extract expected output sequence length category osl = self._extract_annotation(annotations, "osl", "MEDIUM") osl = osl.upper() if osl else "MEDIUM" if osl not in ("LOW", "MEDIUM", "HIGH"): osl = "MEDIUM" + # Extract interarrival time category iat = self._extract_annotation(annotations, "iat", "MEDIUM") iat = iat.upper() if iat else "MEDIUM" if iat not in ("LOW", "MEDIUM", "HIGH"): @@ -256,21 +408,25 @@ def _extract_hints(self, request: dict[str, Any]) -> tuple[str, int, str, str]: async def _update_prefix_state(self, prefix_id: str, total_requests: int) -> int: """ - Updates prefix counters and returns remaining_after (reuse_budget). + Update prefix counters and return remaining_after (reuse_budget). + + This tracks how many requests remain for a given prefix, allowing the + router to make informed decisions about KV cache placement. """ async with self._prefix_lock: - s = self._prefix_state.get(prefix_id) - if s is None: - s = {"total": total_requests, "processed": 0} - self._prefix_state[prefix_id] = s + state = self._prefix_state.get(prefix_id) + if state is None: + state = {"total": total_requests, "processed": 0} + self._prefix_state[prefix_id] = state else: - s["total"] = max(s["total"], total_requests) + # Update total if a higher count is reported + state["total"] = max(state["total"], total_requests) - s["processed"] += 1 - remaining_after = max(s["total"] - s["processed"], 0) + state["processed"] += 1 + remaining_after = max(state["total"] - state["processed"], 0) + # Clean up completed prefixes immediately if remaining_after == 0: - # Drop state immediately when finished self._prefix_state.pop(prefix_id, None) return remaining_after @@ -283,7 +439,11 @@ async def _pick_worker( osl: str, iat: str, ) -> tuple[int | None, str | None]: - """Pick a worker via the router.""" + """ + Pick a worker via the Thompson Sampling router. + + Returns: (worker_id, decision_id) or (None, None) if routing fails. + """ if not self.router_pick_client: return None, None @@ -294,37 +454,41 @@ async def _pick_worker( expected_osl=osl, interarrival=iat, ) + try: stream = await self.router_pick_client.generate(req.model_dump()) worker_id: int | None = None decision_id: str | None = None + async for chunk in stream: data = chunk.data() if "error" in data: logger.error("Router error: %s", data["error"]) - if self._metrics.get("router_errors"): - self._metrics["router_errors"].inc() + self._metrics.router_errors_total.inc() break + wid = data.get("worker_id", -1) if wid == -1: break + worker_id = int(wid) decision_id = data.get("decision_id") break - if worker_id is not None and self._metrics.get("routing_decisions"): - self._metrics["routing_decisions"].labels(worker_id=str(worker_id)).inc() - - if worker_id is None: - logger.warning("Router stream ended without worker_id; falling back to engine load balancing.") + # Record routing decision + if worker_id is not None: + self._metrics.routing_decisions_total.inc({"worker_id": str(worker_id)}) + else: + logger.warning( + "Router stream ended without worker_id; falling back to engine load balancing." + ) return worker_id, decision_id except Exception as e: logger.error("Failed to pick worker: %s", e) - if self._metrics.get("router_errors"): - self._metrics["router_errors"].inc() + self._metrics.router_errors_total.inc() return None, None async def _send_feedback_safely( @@ -336,11 +500,17 @@ async def _send_feedback_safely( tokens_out: int, finish_reason: str | None, ): - """Send feedback to router (fire-and-forget style).""" + """ + Send feedback to router (fire-and-forget style). + + This feedback is used by the Thompson Sampling algorithm to update + its model of worker performance. + """ if not decision_id or not self.router_feedback_client: return + try: - fb = RouterFeedbackRequest( + feedback = RouterFeedbackRequest( decision_id=decision_id, latency_ms=float(latency_ms), success=bool(success), @@ -348,12 +518,56 @@ async def _send_feedback_safely( tokens_out=int(tokens_out), finish_reason=finish_reason or "", ) - stream = await self.router_feedback_client.generate(fb.model_dump()) + stream = await self.router_feedback_client.generate(feedback.model_dump()) async for _ in stream: pass except Exception: logger.exception("Failed to send router feedback") + def _update_kve_metrics_sync(self, kve: KVEfficiencyData) -> None: + """ + Update KV cache efficiency metrics (synchronous, called from background task). + + This is intentionally synchronous - counter increments are atomic and + extremely fast (microseconds). The async wrapper exists only to allow + fire-and-forget scheduling via create_task(). + """ + if not kve.has_data(): + return + + # Update counters - these are atomic operations + self._metrics.kve_prompt_tokens_total.inc_by(kve.prompt_tokens) + self._metrics.kve_cached_tokens_total.inc_by(kve.cached_tokens) + self._metrics.kve_device_blocks_total.inc_by(kve.device_blocks) + self._metrics.kve_host_blocks_total.inc_by(kve.host_blocks) + self._metrics.kve_disk_blocks_total.inc_by(kve.disk_blocks) + + # Log efficiency for debugging (only if we have meaningful data) + if kve.prompt_tokens > 0: + efficiency = kve.cached_tokens / kve.prompt_tokens * 100 + logger.debug( + "KVE update: prompt=%d cached=%d eff=%.1f%% (dev=%d host=%d disk=%d)", + kve.prompt_tokens, + kve.cached_tokens, + efficiency, + kve.device_blocks, + kve.host_blocks, + kve.disk_blocks, + ) + + async def _update_kve_metrics_async(self, kve: KVEfficiencyData) -> None: + """ + Async wrapper for KVE metric updates (fire-and-forget via create_task). + + This allows the main streaming path to continue without waiting for + metric updates, ensuring zero impact on routing throughput. + """ + try: + self._update_kve_metrics_sync(kve) + except Exception: + # Never let metric updates crash the system + logger.exception("Failed to update KVE metrics") + async def _stream_from_engine( self, request: dict[str, Any], @@ -363,11 +577,17 @@ async def _stream_from_engine( ) -> AsyncIterator[dict[str, Any]]: """ Stream response from the backend engine. - Yields response chunks and sends feedback on completion. + + Yields response chunks and sends feedback to the router on completion. + Also updates Prometheus metrics for latency and token throughput. + + KV cache efficiency (KVE) metrics are updated asynchronously via + create_task() to ensure zero impact on routing throughput. """ t0 = time.perf_counter() tokens_out = 0 finish_reason: str | None = None + kve_data: KVEfficiencyData | None = None # Collected from response try: # Route to specific worker or use engine's load balancing @@ -379,13 +599,13 @@ async def _stream_from_engine( async for chunk in stream: data = chunk.data() + # Handle engine errors if "error" in data: latency_ms = (time.perf_counter() - t0) * 1000.0 await self._send_feedback_safely( decision_id, latency_ms, False, tokens_in, tokens_out, "error" ) - if self._metrics.get("engine_errors"): - self._metrics["engine_errors"].inc() + self._metrics.engine_errors_total.inc() yield {"error": data["error"]} return @@ -393,25 +613,36 @@ async def _stream_from_engine( if "token_ids" in data and isinstance(data["token_ids"], list): tokens_out += len(data["token_ids"]) + # Extract KVE data if present (typically in final chunk or usage chunk) + # We check for 'usage' field which contains cache efficiency info + if "usage" in data or "nvext" in data: + extracted = KVEfficiencyData.from_response(data) + if extracted.has_data(): + kve_data = extracted + # Pass through the chunk yield data + # Handle completion if "finish_reason" in data and data["finish_reason"] is not None: finish_reason = data["finish_reason"] - latency_ms = (time.perf_counter() - t0) * 1000.0 + latency_seconds = time.perf_counter() - t0 + latency_ms = latency_seconds * 1000.0 - # Send feedback + # Send feedback to router (this is already fire-and-forget) await self._send_feedback_safely( decision_id, latency_ms, True, tokens_in, tokens_out, finish_reason ) - # Update metrics - if self._metrics.get("request_latency"): - self._metrics["request_latency"].observe(latency_ms / 1000.0) - if self._metrics.get("tokens_in"): - self._metrics["tokens_in"].inc(tokens_in) - if self._metrics.get("tokens_out"): - self._metrics["tokens_out"].inc(tokens_out) + # Update core Prometheus metrics (fast atomic operations) + self._metrics.request_latency_seconds.observe(latency_seconds) + self._metrics.tokens_in_total.inc_by(tokens_in) + self._metrics.tokens_out_total.inc_by(tokens_out) + + # Fire-and-forget KVE metric update (async, non-blocking) + # This ensures KVE computation has ZERO impact on routing throughput + if kve_data is not None: + asyncio.create_task(self._update_kve_metrics_async(kve_data)) return @@ -420,13 +651,12 @@ async def _stream_from_engine( await self._send_feedback_safely( decision_id, latency_ms, False, tokens_in, tokens_out, "exception" ) - if self._metrics.get("engine_errors"): - self._metrics["engine_errors"].inc() + self._metrics.engine_errors_total.inc() logger.exception("Engine stream exception") yield {"error": str(e)} return - # ---- main generation ---- + # ---- main generation endpoint ---- async def generate(self, raw: dict[str, Any]): """ Processor endpoint: receives PreprocessedRequest from frontend. @@ -441,13 +671,11 @@ async def generate(self, raw: dict[str, Any]): } """ # Track active requests - if self._metrics.get("active_requests"): - self._metrics["active_requests"].inc() + self._metrics.active_requests.inc() try: # Increment request counter - if self._metrics.get("requests_total"): - self._metrics["requests_total"].inc() + self._metrics.requests_total.inc() # Extract routing hints from annotations prefix_id, total_requests, osl, iat = self._extract_hints(raw) @@ -460,63 +688,75 @@ async def generate(self, raw: dict[str, Any]): tokens_in = len(token_ids) logger.info( "Processing request: prefix=%s total=%d osl=%s iat=%s tokens=%d", - prefix_id, total_requests, osl, iat, tokens_in + prefix_id, + total_requests, + osl, + iat, + tokens_in, ) # Compute reuse_budget := remaining AFTER this request reuse_budget = await self._update_prefix_state(prefix_id, total_requests) - # Pick worker via router + # Pick worker via Thompson Sampling router worker_id, decision_id = await self._pick_worker( token_ids, prefix_id, reuse_budget, osl, iat ) logger.info( "Routing decision: worker=%s decision=%s reuse_budget=%d", - worker_id, decision_id, reuse_budget + worker_id, + decision_id, + reuse_budget, ) - # Stream from engine + # Stream response from engine async for resp in self._stream_from_engine(raw, worker_id, decision_id, tokens_in): yield resp finally: - if self._metrics.get("active_requests"): - self._metrics["active_requests"].dec() + self._metrics.active_requests.dec() # -------------------------- worker entry point -------------------------- # def parse_args(): - p = argparse.ArgumentParser(description="Optimized Thompson Sampling Processor") - p.add_argument( + """Parse command-line arguments for the processor.""" + parser = argparse.ArgumentParser(description="Optimized Thompson Sampling Processor") + parser.add_argument( "--enable-router", action="store_true", default=True, help="Enable Thompson Sampling router integration", ) - p.add_argument( + parser.add_argument( "--no-router", action="store_false", dest="enable_router", help="Disable router (use engine load balancing only)", ) - p.add_argument( + parser.add_argument( "--model-path", type=str, required=True, help="Path to the model directory (for loading tokenizer and model card)", ) - p.add_argument( + parser.add_argument( "--model-name", type=str, required=True, help="Served model name (must match frontend's --model-name)", ) - return p.parse_args() + return parser.parse_args() @dynamo_worker(static=False) # Dynamic mode for ETCD discovery by frontend async def worker(runtime: DistributedRuntime): + """ + Main worker entry point for the Thompson Sampling processor. + + This processor registers as a backend that the frontend can discover via ETCD, + then forwards requests to actual workers after applying Thompson Sampling routing. + """ args = parse_args() # DYNAMIC DISCOVERY MODE: @@ -534,12 +774,16 @@ async def worker(runtime: DistributedRuntime): component = runtime.namespace("dynamo").component("backend") await component.create_service() - # Create the endpoint FIRST (needed for register_llm) + # Create the endpoint FIRST (needed for register_llm and metrics) endpoint = component.endpoint("generate") # Register the model card with ETCD so the frontend can discover us # We accept preprocessed tokens (ModelInput.Tokens) and serve chat/completions - logger.info(f"Registering model card: model_name={args.model_name}, model_path={args.model_path}") + logger.info( + "Registering model card: model_name=%s, model_path=%s", + args.model_name, + args.model_path, + ) await register_llm( model_input=ModelInput.Tokens, # We accept tokenized input from frontend model_type=ModelType.Chat | ModelType.Completions, # Chat and completions endpoints @@ -549,10 +793,12 @@ async def worker(runtime: DistributedRuntime): ) logger.info("Model card registered successfully - frontend can now discover us via ETCD") - # Initialize the request handler - # Note: We use the same runtime for both serving AND client connections now, - # since we're fully dynamic. The runtime will discover workers dynamically. - handler = ProcessorRequestHandler(runtime, enable_router=args.enable_router) + # Initialize the request handler with the endpoint for metrics + handler = ProcessorRequestHandler( + runtime=runtime, + endpoint=endpoint, + enable_router=args.enable_router, + ) await handler.initialize() # Serve as "backend.generate" - frontend will route to us after ETCD discovery diff --git a/external/dynamo/start_dynamo_optimized_thompson_hints.sh b/external/dynamo/start_dynamo_optimized_thompson_hints.sh index 0d15a52818..d46bcadc9d 100755 --- a/external/dynamo/start_dynamo_optimized_thompson_hints.sh +++ b/external/dynamo/start_dynamo_optimized_thompson_hints.sh @@ -50,7 +50,7 @@ set -euo pipefail # Configuration Variables (can be overridden via environment variables) -CONTAINER_NAME="dynamo-sglang-optimized" +CONTAINER_NAME="dynamo-sglang" WORKER_GPUS="${DYNAMO_GPU_DEVICES:-0,1,2,3}" TP_SIZE="${DYNAMO_TP_SIZE:-4}" HTTP_PORT="${DYNAMO_HTTP_PORT:-8000}" diff --git a/src/nat/llm/dynamo_llm.py b/src/nat/llm/dynamo_llm.py index 79667e106b..b3b6381b7e 100644 --- a/src/nat/llm/dynamo_llm.py +++ b/src/nat/llm/dynamo_llm.py @@ -13,15 +13,29 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Dynamo LLM provider with automatic prefix header injection for KV cache optimization. +Dynamo LLM provider with automatic prefix injection for KV cache optimization. -This module provides a specialized OpenAI-compatible LLM that sends Dynamo prefix headers +This module provides a specialized OpenAI-compatible LLM that sends Dynamo prefix hints for optimal KV cache management and request routing. The prefix parameters are optimizable via the NAT optimizer. -The implementation uses httpx event hooks to inject headers at the HTTP transport level, +The implementation uses httpx event hooks to inject hints at the HTTP transport level, making it framework-agnostic (works with LangChain, LlamaIndex, etc.). +Transport Mechanisms +-------------------- + +This module supports two transport mechanisms for routing hints, used simultaneously +for maximum compatibility: + +1. **HTTP Headers** (``x-prefix-*``): For the generalized Thompson Sampling setup + that uses custom ``frontend.py`` which reads headers directly. + *DEPRECATED: Will be removed when start_dynamo_unified_thompson_hints.sh is retired.* + +2. **nvext.annotations** (in request body): For the optimized Thompson Sampling setup + that uses the default Dynamo frontend with custom ``processor.py`` which reads + annotations from the preprocessed request. *This is the preferred mechanism.* + Dynamo Prefix Parameters ------------------------- @@ -46,6 +60,7 @@ - Lower values allow more load balancing """ +import json import logging import uuid from collections.abc import Callable @@ -164,14 +179,20 @@ def scope(cls, prefix_id: str) -> Iterator[None]: class DynamoModelConfig(OpenAIModelConfig, name="dynamo"): """ - A Dynamo LLM provider with automatic prefix header injection for KV cache optimization. + A Dynamo LLM provider with automatic prefix hint injection for KV cache optimization. - This is a specialized OpenAI-compatible LLM that sends Dynamo prefix headers - for optimal KV cache management and request routing. Prefix headers are enabled + This is a specialized OpenAI-compatible LLM that sends Dynamo prefix hints + for optimal KV cache management and request routing. Prefix hints are enabled by default using the template "nat-dynamo-{uuid}". The prefix routing parameters (prefix_total_requests, prefix_osl, prefix_iat) are optimizable via the NAT optimizer. - To disable prefix headers, set prefix_template to null/None in your config. + Hints are sent via both HTTP headers (``x-prefix-*``) and ``nvext.annotations`` + in the request body for compatibility with different Dynamo setups: + + - **Generalized Thompson Sampling** (custom frontend.py): Reads HTTP headers + - **Optimized Thompson Sampling** (default frontend + processor.py): Reads nvext.annotations + + To disable prefix hints, set prefix_template to null/None in your config. """ # ========================================================================= @@ -257,16 +278,25 @@ def _create_dynamo_request_hook( iat: str, ) -> Callable[["httpx.Request"], Coroutine[Any, Any, None]]: """ - Create an httpx event hook that injects Dynamo prefix headers into requests. + Create an httpx event hook that injects Dynamo prefix hints into requests. This hook is called before each HTTP request is sent, allowing us to inject - headers dynamically. The prefix ID is generated ONCE when the hook is created, + hints dynamically. The prefix ID is generated ONCE when the hook is created, ensuring all requests from the same client share the same prefix ID. This enables Dynamo's KV cache optimization across multi-turn conversations. The context variable can override this for scenarios where you need different prefix IDs (e.g., per-question in batch evaluation). + Hints are injected via TWO transport mechanisms for maximum compatibility: + + 1. **HTTP Headers** (``x-prefix-*``): For the generalized Thompson Sampling setup + that uses custom ``frontend.py`` which reads headers directly. + + 2. **nvext.annotations** (in request body): For the optimized Thompson Sampling + setup that uses the default Dynamo frontend with custom ``processor.py`` + which reads annotations from the preprocessed request. + Args: prefix_template: Template string with {uuid} placeholder total_requests: Expected number of requests for this prefix @@ -287,7 +317,7 @@ def _create_dynamo_request_hook( logger.debug("Created Dynamo request hook with default prefix ID: %s", default_prefix_id) async def on_request(request): - """Inject Dynamo prefix headers before each request.""" + """Inject Dynamo prefix hints into request headers AND body.""" # Check context variable first (allows per-question override in batch evaluation) context_prefix_id = DynamoPrefixContext.get() @@ -299,13 +329,76 @@ async def on_request(request): prefix_id = default_prefix_id logger.debug("Using default prefix ID: %s", prefix_id) - # Inject Dynamo headers + # ===================================================================== + # Transport 1: HTTP Headers (for generalized Thompson Sampling setup) + # The custom frontend.py reads these headers directly. + # + # DEPRECATION NOTE: This transport mechanism exists solely for backwards + # compatibility with start_dynamo_unified_thompson_hints.sh which uses + # custom frontend.py/processor.py that read x-prefix-* headers. + # Once that setup is deprecated in favor of the optimized setup + # (start_dynamo_optimized_thompson_hints.sh), this header injection + # can be removed entirely - only nvext.annotations will be needed. + # + # AI PROMPT TO REMOVE HTTP HEADERS (use when generalized setup is deprecated): + # "Remove the HTTP header injection (x-prefix-*) from dynamo_llm.py. + # Keep only the nvext.annotations transport mechanism. Update docstrings + # to remove references to HTTP headers and the generalized Thompson + # Sampling setup. The start_dynamo_unified_thompson_hints.sh script + # and its custom frontend.py/processor.py are now deprecated." + # ===================================================================== request.headers["x-prefix-id"] = prefix_id request.headers["x-prefix-total-requests"] = str(total_requests) request.headers["x-prefix-osl"] = osl.upper() request.headers["x-prefix-iat"] = iat.upper() - logger.debug("Injected Dynamo headers: prefix_id=%s, total_requests=%d, osl=%s, iat=%s", + # ===================================================================== + # Transport 2: nvext.annotations (for optimized Thompson Sampling setup) + # The default Dynamo frontend passes these through to processor.py + # which extracts them from the PreprocessedRequest.annotations field. + # ===================================================================== + if request.method == "POST" and request.content: + try: + body = json.loads(request.content.decode("utf-8")) + if isinstance(body, dict): + # Build annotations list in "key:value" format + annotations = [ + f"prefix_id:{prefix_id}", + f"total_requests:{total_requests}", + f"osl:{osl.upper()}", + f"iat:{iat.upper()}", + ] + + # Add/merge nvext.annotations + if "nvext" not in body: + body["nvext"] = {} + if not isinstance(body["nvext"], dict): + body["nvext"] = {} + + # Preserve any existing annotations and add ours + existing = body["nvext"].get("annotations", []) + if not isinstance(existing, list): + existing = [] + + # Our annotations take precedence (placed first) + body["nvext"]["annotations"] = annotations + [ + a for a in existing + if not any(a.startswith(f"{key}:") for key in ["prefix_id", "total_requests", "osl", "iat"]) + ] + + # Re-encode the body + new_content = json.dumps(body).encode("utf-8") + # Update the request content (httpx allows this via _content) + request._content = new_content + request.headers["content-length"] = str(len(new_content)) + + logger.debug("Injected nvext.annotations: %s", body["nvext"]["annotations"]) + + except (json.JSONDecodeError, UnicodeDecodeError) as e: + # Not JSON or encoding issue - skip body injection, headers still work + logger.debug("Could not inject nvext.annotations (body not JSON): %s", e) + + logger.debug("Injected Dynamo hints: prefix_id=%s, total_requests=%d, osl=%s, iat=%s", prefix_id, total_requests, osl.upper(), @@ -322,10 +415,16 @@ def create_httpx_client_with_dynamo_hooks( timeout: float = 600.0, ) -> "httpx.AsyncClient": """ - Create an httpx.AsyncClient with Dynamo prefix header injection. + Create an httpx.AsyncClient with Dynamo prefix hint injection. + + This client can be passed to the OpenAI SDK to inject hints at the HTTP level, + making it framework-agnostic. Hints are injected via both HTTP headers and + nvext.annotations in the request body for maximum compatibility with different + Dynamo setups: - This client can be passed to the OpenAI SDK to inject headers at the HTTP level, - making it framework-agnostic. + - **Generalized setup** (custom frontend.py): Reads ``x-prefix-*`` HTTP headers + - **Optimized setup** (default frontend + custom processor.py): Reads + ``nvext.annotations`` from the request body Args: prefix_template: Template string with {uuid} placeholder @@ -335,7 +434,7 @@ def create_httpx_client_with_dynamo_hooks( timeout: HTTP request timeout in seconds Returns: - An httpx.AsyncClient configured with Dynamo header injection. + An httpx.AsyncClient configured with Dynamo hint injection. """ import httpx From 3f696078533c3b5ee5fb79e6e73e450b856fd498 Mon Sep 17 00:00:00 2001 From: bbednarski9 Date: Thu, 22 Jan 2026 07:11:21 +0000 Subject: [PATCH 03/13] add monitoring setup and metrics collection Signed-off-by: bbednarski9 --- external/dynamo/.env.example | 33 +- external/dynamo/collect_metrics.sh | 63 ++ external/dynamo/monitoring/README.md | 488 ++++++++++++ external/dynamo/monitoring/docker-compose.yml | 54 ++ .../provisioning/dashboards/dashboards.yml | 18 + .../dashboards/json/dynamo-overview.json | 709 ++++++++++++++++++ .../provisioning/datasources/datasources.yml | 15 + external/dynamo/monitoring/prometheus.yml | 49 ++ external/dynamo/optimized/processor.py | 34 +- .../start_dynamo_optimized_thompson_hints.sh | 55 +- 10 files changed, 1482 insertions(+), 36 deletions(-) create mode 100755 external/dynamo/collect_metrics.sh create mode 100644 external/dynamo/monitoring/README.md create mode 100644 external/dynamo/monitoring/docker-compose.yml create mode 100644 external/dynamo/monitoring/grafana/provisioning/dashboards/dashboards.yml create mode 100644 external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json create mode 100644 external/dynamo/monitoring/grafana/provisioning/datasources/datasources.yml create mode 100644 external/dynamo/monitoring/prometheus.yml diff --git a/external/dynamo/.env.example b/external/dynamo/.env.example index 3510684153..ad42332fb4 100644 --- a/external/dynamo/.env.example +++ b/external/dynamo/.env.example @@ -43,8 +43,8 @@ DYNAMO_GPU_DEVICES="0,1,2,3" # ============================================================================= # HTTP port for Dynamo frontend API -# Default: 8099 -# DYNAMO_HTTP_PORT="8099" +# Default: 8000 +# DYNAMO_HTTP_PORT="8000" # ETCD client port for metadata and discovery # Default: 2379 @@ -89,8 +89,33 @@ DYNAMO_GPU_DEVICES="0,1,2,3" # Path to CSV file for router decision logging # Default: router_metrics.csv -# ROUTER_METRICS_CSV = "router_metrics.csv" +# ROUTER_METRICS_CSV="router_metrics.csv" # timeout period for dynamo worker initialization # Default: 300 -# DYNAMO_WORKER_INIT_TIMEOUT_S = 300 +# DYNAMO_WORKER_INIT_TIMEOUT_S=300 + +# ============================================================================= +# OPTIONAL VARIABLES - Metrics Configuration +# ============================================================================= + +# Each component exposes Prometheus metrics on its own port to avoid conflicts. +# This allows collecting metrics from Worker, Router, and Processor separately. + +# Worker metrics port (KV cache stats, NATS metrics, internal stats) +# Default: 8081 +# DYNAMO_WORKER_METRICS_PORT="8081" + +# Router metrics port (Thompson Sampling routing metrics) +# Default: 8082 +# DYNAMO_ROUTER_METRICS_PORT="8082" + +# Processor metrics port (Thompson Sampling KVE metrics) +# Default: 8083 +# DYNAMO_PROCESSOR_METRICS_PORT="8083" + +# Metrics endpoints after startup: +# Frontend: http://localhost:${DYNAMO_HTTP_PORT}/metrics (latency, throughput) +# Worker: http://localhost:${DYNAMO_WORKER_METRICS_PORT}/metrics (KV cache) +# Router: http://localhost:${DYNAMO_ROUTER_METRICS_PORT}/metrics (routing) +# Processor: http://localhost:${DYNAMO_PROCESSOR_METRICS_PORT}/metrics (KVE) diff --git a/external/dynamo/collect_metrics.sh b/external/dynamo/collect_metrics.sh new file mode 100755 index 0000000000..59468523de --- /dev/null +++ b/external/dynamo/collect_metrics.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Dynamo Metrics Collector +# Saves metrics from Frontend and Worker to timestamped files + +OUTPUT_DIR="${1:-./metrics_logs}" +INTERVAL="${2:-30}" # Collection interval in seconds + +mkdir -p "$OUTPUT_DIR" + +echo "=== Dynamo Metrics Collector ===" +echo "Output directory: $OUTPUT_DIR" +echo "Collection interval: ${INTERVAL}s" +echo "Press Ctrl+C to stop" +echo "" + +collect_metrics() { + local timestamp=$(date +%Y%m%d_%H%M%S) + local frontend_file="$OUTPUT_DIR/frontend_${timestamp}.prom" + local worker_file="$OUTPUT_DIR/worker_${timestamp}.prom" + local combined_file="$OUTPUT_DIR/combined_${timestamp}.prom" + + echo "[$(date)] Collecting metrics..." + + # Collect frontend metrics + curl -s http://localhost:8000/metrics > "$frontend_file" 2>/dev/null + + # Collect worker metrics + curl -s http://localhost:8081/metrics > "$worker_file" 2>/dev/null + + # Create combined file with headers + { + echo "# Collected at: $(date -Iseconds)" + echo "# === FRONTEND METRICS ===" + cat "$frontend_file" + echo "" + echo "# === WORKER METRICS ===" + cat "$worker_file" + } > "$combined_file" + + # Also append to a rolling log (last 24 hours of key metrics) + { + echo "# Timestamp: $(date -Iseconds)" + grep -E '^dynamo_frontend_(requests_total|time_to_first_token|inter_token_latency|inflight)' "$frontend_file" 2>/dev/null + grep -E '^dynamo_component_(request_duration|inflight|kvstats)' "$worker_file" 2>/dev/null + echo "" + } >> "$OUTPUT_DIR/rolling_metrics.log" + + echo " Saved: $combined_file" +} + +# Collect once immediately +collect_metrics + +# Then collect at intervals +while true; do + sleep "$INTERVAL" + collect_metrics +done + + diff --git a/external/dynamo/monitoring/README.md b/external/dynamo/monitoring/README.md new file mode 100644 index 0000000000..1cde252864 --- /dev/null +++ b/external/dynamo/monitoring/README.md @@ -0,0 +1,488 @@ +# Dynamo Monitoring Stack + +This directory contains a Prometheus + Grafana monitoring setup for the Dynamo LLM inference stack with Thompson Sampling router. + +## Quick Start + +```bash +# Start the monitoring stack +cd monitoring +docker compose up -d + +# Access the dashboards +# Prometheus: http://localhost:9090 +# Grafana: http://localhost:3000 (admin/admin) +``` + +## Prerequisites + +- Docker and Docker Compose +- Dynamo stack running (see `../start_dynamo_optimized_thompson_hints.sh`) + +## Architecture + +``` +┌──────────────────────────────────────────────────────────────────────────────┐ +│ Dynamo Stack │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ +│ │ Frontend │ │ Worker │ │ Router │ │ Processor │ │ +│ │ :8000 │ │ :8081 │ │ :8082 │ │ :8083 │ │ +│ │ /metrics │ │ /metrics │ │ /metrics │ │ /metrics │ │ +│ │ (latency) │ │ (KV cache) │ │ (routing) │ │ (KVE) │ │ +│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ +└─────────┼────────────────┼────────────────┼────────────────┼─────────────────┘ + │ │ │ │ + ▼ ▼ ▼ ▼ +┌──────────────────────────────────────────────────────────────────────────────┐ +│ Monitoring Stack │ +│ ┌────────────────────────────────────────────────────────────────────────┐ │ +│ │ Prometheus :9090 │ │ +│ │ Scrapes all 4 endpoints every 5 seconds: │ │ +│ │ - Frontend (:8000) - latency, throughput, tokens │ │ +│ │ - Worker (:8081) - KV cache, NATS, internal stats │ │ +│ │ - Router (:8082) - Thompson Sampling routing metrics │ │ +│ │ - Processor (:8083) - Thompson Sampling KVE metrics │ │ +│ └────────────────────────────────┬───────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────────────────────────┐ │ +│ │ Grafana :3000 │ │ +│ │ Pre-configured dashboard: "Dynamo LLM Overview" │ │ +│ │ Login: admin / admin │ │ +│ └────────────────────────────────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────────────────────────────┘ +``` + +## Metrics Endpoints + +| Component | Port | URL | Description | +|-----------|------|-----|-------------| +| Frontend | 8000 | `http://localhost:8000/metrics` | User-facing metrics (latency, throughput) | +| Worker | 8081 | `http://localhost:8081/metrics` | Internal metrics (KV cache, NATS stats) | +| Router | 8082 | `http://localhost:8082/metrics` | Thompson Sampling routing metrics | +| Processor | 8083 | `http://localhost:8083/metrics` | Thompson Sampling KVE metrics | + +## Key Metrics + +### Frontend Metrics (`:8000/metrics`) + +User-facing HTTP API metrics for latency, throughput, and token statistics. + +| Metric | Type | Description | +|--------|------|-------------| +| `dynamo_frontend_requests_total` | Counter | Total requests processed | +| `dynamo_frontend_inflight_requests` | Gauge | Currently processing requests | +| `dynamo_frontend_queued_requests` | Gauge | Requests waiting in queue | +| `dynamo_frontend_disconnected_clients` | Counter | Client disconnections | +| `dynamo_frontend_time_to_first_token_seconds` | Histogram | Time until first token generated | +| `dynamo_frontend_inter_token_latency_seconds` | Histogram | Time between consecutive tokens | +| `dynamo_frontend_request_duration_seconds` | Histogram | Total request duration | +| `dynamo_frontend_input_sequence_tokens` | Histogram | Input prompt length distribution | +| `dynamo_frontend_output_sequence_tokens` | Histogram | Output length distribution | +| `dynamo_frontend_output_tokens_total` | Counter | Total output tokens generated | +| `dynamo_frontend_model_context_length` | Gauge | Model context window size | +| `dynamo_frontend_model_kv_cache_block_size` | Gauge | KV cache block size | + +### Worker Metrics (`:8081/metrics`) + +SGLang backend worker metrics including KV cache, scheduling, and internal statistics. + +#### Dynamo Component Metrics + +| Metric | Type | Description | +|--------|------|-------------| +| `dynamo_component_kvstats_gpu_cache_usage_percent` | Gauge | KV cache memory utilization (0-100) | +| `dynamo_component_kvstats_gpu_prefix_cache_hit_rate` | Gauge | Prefix cache hit rate (0-1) | +| `dynamo_component_kvstats_active_blocks` | Gauge | Active KV cache blocks | +| `dynamo_component_kvstats_total_blocks` | Gauge | Total KV cache blocks | +| `dynamo_component_request_duration_seconds` | Histogram | Backend request processing time | +| `dynamo_component_requests_total` | Counter | Total requests to worker | +| `dynamo_component_inflight_requests` | Gauge | Requests currently in worker | +| `dynamo_component_uptime_seconds` | Gauge | Worker uptime | + +#### SGLang Native Metrics + +| Metric | Type | Description | +|--------|------|-------------| +| `sglang:cache_hit_rate` | Gauge | Prefix cache hit rate | +| `sglang:token_usage` | Gauge | Current token usage | +| `sglang:num_running_reqs` | Gauge | Currently running requests | +| `sglang:num_queue_reqs` | Gauge | Queued requests | +| `sglang:num_used_tokens` | Gauge | Tokens currently in use | +| `sglang:gen_throughput` | Gauge | Generation throughput | +| `sglang:utilization` | Gauge | GPU utilization | +| `sglang:queue_time_seconds` | Histogram | Time spent in queue | +| `sglang:per_stage_req_latency_seconds` | Histogram | Per-stage request latency | +| `sglang:kv_transfer_latency_ms` | Gauge | KV transfer latency | +| `sglang:kv_transfer_speed_gb_s` | Gauge | KV transfer speed | +| `sglang:engine_startup_time` | Gauge | Engine startup duration | +| `sglang:engine_load_weights_time` | Gauge | Model weight loading time | + +### Router Metrics (`:8082/metrics`) + +Dynamo component metrics for the Thompson Sampling router (uses standard `dynamo_component_*` prefix). + +| Metric | Type | Description | +|--------|------|-------------| +| `dynamo_component_requests_total` | Counter | Total routing requests (labeled by endpoint) | +| `dynamo_component_request_duration_seconds` | Histogram | Routing decision latency | +| `dynamo_component_request_bytes_total` | Counter | Request payload bytes | +| `dynamo_component_response_bytes_total` | Counter | Response payload bytes | +| `dynamo_component_inflight_requests` | Gauge | In-flight routing requests | +| `dynamo_component_uptime_seconds` | Gauge | Router uptime | +| `dynamo_component_nats_service_requests_total` | Gauge | NATS service requests | +| `dynamo_component_nats_service_processing_ms_avg` | Gauge | Average NATS processing time | +| `dynamo_component_nats_client_connection_state` | Gauge | NATS connection state (0=disconnected, 1=connected) | + +**Router Endpoints** (use `dynamo_endpoint` label to filter): +- `find_worker` - Worker selection requests +- `feedback` - Feedback from completed requests + +### Thompson Sampling Processor Metrics (`:8083/metrics`) + +Custom Thompson Sampling KV Efficiency (KVE) metrics from the processor component. + +| Metric | Type | Description | +|--------|------|-------------| +| `dynamo_component_thompson_requests_total` | Counter | Total requests processed | +| `dynamo_component_thompson_request_latency_seconds` | Histogram | End-to-end request latency | +| `dynamo_component_thompson_tokens_in_total` | Counter | Total input tokens | +| `dynamo_component_thompson_tokens_out_total` | Counter | Total output tokens | +| `dynamo_component_thompson_routing_decisions_total` | Counter | Routing decisions made | +| `dynamo_component_thompson_active_requests` | Gauge | Currently processing requests | +| `dynamo_component_thompson_router_errors_total` | Counter | Router communication errors | +| `dynamo_component_thompson_engine_errors_total` | Counter | Engine/worker errors | +| `dynamo_component_thompson_kve_prompt_tokens_total` | Counter | Total prompt tokens (KVE denominator) | +| `dynamo_component_thompson_kve_cached_tokens_total` | Counter | Cached tokens hit (KVE numerator) | +| `dynamo_component_thompson_kve_device_blocks_total` | Counter | KV blocks from GPU memory | +| `dynamo_component_thompson_kve_host_blocks_total` | Counter | KV blocks from CPU memory | +| `dynamo_component_thompson_kve_disk_blocks_total` | Counter | KV blocks from disk | + +**KV Efficiency (KVE) Calculation:** +```promql +# KV Cache Efficiency percentage (using SGLang native metric - RECOMMENDED) +sglang:cache_hit_rate * 100 + +# Alternative: Using processor counters (may show 0 if SGLang doesn't return cached_tokens in API) +# rate(dynamo_component_thompson_kve_cached_tokens_total[5m]) / rate(dynamo_component_thompson_kve_prompt_tokens_total[5m]) * 100 +``` + +> **Why use SGLang's native metric?** SGLang computes cache hit rate internally but doesn't include +> `cached_tokens` in its API responses. The processor's `thompson_kve_*` counters will show 0 +> unless the underlying engine provides `usage.prompt_tokens_details.cached_tokens`. + +## Grafana Dashboard + +The pre-configured dashboard "Dynamo LLM Overview" includes: + +1. **Inflight Requests** - Current load across all components +2. **Requests/min** - Throughput +3. **Time to First Token (P95)** - Latency to start generating +4. **KV Cache Usage %** - GPU memory utilization +5. **TTFT Over Time** - P50/P95/P99 latency trends +6. **ITL Over Time** - Inter-token latency trends +7. **Token Throughput** - Tokens generated per second +8. **KV Cache Stats** - Cache usage and hit rate over time + +### Thompson Sampling Panels (Included) + +The dashboard includes these Thompson Sampling and SGLang monitoring panels: + +- **KV Efficiency / Cache Hit Rate** - `sglang:cache_hit_rate * 100` (SGLang native metric) +- **Routing Decisions/sec** - `rate(dynamo_component_thompson_routing_decisions_total[5m])` +- **SGLang Queue Depth** - `sglang:num_queue_reqs` + `sglang:num_running_reqs` +- **Worker Utilization** - `sglang:utilization` + `sglang:token_usage` + +> **Note**: KV Efficiency uses SGLang's native `cache_hit_rate` metric rather than the processor's +> `thompson_kve_*` counters because SGLang doesn't include `cached_tokens` in its API responses. +> The native metric provides the same information: `(cached_tokens / prompt_tokens) * 100`. + +## Files + +``` +monitoring/ +├── docker-compose.yml # Prometheus + Grafana services +├── prometheus.yml # Prometheus scrape configuration +├── README.md # This file +└── grafana/ + └── provisioning/ + ├── datasources/ + │ └── datasources.yml # Prometheus datasource config + └── dashboards/ + ├── dashboards.yml # Dashboard provider config + └── json/ + └── dynamo-overview.json # Pre-built dashboard +``` + +## Usage + +### Start Monitoring + +```bash +docker compose up -d +``` + +### Stop Monitoring + +```bash +docker compose down +``` + +### View Logs + +```bash +docker compose logs -f prometheus +docker compose logs -f grafana +``` + +### Reset Data (Start Fresh) + +```bash +docker compose down -v # Removes volumes +docker compose up -d +``` + +## Manual Metrics Queries + +### Prometheus UI (http://localhost:9090) + +Example queries: + +```promql +# Request rate (requests/second) +rate(dynamo_frontend_requests_total[1m]) + +# P95 Time to First Token +histogram_quantile(0.95, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5m])) + +# P99 Inter-Token Latency +histogram_quantile(0.99, rate(dynamo_frontend_inter_token_latency_seconds_bucket[5m])) + +# Token throughput +rate(dynamo_frontend_output_tokens_total[1m]) + +# KV cache hit rate (Dynamo) +dynamo_component_kvstats_gpu_prefix_cache_hit_rate + +# KV cache hit rate (SGLang native) +sglang:cache_hit_rate + +# KV cache usage percentage +dynamo_component_kvstats_gpu_cache_usage_percent + +# Thompson routing decisions rate +rate(dynamo_component_thompson_routing_decisions_total[5m]) + +# KV Efficiency / Cache Hit Rate (using SGLang native - RECOMMENDED) +sglang:cache_hit_rate * 100 + +# Router endpoint request rate +rate(dynamo_component_requests_total{dynamo_component="router"}[5m]) + +# Worker queue depth +sglang:num_queue_reqs +``` + +### curl + +```bash +# All frontend metrics +curl -s http://localhost:8000/metrics + +# All worker metrics (Dynamo + SGLang) +curl -s http://localhost:8081/metrics + +# All router metrics +curl -s http://localhost:8082/metrics + +# All processor metrics (Thompson Sampling) +curl -s http://localhost:8083/metrics + +# Filter specific metrics +curl -s http://localhost:8000/metrics | grep time_to_first_token +curl -s http://localhost:8081/metrics | grep kvstats +curl -s http://localhost:8081/metrics | grep "sglang:" +curl -s http://localhost:8083/metrics | grep thompson +``` + +## Troubleshooting + +### Prometheus can't scrape targets + +Check if Dynamo is running: +```bash +curl http://localhost:8000/health +curl http://localhost:8081/metrics +``` + +### Grafana shows "No data" + +1. Verify Prometheus is scraping: http://localhost:9090/targets +2. Check if metrics exist: http://localhost:9090/graph (query a metric name) +3. Ensure time range is correct in Grafana + +### Port conflicts + +If ports 9090 or 3000 are in use, modify `docker-compose.yml`: +```yaml +# Change Prometheus port +command: + - '--web.listen-address=:9091' # Different port + +# Change Grafana port +environment: + - GF_SERVER_HTTP_PORT=3001 # Different port +``` + +## Alternative: File-Based Collection + +If you don't want to run Prometheus/Grafana, use the collection script: + +```bash +cd /localhome/local-bbednarski/NeMo-Agent-Toolkit/external/dynamo +./collect_metrics.sh ./metrics_output 30 # Collect every 30s +``` + +This creates timestamped `.prom` files that can be analyzed later or imported into Prometheus. + +## Complete Metrics Reference + +### Summary by Component + +| Component | Port | Metric Count | Key Prefixes | +|-----------|------|--------------|--------------| +| Frontend | 8000 | ~22 | `dynamo_frontend_*` | +| Worker | 8081 | ~50 | `dynamo_component_kvstats_*`, `sglang:*` | +| Router | 8082 | ~20 | `dynamo_component_*` (labeled `router`) | +| Processor | 8083 | ~35 | `dynamo_component_thompson_*` | + +### All Metric Names by Component + +
+Frontend (port 8000) - 22 metrics + +``` +dynamo_frontend_disconnected_clients +dynamo_frontend_inflight_requests +dynamo_frontend_input_sequence_tokens_{bucket,count,sum} +dynamo_frontend_inter_token_latency_seconds_{bucket,count,sum} +dynamo_frontend_model_context_length +dynamo_frontend_model_kv_cache_block_size +dynamo_frontend_model_migration_limit +dynamo_frontend_output_sequence_tokens_{bucket,count,sum} +dynamo_frontend_output_tokens_total +dynamo_frontend_queued_requests +dynamo_frontend_request_duration_seconds_{bucket,count,sum} +dynamo_frontend_requests_total +dynamo_frontend_time_to_first_token_seconds_{bucket,count,sum} +``` +
+ +
+Worker (port 8081) - 50 metrics + +**Dynamo Component Metrics:** +``` +dynamo_component_inflight_requests +dynamo_component_kvstats_active_blocks +dynamo_component_kvstats_gpu_cache_usage_percent +dynamo_component_kvstats_gpu_prefix_cache_hit_rate +dynamo_component_kvstats_total_blocks +dynamo_component_nats_client_* +dynamo_component_nats_service_* +dynamo_component_request_bytes_total +dynamo_component_request_duration_seconds_{bucket,count,sum} +dynamo_component_requests_total +dynamo_component_response_bytes_total +dynamo_component_uptime_seconds +``` + +**SGLang Native Metrics:** +``` +sglang:cache_hit_rate +sglang:engine_load_weights_time +sglang:engine_startup_time +sglang:gen_throughput +sglang:is_cuda_graph +sglang:kv_transfer_* +sglang:mamba_usage +sglang:num_decode_prealloc_queue_reqs +sglang:num_decode_transfer_queue_reqs +sglang:num_grammar_queue_reqs +sglang:num_paused_reqs +sglang:num_prefill_inflight_queue_reqs +sglang:num_prefill_prealloc_queue_reqs +sglang:num_queue_reqs +sglang:num_retracted_reqs +sglang:num_running_reqs +sglang:num_running_reqs_offline_batch +sglang:num_used_tokens +sglang:pending_prealloc_token_usage +sglang:per_stage_req_latency_seconds_{bucket,count,sum} +sglang:queue_time_seconds_{bucket,count,sum} +sglang:spec_accept_length +sglang:spec_accept_rate +sglang:swa_token_usage +sglang:token_usage +sglang:utilization +``` +
+ +
+Router (port 8082) - 20 metrics + +``` +dynamo_component_inflight_requests{dynamo_component="router"} +dynamo_component_nats_client_connection_state +dynamo_component_nats_client_current_connections +dynamo_component_nats_client_in_messages +dynamo_component_nats_client_in_total_bytes +dynamo_component_nats_client_out_messages +dynamo_component_nats_client_out_overhead_bytes +dynamo_component_nats_service_active_endpoints +dynamo_component_nats_service_active_services +dynamo_component_nats_service_errors_total +dynamo_component_nats_service_processing_ms_avg +dynamo_component_nats_service_processing_ms_total +dynamo_component_nats_service_requests_total +dynamo_component_request_bytes_total{dynamo_endpoint="find_worker|feedback"} +dynamo_component_request_duration_seconds_{bucket,count,sum} +dynamo_component_requests_total +dynamo_component_response_bytes_total +dynamo_component_uptime_seconds +``` +
+ +
+Processor (port 8083) - 35 metrics + +**Standard Dynamo Component Metrics:** +``` +dynamo_component_inflight_requests +dynamo_component_nats_client_* +dynamo_component_nats_service_* +dynamo_component_request_bytes_total +dynamo_component_request_duration_seconds_{bucket,count,sum} +dynamo_component_requests_total +dynamo_component_response_bytes_total +dynamo_component_uptime_seconds +``` + +**Thompson Sampling Custom Metrics:** +``` +dynamo_component_thompson_active_requests +dynamo_component_thompson_engine_errors_total +dynamo_component_thompson_kve_cached_tokens_total +dynamo_component_thompson_kve_device_blocks_total +dynamo_component_thompson_kve_disk_blocks_total +dynamo_component_thompson_kve_host_blocks_total +dynamo_component_thompson_kve_prompt_tokens_total +dynamo_component_thompson_request_latency_seconds_{bucket,count,sum} +dynamo_component_thompson_requests_total +dynamo_component_thompson_router_errors_total +dynamo_component_thompson_routing_decisions_total +dynamo_component_thompson_tokens_in_total +dynamo_component_thompson_tokens_out_total +``` +
+ diff --git a/external/dynamo/monitoring/docker-compose.yml b/external/dynamo/monitoring/docker-compose.yml new file mode 100644 index 0000000000..3f70780954 --- /dev/null +++ b/external/dynamo/monitoring/docker-compose.yml @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Prometheus + Grafana monitoring stack for Dynamo +# +# Usage: +# cd monitoring +# docker compose up -d +# +# Access: +# Prometheus: http://localhost:9090 +# Grafana: http://localhost:3000 (admin/admin) + +services: + prometheus: + image: prom/prometheus:v2.48.0 + container_name: dynamo-prometheus + network_mode: host + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.listen-address=:9090' + - '--storage.tsdb.retention.time=7d' + restart: unless-stopped + + grafana: + image: grafana/grafana:10.2.2 + container_name: dynamo-grafana + network_mode: host + environment: + - GF_SERVER_HTTP_PORT=3000 + # Disable authentication for local development + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + - GF_AUTH_DISABLE_LOGIN_FORM=true + # Keep these for if you re-enable login later + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + volumes: + - grafana_data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + restart: unless-stopped + depends_on: + - prometheus + +volumes: + prometheus_data: + grafana_data: + + diff --git a/external/dynamo/monitoring/grafana/provisioning/dashboards/dashboards.yml b/external/dynamo/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000000..c348b57218 --- /dev/null +++ b/external/dynamo/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: 1 + +providers: + - name: 'Dynamo Dashboards' + orgId: 1 + folder: 'Dynamo' + folderUid: 'dynamo' + type: file + disableDeletion: false + updateIntervalSeconds: 30 + allowUiUpdates: true + options: + path: /etc/grafana/provisioning/dashboards/json + + diff --git a/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json b/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json new file mode 100644 index 0000000000..9c48708d1a --- /dev/null +++ b/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json @@ -0,0 +1,709 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 5}, + {"color": "red", "value": 10} + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": {"h": 4, "w": 6, "x": 0, "y": 0}, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.2", + "targets": [ + { + "expr": "dynamo_frontend_inflight_requests", + "legendFormat": "Inflight Requests", + "refId": "A" + } + ], + "title": "Inflight Requests", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null} + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": {"h": 4, "w": 6, "x": 6, "y": 0}, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.2", + "targets": [ + { + "expr": "increase(dynamo_frontend_requests_total[1m])", + "legendFormat": "Requests/min", + "refId": "A" + } + ], + "title": "Requests (1m)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 1}, + {"color": "red", "value": 5} + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": {"h": 4, "w": 6, "x": 12, "y": 0}, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.2", + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5m]))", + "legendFormat": "P95 TTFT", + "refId": "A" + } + ], + "title": "Time to First Token (P95)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null} + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": {"h": 4, "w": 6, "x": 18, "y": 0}, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.2", + "targets": [ + { + "expr": "sglang:cache_hit_rate * 100", + "legendFormat": "Cache Hit Rate", + "refId": "A" + } + ], + "title": "Cache Hit Rate %", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": {"legend": false, "tooltip": false, "viz": false}, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": {"type": "linear"}, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"}, + "thresholdsStyle": {"mode": "off"} + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 4}, + "id": 5, + "options": { + "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "none"} + }, + "pluginVersion": "10.2.2", + "targets": [ + { + "expr": "histogram_quantile(0.5, rate(dynamo_frontend_time_to_first_token_seconds_bucket[1m]))", + "legendFormat": "P50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, rate(dynamo_frontend_time_to_first_token_seconds_bucket[1m]))", + "legendFormat": "P95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, rate(dynamo_frontend_time_to_first_token_seconds_bucket[1m]))", + "legendFormat": "P99", + "refId": "C" + } + ], + "title": "Time to First Token (TTFT)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": {"legend": false, "tooltip": false, "viz": false}, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": {"type": "linear"}, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"}, + "thresholdsStyle": {"mode": "off"} + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 4}, + "id": 6, + "options": { + "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "none"} + }, + "pluginVersion": "10.2.2", + "targets": [ + { + "expr": "histogram_quantile(0.5, rate(dynamo_frontend_inter_token_latency_seconds_bucket[1m]))", + "legendFormat": "P50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, rate(dynamo_frontend_inter_token_latency_seconds_bucket[1m]))", + "legendFormat": "P95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, rate(dynamo_frontend_inter_token_latency_seconds_bucket[1m]))", + "legendFormat": "P99", + "refId": "C" + } + ], + "title": "Inter-Token Latency (ITL)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": {"legend": false, "tooltip": false, "viz": false}, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": {"type": "linear"}, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"}, + "thresholdsStyle": {"mode": "off"} + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 12}, + "id": 7, + "options": { + "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "none"} + }, + "pluginVersion": "10.2.2", + "targets": [ + { + "expr": "rate(dynamo_frontend_output_tokens_total[1m])", + "legendFormat": "Output Tokens/s", + "refId": "A" + } + ], + "title": "Token Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": {"legend": false, "tooltip": false, "viz": false}, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": {"type": "linear"}, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"}, + "thresholdsStyle": {"mode": "off"} + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 12}, + "id": 8, + "options": { + "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "none"} + }, + "pluginVersion": "10.2.2", + "targets": [ + { + "expr": "sglang:cache_hit_rate * 100", + "legendFormat": "Cache Hit Rate % (SGLang)", + "refId": "A" + }, + { + "expr": "sglang:token_usage * 100", + "legendFormat": "Token Usage % (SGLang)", + "refId": "B" + }, + { + "expr": "dynamo_component_kvstats_gpu_cache_usage_percent", + "legendFormat": "KV Cache Usage % (Dynamo)", + "refId": "C" + } + ], + "title": "KV Cache Stats", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": {"legend": false, "tooltip": false, "viz": false}, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": {"type": "linear"}, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"}, + "thresholdsStyle": {"mode": "off"} + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 20}, + "id": 9, + "options": { + "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "none"} + }, + "pluginVersion": "10.2.2", + "targets": [ + { + "expr": "sglang:cache_hit_rate * 100", + "legendFormat": "SGLang Cache Hit Rate %", + "refId": "A" + }, + { + "expr": "rate(dynamo_component_thompson_kve_cached_tokens_total[5m]) / rate(dynamo_component_thompson_kve_prompt_tokens_total[5m]) * 100", + "legendFormat": "Thompson KVE % (if available)", + "refId": "B" + } + ], + "title": "KV Efficiency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": {"legend": false, "tooltip": false, "viz": false}, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": {"type": "linear"}, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"}, + "thresholdsStyle": {"mode": "off"} + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 20}, + "id": 10, + "options": { + "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "none"} + }, + "pluginVersion": "10.2.2", + "targets": [ + { + "expr": "rate(dynamo_component_thompson_routing_decisions_total[5m])", + "legendFormat": "Routing Decisions/sec", + "refId": "A" + } + ], + "title": "Routing Decisions (Thompson Sampling)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": {"legend": false, "tooltip": false, "viz": false}, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": {"type": "linear"}, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"}, + "thresholdsStyle": {"mode": "off"} + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 10}, + {"color": "red", "value": 50} + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 28}, + "id": 11, + "options": { + "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "none"} + }, + "pluginVersion": "10.2.2", + "targets": [ + { + "expr": "sglang:num_queue_reqs", + "legendFormat": "Queue Depth", + "refId": "A" + }, + { + "expr": "sglang:num_running_reqs", + "legendFormat": "Running Requests", + "refId": "B" + } + ], + "title": "SGLang Queue Depth", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": {"legend": false, "tooltip": false, "viz": false}, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": {"type": "linear"}, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"}, + "thresholdsStyle": {"mode": "off"} + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 28}, + "id": 12, + "options": { + "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "none"} + }, + "pluginVersion": "10.2.2", + "targets": [ + { + "expr": "sglang:utilization", + "legendFormat": "Worker Utilization", + "refId": "A" + }, + { + "expr": "sglang:token_usage", + "legendFormat": "Token Usage", + "refId": "B" + } + ], + "title": "Worker Utilization (SGLang)", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": ["dynamo", "llm", "inference"], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Dynamo LLM Overview", + "uid": "dynamo-overview", + "version": 1, + "weekStart": "" +} + + diff --git a/external/dynamo/monitoring/grafana/provisioning/datasources/datasources.yml b/external/dynamo/monitoring/grafana/provisioning/datasources/datasources.yml new file mode 100644 index 0000000000..c6a8cab8e3 --- /dev/null +++ b/external/dynamo/monitoring/grafana/provisioning/datasources/datasources.yml @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + uid: prometheus + access: proxy + url: http://localhost:9090 + isDefault: true + editable: true + + diff --git a/external/dynamo/monitoring/prometheus.yml b/external/dynamo/monitoring/prometheus.yml new file mode 100644 index 0000000000..91300240ee --- /dev/null +++ b/external/dynamo/monitoring/prometheus.yml @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Prometheus configuration for Dynamo metrics collection +# +# Metrics Endpoints: +# - Frontend (8000): User-facing latency, throughput, tokens +# - Worker (8081): KV cache stats, NATS metrics, internal stats +# - Router (8082): Thompson Sampling routing metrics +# - Processor (8083): Thompson Sampling KVE metrics + +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + # Dynamo Frontend metrics (user-facing latency, throughput) + - job_name: 'dynamo-frontend' + static_configs: + - targets: ['localhost:8000'] + metrics_path: /metrics + scrape_interval: 5s + + # Dynamo Worker metrics (KV cache, internal stats) + - job_name: 'dynamo-worker' + static_configs: + - targets: ['localhost:8081'] + metrics_path: /metrics + scrape_interval: 5s + + # Thompson Sampling Router metrics + - job_name: 'dynamo-router' + static_configs: + - targets: ['localhost:8082'] + metrics_path: /metrics + scrape_interval: 5s + + # Thompson Sampling Processor metrics (KVE) + - job_name: 'dynamo-processor' + static_configs: + - targets: ['localhost:8083'] + metrics_path: /metrics + scrape_interval: 5s + + # Prometheus self-monitoring + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + diff --git a/external/dynamo/optimized/processor.py b/external/dynamo/optimized/processor.py index b4d4425f75..eaa891f2d7 100644 --- a/external/dynamo/optimized/processor.py +++ b/external/dynamo/optimized/processor.py @@ -219,50 +219,50 @@ def __init__(self, endpoint): Args: endpoint: Dynamo endpoint object providing the metrics interface. """ - # Request throughput + # Request throughput (prefixed with thompson_ to avoid conflicts with + # serve_endpoint's built-in work handler metrics) self.requests_total = endpoint.metrics.create_intcounter( - "requests_total", + "thompson_requests_total", "Total requests processed by the Thompson Sampling processor", ) - # Latency histogram with buckets suited for LLM inference - # Buckets: 100ms, 250ms, 500ms, 1s, 2.5s, 5s, 10s, 30s, 60s, 120s + # Latency histogram (uses default Prometheus buckets since Python binding + # doesn't expose custom bucket configuration in Dynamo 0.7.1) self.request_latency_seconds = endpoint.metrics.create_histogram( - "request_latency_seconds", + "thompson_request_latency_seconds", "End-to-end request latency in seconds", - buckets=[0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0], ) # Token throughput self.tokens_in_total = endpoint.metrics.create_intcounter( - "tokens_in_total", + "thompson_tokens_in_total", "Total input tokens processed", ) self.tokens_out_total = endpoint.metrics.create_intcounter( - "tokens_out_total", + "thompson_tokens_out_total", "Total output tokens generated", ) # Routing decisions by worker (for analyzing load distribution) self.routing_decisions_total = endpoint.metrics.create_intcountervec( - "routing_decisions_total", + "thompson_routing_decisions_total", "Routing decisions by worker", ["worker_id"], ) # Error tracking self.router_errors_total = endpoint.metrics.create_intcounter( - "router_errors_total", + "thompson_router_errors_total", "Router communication errors (failed to pick worker)", ) self.engine_errors_total = endpoint.metrics.create_intcounter( - "engine_errors_total", + "thompson_engine_errors_total", "Backend engine errors (failed during streaming)", ) # Active request gauge self.active_requests = endpoint.metrics.create_intgauge( - "active_requests", + "thompson_active_requests", "Currently active requests being processed", ) @@ -272,25 +272,25 @@ def __init__(self, endpoint): # Efficiency = kve_cached_tokens_total / kve_prompt_tokens_total # ----------------------------------------------------------------- self.kve_prompt_tokens_total = endpoint.metrics.create_intcounter( - "kve_prompt_tokens_total", + "thompson_kve_prompt_tokens_total", "Total prompt tokens processed (KV efficiency denominator)", ) self.kve_cached_tokens_total = endpoint.metrics.create_intcounter( - "kve_cached_tokens_total", + "thompson_kve_cached_tokens_total", "Total cached tokens hit (KV efficiency numerator)", ) # Cache hit breakdown by memory tier (for analyzing cache hierarchy) self.kve_device_blocks_total = endpoint.metrics.create_intcounter( - "kve_device_blocks_total", + "thompson_kve_device_blocks_total", "KV cache blocks hit from device (GPU) memory", ) self.kve_host_blocks_total = endpoint.metrics.create_intcounter( - "kve_host_blocks_total", + "thompson_kve_host_blocks_total", "KV cache blocks hit from host (CPU) memory", ) self.kve_disk_blocks_total = endpoint.metrics.create_intcounter( - "kve_disk_blocks_total", + "thompson_kve_disk_blocks_total", "KV cache blocks hit from disk storage", ) diff --git a/external/dynamo/start_dynamo_optimized_thompson_hints.sh b/external/dynamo/start_dynamo_optimized_thompson_hints.sh index d46bcadc9d..5acbe6dabc 100755 --- a/external/dynamo/start_dynamo_optimized_thompson_hints.sh +++ b/external/dynamo/start_dynamo_optimized_thompson_hints.sh @@ -54,7 +54,10 @@ CONTAINER_NAME="dynamo-sglang" WORKER_GPUS="${DYNAMO_GPU_DEVICES:-0,1,2,3}" TP_SIZE="${DYNAMO_TP_SIZE:-4}" HTTP_PORT="${DYNAMO_HTTP_PORT:-8000}" -METRICS_PORT="${DYNAMO_METRICS_PORT:-8081}" +# Metrics ports - each component gets its own port to avoid conflicts +WORKER_METRICS_PORT="${DYNAMO_WORKER_METRICS_PORT:-8081}" +ROUTER_METRICS_PORT="${DYNAMO_ROUTER_METRICS_PORT:-8082}" +PROCESSOR_METRICS_PORT="${DYNAMO_PROCESSOR_METRICS_PORT:-8083}" MODEL="/workspace/models/Llama-3.3-70B-Instruct" SERVED_MODEL_NAME="${DYNAMO_MODEL_NAME:-llama-3.3-70b}" IMAGE="nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.1" @@ -104,12 +107,15 @@ echo "=========================================================" echo "Model: Llama-3.3-70B-Instruct" echo "Container: $CONTAINER_NAME" echo "HTTP Port: $HTTP_PORT (default Dynamo frontend)" -echo "Metrics Port: $METRICS_PORT (Prometheus)" +echo "Metrics Ports:" +echo " - Worker: $WORKER_METRICS_PORT (KV cache, internal)" +echo " - Router: $ROUTER_METRICS_PORT (Thompson routing)" +echo " - Processor: $PROCESSOR_METRICS_PORT (KVE metrics)" echo "" echo "Architecture Differences (vs generalized):" echo " - Default Dynamo frontend (not custom frontend.py)" echo " - Hints via nvext.annotations (not HTTP headers)" -echo " - Prometheus metrics (not CSV files)" +echo " - Prometheus metrics on separate ports per component" echo "" echo "Components:" echo " - ETCD (metadata and discovery)" @@ -258,8 +264,10 @@ docker run -d \ -e RUST_BACKTRACE=1 \ -e PYTHONUNBUFFERED=1 \ -e DYN_HTTP_PORT=$HTTP_PORT \ - -e DYN_SYSTEM_PORT=$METRICS_PORT \ -e DYN_ROUTER_MODE=round-robin \ + -e WORKER_METRICS_PORT=$WORKER_METRICS_PORT \ + -e ROUTER_METRICS_PORT=$ROUTER_METRICS_PORT \ + -e PROCESSOR_METRICS_PORT=$PROCESSOR_METRICS_PORT \ $IMAGE \ bash -c " set -e @@ -336,7 +344,9 @@ docker run -d \ # CRITICAL: Register worker at dynamo.worker.generate (not default backend.generate) # This allows the custom Processor to register as backend.generate and intercept # frontend requests, then forward to these workers after Thompson Sampling routing. + # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component CUDA_VISIBLE_DEVICES=0,1,2,3 \ + DYN_SYSTEM_PORT=\$WORKER_METRICS_PORT \ python3 -m dynamo.sglang \ --model-path $MODEL \ --served-model-name $SERVED_MODEL_NAME \ @@ -350,6 +360,7 @@ docker run -d \ WORKER_PID=\$! echo \"Unified Worker PID: \$WORKER_PID\" echo \"Registered at: dynamo.worker.generate\" + echo \"Metrics at: http://localhost:\$WORKER_METRICS_PORT/metrics\" echo \"\" # Wait for unified worker to initialize @@ -361,10 +372,13 @@ docker run -d \ echo '=========================================================' # Router uses config.yaml for all parameters # Override specific values with --affinity-base, --temp-base, --lints-v, or --override + # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component + DYN_SYSTEM_PORT=\$ROUTER_METRICS_PORT \ python3 /workspace/custom_dynamo/router.py \ --config /workspace/custom_dynamo/config.yaml & ROUTER_PID=\$! echo \"Router PID: \$ROUTER_PID\" + echo \"Metrics at: http://localhost:\$ROUTER_METRICS_PORT/metrics\" sleep 15 echo \"\" @@ -376,6 +390,8 @@ docker run -d \ # Processor registers as dynamo.backend.generate AND calls register_llm() # to advertise a model card in ETCD. The frontend's ModelWatcher discovers # this and routes requests to us. + # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component + DYN_SYSTEM_PORT=\$PROCESSOR_METRICS_PORT \ python3 /workspace/custom_dynamo/processor.py \ --enable-router \ --model-path $MODEL \ @@ -385,6 +401,7 @@ docker run -d \ echo \"Model: $SERVED_MODEL_NAME (from $MODEL)\" echo \"Registered at: dynamo.backend.generate (discovered via ETCD model card)\" echo \"Forwards to: dynamo.worker.generate (actual SGLang workers)\" + echo \"Metrics at: http://localhost:\$PROCESSOR_METRICS_PORT/metrics\" sleep 15 echo \"\" @@ -417,12 +434,16 @@ docker run -d \ echo \"Dynamo Components (This Container):\" echo \" Unified Worker: PID \$WORKER_PID (GPUs $WORKER_GPUS, TP=$TP_SIZE)\" echo \" → Registered at: dynamo.worker.generate\" + echo \" → Metrics: http://localhost:\$WORKER_METRICS_PORT/metrics\" echo \" Router: PID \$ROUTER_PID (Thompson Sampling + Prometheus)\" echo \" → Registered at: dynamo.router.{find_worker,feedback}\" + echo \" → Metrics: http://localhost:\$ROUTER_METRICS_PORT/metrics\" echo \" Processor: PID \$PROCESSOR_PID (NVExt annotation extraction)\" echo \" → Registered at: dynamo.backend.generate (model card in ETCD)\" + echo \" → Metrics: http://localhost:\$PROCESSOR_METRICS_PORT/metrics\" echo \" Frontend: PID \$FRONTEND_PID (Default Dynamo HTTP API on port $HTTP_PORT)\" echo \" → Discovery: ETCD ModelWatcher (finds processor's model card)\" + echo \" → Metrics: http://localhost:$HTTP_PORT/metrics\" echo '' echo 'Request Flow (Dynamic Discovery Mode):' echo ' Client → Default Frontend API (port $HTTP_PORT)' @@ -440,11 +461,11 @@ docker run -d \ echo ' ↓' echo ' Response + Feedback to Router' echo '' - echo 'Prometheus Metrics:' - echo ' - Frontend: http://localhost:$HTTP_PORT/metrics' - echo ' - Backend: http://localhost:$METRICS_PORT/metrics' - echo ' - Router: thompson_router_* metrics' - echo ' - Processor: thompson_processor_* metrics' + echo 'Prometheus Metrics Endpoints:' + echo ' - Frontend: http://localhost:$HTTP_PORT/metrics (latency, throughput)' + echo ' - Worker: http://localhost:\$WORKER_METRICS_PORT/metrics (KV cache, internal)' + echo ' - Router: http://localhost:\$ROUTER_METRICS_PORT/metrics (thompson_router_*)' + echo ' - Processor: http://localhost:\$PROCESSOR_METRICS_PORT/metrics (thompson_* KVE)' echo '=========================================================' # Monitor all processes @@ -513,9 +534,11 @@ if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then echo " ETCD: etcd-dynamo container, localhost:2379" echo " NATS: nats-dynamo container, localhost:4222" echo "" - echo "Prometheus Metrics:" - echo " Frontend: http://localhost:$HTTP_PORT/metrics" - echo " Backend/Router/Processor: http://localhost:$METRICS_PORT/metrics" + echo "Prometheus Metrics Endpoints:" + echo " Frontend: http://localhost:$HTTP_PORT/metrics (latency, throughput)" + echo " Worker: http://localhost:$WORKER_METRICS_PORT/metrics (KV cache)" + echo " Router: http://localhost:$ROUTER_METRICS_PORT/metrics (routing)" + echo " Processor: http://localhost:$PROCESSOR_METRICS_PORT/metrics (KVE)" echo "" echo "API Endpoint: http://localhost:$HTTP_PORT/v1/chat/completions" echo "Health Check: http://localhost:$HTTP_PORT/health" @@ -538,9 +561,11 @@ if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then echo " GPU usage: watch -n 2 nvidia-smi" echo " Stop all: bash stop_dynamo.sh" echo "" - echo "Prometheus Metrics:" - echo " curl http://localhost:$HTTP_PORT/metrics | grep dynamo" - echo " curl http://localhost:$METRICS_PORT/metrics | grep thompson" + echo "Query Metrics:" + echo " curl http://localhost:$HTTP_PORT/metrics | grep dynamo_frontend" + echo " curl http://localhost:$WORKER_METRICS_PORT/metrics | grep kvstats" + echo " curl http://localhost:$ROUTER_METRICS_PORT/metrics | grep thompson_router" + echo " curl http://localhost:$PROCESSOR_METRICS_PORT/metrics | grep thompson_kve" echo "" echo "=========================================================" echo "Test Request (with nvext annotations):" From 2d5623d3ca2e5abc27cbdf0417d279678a63d7f6 Mon Sep 17 00:00:00 2001 From: bbednarski9 Date: Thu, 22 Jan 2026 23:05:27 +0000 Subject: [PATCH 04/13] dynamo profiler integration! Signed-off-by: bbednarski9 --- .../configs/profile_rethinking_full_test.yml | 31 + external/dynamo/collect_metrics.sh | 1 + .../provisioning/dashboards/dashboards.yml | 1 + src/nat/data_models/profiler.py | 164 ++++ .../inference_optimization/dynamo_metrics.py | 899 ++++++++++++++++++ src/nat/profiler/profile_runner.py | 18 +- 6 files changed, 1113 insertions(+), 1 deletion(-) create mode 100644 src/nat/profiler/inference_optimization/dynamo_metrics.py diff --git a/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/profile_rethinking_full_test.yml b/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/profile_rethinking_full_test.yml index 47b7e243fb..0bf1b7128a 100644 --- a/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/profile_rethinking_full_test.yml +++ b/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/profile_rethinking_full_test.yml @@ -26,12 +26,24 @@ # - Bottleneck analysis with nested call stacks # - Concurrency spike detection # - Prompt caching prefix identification +# - Dynamo inference stack metrics (KVE, TTFT, ITL from Prometheus) +# +# Core Dynamo Optimization Metrics: +# 1. KV Efficiency (KVE) = cached_tokens / prompt_tokens +# - Measures fraction of computational work saved via KV cache +# - Higher is better (0.8 = 80% of tokens from cache) +# 2. Time to First Token (TTFT) - User-perceived initial latency +# 3. Inter-Token Latency (ITL) - Streaming smoothness # # Combines self-evaluating agent with detailed profiler for: # - Understanding performance characteristics of rethinking # - Identifying optimization opportunities # - Generating data for throughput analysis scripts # +# Prerequisites: +# - Prometheus running at localhost:9090 (for Dynamo metrics) +# - Dynamo stack with monitoring enabled +# # Usage: # nat profile --config_file configs/profile_rethinking_full_test.yml # @@ -223,6 +235,25 @@ eval: concurrency_spike_analysis: enable: true spike_threshold: 24 # Alert when concurrent functions >= 24 + # Dynamo inference stack metrics - collect from Prometheus + # Core optimization metrics: KV Efficiency, TTFT, ITL + dynamo_metrics: + enable: true + prometheus_url: http://localhost:9090 + # Time range for rate calculations - should match experiment duration + # Minimum: 15s (Prometheus scrapes every 5s, need ≥3 points for reliable rates) + # Options: 15s (very short), 30s, 1m, 2m, 5m + # Shorter = more accurate for brief experiments, but noisier + # Longer = smoother averages, but may include pre-experiment data + query_range: 30s + # Core metrics (primary optimization targets) + collect_kv_cache: true # KVE = cached_tokens/prompt_tokens (work saved) + collect_ttft: true # Time to First Token (P50/P95/P99) + collect_itl: true # Inter-Token Latency (P50/P95/P99) + # Supplementary metrics (context and diagnostics) + collect_inflight_requests: true + collect_throughput: true + collect_token_throughput: true evaluators: tool_selection_quality: diff --git a/external/dynamo/collect_metrics.sh b/external/dynamo/collect_metrics.sh index 59468523de..16dc51bcff 100755 --- a/external/dynamo/collect_metrics.sh +++ b/external/dynamo/collect_metrics.sh @@ -61,3 +61,4 @@ while true; do done + diff --git a/external/dynamo/monitoring/grafana/provisioning/dashboards/dashboards.yml b/external/dynamo/monitoring/grafana/provisioning/dashboards/dashboards.yml index c348b57218..7c91586621 100644 --- a/external/dynamo/monitoring/grafana/provisioning/dashboards/dashboards.yml +++ b/external/dynamo/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -16,3 +16,4 @@ providers: path: /etc/grafana/provisioning/dashboards/json + diff --git a/src/nat/data_models/profiler.py b/src/nat/data_models/profiler.py index cb0ed64544..134c4710e7 100644 --- a/src/nat/data_models/profiler.py +++ b/src/nat/data_models/profiler.py @@ -14,6 +14,7 @@ # limitations under the License. from pydantic import BaseModel +from pydantic import Field class PromptCachingConfig(BaseModel): @@ -40,6 +41,168 @@ class PrefixSpanConfig(BaseModel): chain_with_common_prefixes: bool = False +class DynamoMetricsConfig(BaseModel): + """ + Configuration for collecting Dynamo inference stack metrics. + + Core Optimization Metrics + ------------------------- + The profiler focuses on three core metrics for Dynamo LLM optimization: + + 1. **KV Efficiency (KVE)** (``collect_kv_cache``): + Token-agnostic measure of computational work saved via KV cache. + Formula: ``KVE = cached_tokens / prompt_tokens`` + A KVE of 0.8 means 80% of prompt tokens were served from cache. + Affected by prefix routing hints (prefix_id, prefix_osl, prefix_iat). + + 2. **Time to First Token - TTFT** (``collect_ttft``): + Latency from request to first token. Lower = faster initial response. + Affected by queue depth, worker selection, KV cache hits. + + 3. **Inter-Token Latency - ITL** (``collect_itl``): + Time between tokens during streaming. Lower = smoother streaming. + Affected by batch scheduling, GPU utilization. + + To collect only core metrics for optimization, use:: + + config = DynamoMetricsConfig.core_metrics_only() + + Dynamo Endpoints + ---------------- + - Frontend (:8000/metrics): Latency, throughput, token stats + - Worker (:8081/metrics): KV cache, SGLang stats + - Router (:8082/metrics): Thompson Sampling routing + - Processor (:8083/metrics): Thompson Sampling KVE + + Adding New Metrics + ------------------ + To add metrics from any Dynamo endpoint: + + 1. **Identify the metric** from the endpoint:: + + curl localhost:8081/metrics | grep kv + + 2. **Add to DynamoMetricsResult** in ``src/nat/profiler/inference_optimization/dynamo_metrics.py``: + - Add a new field to the Pydantic model + - Add the Prometheus query in ``METRIC_QUERIES`` + + 3. **Example - Adding a new metric**:: + + # In dynamo_metrics.py METRIC_QUERIES dict: + "my_new_metric": "rate(dynamo_component_my_metric_total[5m])" + + # In DynamoMetricsResult model: + my_new_metric: float | None = Field(default=None, description="My new metric") + + Metric Reference by Endpoint + ---------------------------- + - **Frontend (:8000)**: ``dynamo_frontend_*`` (requests, latency, tokens) + - **Worker (:8081)**: ``dynamo_component_kvstats_*``, ``sglang:*`` (KV cache, SGLang) + - **Router (:8082)**: ``dynamo_component_*`` with ``dynamo_component="router"`` label + - **Processor (:8083)**: ``dynamo_component_thompson_*`` (Thompson Sampling) + + See ``external/dynamo/monitoring/README.md`` for the complete metrics reference. + """ + + enable: bool = Field(default=False, description="Enable Dynamo metrics collection") + + prometheus_url: str = Field( + default="http://localhost:9090", + description="Prometheus server URL for querying Dynamo metrics", + ) + + # ========================================================================= + # CORE OPTIMIZATION METRICS (Primary targets) + # ========================================================================= + collect_kv_cache: bool = Field( + default=True, + description="[CORE] Collect KV Efficiency (KVE = cached_tokens/prompt_tokens) - " + "primary metric for prefix caching optimization. Measures fraction of work saved.", + ) + collect_ttft: bool = Field( + default=True, + description="[CORE] Collect Time to First Token (P50/P95/P99) - primary latency metric", + ) + collect_itl: bool = Field( + default=True, + description="[CORE] Collect Inter-Token Latency (P50/P95/P99) - primary streaming metric", + ) + + # ========================================================================= + # SUPPLEMENTARY METRICS (Context and diagnostics) + # ========================================================================= + collect_inflight_requests: bool = Field( + default=True, + description="Collect current inflight requests across components", + ) + collect_throughput: bool = Field( + default=True, + description="Collect requests per minute throughput", + ) + collect_token_throughput: bool = Field( + default=True, + description="Collect token generation throughput (tokens/sec)", + ) + + # Query time range for rate calculations + query_range: str = Field( + default="30s", + description="Time range for rate calculations in Prometheus queries. " + "Minimum: '15s' (Prometheus scrapes every 5s, need ≥3 points for reliable rates). " + "Options: '15s', '30s' (default), '1m', '2m', '5m'. " + "Should roughly match experiment duration. Too short = noisy. Too long = stale data included.", + ) + + @classmethod + def core_metrics_only( + cls, + prometheus_url: str = "http://localhost:9090", + query_range: str = "30s", + ) -> "DynamoMetricsConfig": + """ + Create a config that collects only the three core optimization metrics. + + This is optimized for tight optimization loops where you only need: + - KV Cache Efficiency + - TTFT (Time to First Token) + - ITL (Inter-Token Latency) + + Args: + prometheus_url: Prometheus server URL + query_range: Time range for rate calculations + + Returns: + DynamoMetricsConfig with only core metrics enabled + + Usage:: + + config = DynamoMetricsConfig.core_metrics_only() + # Equivalent to: + # DynamoMetricsConfig( + # enable=True, + # collect_kv_cache=True, + # collect_ttft=True, + # collect_itl=True, + # collect_inflight_requests=False, + # collect_throughput=False, + # collect_token_throughput=False, + # ) + """ + return cls( + enable=True, + prometheus_url=prometheus_url, + query_range=query_range, + # Core metrics + collect_kv_cache=True, + collect_ttft=True, + collect_itl=True, + # Disable supplementary metrics + collect_inflight_requests=False, + collect_throughput=False, + collect_token_throughput=False, + ) + + class ProfilerConfig(BaseModel): base_metrics: bool = False @@ -52,3 +215,4 @@ class ProfilerConfig(BaseModel): bottleneck_analysis: BottleneckConfig = BottleneckConfig() concurrency_spike_analysis: ConcurrencySpikeConfig = ConcurrencySpikeConfig() prefix_span_analysis: PrefixSpanConfig = PrefixSpanConfig() + dynamo_metrics: DynamoMetricsConfig = DynamoMetricsConfig() diff --git a/src/nat/profiler/inference_optimization/dynamo_metrics.py b/src/nat/profiler/inference_optimization/dynamo_metrics.py new file mode 100644 index 0000000000..8d0e76af66 --- /dev/null +++ b/src/nat/profiler/inference_optimization/dynamo_metrics.py @@ -0,0 +1,899 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Dynamo Metrics Collector for NAT Profiler. + +This module collects performance metrics from the Dynamo inference stack via Prometheus. +Metrics are collected from four Dynamo components: + +- **Frontend** (:8000): User-facing latency, throughput, token statistics +- **Worker** (:8081): KV cache utilization, SGLang backend metrics +- **Router** (:8082): Thompson Sampling routing decisions +- **Processor** (:8083): Thompson Sampling KVE (KV Efficiency) metrics + +Core Optimization Metrics +------------------------- + +The profiler focuses on three core metrics for Dynamo LLM optimization: + +1. **KV Efficiency (KVE)** - Token-agnostic measure of computational savings: + + - Formula: ``KVE = cached_tokens / prompt_tokens`` + - Measures the fraction of total work saved via KV cache reuse + - A KVE of 0.8 means 80% of prompt tokens were served from cache + - Source: Thompson Sampling processor (``dynamo_component_thompson_kve_*``) + - Fallback: SGLang native ``cache_hit_rate`` if KVE counters unavailable + - Affected by: prefix_id routing, prefix hints (osl, iat), request patterns + +2. **Time to First Token (TTFT)** (``ttft_p50``, ``ttft_p95``, ``ttft_p99``): + + - Latency from request arrival to first token generation + - Critical for user-perceived responsiveness + - Affected by queue depth, worker selection, KV cache hits + +3. **Inter-Token Latency (ITL)** (``itl_p50``, ``itl_p95``, ``itl_p99``): + + - Time between consecutive token generations during streaming + - Affects smoothness of streaming responses + - Influenced by batch scheduling and GPU utilization + +Adding New Metrics +------------------ + +To add a new metric from any Dynamo endpoint: + +1. **Find the metric name** by curling the endpoint:: + + curl -s http://localhost:8081/metrics | grep -i kv + curl -s http://localhost:8000/metrics | grep -i token + +2. **Add the Prometheus query** to ``METRIC_QUERIES``:: + + METRIC_QUERIES = { + ... + "my_new_metric": "rate(dynamo_component_my_metric_total[{range}])", + } + + Note: Use ``{range}`` placeholder for time range (replaced with config value). + +3. **Add the field** to ``DynamoMetricsResult``:: + + class DynamoMetricsResult(BaseModel): + ... + my_new_metric: float | None = Field( + default=None, + description="Description of my new metric" + ) + +4. **Update the collector** if needed (optional - for complex metrics): + + If the metric requires special handling (e.g., combining multiple queries), + add custom logic in ``DynamoMetricsCollector.collect()``. + +Metric Reference by Endpoint +---------------------------- + +**Frontend (:8000/metrics)**:: + + dynamo_frontend_requests_total # Counter: Total requests + dynamo_frontend_inflight_requests # Gauge: Current inflight + dynamo_frontend_time_to_first_token_seconds_bucket # Histogram: TTFT + dynamo_frontend_inter_token_latency_seconds_bucket # Histogram: ITL + dynamo_frontend_output_tokens_total # Counter: Total output tokens + +**Worker (:8081/metrics)**:: + + dynamo_component_kvstats_gpu_cache_usage_percent # Gauge: KV cache % + dynamo_component_kvstats_gpu_prefix_cache_hit_rate # Gauge: Cache hit rate + sglang:cache_hit_rate # Gauge: SGLang native cache hit + sglang:gen_throughput # Gauge: Generation throughput + sglang:num_running_reqs # Gauge: Running requests + sglang:num_queue_reqs # Gauge: Queued requests + +**Router (:8082/metrics)**:: + + dynamo_component_requests_total{dynamo_endpoint="find_worker"} + dynamo_component_request_duration_seconds_bucket + +**Processor (:8083/metrics)**:: + + dynamo_component_thompson_requests_total + dynamo_component_thompson_kve_cached_tokens_total + dynamo_component_thompson_kve_prompt_tokens_total + dynamo_component_thompson_routing_decisions_total + +See ``external/dynamo/monitoring/README.md`` for the complete metrics reference. +""" + +import logging +from typing import Any + +import httpx +from pydantic import BaseModel +from pydantic import Field + +from nat.data_models.profiler import DynamoMetricsConfig + +logger = logging.getLogger(__name__) + +# ============================================================================= +# PROMETHEUS QUERY DEFINITIONS +# ============================================================================= + +# Metric queries using Prometheus query language (PromQL). +# Use {range} placeholder for time range substitution. +# +# To add a new metric: +# 1. Add the query here with a descriptive key +# 2. Add corresponding field to DynamoMetricsResult +# 3. The collector will automatically fetch and populate it +METRIC_QUERIES: dict[str, str] = { + # ------------------------------------------------------------------------- + # Inflight Requests (Gauge metrics - no rate needed) + # ------------------------------------------------------------------------- + "inflight_requests_frontend": "dynamo_frontend_inflight_requests", + "inflight_requests_worker": "dynamo_component_inflight_requests", + "queued_requests": "dynamo_frontend_queued_requests", + + # ------------------------------------------------------------------------- + # Throughput (Rate metrics) + # ------------------------------------------------------------------------- + "requests_per_minute": "rate(dynamo_frontend_requests_total[{range}]) * 60", + "token_throughput": "rate(dynamo_frontend_output_tokens_total[{range}])", + + # ------------------------------------------------------------------------- + # Time to First Token (TTFT) - Histogram quantiles + # ------------------------------------------------------------------------- + "ttft_p50": "histogram_quantile(0.50, rate(dynamo_frontend_time_to_first_token_seconds_bucket[{range}]))", + "ttft_p95": "histogram_quantile(0.95, rate(dynamo_frontend_time_to_first_token_seconds_bucket[{range}]))", + "ttft_p99": "histogram_quantile(0.99, rate(dynamo_frontend_time_to_first_token_seconds_bucket[{range}]))", + + # ------------------------------------------------------------------------- + # Inter-Token Latency (ITL) - Histogram quantiles + # ------------------------------------------------------------------------- + "itl_p50": "histogram_quantile(0.50, rate(dynamo_frontend_inter_token_latency_seconds_bucket[{range}]))", + "itl_p95": "histogram_quantile(0.95, rate(dynamo_frontend_inter_token_latency_seconds_bucket[{range}]))", + "itl_p99": "histogram_quantile(0.99, rate(dynamo_frontend_inter_token_latency_seconds_bucket[{range}]))", + + # ------------------------------------------------------------------------- + # KV Cache Metrics (Gauge metrics) + # ------------------------------------------------------------------------- + "kv_cache_usage_percent": "dynamo_component_kvstats_gpu_cache_usage_percent", + "kv_cache_hit_rate_sglang": "sglang:cache_hit_rate", # SGLang native (fallback) + "kv_cache_hit_rate_dynamo": "dynamo_component_kvstats_gpu_prefix_cache_hit_rate", + + # ------------------------------------------------------------------------- + # KV Efficiency (KVE) - TRUE efficiency metric from Thompson Sampling processor + # KVE = cached_tokens / prompt_tokens (fraction of work saved) + # This is token-agnostic and measures actual computational savings + # ------------------------------------------------------------------------- + "kve_cached_tokens_rate": "rate(dynamo_component_thompson_kve_cached_tokens_total[{range}])", + "kve_prompt_tokens_rate": "rate(dynamo_component_thompson_kve_prompt_tokens_total[{range}])", + # Block-level KVE metrics for deeper analysis + "kve_device_blocks_rate": "rate(dynamo_component_thompson_kve_device_blocks_total[{range}])", + "kve_host_blocks_rate": "rate(dynamo_component_thompson_kve_host_blocks_total[{range}])", + "kve_disk_blocks_rate": "rate(dynamo_component_thompson_kve_disk_blocks_total[{range}])", + + # ------------------------------------------------------------------------- + # SGLang Worker Metrics (Gauge metrics) + # ------------------------------------------------------------------------- + "sglang_running_requests": "sglang:num_running_reqs", + "sglang_queue_depth": "sglang:num_queue_reqs", + "sglang_gen_throughput": "sglang:gen_throughput", + "sglang_utilization": "sglang:utilization", + + # ------------------------------------------------------------------------- + # Thompson Sampling Metrics (Rate metrics) + # ------------------------------------------------------------------------- + "thompson_routing_decisions_rate": "rate(dynamo_component_thompson_routing_decisions_total[{range}])", + "thompson_requests_rate": "rate(dynamo_component_thompson_requests_total[{range}])", +} + + +# ============================================================================= +# DATA MODELS +# ============================================================================= + + +class DynamoCoreMetrics(BaseModel): + """ + Core optimization metrics for Dynamo LLM inference. + + These three metrics are the primary targets for optimization: + + 1. **KV Efficiency (KVE)**: Fraction of computational work saved via KV cache. + - Formula: ``cached_tokens / prompt_tokens`` + - Target: Maximize (closer to 1.0 = more work saved) + - Affected by: prefix_id routing, prefix hints (osl, iat), request patterns + - Token-agnostic measure of actual computational savings + + 2. **TTFT (Time to First Token)**: User-perceived initial latency. + - Target: Minimize (lower is better) + - Affected by: queue depth, worker selection, KV cache hits + + 3. **ITL (Inter-Token Latency)**: Streaming smoothness. + - Target: Minimize (lower is better) + - Affected by: batch scheduling, GPU utilization, memory bandwidth + + Usage:: + + result = await collector.collect() + core = result.get_core_metrics() + + print(f"KV Efficiency: {core.kv_efficiency:.2%}") + print(f"TTFT P95: {core.ttft_p95_seconds:.3f}s") + print(f"ITL P95: {core.itl_p95_seconds:.3f}s") + + # Check if all core metrics are available + if core.is_complete(): + print("All core metrics collected successfully") + """ + + # ------------------------------------------------------------------------- + # KV Efficiency - KVE (CORE METRIC #1) + # Goal: MAXIMIZE - Higher efficiency = more computational work saved + # Formula: cached_tokens / prompt_tokens + # ------------------------------------------------------------------------- + kv_efficiency: float | None = Field( + default=None, + description="KV Efficiency (0-1): fraction of prompt tokens served from cache. " + "Computed as cached_tokens / prompt_tokens from Thompson Sampling processor. " + "Higher values indicate more computational work saved via KV cache reuse. " + "This is the PRIMARY metric affected by prefix routing hints (prefix_id, prefix_osl, prefix_iat).", + ) + kv_efficiency_fallback: float | None = Field( + default=None, + description="Fallback KV efficiency from SGLang native cache_hit_rate. " + "Used when Thompson Sampling KVE counters are unavailable.", + ) + + # ------------------------------------------------------------------------- + # Time to First Token - TTFT (CORE METRIC #2) + # Goal: MINIMIZE - Lower latency = faster initial response + # ------------------------------------------------------------------------- + ttft_p50_seconds: float | None = Field( + default=None, + description="Time to First Token - 50th percentile (median) in seconds", + ) + ttft_p95_seconds: float | None = Field( + default=None, + description="Time to First Token - 95th percentile in seconds. " + "Primary latency target for optimization.", + ) + ttft_p99_seconds: float | None = Field( + default=None, + description="Time to First Token - 99th percentile in seconds (tail latency)", + ) + + # ------------------------------------------------------------------------- + # Inter-Token Latency - ITL (CORE METRIC #3) + # Goal: MINIMIZE - Lower latency = smoother streaming + # ------------------------------------------------------------------------- + itl_p50_seconds: float | None = Field( + default=None, + description="Inter-Token Latency - 50th percentile (median) in seconds", + ) + itl_p95_seconds: float | None = Field( + default=None, + description="Inter-Token Latency - 95th percentile in seconds. " + "Primary streaming smoothness target.", + ) + itl_p99_seconds: float | None = Field( + default=None, + description="Inter-Token Latency - 99th percentile in seconds (tail latency)", + ) + + def get_effective_kv_efficiency(self) -> float | None: + """ + Get the best available KV efficiency value. + + Prefers the true KVE (cached_tokens/prompt_tokens) from Thompson Sampling, + falls back to SGLang native cache_hit_rate if KVE is unavailable. + + Returns: + KV efficiency (0-1) or None if neither source is available + """ + if self.kv_efficiency is not None: + return self.kv_efficiency + return self.kv_efficiency_fallback + + def is_complete(self) -> bool: + """ + Check if all core optimization metrics were successfully collected. + + Returns: + True if KV efficiency (or fallback), ttft_p95, and itl_p95 are all available + """ + return all([ + self.get_effective_kv_efficiency() is not None, + self.ttft_p95_seconds is not None, + self.itl_p95_seconds is not None, + ]) + + def get_optimization_summary(self) -> dict[str, float | None]: + """ + Get a summary dict of the primary optimization targets. + + Returns: + Dict with the three key metrics for optimization loops + """ + return { + "kv_efficiency": self.get_effective_kv_efficiency(), + "kv_efficiency_source": "kve" if self.kv_efficiency is not None else "sglang_fallback", + "ttft_p95_seconds": self.ttft_p95_seconds, + "itl_p95_seconds": self.itl_p95_seconds, + } + + def to_optimization_score( + self, + kv_weight: float = 0.4, + ttft_weight: float = 0.4, + itl_weight: float = 0.2, + ttft_target_seconds: float = 0.5, + itl_target_seconds: float = 0.05, + ) -> float | None: + """ + Compute a combined optimization score (higher is better). + + This provides a single scalar for optimization algorithms that combines + the three core metrics with configurable weights. + + Args: + kv_weight: Weight for KV efficiency (0-1) + ttft_weight: Weight for TTFT score (0-1) + itl_weight: Weight for ITL score (0-1) + ttft_target_seconds: Target TTFT for scoring (score=1.0 at target) + itl_target_seconds: Target ITL for scoring (score=1.0 at target) + + Returns: + Combined score (0-1) where higher is better, or None if metrics unavailable + + Note: + Weights should sum to 1.0. TTFT and ITL scores are computed as + target/actual (capped at 1.0) so lower latency = higher score. + """ + if not self.is_complete(): + return None + + # KV efficiency score is already 0-1 (higher is better) + kv_score = self.get_effective_kv_efficiency() or 0.0 + + # TTFT score: target/actual, capped at 1.0 (lower latency = higher score) + ttft_score = min(1.0, ttft_target_seconds / max(self.ttft_p95_seconds or ttft_target_seconds, 0.001)) + + # ITL score: target/actual, capped at 1.0 (lower latency = higher score) + itl_score = min(1.0, itl_target_seconds / max(self.itl_p95_seconds or itl_target_seconds, 0.001)) + + return (kv_weight * kv_score) + (ttft_weight * ttft_score) + (itl_weight * itl_score) + + +class DynamoMetricsResult(BaseModel): + """ + Results from Dynamo metrics collection. + + To add a new metric: + 1. Add a field here with appropriate type and description + 2. Add the corresponding Prometheus query to METRIC_QUERIES above + 3. The collector will automatically populate it + + All metrics are optional (None) to handle cases where: + - The metric endpoint is unavailable + - Prometheus query returns no data + - The Dynamo component is not running + + For optimization, use ``get_core_metrics()`` to extract the three primary + optimization targets (KV Cache Efficiency, TTFT, ITL). + """ + + # ========================================================================= + # CORE OPTIMIZATION METRICS (Primary targets for optimization) + # ========================================================================= + + # ------------------------------------------------------------------------- + # KV Efficiency - KVE (CORE METRIC #1) + # Dashboard panels: "KV Cache Usage %", "KV Cache Stats" + # KVE = cached_tokens / prompt_tokens (fraction of work saved) + # ------------------------------------------------------------------------- + kve_cached_tokens_rate: float | None = Field( + default=None, + description="Rate of tokens served from KV cache (tokens/sec). KVE numerator.", + ) + kve_prompt_tokens_rate: float | None = Field( + default=None, + description="Rate of total prompt tokens processed (tokens/sec). KVE denominator.", + ) + kve_device_blocks_rate: float | None = Field( + default=None, + description="Rate of KV blocks served from GPU memory (blocks/sec)", + ) + kve_host_blocks_rate: float | None = Field( + default=None, + description="Rate of KV blocks served from CPU/host memory (blocks/sec)", + ) + kve_disk_blocks_rate: float | None = Field( + default=None, + description="Rate of KV blocks served from disk (blocks/sec)", + ) + kv_cache_usage_percent: float | None = Field( + default=None, + description="GPU KV cache memory utilization (0-100%)", + ) + kv_cache_hit_rate_sglang: float | None = Field( + default=None, + description="[FALLBACK] KV cache hit rate from SGLang native metric (0-1). " + "Used when Thompson Sampling KVE counters are unavailable.", + ) + kv_cache_hit_rate_dynamo: float | None = Field( + default=None, + description="KV cache hit rate from Dynamo component (0-1), alternative source", + ) + + # ------------------------------------------------------------------------- + # Time to First Token - TTFT (CORE METRIC #2) + # Dashboard panels: "Time to First Token (P95)", "TTFT Over Time" + # ------------------------------------------------------------------------- + ttft_p50: float | None = Field( + default=None, + description="Time to First Token - 50th percentile (seconds)", + ) + ttft_p95: float | None = Field( + default=None, + description="[CORE] Time to First Token - 95th percentile (seconds). PRIMARY latency target.", + ) + ttft_p99: float | None = Field( + default=None, + description="Time to First Token - 99th percentile (seconds)", + ) + + # ------------------------------------------------------------------------- + # Inter-Token Latency - ITL (CORE METRIC #3) + # Dashboard panel: "ITL Over Time" - Inter-token latency trends + # ------------------------------------------------------------------------- + itl_p50: float | None = Field( + default=None, + description="Inter-Token Latency - 50th percentile (seconds)", + ) + itl_p95: float | None = Field( + default=None, + description="[CORE] Inter-Token Latency - 95th percentile (seconds). PRIMARY streaming target.", + ) + itl_p99: float | None = Field( + default=None, + description="Inter-Token Latency - 99th percentile (seconds)", + ) + + # ========================================================================= + # SUPPLEMENTARY METRICS (Context and diagnostics) + # ========================================================================= + + # ------------------------------------------------------------------------- + # Inflight Requests + # Dashboard panel: "Inflight Requests" - Current load across components + # ------------------------------------------------------------------------- + inflight_requests_frontend: float | None = Field( + default=None, + description="Current inflight requests at the frontend (user-facing API)", + ) + inflight_requests_worker: float | None = Field( + default=None, + description="Current inflight requests at the worker (SGLang backend)", + ) + queued_requests: float | None = Field( + default=None, + description="Requests currently queued at the frontend", + ) + + # ------------------------------------------------------------------------- + # Throughput + # Dashboard panel: "Requests/min" - Throughput + # ------------------------------------------------------------------------- + requests_per_minute: float | None = Field( + default=None, + description="Request throughput in requests per minute", + ) + + # ------------------------------------------------------------------------- + # Token Throughput + # Dashboard panel: "Token Throughput" - Tokens generated per second + # ------------------------------------------------------------------------- + token_throughput: float | None = Field( + default=None, + description="Output token generation rate (tokens/second)", + ) + + # ------------------------------------------------------------------------- + # SGLang Worker Metrics + # Additional worker-level metrics for deeper analysis + # ------------------------------------------------------------------------- + sglang_running_requests: float | None = Field( + default=None, + description="Number of requests currently running in SGLang", + ) + sglang_queue_depth: float | None = Field( + default=None, + description="Number of requests queued in SGLang", + ) + sglang_gen_throughput: float | None = Field( + default=None, + description="SGLang generation throughput", + ) + sglang_utilization: float | None = Field( + default=None, + description="SGLang GPU utilization", + ) + + # ------------------------------------------------------------------------- + # Thompson Sampling Metrics + # Routing efficiency and decision-making metrics + # ------------------------------------------------------------------------- + thompson_routing_decisions_rate: float | None = Field( + default=None, + description="Rate of Thompson Sampling routing decisions per second", + ) + thompson_requests_rate: float | None = Field( + default=None, + description="Rate of requests processed by Thompson Sampling processor", + ) + + # ------------------------------------------------------------------------- + # Metadata + # ------------------------------------------------------------------------- + collection_timestamp: float | None = Field( + default=None, + description="Unix timestamp when metrics were collected", + ) + prometheus_url: str | None = Field( + default=None, + description="Prometheus URL used for collection", + ) + errors: list[str] = Field( + default_factory=list, + description="Any errors encountered during collection", + ) + + # ========================================================================= + # CORE METRICS EXTRACTION + # ========================================================================= + + def compute_kv_efficiency(self) -> float | None: + """ + Compute KV Efficiency (KVE) from Thompson Sampling processor metrics. + + KVE = cached_tokens / prompt_tokens + + This measures the fraction of computational work saved via KV cache reuse. + A KVE of 0.8 means 80% of prompt tokens were served from cache. + + Returns: + KVE (0-1) if both metrics are available and prompt_tokens > 0, else None + """ + if self.kve_cached_tokens_rate is None or self.kve_prompt_tokens_rate is None: + return None + if self.kve_prompt_tokens_rate <= 0: + return None + return self.kve_cached_tokens_rate / self.kve_prompt_tokens_rate + + def get_core_metrics(self) -> DynamoCoreMetrics: + """ + Extract the three core optimization metrics. + + KV Efficiency is computed as cached_tokens / prompt_tokens from the + Thompson Sampling processor. Falls back to SGLang native cache_hit_rate + if KVE counters are unavailable. + + Returns: + DynamoCoreMetrics with KV efficiency, TTFT, and ITL + + Usage:: + + result = await collector.collect() + core = result.get_core_metrics() + + if core.is_complete(): + score = core.to_optimization_score() + print(f"Optimization score: {score:.3f}") + """ + # Compute true KVE from Thompson Sampling processor metrics + kv_efficiency = self.compute_kv_efficiency() + + return DynamoCoreMetrics( + kv_efficiency=kv_efficiency, + kv_efficiency_fallback=self.kv_cache_hit_rate_sglang, + ttft_p50_seconds=self.ttft_p50, + ttft_p95_seconds=self.ttft_p95, + ttft_p99_seconds=self.ttft_p99, + itl_p50_seconds=self.itl_p50, + itl_p95_seconds=self.itl_p95, + itl_p99_seconds=self.itl_p99, + ) + + def has_core_metrics(self) -> bool: + """ + Check if all three core optimization metrics are available. + + Returns: + True if kv_cache_hit_rate, ttft_p95, and itl_p95 are all collected + """ + return self.get_core_metrics().is_complete() + + +# ============================================================================= +# METRICS COLLECTOR +# ============================================================================= + + +class DynamoMetricsCollector: + """ + Collects Dynamo inference stack metrics from Prometheus. + + Usage:: + + from nat.profiler.inference_optimization.dynamo_metrics import DynamoMetricsCollector + from nat.data_models.profiler import DynamoMetricsConfig + + config = DynamoMetricsConfig(enable=True, prometheus_url="http://localhost:9090") + collector = DynamoMetricsCollector(config) + result = await collector.collect() + + print(f"TTFT P95: {result.ttft_p95}") + print(f"KV Cache Usage: {result.kv_cache_usage_percent}%") + """ + + def __init__(self, config: DynamoMetricsConfig): + """ + Initialize the collector with configuration. + + Args: + config: DynamoMetricsConfig with Prometheus URL and metric toggles + """ + self.config = config + self.prometheus_url = config.prometheus_url.rstrip("/") + + async def collect(self) -> DynamoMetricsResult: + """ + Collect all enabled Dynamo metrics from Prometheus. + + Returns: + DynamoMetricsResult with collected metric values + """ + import time + + result = DynamoMetricsResult( + collection_timestamp=time.time(), + prometheus_url=self.prometheus_url, + ) + + # Build list of metrics to collect based on config toggles + metrics_to_collect = self._get_enabled_metrics() + + # Collect each metric + async with httpx.AsyncClient(timeout=30.0) as client: + for metric_name, query_template in metrics_to_collect.items(): + try: + # Substitute time range placeholder + query = query_template.replace("{range}", self.config.query_range) + value = await self._query_prometheus(client, query) + + if value is not None: + setattr(result, metric_name, value) + logger.debug("Collected %s = %s", metric_name, value) + + except Exception as e: + error_msg = f"Failed to collect {metric_name}: {e}" + logger.warning(error_msg) + result.errors.append(error_msg) + + return result + + def _get_enabled_metrics(self) -> dict[str, str]: + """ + Get the subset of METRIC_QUERIES enabled by config. + + Returns: + Dict mapping metric names to their Prometheus queries + """ + enabled: dict[str, str] = {} + + # Map config flags to metric prefixes/names + metric_groups = { + "collect_inflight_requests": ["inflight_requests_frontend", "inflight_requests_worker", "queued_requests"], + "collect_throughput": ["requests_per_minute"], + "collect_ttft": ["ttft_p50", "ttft_p95", "ttft_p99"], + "collect_itl": ["itl_p50", "itl_p95", "itl_p99"], + "collect_kv_cache": [ + # KVE metrics (primary - token-level efficiency) + "kve_cached_tokens_rate", + "kve_prompt_tokens_rate", + "kve_device_blocks_rate", + "kve_host_blocks_rate", + "kve_disk_blocks_rate", + # Supplementary KV cache metrics + "kv_cache_usage_percent", + "kv_cache_hit_rate_sglang", # Fallback for KVE + "kv_cache_hit_rate_dynamo", + ], + "collect_token_throughput": ["token_throughput", "sglang_gen_throughput"], + } + + for config_flag, metric_names in metric_groups.items(): + if getattr(self.config, config_flag, False): + for name in metric_names: + if name in METRIC_QUERIES: + enabled[name] = METRIC_QUERIES[name] + + # Always collect SGLang worker metrics for context + for name in ["sglang_running_requests", "sglang_queue_depth", "sglang_utilization"]: + if name in METRIC_QUERIES: + enabled[name] = METRIC_QUERIES[name] + + # Always collect Thompson Sampling metrics when available + for name in ["thompson_routing_decisions_rate", "thompson_requests_rate"]: + if name in METRIC_QUERIES: + enabled[name] = METRIC_QUERIES[name] + + return enabled + + async def _query_prometheus(self, client: httpx.AsyncClient, query: str) -> float | None: + """ + Execute a Prometheus instant query and extract the scalar result. + + Args: + client: httpx AsyncClient + query: PromQL query string + + Returns: + Float value if successful, None if no data or error + """ + url = f"{self.prometheus_url}/api/v1/query" + params = {"query": query} + + response = await client.get(url, params=params) + response.raise_for_status() + + data = response.json() + + if data.get("status") != "success": + logger.warning("Prometheus query failed: %s", data.get("error", "unknown")) + return None + + results = data.get("data", {}).get("result", []) + + if not results: + logger.debug("No data for query: %s", query) + return None + + # For instant queries, extract the value from the first result + # Result format: [{"metric": {...}, "value": [timestamp, "value_string"]}] + try: + value_str = results[0]["value"][1] + value = float(value_str) + + # Handle special float values + if value != value: # NaN check + return None + + return value + except (KeyError, IndexError, ValueError) as e: + logger.debug("Failed to parse Prometheus result for query '%s': %s", query, e) + return None + + async def health_check(self) -> dict[str, Any]: + """ + Check connectivity to Prometheus and Dynamo endpoints. + + Returns: + Dict with health status for each component + """ + health: dict[str, Any] = { + "prometheus": False, + "frontend": False, + "worker": False, + "errors": [], + } + + async with httpx.AsyncClient(timeout=10.0) as client: + # Check Prometheus + try: + response = await client.get(f"{self.prometheus_url}/-/healthy") + health["prometheus"] = response.status_code == 200 + except Exception as e: + health["errors"].append(f"Prometheus: {e}") + + # Check if Dynamo metrics are being scraped + try: + # Query for any frontend metric to verify scraping + url = f"{self.prometheus_url}/api/v1/query" + response = await client.get(url, params={"query": "up{job=~\".*dynamo.*\"}"}) + if response.status_code == 200: + data = response.json() + results = data.get("data", {}).get("result", []) + health["frontend"] = len(results) > 0 + health["worker"] = len(results) > 0 + except Exception as e: + health["errors"].append(f"Dynamo metrics check: {e}") + + return health + + +# ============================================================================= +# CONVENIENCE FUNCTIONS +# ============================================================================= + + +async def collect_dynamo_metrics(config: DynamoMetricsConfig) -> DynamoMetricsResult: + """ + Convenience function to collect Dynamo metrics. + + Args: + config: DynamoMetricsConfig with collection settings + + Returns: + DynamoMetricsResult with collected metrics + """ + collector = DynamoMetricsCollector(config) + return await collector.collect() + + +async def collect_core_metrics( + prometheus_url: str = "http://localhost:9090", + query_range: str = "30s", +) -> DynamoCoreMetrics: + """ + Convenience function to collect only the three core optimization metrics. + + This is a simplified interface for optimization loops that only need: + - KV Cache Efficiency + - Time to First Token (TTFT) + - Inter-Token Latency (ITL) + + Args: + prometheus_url: Prometheus server URL + query_range: Time range for rate calculations (e.g., '1m', '5m') + + Returns: + DynamoCoreMetrics with the three core metrics + + Usage:: + + from nat.profiler.inference_optimization.dynamo_metrics import collect_core_metrics + + # Quick collection for optimization + core = await collect_core_metrics() + + if core.is_complete(): + print(f"KV Efficiency: {core.kv_cache_efficiency:.2%}") + print(f"TTFT P95: {core.ttft_p95_seconds:.3f}s") + print(f"ITL P95: {core.itl_p95_seconds:.3f}s") + + # Get combined optimization score + score = core.to_optimization_score() + print(f"Combined score: {score:.3f}") + """ + config = DynamoMetricsConfig( + enable=True, + prometheus_url=prometheus_url, + query_range=query_range, + # Enable only core metrics for efficiency + collect_kv_cache=True, + collect_ttft=True, + collect_itl=True, + # Disable supplementary metrics + collect_inflight_requests=False, + collect_throughput=False, + collect_token_throughput=False, + ) + result = await collect_dynamo_metrics(config) + return result.get_core_metrics() + diff --git a/src/nat/profiler/profile_runner.py b/src/nat/profiler/profile_runner.py index 0ac72f5deb..61602c881c 100644 --- a/src/nat/profiler/profile_runner.py +++ b/src/nat/profiler/profile_runner.py @@ -45,6 +45,7 @@ class InferenceOptimizationHolder(BaseModel): common_prefixes: Any token_uniqueness: Any workflow_runtimes: Any + dynamo_metrics: Any = None class ProfilerRunner: @@ -187,10 +188,25 @@ async def run(self, all_steps: list[list[IntermediateStep]]) -> ProfilerResults: workflow_runtimes = compute_workflow_runtime_metrics(all_steps) workflow_runtimes_results = workflow_runtimes + # ------------------------------------------------------------ + # Collect Dynamo inference stack metrics (if enabled) + # ------------------------------------------------------------ + dynamo_metrics_results = None + if self.profile_config.dynamo_metrics.enable: + from nat.profiler.inference_optimization.dynamo_metrics import collect_dynamo_metrics + try: + dynamo_metrics_results = await collect_dynamo_metrics(self.profile_config.dynamo_metrics) + if dynamo_metrics_results.errors: + logger.warning("Dynamo metrics collection had errors: %s", dynamo_metrics_results.errors) + logger.info("Collected Dynamo metrics successfully") + except Exception as e: + logger.warning("Failed to collect Dynamo metrics: %s", e) + inference_optimization_results = InferenceOptimizationHolder(confidence_intervals=simple_metrics, common_prefixes=common_prefix_results, token_uniqueness=token_uniqueness_results, - workflow_runtimes=workflow_runtimes_results) + workflow_runtimes=workflow_runtimes_results, + dynamo_metrics=dynamo_metrics_results) if self.write_output and inference_optimization_results: # Save to JSON From 2fb79db9e9db18f39dc322b0ee17ba092c90dd5d Mon Sep 17 00:00:00 2001 From: bbednarski9 Date: Fri, 23 Jan 2026 01:22:46 +0000 Subject: [PATCH 05/13] grafana dash updates Signed-off-by: bbednarski9 --- .../configs/profile_rethinking_full_test.yml | 6 +- external/dynamo/monitoring/README.md | 254 +++++++++++++----- .../dashboards/json/dynamo-overview.json | 91 +++---- src/nat/llm/dynamo_llm.py | 206 ++++++-------- 4 files changed, 316 insertions(+), 241 deletions(-) diff --git a/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/profile_rethinking_full_test.yml b/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/profile_rethinking_full_test.yml index 0bf1b7128a..b2a9aa8b69 100644 --- a/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/profile_rethinking_full_test.yml +++ b/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/profile_rethinking_full_test.yml @@ -142,7 +142,7 @@ llms: dynamo_llm: _type: dynamo model_name: llama-3.3-70b - base_url: http://localhost:8099/v1 + base_url: http://localhost:8000/v1 api_key: dummy temperature: 0.0 max_tokens: 8192 @@ -163,7 +163,7 @@ llms: eval_llm: _type: dynamo model_name: llama-3.3-70b - base_url: http://localhost:8099/v1 + base_url: http://localhost:8000/v1 api_key: dummy temperature: 0.0 max_tokens: 1024 @@ -199,7 +199,7 @@ workflow: eval: general: - max_concurrency: 36 + max_concurrency: 1 output: dir: ./examples/dynamo_integration/react_benchmark_agent/outputs/dynamo_evals/rethinking_full_test_for_profiling/ diff --git a/external/dynamo/monitoring/README.md b/external/dynamo/monitoring/README.md index 1cde252864..a30f433928 100644 --- a/external/dynamo/monitoring/README.md +++ b/external/dynamo/monitoring/README.md @@ -68,20 +68,20 @@ docker compose up -d User-facing HTTP API metrics for latency, throughput, and token statistics. -| Metric | Type | Description | -|--------|------|-------------| -| `dynamo_frontend_requests_total` | Counter | Total requests processed | -| `dynamo_frontend_inflight_requests` | Gauge | Currently processing requests | -| `dynamo_frontend_queued_requests` | Gauge | Requests waiting in queue | -| `dynamo_frontend_disconnected_clients` | Counter | Client disconnections | -| `dynamo_frontend_time_to_first_token_seconds` | Histogram | Time until first token generated | -| `dynamo_frontend_inter_token_latency_seconds` | Histogram | Time between consecutive tokens | -| `dynamo_frontend_request_duration_seconds` | Histogram | Total request duration | -| `dynamo_frontend_input_sequence_tokens` | Histogram | Input prompt length distribution | -| `dynamo_frontend_output_sequence_tokens` | Histogram | Output length distribution | -| `dynamo_frontend_output_tokens_total` | Counter | Total output tokens generated | -| `dynamo_frontend_model_context_length` | Gauge | Model context window size | -| `dynamo_frontend_model_kv_cache_block_size` | Gauge | KV cache block size | +| Prefix | Full Metric Name | Type | Description | +|--------|------------------|------|-------------| +| `dynamo_frontend_` | `dynamo_frontend_requests_total` | Counter | Total requests processed | +| `dynamo_frontend_` | `dynamo_frontend_inflight_requests` | Gauge | Currently processing requests | +| `dynamo_frontend_` | `dynamo_frontend_queued_requests` | Gauge | Requests waiting in queue | +| `dynamo_frontend_` | `dynamo_frontend_disconnected_clients` | Counter | Client disconnections | +| `dynamo_frontend_` | `dynamo_frontend_time_to_first_token_seconds` | Histogram | Time until first token generated | +| `dynamo_frontend_` | `dynamo_frontend_inter_token_latency_seconds` | Histogram | Time between consecutive tokens | +| `dynamo_frontend_` | `dynamo_frontend_request_duration_seconds` | Histogram | Total request duration | +| `dynamo_frontend_` | `dynamo_frontend_input_sequence_tokens` | Histogram | Input prompt length distribution | +| `dynamo_frontend_` | `dynamo_frontend_output_sequence_tokens` | Histogram | Output length distribution | +| `dynamo_frontend_` | `dynamo_frontend_output_tokens_total` | Counter | Total output tokens generated | +| `dynamo_frontend_` | `dynamo_frontend_model_context_length` | Gauge | Model context window size | +| `dynamo_frontend_` | `dynamo_frontend_model_kv_cache_block_size` | Gauge | KV cache block size | ### Worker Metrics (`:8081/metrics`) @@ -89,50 +89,50 @@ SGLang backend worker metrics including KV cache, scheduling, and internal stati #### Dynamo Component Metrics -| Metric | Type | Description | -|--------|------|-------------| -| `dynamo_component_kvstats_gpu_cache_usage_percent` | Gauge | KV cache memory utilization (0-100) | -| `dynamo_component_kvstats_gpu_prefix_cache_hit_rate` | Gauge | Prefix cache hit rate (0-1) | -| `dynamo_component_kvstats_active_blocks` | Gauge | Active KV cache blocks | -| `dynamo_component_kvstats_total_blocks` | Gauge | Total KV cache blocks | -| `dynamo_component_request_duration_seconds` | Histogram | Backend request processing time | -| `dynamo_component_requests_total` | Counter | Total requests to worker | -| `dynamo_component_inflight_requests` | Gauge | Requests currently in worker | -| `dynamo_component_uptime_seconds` | Gauge | Worker uptime | +| Prefix | Full Metric Name | Type | Description | +|--------|------------------|------|-------------| +| `dynamo_component_kvstats_` | `dynamo_component_kvstats_gpu_cache_usage_percent` | Gauge | KV cache memory utilization (0-100) | +| `dynamo_component_kvstats_` | `dynamo_component_kvstats_gpu_prefix_cache_hit_rate` | Gauge | Prefix cache hit rate (0-1) | +| `dynamo_component_kvstats_` | `dynamo_component_kvstats_active_blocks` | Gauge | Active KV cache blocks | +| `dynamo_component_kvstats_` | `dynamo_component_kvstats_total_blocks` | Gauge | Total KV cache blocks | +| `dynamo_component_` | `dynamo_component_request_duration_seconds` | Histogram | Backend request processing time | +| `dynamo_component_` | `dynamo_component_requests_total` | Counter | Total requests to worker | +| `dynamo_component_` | `dynamo_component_inflight_requests` | Gauge | Requests currently in worker | +| `dynamo_component_` | `dynamo_component_uptime_seconds` | Gauge | Worker uptime | #### SGLang Native Metrics -| Metric | Type | Description | -|--------|------|-------------| -| `sglang:cache_hit_rate` | Gauge | Prefix cache hit rate | -| `sglang:token_usage` | Gauge | Current token usage | -| `sglang:num_running_reqs` | Gauge | Currently running requests | -| `sglang:num_queue_reqs` | Gauge | Queued requests | -| `sglang:num_used_tokens` | Gauge | Tokens currently in use | -| `sglang:gen_throughput` | Gauge | Generation throughput | -| `sglang:utilization` | Gauge | GPU utilization | -| `sglang:queue_time_seconds` | Histogram | Time spent in queue | -| `sglang:per_stage_req_latency_seconds` | Histogram | Per-stage request latency | -| `sglang:kv_transfer_latency_ms` | Gauge | KV transfer latency | -| `sglang:kv_transfer_speed_gb_s` | Gauge | KV transfer speed | -| `sglang:engine_startup_time` | Gauge | Engine startup duration | -| `sglang:engine_load_weights_time` | Gauge | Model weight loading time | +| Prefix | Full Metric Name | Type | Description | +|--------|------------------|------|-------------| +| `sglang:` | `sglang:cache_hit_rate` | Gauge | Prefix cache hit rate | +| `sglang:` | `sglang:token_usage` | Gauge | Current token usage | +| `sglang:` | `sglang:num_running_reqs` | Gauge | Currently running requests | +| `sglang:` | `sglang:num_queue_reqs` | Gauge | Queued requests | +| `sglang:` | `sglang:num_used_tokens` | Gauge | Tokens currently in use | +| `sglang:` | `sglang:gen_throughput` | Gauge | Generation throughput | +| `sglang:` | `sglang:utilization` | Gauge | GPU utilization | +| `sglang:` | `sglang:queue_time_seconds` | Histogram | Time spent in queue | +| `sglang:` | `sglang:per_stage_req_latency_seconds` | Histogram | Per-stage request latency | +| `sglang:` | `sglang:kv_transfer_latency_ms` | Gauge | KV transfer latency | +| `sglang:` | `sglang:kv_transfer_speed_gb_s` | Gauge | KV transfer speed | +| `sglang:` | `sglang:engine_startup_time` | Gauge | Engine startup duration | +| `sglang:` | `sglang:engine_load_weights_time` | Gauge | Model weight loading time | ### Router Metrics (`:8082/metrics`) Dynamo component metrics for the Thompson Sampling router (uses standard `dynamo_component_*` prefix). -| Metric | Type | Description | -|--------|------|-------------| -| `dynamo_component_requests_total` | Counter | Total routing requests (labeled by endpoint) | -| `dynamo_component_request_duration_seconds` | Histogram | Routing decision latency | -| `dynamo_component_request_bytes_total` | Counter | Request payload bytes | -| `dynamo_component_response_bytes_total` | Counter | Response payload bytes | -| `dynamo_component_inflight_requests` | Gauge | In-flight routing requests | -| `dynamo_component_uptime_seconds` | Gauge | Router uptime | -| `dynamo_component_nats_service_requests_total` | Gauge | NATS service requests | -| `dynamo_component_nats_service_processing_ms_avg` | Gauge | Average NATS processing time | -| `dynamo_component_nats_client_connection_state` | Gauge | NATS connection state (0=disconnected, 1=connected) | +| Prefix | Full Metric Name | Type | Description | +|--------|------------------|------|-------------| +| `dynamo_component_` | `dynamo_component_requests_total` | Counter | Total routing requests (labeled by endpoint) | +| `dynamo_component_` | `dynamo_component_request_duration_seconds` | Histogram | Routing decision latency | +| `dynamo_component_` | `dynamo_component_request_bytes_total` | Counter | Request payload bytes | +| `dynamo_component_` | `dynamo_component_response_bytes_total` | Counter | Response payload bytes | +| `dynamo_component_` | `dynamo_component_inflight_requests` | Gauge | In-flight routing requests | +| `dynamo_component_` | `dynamo_component_uptime_seconds` | Gauge | Router uptime | +| `dynamo_component_nats_` | `dynamo_component_nats_service_requests_total` | Gauge | NATS service requests | +| `dynamo_component_nats_` | `dynamo_component_nats_service_processing_ms_avg` | Gauge | Average NATS processing time | +| `dynamo_component_nats_` | `dynamo_component_nats_client_connection_state` | Gauge | NATS connection state (0=disconnected, 1=connected) | **Router Endpoints** (use `dynamo_endpoint` label to filter): - `find_worker` - Worker selection requests @@ -142,21 +142,21 @@ Dynamo component metrics for the Thompson Sampling router (uses standard `dynamo Custom Thompson Sampling KV Efficiency (KVE) metrics from the processor component. -| Metric | Type | Description | -|--------|------|-------------| -| `dynamo_component_thompson_requests_total` | Counter | Total requests processed | -| `dynamo_component_thompson_request_latency_seconds` | Histogram | End-to-end request latency | -| `dynamo_component_thompson_tokens_in_total` | Counter | Total input tokens | -| `dynamo_component_thompson_tokens_out_total` | Counter | Total output tokens | -| `dynamo_component_thompson_routing_decisions_total` | Counter | Routing decisions made | -| `dynamo_component_thompson_active_requests` | Gauge | Currently processing requests | -| `dynamo_component_thompson_router_errors_total` | Counter | Router communication errors | -| `dynamo_component_thompson_engine_errors_total` | Counter | Engine/worker errors | -| `dynamo_component_thompson_kve_prompt_tokens_total` | Counter | Total prompt tokens (KVE denominator) | -| `dynamo_component_thompson_kve_cached_tokens_total` | Counter | Cached tokens hit (KVE numerator) | -| `dynamo_component_thompson_kve_device_blocks_total` | Counter | KV blocks from GPU memory | -| `dynamo_component_thompson_kve_host_blocks_total` | Counter | KV blocks from CPU memory | -| `dynamo_component_thompson_kve_disk_blocks_total` | Counter | KV blocks from disk | +| Prefix | Full Metric Name | Type | Description | +|--------|------------------|------|-------------| +| `dynamo_component_thompson_` | `dynamo_component_thompson_requests_total` | Counter | Total requests processed | +| `dynamo_component_thompson_` | `dynamo_component_thompson_request_latency_seconds` | Histogram | End-to-end request latency | +| `dynamo_component_thompson_` | `dynamo_component_thompson_tokens_in_total` | Counter | Total input tokens | +| `dynamo_component_thompson_` | `dynamo_component_thompson_tokens_out_total` | Counter | Total output tokens | +| `dynamo_component_thompson_` | `dynamo_component_thompson_routing_decisions_total` | Counter | Routing decisions made | +| `dynamo_component_thompson_` | `dynamo_component_thompson_active_requests` | Gauge | Currently processing requests | +| `dynamo_component_thompson_` | `dynamo_component_thompson_router_errors_total` | Counter | Router communication errors | +| `dynamo_component_thompson_` | `dynamo_component_thompson_engine_errors_total` | Counter | Engine/worker errors | +| `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_prompt_tokens_total` | Counter | Total prompt tokens (KVE denominator) | +| `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_cached_tokens_total` | Counter | Cached tokens hit (KVE numerator) | +| `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_device_blocks_total` | Counter | KV blocks from GPU memory | +| `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_host_blocks_total` | Counter | KV blocks from CPU memory | +| `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_disk_blocks_total` | Counter | KV blocks from disk | **KV Efficiency (KVE) Calculation:** ```promql @@ -171,6 +171,72 @@ sglang:cache_hit_rate * 100 > `cached_tokens` in its API responses. The processor's `thompson_kve_*` counters will show 0 > unless the underlying engine provides `usage.prompt_tokens_details.cached_tokens`. +## KV Cache Metrics Status + +This section documents the working status of all KV cache related metrics across the Dynamo stack. + +### Working Metrics ✓ + +| Prefix | Full Metric Name | Status | Description | +|--------|------------------|--------|-------------| +| `sglang:` | `sglang:token_usage` | ✓ **WORKING** | KV cache memory usage as ratio (0-1). Multiply by 100 for percentage. | +| `sglang:` | `sglang:num_used_tokens` | ✓ **WORKING** | Absolute number of tokens currently stored in KV cache. | +| `dynamo_component_kvstats_` | `dynamo_component_kvstats_total_blocks` | ✓ **WORKING** | Total KV cache blocks available (capacity). | +| `sglang:` | `sglang:gen_throughput` | ✓ **WORKING** | Token generation throughput (tokens/sec). | + +### Conditionally Working Metrics ⚠ + +| Prefix | Full Metric Name | Status | Notes | +|--------|------------------|--------|-------| +| `sglang:` | `sglang:cache_hit_rate` | ⚠ **CONDITIONAL** | Shows prefix cache hit rate (0-1). Requires repeated queries with shared prefixes to see non-zero values. May stay at 0 if prefix caching is not effective for workload. | + +### Not Implemented / Always Zero Metrics + +| Prefix | Full Metric Name | Status | Notes | +|--------|------------------|--------|-------| +| `sglang:` | `sglang:utilization` | ✗ **ALWAYS 0** | Exported but not populated in unified engine mode. Use `sglang:num_running_reqs` and `sglang:gen_throughput` instead to gauge worker activity. | +| `sglang:` | `sglang:is_cuda_graph` | ✗ **ALWAYS 0** | CUDA graph optimization not enabled in current configuration. | +| `sglang:` | `sglang:spec_accept_*` | ✗ **ALWAYS 0** | Speculative decoding metrics - not applicable without draft model. | + +### Non-Working Metrics ✗ + +| Prefix | Full Metric Name | Status | Reason | +|--------|------------------|--------|--------| +| `dynamo_component_kvstats_` | `dynamo_component_kvstats_gpu_cache_usage_percent` | ✗ **NOT WORKING** | Dynamo's internal metric not populated by SGLang backend. Use `sglang:token_usage * 100` instead. | +| `dynamo_component_kvstats_` | `dynamo_component_kvstats_gpu_prefix_cache_hit_rate` | ✗ **NOT WORKING** | Dynamo's internal metric not populated. Use `sglang:cache_hit_rate` instead. | +| `dynamo_component_kvstats_` | `dynamo_component_kvstats_active_blocks` | ✗ **NOT WORKING** | Dynamo's internal metric not populated by SGLang backend. | +| `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_cached_tokens_total` | ✗ **NOT WORKING** | SGLang API doesn't return `cached_tokens` in response. | +| `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_prompt_tokens_total` | ✗ **NOT WORKING** | Counter stays at 0 due to API limitation. | +| `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_*_blocks_total` | ✗ **NOT WORKING** | Block-level KVE metrics not populated. | + +### Architecture-Specific Metrics (Always Zero for Llama) + +| Prefix | Full Metric Name | Status | Reason | +|--------|------------------|--------|--------| +| `sglang:` | `sglang:swa_token_usage` | N/A | Sliding Window Attention - not used by Llama architecture. | +| `sglang:` | `sglang:mamba_usage` | N/A | Mamba architecture metric - not applicable to Llama. | +| `sglang:` | `sglang:kv_transfer_*` | N/A | KV transfer metrics only used in disaggregated prefill/decode mode. | +| `sglang:` | `sglang:pending_prealloc_token_usage` | N/A | Preallocation metric - typically 0 in standard operation. | + +### Recommended KV Cache Queries + +```promql +# KV Cache Memory Usage % (RECOMMENDED - this actually works!) +sglang:token_usage * 100 + +# Absolute tokens in KV cache +sglang:num_used_tokens + +# Total KV cache capacity (blocks) +dynamo_component_kvstats_total_blocks + +# Prefix Cache Hit Rate % (may be 0 without repeated prefix queries) +sglang:cache_hit_rate * 100 + +# Token throughput +sglang:gen_throughput +``` + ## Grafana Dashboard The pre-configured dashboard "Dynamo LLM Overview" includes: @@ -178,24 +244,25 @@ The pre-configured dashboard "Dynamo LLM Overview" includes: 1. **Inflight Requests** - Current load across all components 2. **Requests/min** - Throughput 3. **Time to First Token (P95)** - Latency to start generating -4. **KV Cache Usage %** - GPU memory utilization +4. **Cache Hit Rate %** - Prefix cache hit rate (may be 0 without repeated prefix queries) 5. **TTFT Over Time** - P50/P95/P99 latency trends 6. **ITL Over Time** - Inter-token latency trends 7. **Token Throughput** - Tokens generated per second -8. **KV Cache Stats** - Cache usage and hit rate over time +8. **KV Cache Usage** - Memory usage % and prefix cache hit rate % over time +9. **KV Cache Tokens & Throughput** - Absolute token count and generation throughput ### Thompson Sampling Panels (Included) The dashboard includes these Thompson Sampling and SGLang monitoring panels: -- **KV Efficiency / Cache Hit Rate** - `sglang:cache_hit_rate * 100` (SGLang native metric) - **Routing Decisions/sec** - `rate(dynamo_component_thompson_routing_decisions_total[5m])` - **SGLang Queue Depth** - `sglang:num_queue_reqs` + `sglang:num_running_reqs` - **Worker Utilization** - `sglang:utilization` + `sglang:token_usage` -> **Note**: KV Efficiency uses SGLang's native `cache_hit_rate` metric rather than the processor's -> `thompson_kve_*` counters because SGLang doesn't include `cached_tokens` in its API responses. -> The native metric provides the same information: `(cached_tokens / prompt_tokens) * 100`. +> **Note on KV Cache Metrics**: The dashboard uses SGLang's native metrics (`sglang:token_usage`, +> `sglang:cache_hit_rate`, `sglang:num_used_tokens`) which are reliably populated. The Dynamo-specific +> `dynamo_component_kvstats_*` metrics are not populated by the SGLang backend. See the +> "KV Cache Metrics Status" section above for detailed metric availability. ## Files @@ -242,6 +309,51 @@ docker compose down -v # Removes volumes docker compose up -d ``` +## Remote Access via SSH Port Forwarding + +If the monitoring stack is running on a remote server, use SSH port forwarding to access Grafana and Prometheus locally. + +### General Syntax + +```bash +ssh -L :localhost: @ +``` + +### Access Grafana (Port 3000) + +```bash +ssh -L 3000:localhost:3000 @ +``` + +Then open http://localhost:3000 in your browser. + +### Access Prometheus (Port 9090) + +```bash +ssh -L 9090:localhost:9090 @ +``` + +Then open http://localhost:9090 in your browser. + +### Forward Multiple Ports + +To access both Grafana and Prometheus simultaneously: + +```bash +ssh -L 3000:localhost:3000 -L 9090:localhost:9090 @ +``` + +### Background SSH Tunnel + +To run the tunnel in the background: + +```bash +ssh -f -N -L 3000:localhost:3000 -L 9090:localhost:9090 @ +``` + +- `-f`: Run in background after authentication +- `-N`: Don't execute remote commands (tunnel only) + ## Manual Metrics Queries ### Prometheus UI (http://localhost:9090) diff --git a/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json b/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json index 9c48708d1a..901957a911 100644 --- a/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json +++ b/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json @@ -28,7 +28,7 @@ {"color": "red", "value": 10} ] }, - "unit": "short" + "unit": "none" }, "overrides": [] }, @@ -50,7 +50,7 @@ "targets": [ { "expr": "dynamo_frontend_inflight_requests", - "legendFormat": "Inflight Requests", + "legendFormat": "Inflight Requests (dynamo_frontend_inflight_requests)", "refId": "A" } ], @@ -74,7 +74,7 @@ {"color": "green", "value": null} ] }, - "unit": "short" + "unit": "reqpm" }, "overrides": [] }, @@ -95,8 +95,8 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "increase(dynamo_frontend_requests_total[1m])", - "legendFormat": "Requests/min", + "expr": "sum(increase(dynamo_frontend_requests_total[1m]))", + "legendFormat": "Total Requests/min (dynamo_frontend_requests_total)", "refId": "A" } ], @@ -144,7 +144,7 @@ "targets": [ { "expr": "histogram_quantile(0.95, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5m]))", - "legendFormat": "P95 TTFT", + "legendFormat": "P95 TTFT (dynamo_frontend_time_to_first_token_seconds)", "refId": "A" } ], @@ -190,7 +190,7 @@ "targets": [ { "expr": "sglang:cache_hit_rate * 100", - "legendFormat": "Cache Hit Rate", + "legendFormat": "Cache Hit Rate (sglang:cache_hit_rate)", "refId": "A" } ], @@ -245,17 +245,17 @@ "targets": [ { "expr": "histogram_quantile(0.5, rate(dynamo_frontend_time_to_first_token_seconds_bucket[1m]))", - "legendFormat": "P50", + "legendFormat": "P50 (dynamo_frontend_time_to_first_token_seconds)", "refId": "A" }, { "expr": "histogram_quantile(0.95, rate(dynamo_frontend_time_to_first_token_seconds_bucket[1m]))", - "legendFormat": "P95", + "legendFormat": "P95 (dynamo_frontend_time_to_first_token_seconds)", "refId": "B" }, { "expr": "histogram_quantile(0.99, rate(dynamo_frontend_time_to_first_token_seconds_bucket[1m]))", - "legendFormat": "P99", + "legendFormat": "P99 (dynamo_frontend_time_to_first_token_seconds)", "refId": "C" } ], @@ -310,17 +310,17 @@ "targets": [ { "expr": "histogram_quantile(0.5, rate(dynamo_frontend_inter_token_latency_seconds_bucket[1m]))", - "legendFormat": "P50", + "legendFormat": "P50 (dynamo_frontend_inter_token_latency_seconds)", "refId": "A" }, { "expr": "histogram_quantile(0.95, rate(dynamo_frontend_inter_token_latency_seconds_bucket[1m]))", - "legendFormat": "P95", + "legendFormat": "P95 (dynamo_frontend_inter_token_latency_seconds)", "refId": "B" }, { "expr": "histogram_quantile(0.99, rate(dynamo_frontend_inter_token_latency_seconds_bucket[1m]))", - "legendFormat": "P99", + "legendFormat": "P99 (dynamo_frontend_inter_token_latency_seconds)", "refId": "C" } ], @@ -361,7 +361,7 @@ "mode": "absolute", "steps": [{"color": "green", "value": null}] }, - "unit": "short" + "unit": "tps" }, "overrides": [] }, @@ -374,9 +374,14 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "rate(dynamo_frontend_output_tokens_total[1m])", - "legendFormat": "Output Tokens/s", + "expr": "sglang:gen_throughput", + "legendFormat": "Backend Throughput tok/s (sglang:gen_throughput)", "refId": "A" + }, + { + "expr": "rate(dynamo_frontend_output_tokens_total[1m])", + "legendFormat": "Frontend Throughput tok/s (dynamo_frontend_output_tokens_total)", + "refId": "B" } ], "title": "Token Throughput", @@ -429,22 +434,17 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "sglang:cache_hit_rate * 100", - "legendFormat": "Cache Hit Rate % (SGLang)", + "expr": "sglang:token_usage * 100", + "legendFormat": "KV Cache Memory Usage % (sglang:token_usage)", "refId": "A" }, { - "expr": "sglang:token_usage * 100", - "legendFormat": "Token Usage % (SGLang)", + "expr": "sglang:cache_hit_rate * 100", + "legendFormat": "Prefix Cache Hit Rate % (sglang:cache_hit_rate)", "refId": "B" - }, - { - "expr": "dynamo_component_kvstats_gpu_cache_usage_percent", - "legendFormat": "KV Cache Usage % (Dynamo)", - "refId": "C" } ], - "title": "KV Cache Stats", + "title": "KV Cache Usage", "type": "timeseries" }, { @@ -481,11 +481,11 @@ "mode": "absolute", "steps": [{"color": "green", "value": null}] }, - "unit": "percent" + "unit": "none" }, "overrides": [] }, - "gridPos": {"h": 8, "w": 12, "x": 0, "y": 20}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 20}, "id": 9, "options": { "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, @@ -494,17 +494,12 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "sglang:cache_hit_rate * 100", - "legendFormat": "SGLang Cache Hit Rate %", + "expr": "sglang:num_used_tokens", + "legendFormat": "Tokens in KV Cache (sglang:num_used_tokens)", "refId": "A" - }, - { - "expr": "rate(dynamo_component_thompson_kve_cached_tokens_total[5m]) / rate(dynamo_component_thompson_kve_prompt_tokens_total[5m]) * 100", - "legendFormat": "Thompson KVE % (if available)", - "refId": "B" } ], - "title": "KV Efficiency", + "title": "KV Cache Tokens", "type": "timeseries" }, { @@ -541,11 +536,11 @@ "mode": "absolute", "steps": [{"color": "green", "value": null}] }, - "unit": "short" + "unit": "ops" }, "overrides": [] }, - "gridPos": {"h": 8, "w": 12, "x": 12, "y": 20}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 20}, "id": 10, "options": { "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, @@ -555,7 +550,7 @@ "targets": [ { "expr": "rate(dynamo_component_thompson_routing_decisions_total[5m])", - "legendFormat": "Routing Decisions/sec", + "legendFormat": "Routing Decisions/sec (dynamo_component_thompson_routing_decisions_total)", "refId": "A" } ], @@ -600,7 +595,7 @@ {"color": "red", "value": 50} ] }, - "unit": "short" + "unit": "none" }, "overrides": [] }, @@ -614,12 +609,12 @@ "targets": [ { "expr": "sglang:num_queue_reqs", - "legendFormat": "Queue Depth", + "legendFormat": "Queue Depth (sglang:num_queue_reqs)", "refId": "A" }, { "expr": "sglang:num_running_reqs", - "legendFormat": "Running Requests", + "legendFormat": "Running Requests (sglang:num_running_reqs)", "refId": "B" } ], @@ -660,7 +655,7 @@ "mode": "absolute", "steps": [{"color": "green", "value": null}] }, - "unit": "percentunit" + "unit": "none" }, "overrides": [] }, @@ -673,17 +668,17 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "sglang:utilization", - "legendFormat": "Worker Utilization", + "expr": "sglang:num_running_reqs", + "legendFormat": "Running Requests (sglang:num_running_reqs)", "refId": "A" }, { - "expr": "sglang:token_usage", - "legendFormat": "Token Usage", + "expr": "sglang:num_queue_reqs", + "legendFormat": "Queued Requests (sglang:num_queue_reqs)", "refId": "B" } ], - "title": "Worker Utilization (SGLang)", + "title": "Worker Activity (SGLang)", "type": "timeseries" } ], diff --git a/src/nat/llm/dynamo_llm.py b/src/nat/llm/dynamo_llm.py index b3b6381b7e..d6e848f81d 100644 --- a/src/nat/llm/dynamo_llm.py +++ b/src/nat/llm/dynamo_llm.py @@ -63,13 +63,10 @@ import json import logging import uuid -from collections.abc import Callable -from collections.abc import Coroutine from collections.abc import Iterator from contextlib import contextmanager from contextvars import ContextVar from typing import TYPE_CHECKING -from typing import Any from typing import Literal if TYPE_CHECKING: @@ -267,144 +264,98 @@ def get_dynamo_field_names() -> frozenset[str]: # ============================================================================= -# HTTPX EVENT HOOK FOR HEADER INJECTION +# CUSTOM TRANSPORT FOR DYNAMO HINT INJECTION # ============================================================================= -def _create_dynamo_request_hook( - prefix_template: str | None, - total_requests: int, - osl: str, - iat: str, -) -> Callable[["httpx.Request"], Coroutine[Any, Any, None]]: +class _DynamoTransport: """ - Create an httpx event hook that injects Dynamo prefix hints into requests. - - This hook is called before each HTTP request is sent, allowing us to inject - hints dynamically. The prefix ID is generated ONCE when the hook is created, - ensuring all requests from the same client share the same prefix ID. This enables - Dynamo's KV cache optimization across multi-turn conversations. - - The context variable can override this for scenarios where you need different - prefix IDs (e.g., per-question in batch evaluation). - - Hints are injected via TWO transport mechanisms for maximum compatibility: - - 1. **HTTP Headers** (``x-prefix-*``): For the generalized Thompson Sampling setup - that uses custom ``frontend.py`` which reads headers directly. - - 2. **nvext.annotations** (in request body): For the optimized Thompson Sampling - setup that uses the default Dynamo frontend with custom ``processor.py`` - which reads annotations from the preprocessed request. - - Args: - prefix_template: Template string with {uuid} placeholder - total_requests: Expected number of requests for this prefix - osl: Output sequence length hint (LOW/MEDIUM/HIGH) - iat: Inter-arrival time hint (LOW/MEDIUM/HIGH) - - Returns: - An async function suitable for use as an httpx event hook. + Custom transport wrapper that injects nvext.annotations into request bodies. + + This approach is more reliable than using event hooks because it modifies + the request BEFORE httpx's internal state machine processes it. """ - # Generate the default prefix ID ONCE when the hook is created - # This ensures all requests from this client share the same prefix ID - unique_id = uuid.uuid4().hex[:16] - if prefix_template: - default_prefix_id = prefix_template.format(uuid=unique_id) - else: - default_prefix_id = f"nat-dynamo-{unique_id}" - - logger.debug("Created Dynamo request hook with default prefix ID: %s", default_prefix_id) - - async def on_request(request): - """Inject Dynamo prefix hints into request headers AND body.""" + + def __init__( + self, + transport: "httpx.AsyncBaseTransport", + prefix_id: str, + total_requests: int, + osl: str, + iat: str, + ): + self._transport = transport + self._prefix_id = prefix_id + self._total_requests = total_requests + self._osl = osl.upper() + self._iat = iat.upper() + + async def handle_async_request(self, request: "httpx.Request") -> "httpx.Response": + import httpx + # Check context variable first (allows per-question override in batch evaluation) context_prefix_id = DynamoPrefixContext.get() - - if context_prefix_id: - prefix_id = context_prefix_id - logger.debug("Using context prefix ID: %s", prefix_id) - else: - # Use the pre-generated prefix ID (same for all requests from this client) - prefix_id = default_prefix_id - logger.debug("Using default prefix ID: %s", prefix_id) - - # ===================================================================== - # Transport 1: HTTP Headers (for generalized Thompson Sampling setup) - # The custom frontend.py reads these headers directly. - # - # DEPRECATION NOTE: This transport mechanism exists solely for backwards - # compatibility with start_dynamo_unified_thompson_hints.sh which uses - # custom frontend.py/processor.py that read x-prefix-* headers. - # Once that setup is deprecated in favor of the optimized setup - # (start_dynamo_optimized_thompson_hints.sh), this header injection - # can be removed entirely - only nvext.annotations will be needed. - # - # AI PROMPT TO REMOVE HTTP HEADERS (use when generalized setup is deprecated): - # "Remove the HTTP header injection (x-prefix-*) from dynamo_llm.py. - # Keep only the nvext.annotations transport mechanism. Update docstrings - # to remove references to HTTP headers and the generalized Thompson - # Sampling setup. The start_dynamo_unified_thompson_hints.sh script - # and its custom frontend.py/processor.py are now deprecated." - # ===================================================================== - request.headers["x-prefix-id"] = prefix_id - request.headers["x-prefix-total-requests"] = str(total_requests) - request.headers["x-prefix-osl"] = osl.upper() - request.headers["x-prefix-iat"] = iat.upper() - - # ===================================================================== - # Transport 2: nvext.annotations (for optimized Thompson Sampling setup) - # The default Dynamo frontend passes these through to processor.py - # which extracts them from the PreprocessedRequest.annotations field. - # ===================================================================== - if request.method == "POST" and request.content: + prefix_id = context_prefix_id if context_prefix_id else self._prefix_id + + # Add HTTP headers (for generalized setup compatibility) + headers = dict(request.headers) + headers["x-prefix-id"] = prefix_id + headers["x-prefix-total-requests"] = str(self._total_requests) + headers["x-prefix-osl"] = self._osl + headers["x-prefix-iat"] = self._iat + + # Modify body if it's a POST request with JSON content + content = request.content + if request.method == "POST" and content: try: - body = json.loads(request.content.decode("utf-8")) + body = json.loads(content.decode("utf-8")) if isinstance(body, dict): - # Build annotations list in "key:value" format + # Build annotations list annotations = [ f"prefix_id:{prefix_id}", - f"total_requests:{total_requests}", - f"osl:{osl.upper()}", - f"iat:{iat.upper()}", + f"total_requests:{self._total_requests}", + f"osl:{self._osl}", + f"iat:{self._iat}", ] - + # Add/merge nvext.annotations if "nvext" not in body: body["nvext"] = {} if not isinstance(body["nvext"], dict): body["nvext"] = {} - - # Preserve any existing annotations and add ours + existing = body["nvext"].get("annotations", []) if not isinstance(existing, list): existing = [] - - # Our annotations take precedence (placed first) + + # Our annotations take precedence body["nvext"]["annotations"] = annotations + [ a for a in existing if not any(a.startswith(f"{key}:") for key in ["prefix_id", "total_requests", "osl", "iat"]) ] - - # Re-encode the body - new_content = json.dumps(body).encode("utf-8") - # Update the request content (httpx allows this via _content) - request._content = new_content - request.headers["content-length"] = str(len(new_content)) - - logger.debug("Injected nvext.annotations: %s", body["nvext"]["annotations"]) - + + # Re-encode + content = json.dumps(body).encode("utf-8") + headers["content-length"] = str(len(content)) + + logger.debug("Injected nvext.annotations: %s (body size: %d bytes)", + body["nvext"]["annotations"], len(content)) except (json.JSONDecodeError, UnicodeDecodeError) as e: - # Not JSON or encoding issue - skip body injection, headers still work - logger.debug("Could not inject nvext.annotations (body not JSON): %s", e) - - logger.debug("Injected Dynamo hints: prefix_id=%s, total_requests=%d, osl=%s, iat=%s", - prefix_id, - total_requests, - osl.upper(), - iat.upper()) - - return on_request + logger.debug("Could not inject nvext.annotations: %s", e) + + # Create a new request with modified headers and content + new_request = httpx.Request( + method=request.method, + url=request.url, + headers=headers, + content=content, + extensions=request.extensions, + ) + + return await self._transport.handle_async_request(new_request) + + async def aclose(self): + await self._transport.aclose() def create_httpx_client_with_dynamo_hooks( @@ -438,10 +389,27 @@ def create_httpx_client_with_dynamo_hooks( """ import httpx - request_hook = _create_dynamo_request_hook(prefix_template, total_requests, osl, iat) - + # Generate the prefix ID once + unique_id = uuid.uuid4().hex[:16] + if prefix_template: + prefix_id = prefix_template.format(uuid=unique_id) + else: + prefix_id = f"nat-dynamo-{unique_id}" + + logger.debug("Created Dynamo client with prefix ID: %s", prefix_id) + + # Create a base transport and wrap it with our custom transport + base_transport = httpx.AsyncHTTPTransport() + dynamo_transport = _DynamoTransport( + transport=base_transport, + prefix_id=prefix_id, + total_requests=total_requests, + osl=osl, + iat=iat, + ) + return httpx.AsyncClient( - event_hooks={"request": [request_hook]}, + transport=dynamo_transport, timeout=httpx.Timeout(timeout), ) From 6c0f2b514cb349309e5671ec6a79890726010c4a Mon Sep 17 00:00:00 2001 From: bbednarski9 Date: Sun, 25 Jan 2026 20:40:49 +0000 Subject: [PATCH 06/13] profiler and grafana patches Signed-off-by: bbednarski9 --- .../components/integrations/frameworks.md | 2 +- .../configs/profile_rethinking_full_test.yml | 2 +- external/dynamo/.env.example | 92 +++-- external/dynamo/collect_metrics.sh | 5 + external/dynamo/monitoring/README.md | 75 +++- .../provisioning/dashboards/dashboards.yml | 5 + .../dashboards/json/dynamo-overview.json | 321 ++++++++++++++++-- .../provisioning/datasources/datasources.yml | 2 + external/dynamo/monitoring/prometheus.yml | 40 ++- external/dynamo/optimized/__init__.py | 1 - external/dynamo/optimized/processor.py | 61 ++-- external/dynamo/optimized/router.py | 285 +++++++++------- .../start_dynamo_optimized_thompson_hints.sh | 301 +++++++++++----- external/dynamo/start_dynamo_unified.sh | 101 ++++-- external/dynamo/stop_dynamo.sh | 76 ++++- src/nat/data_models/profiler.py | 20 ++ src/nat/llm/dynamo_llm.py | 37 +- .../inference_optimization/dynamo_metrics.py | 205 ++++++++++- src/nat/profiler/profile_runner.py | 48 +++ 19 files changed, 1264 insertions(+), 415 deletions(-) diff --git a/docs/source/components/integrations/frameworks.md b/docs/source/components/integrations/frameworks.md index 094065b0bc..ab27845314 100644 --- a/docs/source/components/integrations/frameworks.md +++ b/docs/source/components/integrations/frameworks.md @@ -148,7 +148,7 @@ uv pip install "nvidia-nat[crewai]" LangChain is a framework for building applications that utilize large language models (LLMs) to interact with data. It provides a set of tools for creating chains of LLM calls, allowing for complex workflows powered by LLMs. LangChain focuses on modularity and extensibility, making it suitable for integrating custom data pipelines and enhancing intelligent applications. -For more information, visit the [LangChain website](https://www.langchain.com/). +For more information, visit the [LangChain documentation](https://docs.langchain.com/oss/python/langchain/overview). | Capability | Providers / Details | diff --git a/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/profile_rethinking_full_test.yml b/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/profile_rethinking_full_test.yml index b2a9aa8b69..f2d25d58f0 100644 --- a/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/profile_rethinking_full_test.yml +++ b/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/profile_rethinking_full_test.yml @@ -199,7 +199,7 @@ workflow: eval: general: - max_concurrency: 1 + max_concurrency: 8 output: dir: ./examples/dynamo_integration/react_benchmark_agent/outputs/dynamo_evals/rethinking_full_test_for_profiling/ diff --git a/external/dynamo/.env.example b/external/dynamo/.env.example index ad42332fb4..c55c6e523c 100644 --- a/external/dynamo/.env.example +++ b/external/dynamo/.env.example @@ -13,10 +13,24 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Required: Set your model directory path -export DYNAMO_MODEL_DIR="/path/to/your/models/Llama-3.3-70B-Instruct" -# Optional: Set repository directory (for Thompson Sampling router) -export DYNAMO_REPO_DIR="/path/to/NeMo-Agent-Toolkit" +export HF_HOME=/path/to/local/storage/.cache/huggingface + +export HF_TOKEN=my_huggingface_read_token + +# Required: Set your model directory path with model weights +# EXAMPLE ls from properly configured directory +# ~/models/Llama-3.3-70B-Instruct$ ls +# LICENSE model-00003-of-00030.safetensors model-00010-of-00030.safetensors model-00017-of-00030.safetensors model-00024-of-00030.safetensors model.safetensors.index.json +# README.md model-00004-of-00030.safetensors model-00011-of-00030.safetensors model-00018-of-00030.safetensors model-00025-of-00030.safetensors original +# USE_POLICY.md model-00005-of-00030.safetensors model-00012-of-00030.safetensors model-00019-of-00030.safetensors model-00026-of-00030.safetensors special_tokens_map.json +# config.json model-00006-of-00030.safetensors model-00013-of-00030.safetensors model-00020-of-00030.safetensors model-00027-of-00030.safetensors tokenizer.json +# generation_config.json model-00007-of-00030.safetensors model-00014-of-00030.safetensors model-00021-of-00030.safetensors model-00028-of-00030.safetensors tokenizer_config.json +# model-00001-of-00030.safetensors model-00008-of-00030.safetensors model-00015-of-00030.safetensors model-00022-of-00030.safetensors model-00029-of-00030.safetensors +# model-00002-of-00030.safetensors model-00009-of-00030.safetensors model-00016-of-00030.safetensors model-00023-of-00030.safetensors model-00030-of-00030.safetensors +export DYNAMO_MODEL_DIR=/path/to/your/models/Llama-3.3-70B-Instruct + +# Set repository directory (for Thompson Sampling router) +export DYNAMO_REPO_DIR=/path/to/NeMo-Agent-Toolkit/external/dynamo # ============================================================================= # OPTIONAL VARIABLES - GPU Configuration @@ -24,39 +38,39 @@ export DYNAMO_REPO_DIR="/path/to/NeMo-Agent-Toolkit" # GPU device IDs for unified mode (comma-separated) # Default: 0,1,2,3 -DYNAMO_GPU_DEVICES="0,1,2,3" +export DYNAMO_GPU_DEVICES=0,1,2,3 # GPU device IDs for disaggregated mode prefill workers # Default: 0,1 -# DYNAMO_PREFILL_GPUS="0,1" +# DYNAMO_PREFILL_GPUS=0,1 # GPU device IDs for disaggregated mode decode workers # Default: 2,3 -# DYNAMO_DECODE_GPUS="2,3" +# DYNAMO_DECODE_GPUS=2,3 # Tensor parallelism size (number of GPUs per worker) # Default: 4 for unified mode, 2 for disaggregated mode -# DYNAMO_TP_SIZE="4" +# DYNAMO_TP_SIZE=4 # ============================================================================= # OPTIONAL VARIABLES - Network Configuration # ============================================================================= # HTTP port for Dynamo frontend API -# Default: 8000 -# DYNAMO_HTTP_PORT="8000" +# Default: 8099 +# DYNAMO_HTTP_PORT=8099 # ETCD client port for metadata and discovery # Default: 2379 -# DYNAMO_ETCD_PORT="2379" +# DYNAMO_ETCD_PORT=2379 # ETCD peer port # Default: 2390 -# DYNAMO_ETCD_PEER_PORT="2390" +# DYNAMO_ETCD_PEER_PORT=2390 # NATS messaging port # Default: 4222 -# DYNAMO_NATS_PORT="4222" +# DYNAMO_NATS_PORT=4222 # ============================================================================= # OPTIONAL VARIABLES - Model Configuration @@ -64,11 +78,11 @@ DYNAMO_GPU_DEVICES="0,1,2,3" # Model name as exposed by the API # Default: llama-3.3-70b -# DYNAMO_MODEL_NAME="llama-3.3-70b" +# DYNAMO_MODEL_NAME=llama-3.3-70b # Shared memory size for Docker container # Default: 16g -# DYNAMO_SHM_SIZE="16g" +# DYNAMO_SHM_SIZE=16g # ============================================================================= # OPTIONAL VARIABLES - Disaggregated Mode @@ -76,46 +90,26 @@ DYNAMO_GPU_DEVICES="0,1,2,3" # Bootstrap port for disaggregated mode communication # Default: 12345 -# DYNAMO_DISAGG_BOOTSTRAP_PORT="12345" +# DYNAMO_DISAGG_BOOTSTRAP_PORT=12345 # Transfer backend for KV cache (nixl, nccl, or gloo) # Default: nixl -# DYNAMO_DISAGG_TRANSFER_BACKEND="nixl" - +# DYNAMO_DISAGG_TRANSFER_BACKEND=nixl # ============================================================================= -# OPTIONAL VARIABLES - Custom Thompson Sampler +# OPTIONAL VARIABLES - Performance Tuning # ============================================================================= -# Path to CSV file for router decision logging -# Default: router_metrics.csv -# ROUTER_METRICS_CSV="router_metrics.csv" - -# timeout period for dynamo worker initialization -# Default: 300 -# DYNAMO_WORKER_INIT_TIMEOUT_S=300 - -# ============================================================================= -# OPTIONAL VARIABLES - Metrics Configuration -# ============================================================================= - -# Each component exposes Prometheus metrics on its own port to avoid conflicts. -# This allows collecting metrics from Worker, Router, and Processor separately. - -# Worker metrics port (KV cache stats, NATS metrics, internal stats) -# Default: 8081 -# DYNAMO_WORKER_METRICS_PORT="8081" - -# Router metrics port (Thompson Sampling routing metrics) -# Default: 8082 -# DYNAMO_ROUTER_METRICS_PORT="8082" +# Worker initialization timeout (seconds) +# Increase for large models (70B+) or cold starts +# Default: 1800 (30 minutes) +# DYNAMO_WORKER_INIT_TIMEOUT_S=1800 -# Processor metrics port (Thompson Sampling KVE metrics) -# Default: 8083 -# DYNAMO_PROCESSOR_METRICS_PORT="8083" +# Block size in tokens - must match between SGLang (--page-size) and Frontend (--kv-cache-block-size) +# Default: 64 tokens per block +DYNAMO_KV_BLOCK_SIZE=64 -# Metrics endpoints after startup: -# Frontend: http://localhost:${DYNAMO_HTTP_PORT}/metrics (latency, throughput) -# Worker: http://localhost:${DYNAMO_WORKER_METRICS_PORT}/metrics (KV cache) -# Router: http://localhost:${DYNAMO_ROUTER_METRICS_PORT}/metrics (routing) -# Processor: http://localhost:${DYNAMO_PROCESSOR_METRICS_PORT}/metrics (KVE) +# Fraction of GPU memory for KV cache (0.0-1.0) +# Reduce to test cache pressure/degradation scenarios +# Default: 0.9 (90% of GPU memory for KV cache) +DYNAMO_MEM_FRACTION_STATIC=0.9 diff --git a/external/dynamo/collect_metrics.sh b/external/dynamo/collect_metrics.sh index 16dc51bcff..aeeb3dc0ea 100755 --- a/external/dynamo/collect_metrics.sh +++ b/external/dynamo/collect_metrics.sh @@ -62,3 +62,8 @@ done + + + + + diff --git a/external/dynamo/monitoring/README.md b/external/dynamo/monitoring/README.md index a30f433928..6f2ae2e7bb 100644 --- a/external/dynamo/monitoring/README.md +++ b/external/dynamo/monitoring/README.md @@ -58,9 +58,11 @@ docker compose up -d | Component | Port | URL | Description | |-----------|------|-----|-------------| | Frontend | 8000 | `http://localhost:8000/metrics` | User-facing metrics (latency, throughput) | -| Worker | 8081 | `http://localhost:8081/metrics` | Internal metrics (KV cache, NATS stats) | -| Router | 8082 | `http://localhost:8082/metrics` | Thompson Sampling routing metrics | -| Processor | 8083 | `http://localhost:8083/metrics` | Thompson Sampling KVE metrics | +| Workers | 18081-180xx | `http://localhost:18081/metrics` | Internal metrics (KV cache, NATS stats) - one port per worker | +| Router | 18090 | `http://localhost:18090/metrics` | Thompson Sampling routing metrics | +| Processor | 18091 | `http://localhost:18091/metrics` | Thompson Sampling KVE metrics | + +**Note**: Worker metrics ports are sequential starting at 18081. With 2 workers: 18081, 18082. With 4 workers: 18081-18084. ## Key Metrics @@ -158,19 +160,35 @@ Custom Thompson Sampling KV Efficiency (KVE) metrics from the processor componen | `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_host_blocks_total` | Counter | KV blocks from CPU memory | | `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_disk_blocks_total` | Counter | KV blocks from disk | -**KV Efficiency (KVE) Calculation:** +**KV Cache Efficiency Score (KVES) Calculation:** + +The full KVES formula is: +``` +KVES = (TotalWork - ActualWork) / TotalWork ∈ [0,1] + where 0 = no cache benefit, 1 = full reuse + +ActualWork = + w_compute * recomputed_prefill_blocks * block_size +TotalWork = cached_prompt_blocks * block_size +w_hit = (w_gpu_hit, w_cpu_hit, w_disk_hit) # weights per hit source +``` + +Since CPU/disk hit metrics are not available in SGLang (KVBM not yet supported), we use a **simplified KVES proxy**: + ```promql -# KV Cache Efficiency percentage (using SGLang native metric - RECOMMENDED) -sglang:cache_hit_rate * 100 +# KVES Proxy (using SGLang native metric - RECOMMENDED) +sglang:cache_hit_rate -# Alternative: Using processor counters (may show 0 if SGLang doesn't return cached_tokens in API) -# rate(dynamo_component_thompson_kve_cached_tokens_total[5m]) / rate(dynamo_component_thompson_kve_prompt_tokens_total[5m]) * 100 +# As percentage +sglang:cache_hit_rate * 100 ``` > **Why use SGLang's native metric?** SGLang computes cache hit rate internally but doesn't include > `cached_tokens` in its API responses. The processor's `thompson_kve_*` counters will show 0 > unless the underlying engine provides `usage.prompt_tokens_details.cached_tokens`. +> **Note on Full KVES**: To implement the full KVES equation with CPU/disk hit weights, you would need +> to switch to vLLM with KVBM enabled, which provides GPU→CPU→Disk tiered caching with proper metrics. + ## KV Cache Metrics Status This section documents the working status of all KV cache related metrics across the Dynamo stack. @@ -244,12 +262,20 @@ The pre-configured dashboard "Dynamo LLM Overview" includes: 1. **Inflight Requests** - Current load across all components 2. **Requests/min** - Throughput 3. **Time to First Token (P95)** - Latency to start generating -4. **Cache Hit Rate %** - Prefix cache hit rate (may be 0 without repeated prefix queries) +4. **KVES Proxy (Cache Hit Rate %)** - KV Efficiency Score proxy using prefix cache hit rate 5. **TTFT Over Time** - P50/P95/P99 latency trends 6. **ITL Over Time** - Inter-token latency trends 7. **Token Throughput** - Tokens generated per second 8. **KV Cache Usage** - Memory usage % and prefix cache hit rate % over time 9. **KV Cache Tokens & Throughput** - Absolute token count and generation throughput +10. **KV Cache Details (Per-Worker)** - Detailed per-worker metrics including: + - KVES: Prefix hit rate (%) - `avg_over_time(sglang:cache_hit_rate[1m]) * 100` + - KV Usage (%) - `avg_over_time(sglang:token_usage[1m]) * 100` + - KV Tokens Used - `last_over_time(sglang:num_used_tokens[1m])` + - KV Capacity (blocks) - `last_over_time(dynamo_component_kvstats_total_blocks[1m])` + - Frontend Block Size - `last_over_time(dynamo_frontend_model_kv_cache_block_size[5m])` +11. **KVES Proxy by Worker** - Color-coded efficiency score per worker (0-1 scale) +12. **KV Cache Memory Usage % by Worker** - Per-worker memory utilization ### Thompson Sampling Panels (Included) @@ -305,10 +331,39 @@ docker compose logs -f grafana ### Reset Data (Start Fresh) ```bash -docker compose down -v # Removes volumes +docker compose down -v # Removes ALL volumes (Prometheus + Grafana data) docker compose up -d ``` +### Clear Prometheus Data Only + +If you're seeing duplicate labels in Grafana (for example, after restarting workers with new IDs), you can clear just the Prometheus data while keeping Grafana settings: + +```bash +# Stop the monitoring containers +docker stop dynamo-prometheus dynamo-grafana +docker rm dynamo-prometheus dynamo-grafana + +# Remove just the Prometheus data volume (clears all historical metrics) +docker volume rm monitoring_prometheus_data && echo "Prometheus data volume removed (old metrics cleared)" + +# Restart the monitoring stack with fresh data +docker compose up -d +``` + +Alternatively, use the stop script with the `--kill-metrics` flag: + +```bash +# From the dynamo directory +bash stop_dynamo.sh --kill-metrics + +# Then remove the Prometheus volume +docker volume rm monitoring_prometheus_data + +# Restart everything (monitoring will start automatically) +bash start_dynamo_optimized_thompson_hints.sh +``` + ## Remote Access via SSH Port Forwarding If the monitoring stack is running on a remote server, use SSH port forwarding to access Grafana and Prometheus locally. diff --git a/external/dynamo/monitoring/grafana/provisioning/dashboards/dashboards.yml b/external/dynamo/monitoring/grafana/provisioning/dashboards/dashboards.yml index 7c91586621..08c8673e8d 100644 --- a/external/dynamo/monitoring/grafana/provisioning/dashboards/dashboards.yml +++ b/external/dynamo/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -17,3 +17,8 @@ providers: + + + + + diff --git a/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json b/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json index 901957a911..801b2a7d0d 100644 --- a/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json +++ b/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json @@ -95,7 +95,7 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "sum(increase(dynamo_frontend_requests_total[1m]))", + "expr": "sum(increase(dynamo_frontend_requests_total[5s]))", "legendFormat": "Total Requests/min (dynamo_frontend_requests_total)", "refId": "A" } @@ -143,7 +143,7 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "histogram_quantile(0.95, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5s]))", "legendFormat": "P95 TTFT (dynamo_frontend_time_to_first_token_seconds)", "refId": "A" } @@ -180,7 +180,7 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": ["mean"], "fields": "", "values": false }, @@ -190,11 +190,11 @@ "targets": [ { "expr": "sglang:cache_hit_rate * 100", - "legendFormat": "Cache Hit Rate (sglang:cache_hit_rate)", + "legendFormat": "Cache Hit Rate ({{instance}}) [sglang:cache_hit_rate * 100]", "refId": "A" } ], - "title": "Cache Hit Rate %", + "title": "KVES Proxy (Cache Hit Rate %)", "type": "stat" }, { @@ -244,17 +244,17 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "histogram_quantile(0.5, rate(dynamo_frontend_time_to_first_token_seconds_bucket[1m]))", + "expr": "histogram_quantile(0.5, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5s]))", "legendFormat": "P50 (dynamo_frontend_time_to_first_token_seconds)", "refId": "A" }, { - "expr": "histogram_quantile(0.95, rate(dynamo_frontend_time_to_first_token_seconds_bucket[1m]))", + "expr": "histogram_quantile(0.95, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5s]))", "legendFormat": "P95 (dynamo_frontend_time_to_first_token_seconds)", "refId": "B" }, { - "expr": "histogram_quantile(0.99, rate(dynamo_frontend_time_to_first_token_seconds_bucket[1m]))", + "expr": "histogram_quantile(0.99, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5s]))", "legendFormat": "P99 (dynamo_frontend_time_to_first_token_seconds)", "refId": "C" } @@ -309,17 +309,17 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "histogram_quantile(0.5, rate(dynamo_frontend_inter_token_latency_seconds_bucket[1m]))", + "expr": "histogram_quantile(0.5, rate(dynamo_frontend_inter_token_latency_seconds_bucket[5s]))", "legendFormat": "P50 (dynamo_frontend_inter_token_latency_seconds)", "refId": "A" }, { - "expr": "histogram_quantile(0.95, rate(dynamo_frontend_inter_token_latency_seconds_bucket[1m]))", + "expr": "histogram_quantile(0.95, rate(dynamo_frontend_inter_token_latency_seconds_bucket[5s]))", "legendFormat": "P95 (dynamo_frontend_inter_token_latency_seconds)", "refId": "B" }, { - "expr": "histogram_quantile(0.99, rate(dynamo_frontend_inter_token_latency_seconds_bucket[1m]))", + "expr": "histogram_quantile(0.99, rate(dynamo_frontend_inter_token_latency_seconds_bucket[5s]))", "legendFormat": "P99 (dynamo_frontend_inter_token_latency_seconds)", "refId": "C" } @@ -375,12 +375,17 @@ "targets": [ { "expr": "sglang:gen_throughput", - "legendFormat": "Backend Throughput tok/s (sglang:gen_throughput)", + "legendFormat": "Worker ({{instance}}) [sglang:gen_throughput]", "refId": "A" }, { - "expr": "rate(dynamo_frontend_output_tokens_total[1m])", - "legendFormat": "Frontend Throughput tok/s (dynamo_frontend_output_tokens_total)", + "expr": "sum(sglang:gen_throughput)", + "legendFormat": "Total Workers (sum) [sum(sglang:gen_throughput)]", + "refId": "C" + }, + { + "expr": "rate(dynamo_frontend_output_tokens_total{job=\"dynamo-frontend\"}[5s])", + "legendFormat": "Frontend Output (delivered) [rate(dynamo_frontend_output_tokens_total[5s])]", "refId": "B" } ], @@ -435,12 +440,12 @@ "targets": [ { "expr": "sglang:token_usage * 100", - "legendFormat": "KV Cache Memory Usage % (sglang:token_usage)", + "legendFormat": "KV Cache % ({{instance}}) [sglang:token_usage * 100]", "refId": "A" }, { "expr": "sglang:cache_hit_rate * 100", - "legendFormat": "Prefix Cache Hit Rate % (sglang:cache_hit_rate)", + "legendFormat": "Prefix Cache % ({{instance}}) [sglang:cache_hit_rate * 100]", "refId": "B" } ], @@ -495,7 +500,7 @@ "targets": [ { "expr": "sglang:num_used_tokens", - "legendFormat": "Tokens in KV Cache (sglang:num_used_tokens)", + "legendFormat": "Tokens in KV Cache ({{instance}}) [sglang:num_used_tokens]", "refId": "A" } ], @@ -549,12 +554,37 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "rate(dynamo_component_thompson_routing_decisions_total[5m])", - "legendFormat": "Routing Decisions/sec (dynamo_component_thompson_routing_decisions_total)", + "expr": "sum(rate(dynamo_frontend_requests_total[5s]))", + "legendFormat": "1. Frontend (total) [sum(rate(dynamo_frontend_requests_total[5s]))]", "refId": "A" + }, + { + "expr": "sum(rate(dynamo_component_requests_total{dynamo_namespace=\"dynamo\",dynamo_component=\"backend\"}[5s]))", + "legendFormat": "2. Processor (backend) [sum(rate(dynamo_component_requests_total{backend}[5s]))]", + "refId": "B" + }, + { + "expr": "sum(rate(dynamo_component_requests_total{dynamo_namespace=\"dynamo\",dynamo_component=\"router\",dynamo_endpoint=\"find_worker\"}[5s]))", + "legendFormat": "3. Router (find_worker) [sum(rate(dynamo_component_requests_total{router}[5s]))]", + "refId": "C" + }, + { + "expr": "rate(dynamo_component_requests_total{dynamo_namespace=\"workers\",dynamo_component=\"worker\",instance=\"localhost:18081\"}[5s])", + "legendFormat": "4. Worker 0 (18081) [rate(dynamo_component_requests_total{worker:18081}[5s])]", + "refId": "D" + }, + { + "expr": "rate(dynamo_component_requests_total{dynamo_namespace=\"workers\",dynamo_component=\"worker\",instance=\"localhost:18082\"}[5s])", + "legendFormat": "4. Worker 1 (18082) [rate(dynamo_component_requests_total{worker:18082}[5s])]", + "refId": "E" + }, + { + "expr": "sum(rate(dynamo_component_requests_total{dynamo_namespace=\"workers\",dynamo_component=\"worker\"}[5s]))", + "legendFormat": "4. Workers (total) [sum(rate(dynamo_component_requests_total{workers}[5s]))]", + "refId": "F" } ], - "title": "Routing Decisions (Thompson Sampling)", + "title": "Request Flow (Frontend → Processor → Router → Workers)", "type": "timeseries" }, { @@ -609,13 +639,8 @@ "targets": [ { "expr": "sglang:num_queue_reqs", - "legendFormat": "Queue Depth (sglang:num_queue_reqs)", + "legendFormat": "Queue Depth ({{instance}}) [sglang:num_queue_reqs]", "refId": "A" - }, - { - "expr": "sglang:num_running_reqs", - "legendFormat": "Running Requests (sglang:num_running_reqs)", - "refId": "B" } ], "title": "SGLang Queue Depth", @@ -669,20 +694,254 @@ "targets": [ { "expr": "sglang:num_running_reqs", - "legendFormat": "Running Requests (sglang:num_running_reqs)", + "legendFormat": "Running Requests ({{instance}}) [sglang:num_running_reqs]", + "refId": "A" + } + ], + "title": "Worker Activity (SGLang)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Detailed KV cache metrics per worker including KVES proxy (prefix hit rate), memory usage, token counts, and capacity.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": {"legend": false, "tooltip": false, "viz": false}, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": {"type": "linear"}, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"}, + "thresholdsStyle": {"mode": "off"} + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + }, + "unit": "percent" + }, + "overrides": [ + { + "matcher": {"id": "byRegexp", "options": ".*Tokens.*"}, + "properties": [ + {"id": "unit", "value": "none"}, + {"id": "custom.axisPlacement", "value": "right"} + ] + }, + { + "matcher": {"id": "byRegexp", "options": ".*Blocks.*"}, + "properties": [ + {"id": "unit", "value": "none"}, + {"id": "custom.axisPlacement", "value": "right"}, + {"id": "custom.drawStyle", "value": "bars"}, + {"id": "custom.fillOpacity", "value": 30} + ] + }, + { + "matcher": {"id": "byRegexp", "options": ".*Block Size.*"}, + "properties": [ + {"id": "unit", "value": "none"}, + {"id": "custom.axisPlacement", "value": "hidden"}, + {"id": "custom.drawStyle", "value": "points"}, + {"id": "custom.pointSize", "value": 8} + ] + } + ] + }, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 36}, + "id": 13, + "options": { + "legend": {"calcs": ["mean", "last", "max"], "displayMode": "table", "placement": "right", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "pluginVersion": "10.2.2", + "targets": [ + { + "expr": "avg_over_time(sglang:cache_hit_rate[1m]) * 100", + "legendFormat": "KVES: Prefix Hit Rate % ({{instance}})", "refId": "A" }, { - "expr": "sglang:num_queue_reqs", - "legendFormat": "Queued Requests (sglang:num_queue_reqs)", + "expr": "avg_over_time(sglang:token_usage[1m]) * 100", + "legendFormat": "KV Usage % ({{instance}})", "refId": "B" + }, + { + "expr": "last_over_time(sglang:num_used_tokens[1m])", + "legendFormat": "KV Tokens Used ({{instance}})", + "refId": "C" + }, + { + "expr": "last_over_time(dynamo_component_kvstats_total_blocks[1m])", + "legendFormat": "KV Capacity Blocks ({{instance}})", + "refId": "D" + }, + { + "expr": "max(dynamo_frontend_model_kv_cache_block_size{job=\"dynamo-frontend\"})", + "legendFormat": "Frontend Block Size (tokens)", + "refId": "E" } ], - "title": "Worker Activity (SGLang)", + "title": "KV Cache Details (Per-Worker)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "KV Cache Efficiency Score (KVES) proxy using SGLang's native prefix cache hit rate. KVES ∈ [0,1]: 0 = no cache benefit, 1 = full reuse. This is a simplified proxy for the full KVES equation (which requires CPU/disk hit metrics not currently available in SGLang).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": {"legend": false, "tooltip": false, "viz": false}, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": {"type": "linear"}, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"}, + "thresholdsStyle": {"mode": "off"} + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "orange", "value": 0.2}, + {"color": "yellow", "value": 0.4}, + {"color": "light-green", "value": 0.6}, + {"color": "green", "value": 0.8} + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 46}, + "id": 14, + "options": { + "legend": { + "calcs": ["mean", "lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": {"mode": "multi", "sort": "none"} + }, + "pluginVersion": "10.2.2", + "targets": [ + { + "expr": "sglang:cache_hit_rate", + "legendFormat": "Worker ({{instance}})", + "refId": "A" + } + ], + "title": "KVES Proxy by Worker", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "KV cache memory utilization per worker. Shows how much of the allocated KV cache memory is currently in use.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": {"legend": false, "tooltip": false, "viz": false}, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": {"type": "linear"}, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"}, + "thresholdsStyle": {"mode": "off"} + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 70}, + {"color": "orange", "value": 85}, + {"color": "red", "value": 95} + ] + }, + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 46}, + "id": 15, + "options": { + "legend": { + "calcs": ["mean", "lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": {"mode": "multi", "sort": "none"} + }, + "pluginVersion": "10.2.2", + "targets": [ + { + "expr": "sglang:token_usage * 100", + "legendFormat": "Worker ({{instance}})", + "refId": "A" + } + ], + "title": "KV Cache Memory Usage % by Worker", "type": "timeseries" } ], - "refresh": "5s", + "refresh": "2s", "schemaVersion": 38, "style": "dark", "tags": ["dynamo", "llm", "inference"], diff --git a/external/dynamo/monitoring/grafana/provisioning/datasources/datasources.yml b/external/dynamo/monitoring/grafana/provisioning/datasources/datasources.yml index c6a8cab8e3..0c065f282c 100644 --- a/external/dynamo/monitoring/grafana/provisioning/datasources/datasources.yml +++ b/external/dynamo/monitoring/grafana/provisioning/datasources/datasources.yml @@ -11,5 +11,7 @@ datasources: url: http://localhost:9090 isDefault: true editable: true + jsonData: + timeInterval: 2s diff --git a/external/dynamo/monitoring/prometheus.yml b/external/dynamo/monitoring/prometheus.yml index 91300240ee..abeaa3187b 100644 --- a/external/dynamo/monitoring/prometheus.yml +++ b/external/dynamo/monitoring/prometheus.yml @@ -3,15 +3,19 @@ # Prometheus configuration for Dynamo metrics collection # -# Metrics Endpoints: +# Metrics Endpoints (using 18xxx range to avoid conflicts): # - Frontend (8000): User-facing latency, throughput, tokens -# - Worker (8081): KV cache stats, NATS metrics, internal stats -# - Router (8082): Thompson Sampling routing metrics -# - Processor (8083): Thompson Sampling KVE metrics +# - Workers (18081-180xx): KV cache stats, NATS metrics, internal stats (one per worker) +# - Router (18090): Thompson Sampling routing metrics +# - Processor (18091): Thompson Sampling KVE metrics +# +# Note: Worker ports are sequential starting at 18081. With 2 workers: 18081, 18082. +# With 4 workers: 18081, 18082, 18083, 18084. With 8 workers: 18081-18088. +# Add more targets below if you run more than 8 workers. global: - scrape_interval: 15s - evaluation_interval: 15s + scrape_interval: 2s + evaluation_interval: 2s scrape_configs: # Dynamo Frontend metrics (user-facing latency, throughput) @@ -19,28 +23,38 @@ scrape_configs: static_configs: - targets: ['localhost:8000'] metrics_path: /metrics - scrape_interval: 5s + scrape_interval: 2s # Dynamo Worker metrics (KV cache, internal stats) + # Multiple workers use sequential ports starting at 18081 + # Add/remove targets based on your NUM_WORKERS setting - job_name: 'dynamo-worker' static_configs: - - targets: ['localhost:8081'] + - targets: + - 'localhost:18081' + - 'localhost:18082' + - 'localhost:18083' + - 'localhost:18084' + - 'localhost:18085' + - 'localhost:18086' + - 'localhost:18087' + - 'localhost:18088' metrics_path: /metrics - scrape_interval: 5s + scrape_interval: 2s # Thompson Sampling Router metrics - job_name: 'dynamo-router' static_configs: - - targets: ['localhost:8082'] + - targets: ['localhost:18090'] metrics_path: /metrics - scrape_interval: 5s + scrape_interval: 2s # Thompson Sampling Processor metrics (KVE) - job_name: 'dynamo-processor' static_configs: - - targets: ['localhost:8083'] + - targets: ['localhost:18091'] metrics_path: /metrics - scrape_interval: 5s + scrape_interval: 2s # Prometheus self-monitoring - job_name: 'prometheus' diff --git a/external/dynamo/optimized/__init__.py b/external/dynamo/optimized/__init__.py index da8420481d..98c3816c75 100644 --- a/external/dynamo/optimized/__init__.py +++ b/external/dynamo/optimized/__init__.py @@ -25,4 +25,3 @@ See ARCHITECTURE.md for detailed documentation. """ - diff --git a/external/dynamo/optimized/processor.py b/external/dynamo/optimized/processor.py index eaa891f2d7..5c1f4871c8 100644 --- a/external/dynamo/optimized/processor.py +++ b/external/dynamo/optimized/processor.py @@ -35,7 +35,7 @@ 1. **This Processor registers as `dynamo.backend.generate`** - Dynamically with instance ID 2. **Processor calls `register_llm()`** - Advertises model card in ETCD 3. **Frontend's ModelWatcher discovers us** - Routes requests to our endpoint -4. **SGLang Worker registers as `dynamo.worker.generate`** - We forward to actual workers +4. **SGLang Worker registers as `workers.worker.generate`** - We forward to actual workers ## Request Flow @@ -45,14 +45,14 @@ → THIS PROCESSOR (discovered via model card!) → extracts hints from nvext annotations → queries Thompson Sampling router → worker_id - → forwards to dynamo.worker.generate (actual SGLang workers) + → forwards to workers.worker.generate (actual SGLang workers) ``` Key differences from generalized/processor.py: - Uses dynamic discovery (no --static-endpoint on frontend) - Registers model card via register_llm() for ETCD discovery - Registers as `dynamo.backend.generate` (not `dynamo.processor.process`) -- Forwards to `dynamo.worker.generate` (not `dynamo.backend.generate`) +- Forwards to `workers.worker.generate` (workers in separate namespace) - Receives PreprocessedRequest instead of ChatCompletionRequest - Extracts hints from nvext annotations (prefix_id:value format) - Uses Dynamo metrics API for Prometheus integration (auto-exposed at /metrics) @@ -110,8 +110,11 @@ from typing import Any import uvloop -from dynamo.llm import ModelInput, ModelType, register_llm -from dynamo.runtime import DistributedRuntime, dynamo_worker +from dynamo.llm import ModelInput +from dynamo.llm import ModelType +from dynamo.llm import register_llm +from dynamo.runtime import DistributedRuntime +from dynamo.runtime import dynamo_worker from dynamo.runtime.logging import configure_dynamo_logging from pydantic import BaseModel @@ -349,25 +352,23 @@ async def initialize(self): await self.router_pick_client.wait_for_instances() logger.info("Router clients initialized successfully") - # Connect to actual workers at dynamo.worker.generate - # (We register as "backend" to intercept frontend requests, but actual SGLang - # workers register as "worker" so we can forward to them after routing) - worker_component = self.runtime.namespace("dynamo").component("worker") + # Connect to actual workers at workers.worker.generate + # Workers are in the "workers" namespace (hidden from frontend discovery) + # while this processor is in "dynamo" namespace (frontend discovers us) + worker_component = self.runtime.namespace("workers").component("worker") self.engine_client = await worker_component.endpoint("generate").client() logger.info("Engine client created, waiting for worker instances...") await self.engine_client.wait_for_instances() - logger.info("Processor initialized successfully (routing to dynamo.worker.generate)") + logger.info("Processor initialized successfully (routing to workers.worker.generate)") # ---- annotation extraction ---- @staticmethod - def _extract_annotation( - annotations: list[str], key: str, default: str | None = None - ) -> str | None: + def _extract_annotation(annotations: list[str], key: str, default: str | None = None) -> str | None: """Extract value from annotations list (format: 'key:value').""" prefix = f"{key}:" for ann in annotations: if ann.startswith(prefix): - return ann[len(prefix) :] + return ann[len(prefix):] return default def _extract_hints(self, request: dict[str, Any]) -> tuple[str, int, str, str]: @@ -480,9 +481,7 @@ async def _pick_worker( if worker_id is not None: self._metrics.routing_decisions_total.inc({"worker_id": str(worker_id)}) else: - logger.warning( - "Router stream ended without worker_id; falling back to engine load balancing." - ) + logger.warning("Router stream ended without worker_id; falling back to engine load balancing.") return worker_id, decision_id @@ -602,9 +601,7 @@ async def _stream_from_engine( # Handle engine errors if "error" in data: latency_ms = (time.perf_counter() - t0) * 1000.0 - await self._send_feedback_safely( - decision_id, latency_ms, False, tokens_in, tokens_out, "error" - ) + await self._send_feedback_safely(decision_id, latency_ms, False, tokens_in, tokens_out, "error") self._metrics.engine_errors_total.inc() yield {"error": data["error"]} return @@ -630,9 +627,12 @@ async def _stream_from_engine( latency_ms = latency_seconds * 1000.0 # Send feedback to router (this is already fire-and-forget) - await self._send_feedback_safely( - decision_id, latency_ms, True, tokens_in, tokens_out, finish_reason - ) + await self._send_feedback_safely(decision_id, + latency_ms, + True, + tokens_in, + tokens_out, + finish_reason) # Update core Prometheus metrics (fast atomic operations) self._metrics.request_latency_seconds.observe(latency_seconds) @@ -648,9 +648,7 @@ async def _stream_from_engine( except Exception as e: latency_ms = (time.perf_counter() - t0) * 1000.0 - await self._send_feedback_safely( - decision_id, latency_ms, False, tokens_in, tokens_out, "exception" - ) + await self._send_feedback_safely(decision_id, latency_ms, False, tokens_in, tokens_out, "exception") self._metrics.engine_errors_total.inc() logger.exception("Engine stream exception") yield {"error": str(e)} @@ -699,9 +697,7 @@ async def generate(self, raw: dict[str, Any]): reuse_budget = await self._update_prefix_state(prefix_id, total_requests) # Pick worker via Thompson Sampling router - worker_id, decision_id = await self._pick_worker( - token_ids, prefix_id, reuse_budget, osl, iat - ) + worker_id, decision_id = await self._pick_worker(token_ids, prefix_id, reuse_budget, osl, iat) logger.info( "Routing decision: worker=%s decision=%s reuse_budget=%d", @@ -749,7 +745,7 @@ def parse_args(): return parser.parse_args() -@dynamo_worker(static=False) # Dynamic mode for ETCD discovery by frontend +@dynamo_worker(static=False) # Dynamic mode - required to call router/workers which are also dynamic async def worker(runtime: DistributedRuntime): """ Main worker entry point for the Thompson Sampling processor. @@ -769,7 +765,7 @@ async def worker(runtime: DistributedRuntime): # 1. We register as dynamo.backend.generate (dynamically with instance ID) # 2. We call register_llm() to advertise ourselves in ETCD # 3. Frontend's ModelWatcher discovers us and routes requests to us - # 4. We forward to actual workers at dynamo.worker.generate + # 4. We forward to actual workers at workers.worker.generate component = runtime.namespace("dynamo").component("backend") await component.create_service() @@ -784,12 +780,15 @@ async def worker(runtime: DistributedRuntime): args.model_name, args.model_path, ) + # IMPORTANT: kv_cache_block_size must match what workers use (default page_size=1) + # Otherwise checksums will differ and frontend will reject the processor's model card await register_llm( model_input=ModelInput.Tokens, # We accept tokenized input from frontend model_type=ModelType.Chat | ModelType.Completions, # Chat and completions endpoints endpoint=endpoint, model_path=args.model_path, model_name=args.model_name, + kv_cache_block_size=1, # Must match worker page_size to ensure same checksum ) logger.info("Model card registered successfully - frontend can now discover us via ETCD") diff --git a/external/dynamo/optimized/router.py b/external/dynamo/optimized/router.py index 8d032d4dc9..e1c02f6a05 100644 --- a/external/dynamo/optimized/router.py +++ b/external/dynamo/optimized/router.py @@ -279,101 +279,100 @@ def apply_cli_overrides(config: dict, args: argparse.Namespace) -> dict: return config -# Prometheus metrics - initialized lazily -_prometheus_initialized = False -_metrics = {} - - def _init_prometheus_metrics(): """Initialize Prometheus metrics lazily.""" - global _prometheus_initialized, _metrics - if _prometheus_initialized: - return _metrics + import functools - try: - from prometheus_client import Counter, Histogram, Gauge, REGISTRY + @functools.lru_cache(maxsize=1) + def _init() -> dict: + metrics: dict = {} + try: + from prometheus_client import REGISTRY + from prometheus_client import Counter + from prometheus_client import Gauge + from prometheus_client import Histogram + + metrics["decisions_total"] = Counter( + "thompson_router_decisions_total", + "Total routing decisions by worker", + ["worker_id"], + registry=REGISTRY, + ) + metrics["kv_overlap"] = Gauge( + "thompson_router_kv_overlap", + "KV cache overlap score for last decision by worker", + ["worker_id"], + registry=REGISTRY, + ) + metrics["feedback_latency"] = Histogram( + "thompson_router_feedback_latency_seconds", + "Latency from feedback by worker", + ["worker_id"], + buckets=[0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0], + registry=REGISTRY, + ) + metrics["reward"] = Gauge( + "thompson_router_reward", + "Last computed reward by worker", + ["worker_id"], + registry=REGISTRY, + ) + metrics["pending_decisions"] = Gauge( + "thompson_router_pending_decisions", + "Number of pending decisions awaiting feedback", + registry=REGISTRY, + ) + metrics["timeout_penalties"] = Counter( + "thompson_router_timeout_penalties_total", + "Total timeout penalties applied", + registry=REGISTRY, + ) + metrics["sticky_decisions"] = Counter( + "thompson_router_sticky_decisions_total", + "Decisions that stayed on the same worker (sticky)", + registry=REGISTRY, + ) + metrics["switch_decisions"] = Counter( + "thompson_router_switch_decisions_total", + "Decisions that switched to a different worker", + registry=REGISTRY, + ) + metrics["beta_alpha"] = Gauge( + "thompson_router_beta_alpha", + "Beta distribution alpha parameter by worker", + ["worker_id"], + registry=REGISTRY, + ) + metrics["beta_beta"] = Gauge( + "thompson_router_beta_beta", + "Beta distribution beta parameter by worker", + ["worker_id"], + registry=REGISTRY, + ) + metrics["prefix_state_size"] = Gauge( + "thompson_router_prefix_state_size", + "Number of active prefix states", + registry=REGISTRY, + ) + metrics["reuse_budget"] = Histogram( + "thompson_router_reuse_budget", + "Distribution of reuse_budget values", + buckets=[0, 1, 2, 5, 10, 20, 50, 100], + registry=REGISTRY, + ) + metrics["tokens_per_request"] = Histogram( + "thompson_router_tokens_per_request", + "Distribution of input token counts", + buckets=[32, 64, 128, 256, 512, 1024, 2048, 4096, 8192], + registry=REGISTRY, + ) + logger.info("Prometheus metrics initialized for router") + except ImportError: + logger.warning("prometheus_client not available, metrics disabled") - _metrics["decisions_total"] = Counter( - "thompson_router_decisions_total", - "Total routing decisions by worker", - ["worker_id"], - registry=REGISTRY, - ) - _metrics["kv_overlap"] = Gauge( - "thompson_router_kv_overlap", - "KV cache overlap score for last decision by worker", - ["worker_id"], - registry=REGISTRY, - ) - _metrics["feedback_latency"] = Histogram( - "thompson_router_feedback_latency_seconds", - "Latency from feedback by worker", - ["worker_id"], - buckets=[0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0], - registry=REGISTRY, - ) - _metrics["reward"] = Gauge( - "thompson_router_reward", - "Last computed reward by worker", - ["worker_id"], - registry=REGISTRY, - ) - _metrics["pending_decisions"] = Gauge( - "thompson_router_pending_decisions", - "Number of pending decisions awaiting feedback", - registry=REGISTRY, - ) - _metrics["timeout_penalties"] = Counter( - "thompson_router_timeout_penalties_total", - "Total timeout penalties applied", - registry=REGISTRY, - ) - _metrics["sticky_decisions"] = Counter( - "thompson_router_sticky_decisions_total", - "Decisions that stayed on the same worker (sticky)", - registry=REGISTRY, - ) - _metrics["switch_decisions"] = Counter( - "thompson_router_switch_decisions_total", - "Decisions that switched to a different worker", - registry=REGISTRY, - ) - _metrics["beta_alpha"] = Gauge( - "thompson_router_beta_alpha", - "Beta distribution alpha parameter by worker", - ["worker_id"], - registry=REGISTRY, - ) - _metrics["beta_beta"] = Gauge( - "thompson_router_beta_beta", - "Beta distribution beta parameter by worker", - ["worker_id"], - registry=REGISTRY, - ) - _metrics["prefix_state_size"] = Gauge( - "thompson_router_prefix_state_size", - "Number of active prefix states", - registry=REGISTRY, - ) - _metrics["reuse_budget"] = Histogram( - "thompson_router_reuse_budget", - "Distribution of reuse_budget values", - buckets=[0, 1, 2, 5, 10, 20, 50, 100], - registry=REGISTRY, - ) - _metrics["tokens_per_request"] = Histogram( - "thompson_router_tokens_per_request", - "Distribution of input token counts", - buckets=[32, 64, 128, 256, 512, 1024, 2048, 4096, 8192], - registry=REGISTRY, - ) - _prometheus_initialized = True - logger.info("Prometheus metrics initialized for router") - except ImportError: - logger.warning("prometheus_client not available, metrics disabled") - _prometheus_initialized = True # Don't retry + return metrics - return _metrics + return _init() # ---------------------- request / response models ---------------------- # @@ -410,13 +409,17 @@ class FeedbackAck(BaseModel): # ---------------------- helper decorator ---------------------- # def safe_update(lock_name: str): + def decorator(fn): + @wraps(fn) def wrapper(self, *args, **kwargs): lock = getattr(self, lock_name) with lock: return fn(self, *args, **kwargs) + return wrapper + return decorator @@ -591,10 +594,11 @@ async def initialize(self): # Initialize Prometheus metrics self._metrics = _init_prometheus_metrics() - # Connect to actual SGLang workers at dynamo.worker.generate + # Connect to actual SGLang workers at workers.worker.generate + # Workers are in the "workers" namespace (hidden from frontend discovery) # (NOT backend.generate - that's where the Processor registers to intercept frontend) - engine = self.runtime.namespace("dynamo").component("worker") - logger.info("Getting engine client for dynamo/worker/generate") + engine = self.runtime.namespace("workers").component("worker") + logger.info("Getting engine client for workers/worker/generate") self.engine_client = await engine.endpoint("generate").client() min_workers = int(self.min_workers) @@ -617,9 +621,7 @@ async def initialize(self): while True: remaining = deadline - time.monotonic() if remaining <= 0: - raise TimeoutError( - f"Timed out after {timeout_s}s waiting for >= {min_workers} backend worker(s)" - ) + raise TimeoutError(f"Timed out after {timeout_s}s waiting for >= {min_workers} backend worker(s)") try: await asyncio.wait_for( @@ -846,7 +848,8 @@ def _feature_vector( prefill_norm, iat_norm, reuse_norm, - ], dtype=np.float64) + ], + dtype=np.float64) def _load_score(self, wid: int, metrics: dict[str, Any] | None, job_cost_total: float) -> float: gpu = 0.0 @@ -1023,15 +1026,17 @@ def _sweep_pending(self, now: float): if self._metrics.get("timeout_penalties"): self._metrics["timeout_penalties"].inc() - self._emit_trace("timeout", { - "decision_id": did, - "wid": wid, - "reward": reward, - "age": self.feedback_timeout_seconds, - "prefix_id": rec.get("prefix_id"), - "osl": rec.get("osl"), - "prefill_bin": rec.get("prefill_bin"), - }) + self._emit_trace( + "timeout", + { + "decision_id": did, + "wid": wid, + "reward": reward, + "age": self.feedback_timeout_seconds, + "prefix_id": rec.get("prefix_id"), + "osl": rec.get("osl"), + "prefill_bin": rec.get("prefill_bin"), + }) logger.warning("Timeout feedback: wid=%s decision=%s reward=%.3f", wid, did, reward) # --------------------- main endpoint: find_worker --------------------- # @@ -1121,9 +1126,8 @@ async def generate(self, request: dict): if chosen == last_w: if self._metrics.get("sticky_decisions"): self._metrics["sticky_decisions"].inc() - else: - if self._metrics.get("switch_decisions"): - self._metrics["switch_decisions"].inc() + elif self._metrics.get("switch_decisions"): + self._metrics["switch_decisions"].inc() # Decision trace if self.debug_traces: @@ -1136,16 +1140,26 @@ async def generate(self, request: dict): } for i, wid in enumerate(worker_list) } - self._emit_trace("decision", { - "decision_id": decision_id, - "prefix_id": req.prefix_id, - "chosen": int(chosen), - "workers": details, - }) + self._emit_trace("decision", + { + "decision_id": decision_id, + "prefix_id": req.prefix_id, + "chosen": int(chosen), + "workers": details, + }) logger.info( - "Router picked worker=%s decision=%s prefix=%s (last=%s reuse_after=%s osl=%s prefill_cost=%.3f iat=%s overlap=%.3f)", - chosen, decision_id, req.prefix_id, last_w, req.reuse_budget, osl, prefill_cost_chosen, iat, overlap_chosen, + "Router picked worker=%s decision=%s prefix=%s (last=%s reuse_after=%s osl=%s " + "prefill_cost=%.3f iat=%s overlap=%.3f)", + chosen, + decision_id, + req.prefix_id, + last_w, + req.reuse_budget, + osl, + prefill_cost_chosen, + iat, + overlap_chosen, ) resp = RouterResponse(worker_id=chosen, prefix_hit_rate=overlap_chosen, decision_id=decision_id) @@ -1200,23 +1214,31 @@ async def feedback(self, request: dict): if self._metrics.get("reward"): self._metrics["reward"].labels(worker_id=str(wid)).set(reward) - self._emit_trace("feedback", { - "decision_id": fb.decision_id, - "wid": wid, - "latency_ms": float(fb.latency_ms), - "tokens_out": tokens_out, - "metric": metric, - "per_tok": per_tok, - "baseline_used": baseline_before, - "baseline_after": baseline_after, - "reward": reward, - "success": bool(fb.success), - "finish_reason": fb.finish_reason or "", - }) + self._emit_trace( + "feedback", + { + "decision_id": fb.decision_id, + "wid": wid, + "latency_ms": float(fb.latency_ms), + "tokens_out": tokens_out, + "metric": metric, + "per_tok": per_tok, + "baseline_used": baseline_before, + "baseline_after": baseline_after, + "reward": reward, + "success": bool(fb.success), + "finish_reason": fb.finish_reason or "", + }) logger.info( "Feedback: wid=%s decision=%s metric=%.3f%s baseline=%.3f reward=%.3f success=%s", - wid, fb.decision_id, metric, " ms/tok" if per_tok else " ms", baseline_before, reward, fb.success, + wid, + fb.decision_id, + metric, + " ms/tok" if per_tok else " ms", + baseline_before, + reward, + fb.success, ) ack = FeedbackAck(ok=True, used_baseline=float(baseline_before), reward=float(reward), worker_id=wid) @@ -1372,4 +1394,3 @@ async def worker(runtime: DistributedRuntime): if __name__ == "__main__": uvloop.install() asyncio.run(worker()) - diff --git a/external/dynamo/start_dynamo_optimized_thompson_hints.sh b/external/dynamo/start_dynamo_optimized_thompson_hints.sh index 5acbe6dabc..d9e546c977 100755 --- a/external/dynamo/start_dynamo_optimized_thompson_hints.sh +++ b/external/dynamo/start_dynamo_optimized_thompson_hints.sh @@ -39,7 +39,7 @@ # - Default Dynamo Frontend (HTTP API on port 8000) # - Custom Router (Thompson Sampling + KV overlap) # - Custom Processor (hint extraction + routing) -# - SGLang Worker (unified mode, GPUs 0-3, TP=4) +# - SGLang Workers (unified mode, multiple workers with TP=2 each) # # Prometheus Metrics: # - Frontend: http://localhost:8000/metrics @@ -50,19 +50,36 @@ set -euo pipefail # Configuration Variables (can be overridden via environment variables) +# See env.example for documentation on each variable CONTAINER_NAME="dynamo-sglang" -WORKER_GPUS="${DYNAMO_GPU_DEVICES:-0,1,2,3}" -TP_SIZE="${DYNAMO_TP_SIZE:-4}" +WORKER_GPUS="${DYNAMO_GPU_DEVICES:-0,1,2,3,4,5,6,7}" +TP_SIZE="${DYNAMO_TP_SIZE:-2}" HTTP_PORT="${DYNAMO_HTTP_PORT:-8000}" # Metrics ports - each component gets its own port to avoid conflicts -WORKER_METRICS_PORT="${DYNAMO_WORKER_METRICS_PORT:-8081}" -ROUTER_METRICS_PORT="${DYNAMO_ROUTER_METRICS_PORT:-8082}" -PROCESSOR_METRICS_PORT="${DYNAMO_PROCESSOR_METRICS_PORT:-8083}" +# Using 18xxx range to avoid conflicts with common services +# Workers use sequential ports starting at WORKER_METRICS_PORT (18081, 18082, ...) +# Router and Processor are offset to allow for many workers +WORKER_METRICS_PORT="${DYNAMO_WORKER_METRICS_PORT:-18081}" +ROUTER_METRICS_PORT="${DYNAMO_ROUTER_METRICS_PORT:-18090}" +PROCESSOR_METRICS_PORT="${DYNAMO_PROCESSOR_METRICS_PORT:-18091}" MODEL="/workspace/models/Llama-3.3-70B-Instruct" SERVED_MODEL_NAME="${DYNAMO_MODEL_NAME:-llama-3.3-70b}" IMAGE="nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.1" SHM_SIZE="${DYNAMO_SHM_SIZE:-16g}" -WORKER_INIT_TIMEOUT_S="${DYNAMO_WORKER_INIT_TIMEOUT_S:-600}" +WORKER_INIT_TIMEOUT_S="${DYNAMO_WORKER_INIT_TIMEOUT_S:-1800}" + +# KV Cache Configuration +# Block size in tokens - must match between SGLang (--page-size) and Frontend (--kv-cache-block-size) +KV_BLOCK_SIZE="${DYNAMO_KV_BLOCK_SIZE:-64}" +# Fraction of GPU memory for KV cache (0.0-1.0). Reduce to test cache pressure/degradation. +MEM_FRACTION_STATIC="${DYNAMO_MEM_FRACTION_STATIC:-0.9}" + +# Compute container-internal GPU indices (GPUs are renumbered 0,1,2,... inside the container) +NUM_GPUS=$(echo "$WORKER_GPUS" | tr ',' '\n' | wc -l) +CONTAINER_GPU_INDICES=$(seq -s, 0 $((NUM_GPUS - 1))) + +# Calculate number of workers based on available GPUs and TP size +NUM_WORKERS=$((NUM_GPUS / TP_SIZE)) # Local paths - DYNAMO_MODEL_DIR must be set or script will error if [ -z "${DYNAMO_MODEL_DIR:-}" ]; then @@ -91,7 +108,7 @@ if [ -d "${DYNAMO_MODEL_DIR}" ]; then echo "" echo "This usually means incomplete/corrupted download. Try:" echo " rm -rf ${DYNAMO_MODEL_DIR}" - echo " huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ${DYNAMO_MODEL_DIR}" + echo " hf download meta-llama/Llama-3.3-70B-Instruct --local-dir ${DYNAMO_MODEL_DIR}" exit 1 fi fi @@ -125,8 +142,14 @@ echo " - Custom Router (Thompson Sampling + KV overlap)" echo " - Custom Processor (hint extraction + routing)" echo " - SGLang Worker (unified mode)" echo "" -echo "Backend Worker:" -echo " Unified: GPUs $WORKER_GPUS (TP=$TP_SIZE)" +echo "Backend Workers:" +echo " Workers: $NUM_WORKERS (GPUs: $NUM_GPUS, TP=$TP_SIZE per worker)" +echo " GPUs: $WORKER_GPUS" +echo " Mode: UNIFIED (no prefill/decode disaggregation)" +echo "" +echo "KV Cache Configuration:" +echo " Block Size: $KV_BLOCK_SIZE tokens (--page-size / --kv-cache-block-size)" +echo " GPU Mem Fraction: $MEM_FRACTION_STATIC (--mem-fraction-static)" echo "" echo "=========================================================" @@ -202,6 +225,56 @@ for i in {1..30}; do done echo "" +# Start monitoring stack (Prometheus + Grafana) if not running +MONITORING_DIR="${SCRIPT_DIR}/monitoring" +if [ -f "$MONITORING_DIR/docker-compose.yml" ]; then + PROMETHEUS_RUNNING=$(docker ps --format '{{.Names}}' | grep -q "^dynamo-prometheus$" && echo "true" || echo "false") + GRAFANA_RUNNING=$(docker ps --format '{{.Names}}' | grep -q "^dynamo-grafana$" && echo "true" || echo "false") + + if [ "$PROMETHEUS_RUNNING" = "false" ] || [ "$GRAFANA_RUNNING" = "false" ]; then + echo "Starting monitoring stack (Prometheus + Grafana)..." + cd "$MONITORING_DIR" + docker compose up -d + cd "$SCRIPT_DIR" + + # Wait for Prometheus to be ready + echo "Waiting for Prometheus to be ready..." + for i in {1..30}; do + if curl -s http://localhost:9090/-/ready > /dev/null 2>&1; then + echo "✓ Prometheus is ready (http://localhost:9090)" + break + fi + if [ $i -eq 30 ]; then + echo "⚠ WARNING: Prometheus may not be fully ready yet" + fi + sleep 1 + done + + # Wait for Grafana to be ready + echo "Waiting for Grafana to be ready..." + for i in {1..30}; do + if curl -s http://localhost:3000/api/health > /dev/null 2>&1; then + echo "✓ Grafana is ready (http://localhost:3000)" + break + fi + if [ $i -eq 30 ]; then + echo "⚠ WARNING: Grafana may not be fully ready yet" + fi + sleep 1 + done + echo "" + else + echo "✓ Monitoring stack already running" + echo " Prometheus: http://localhost:9090" + echo " Grafana: http://localhost:3000" + echo "" + fi +else + echo "⚠ Monitoring docker-compose.yml not found at: $MONITORING_DIR" + echo " Skipping monitoring stack startup" + echo "" +fi + # Clean up existing Dynamo container if it exists if docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then echo "Removing existing Dynamo container: $CONTAINER_NAME" @@ -237,7 +310,7 @@ if [ ! -d "$LOCAL_MODEL_DIR" ]; then echo "WARNING: Model directory not found at: $LOCAL_MODEL_DIR" echo "" echo "To download the model, run:" - echo " huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir $LOCAL_MODEL_DIR" + echo " hf download meta-llama/Llama-3.3-70B-Instruct --local-dir $LOCAL_MODEL_DIR" echo "" read -p "Continue anyway (model will be downloaded from HuggingFace)? [y/N] " -n 1 -r echo @@ -268,6 +341,8 @@ docker run -d \ -e WORKER_METRICS_PORT=$WORKER_METRICS_PORT \ -e ROUTER_METRICS_PORT=$ROUTER_METRICS_PORT \ -e PROCESSOR_METRICS_PORT=$PROCESSOR_METRICS_PORT \ + -e KV_BLOCK_SIZE=$KV_BLOCK_SIZE \ + -e MEM_FRACTION_STATIC=$MEM_FRACTION_STATIC \ $IMAGE \ bash -c " set -e @@ -298,7 +373,8 @@ docker run -d \ wait_for_worker() { local worker_type=\$1 local pid=\$2 - local max_wait=${WORKER_INIT_TIMEOUT_S:-600} + # Use WORKER_INIT_TIMEOUT_S (defaults to 1800s / 30 min) + local max_wait=$WORKER_INIT_TIMEOUT_S local elapsed=0 local poll_interval=5 @@ -338,40 +414,78 @@ docker run -d \ return 1 } + # ========================================================================= + # STARTUP ORDER WITH MODEL NAME ISOLATION + # ========================================================================= + # Using different model names to force ALL traffic through the processor. + # Workers register with internal model name (${SERVED_MODEL_NAME}-internal), + # while processor registers with public model name (${SERVED_MODEL_NAME}). + # Frontend only routes to backends matching the requested model name. + # + # Order: + # 1. Workers (model=${SERVED_MODEL_NAME}-internal, not discovered for public model) + # 2. Router (needs workers to be present) + # 3. Processor (model=${SERVED_MODEL_NAME}, frontend discovers this) + # 4. Frontend (routes ${SERVED_MODEL_NAME} requests to processor ONLY) + # ========================================================================= + echo '=========================================================' - echo 'Step 1: Starting Unified Worker (GPUs 0,1,2,3 = Host GPUs $WORKER_GPUS)...' + echo 'Step 1: Starting $NUM_WORKERS Unified Worker(s) (Host GPUs $WORKER_GPUS -> Container GPUs $CONTAINER_GPU_INDICES)...' echo '=========================================================' - # CRITICAL: Register worker at dynamo.worker.generate (not default backend.generate) - # This allows the custom Processor to register as backend.generate and intercept - # frontend requests, then forward to these workers after Thompson Sampling routing. + # Workers register at workers.worker.generate (in 'workers' namespace) + # They start first so the router can discover them during initialization # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component - CUDA_VISIBLE_DEVICES=0,1,2,3 \ - DYN_SYSTEM_PORT=\$WORKER_METRICS_PORT \ - python3 -m dynamo.sglang \ - --model-path $MODEL \ - --served-model-name $SERVED_MODEL_NAME \ - --host 0.0.0.0 \ - --port 30000 \ - --tp $TP_SIZE \ - --trust-remote-code \ - --enable-metrics \ - --mem-fraction-static 0.8 \ - --endpoint dynamo.worker.generate & - WORKER_PID=\$! - echo \"Unified Worker PID: \$WORKER_PID\" - echo \"Registered at: dynamo.worker.generate\" - echo \"Metrics at: http://localhost:\$WORKER_METRICS_PORT/metrics\" + + # Start multiple workers, each using TP_SIZE GPUs + WORKER_PIDS=() + for i in \$(seq 0 \$(($NUM_WORKERS - 1))); do + # Calculate GPU range for this worker (e.g., worker 0: 0,1; worker 1: 2,3; etc.) + START_GPU=\$((i * $TP_SIZE)) + END_GPU=\$(((i + 1) * $TP_SIZE - 1)) + WORKER_GPU_LIST=\$(seq -s, \$START_GPU \$END_GPU) + WORKER_PORT=\$((30000 + i)) + + echo \"Starting Worker \$i: GPUs \$WORKER_GPU_LIST, Port \$WORKER_PORT (internal model name)\" + echo \" KV Block Size: $KV_BLOCK_SIZE tokens, Mem Fraction: $MEM_FRACTION_STATIC\" + CUDA_VISIBLE_DEVICES=\$WORKER_GPU_LIST \ + DYN_SYSTEM_PORT=\$((WORKER_METRICS_PORT + i)) \ + DYN_NAMESPACE=workers \ + python3 -m dynamo.sglang \ + --model-path $MODEL \ + --served-model-name ${SERVED_MODEL_NAME}-internal \ + --host 0.0.0.0 \ + --port \$WORKER_PORT \ + --tp $TP_SIZE \ + --trust-remote-code \ + --enable-metrics \ + --page-size $KV_BLOCK_SIZE \ + --mem-fraction-static $MEM_FRACTION_STATIC \ + --endpoint workers.worker.generate & + WORKER_PIDS+=(\$!) + echo \" Worker \$i PID: \${WORKER_PIDS[\$i]}\" + done + echo \"\" + echo \"Total workers started: \${#WORKER_PIDS[@]}\" + echo \"Worker PIDs: \${WORKER_PIDS[*]}\" + echo \"Registered at: workers.worker.generate (model: ${SERVED_MODEL_NAME}-internal)\" + echo \"NOTE: Workers use internal model name so frontend only discovers processor\" echo \"\" - # Wait for unified worker to initialize - wait_for_worker \"Unified\" \$WORKER_PID || exit 1 + # Wait for first worker to initialize (checks ETCD registration) + wait_for_worker \"Unified\" \${WORKER_PIDS[0]} || exit 1 + + # Give additional workers time to initialize + if [ \${#WORKER_PIDS[@]} -gt 1 ]; then + echo \"Waiting additional 30s for remaining workers to initialize...\" + sleep 30 + fi echo '' echo '=========================================================' echo 'Step 2: Starting Custom Router (Thompson Sampling + Prometheus)...' echo '=========================================================' # Router uses config.yaml for all parameters - # Override specific values with --affinity-base, --temp-base, --lints-v, or --override + # It needs workers to be present (started in Step 1) # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component DYN_SYSTEM_PORT=\$ROUTER_METRICS_PORT \ python3 /workspace/custom_dynamo/router.py \ @@ -384,12 +498,11 @@ docker run -d \ echo '' echo '=========================================================' - echo 'Step 3: Starting Custom Processor (Dynamic Discovery Mode)...' + echo 'Step 3: Starting Custom Processor (Static Mode)...' echo '=========================================================' - # DYNAMIC DISCOVERY MODE (forward-compatible, --static-endpoint deprecated): - # Processor registers as dynamo.backend.generate AND calls register_llm() - # to advertise a model card in ETCD. The frontend's ModelWatcher discovers - # this and routes requests to us. + # STATIC MODE: Processor uses @dynamo_worker(static=True) so it registers + # at dynamo.backend.generate WITHOUT an instance ID. This is required for + # --static-endpoint on the frontend to find it. # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component DYN_SYSTEM_PORT=\$PROCESSOR_METRICS_PORT \ python3 /workspace/custom_dynamo/processor.py \ @@ -399,27 +512,30 @@ docker run -d \ PROCESSOR_PID=\$! echo \"Processor PID: \$PROCESSOR_PID\" echo \"Model: $SERVED_MODEL_NAME (from $MODEL)\" - echo \"Registered at: dynamo.backend.generate (discovered via ETCD model card)\" - echo \"Forwards to: dynamo.worker.generate (actual SGLang workers)\" + echo \"Registered at: dynamo.backend.generate (namespace=dynamo)\" + echo \"Forwards to: workers.worker.generate (actual SGLang workers)\" echo \"Metrics at: http://localhost:\$PROCESSOR_METRICS_PORT/metrics\" sleep 15 echo \"\" echo '' echo '=========================================================' - echo 'Step 4: Starting Default Dynamo Frontend (Dynamic Discovery)...' + echo 'Step 4: Starting Default Dynamo Frontend (Namespace-Scoped Discovery)...' echo '=========================================================' - # DYNAMIC DISCOVERY MODE (forward-compatible): - # No --static-endpoint needed! The frontend uses its ModelWatcher to - # discover backends registered in ETCD. Our processor registered a - # model card in Step 3, so the frontend will find and route to it. + # NAMESPACE-SCOPED DISCOVERY: Frontend discovers backends via ETCD ModelWatcher, + # but only from the 'dynamo' namespace. Workers are in the 'workers' namespace, + # so the frontend will ONLY discover the processor (in 'dynamo' namespace). + # This ensures ALL requests go through the Thompson Sampling router. + echo \"Frontend KV Block Size: $KV_BLOCK_SIZE tokens (must match worker --page-size)\" python3 -m dynamo.frontend \ --http-port $HTTP_PORT \ --model-name $SERVED_MODEL_NAME \ - --model-path $MODEL & + --model-path $MODEL \ + --kv-cache-block-size $KV_BLOCK_SIZE \ + --namespace dynamo & FRONTEND_PID=\$! echo \"Frontend PID: \$FRONTEND_PID\" - echo \"Discovery: ETCD ModelWatcher (no --static-endpoint)\" + echo \"Discovery: ETCD ModelWatcher (namespace=dynamo, discovers processor ONLY)\" sleep 15 echo \"\" @@ -432,38 +548,42 @@ docker run -d \ echo \" NATS: localhost:4222\" echo \"\" echo \"Dynamo Components (This Container):\" - echo \" Unified Worker: PID \$WORKER_PID (GPUs $WORKER_GPUS, TP=$TP_SIZE)\" - echo \" → Registered at: dynamo.worker.generate\" - echo \" → Metrics: http://localhost:\$WORKER_METRICS_PORT/metrics\" + echo \" Unified Workers: \${#WORKER_PIDS[@]} workers (GPUs $WORKER_GPUS, TP=$TP_SIZE each)\" + for i in \$(seq 0 \$((\${#WORKER_PIDS[@]} - 1))); do + START_GPU=\$((i * $TP_SIZE)) + END_GPU=\$(((i + 1) * $TP_SIZE - 1)) + echo \" Worker \$i: PID \${WORKER_PIDS[\$i]}, GPUs \$START_GPU-\$END_GPU, port \$((30000 + i))\" + done + echo \" → Registered at: workers.worker.generate (hidden from frontend)\" echo \" Router: PID \$ROUTER_PID (Thompson Sampling + Prometheus)\" echo \" → Registered at: dynamo.router.{find_worker,feedback}\" echo \" → Metrics: http://localhost:\$ROUTER_METRICS_PORT/metrics\" echo \" Processor: PID \$PROCESSOR_PID (NVExt annotation extraction)\" - echo \" → Registered at: dynamo.backend.generate (model card in ETCD)\" + echo \" → Registered at: dynamo.backend.generate (STATIC mode)\" echo \" → Metrics: http://localhost:\$PROCESSOR_METRICS_PORT/metrics\" echo \" Frontend: PID \$FRONTEND_PID (Default Dynamo HTTP API on port $HTTP_PORT)\" - echo \" → Discovery: ETCD ModelWatcher (finds processor's model card)\" + echo \" → Discovery: ETCD ModelWatcher\" echo \" → Metrics: http://localhost:$HTTP_PORT/metrics\" echo '' - echo 'Request Flow (Dynamic Discovery Mode):' + echo 'Request Flow (Dynamic Discovery - Thompson Sampling when routed to processor):' echo ' Client → Default Frontend API (port $HTTP_PORT)' echo ' ↓ (tokenization + nvext parsing)' - echo ' Frontend discovers backends via ETCD ModelWatcher' - echo ' ↓ (finds Processor model card!)' - echo ' Custom Processor (dynamo.backend.generate-{id})' + echo ' Frontend routes via ETCD ModelWatcher (processor OR workers)' + echo ' ↓' + echo ' IF routed to Processor (dynamo.backend.generate):' echo ' ↓ (extract hints from annotations)' echo ' ↓ (query Thompson Sampling router)' echo ' Custom Router → worker_id' echo ' ↓ (KV overlap + workload-aware selection)' - echo ' Processor routes to → dynamo.worker.generate (with worker_id)' + echo ' Processor routes to → workers.worker.generate (with worker_id)' echo ' ↓' - echo ' Unified Worker (dynamo.worker.generate)' + echo ' Unified Worker (workers.worker.generate)' echo ' ↓' echo ' Response + Feedback to Router' echo '' echo 'Prometheus Metrics Endpoints:' echo ' - Frontend: http://localhost:$HTTP_PORT/metrics (latency, throughput)' - echo ' - Worker: http://localhost:\$WORKER_METRICS_PORT/metrics (KV cache, internal)' + echo ' - Workers: http://localhost:\$WORKER_METRICS_PORT/metrics - \$((WORKER_METRICS_PORT + \${#WORKER_PIDS[@]} - 1))/metrics (KV cache)' echo ' - Router: http://localhost:\$ROUTER_METRICS_PORT/metrics (thompson_router_*)' echo ' - Processor: http://localhost:\$PROCESSOR_METRICS_PORT/metrics (thompson_* KVE)' echo '=========================================================' @@ -482,10 +602,12 @@ docker run -d \ echo \"ERROR: Router died!\" exit 1 fi - if ! kill -0 \$WORKER_PID 2>/dev/null; then - echo \"ERROR: Unified worker died!\" - exit 1 - fi + for i in \$(seq 0 \$((\${#WORKER_PIDS[@]} - 1))); do + if ! kill -0 \${WORKER_PIDS[\$i]} 2>/dev/null; then + echo \"ERROR: Worker \$i (PID \${WORKER_PIDS[\$i]}) died!\" + exit 1 + fi + done sleep 10 done " @@ -502,31 +624,33 @@ if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then echo "✓ Dynamo with OPTIMIZED Thompson Sampling Router Started!" echo "=========================================================" echo "" - echo "Architecture (Dynamic Discovery - Forward Compatible):" + echo "Architecture (Model Name Isolation - Thompson Sampling):" echo "" - echo " Endpoint Registration:" - echo " • SGLang Worker: dynamo.worker.generate (actual inference)" - echo " • Processor: dynamo.backend.generate + ETCD model card" - echo " • Router: dynamo.router.{find_worker,feedback}" + echo " Model Name Isolation Mode:" + echo " - Workers register with internal model name (${SERVED_MODEL_NAME}-internal)" + echo " - Processor registers with public model name (${SERVED_MODEL_NAME})" + echo " - Frontend routes ${SERVED_MODEL_NAME} requests to processor ONLY" + echo " - ALL requests go through Thompson Sampling router" echo "" - echo " Discovery Mode:" - echo " • Frontend uses ETCD ModelWatcher (no --static-endpoint)" - echo " • Processor registers model card via register_llm()" - echo " • Frontend discovers processor as a 'backend' automatically" + echo " Startup Order:" + echo " 1. Workers → model=${SERVED_MODEL_NAME}-internal (not matched by frontend)" + echo " 2. Router → dynamo.router.{find_worker,feedback}" + echo " 3. Processor → model=${SERVED_MODEL_NAME} (matched by frontend)" + echo " 4. Frontend → routes to processor for ${SERVED_MODEL_NAME} requests" echo "" - echo " Request Flow:" + echo " Request Flow (ALL requests go through processor):" echo " Client Request (with nvext.annotations)" echo " ↓" echo " Default Dynamo Frontend (port $HTTP_PORT)" - echo " ↓ discovers backends via ETCD ModelWatcher" - echo " Custom Processor (discovered via model card)" + echo " ↓ ETCD ModelWatcher (namespace=dynamo) routes to processor" + echo " Custom Processor (dynamo.backend.generate)" echo " ↓ extracts: prefix_id, total_requests, osl, iat" echo " ↓ queries Thompson Sampling router" echo " Custom Router → worker_id" echo " ↓ KV overlap + workload-aware selection" - echo " Processor forwards to dynamo.worker.generate" + echo " Processor forwards to workers.worker.generate" echo " ↓" - echo " Unified Worker (GPUs $WORKER_GPUS, TP=$TP_SIZE)" + echo " Unified Workers ($NUM_WORKERS x TP=$TP_SIZE = $NUM_GPUS GPUs total)" echo " ↓" echo " Response + Feedback Loop" echo "" @@ -536,10 +660,18 @@ if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then echo "" echo "Prometheus Metrics Endpoints:" echo " Frontend: http://localhost:$HTTP_PORT/metrics (latency, throughput)" - echo " Worker: http://localhost:$WORKER_METRICS_PORT/metrics (KV cache)" + echo " Workers: http://localhost:$WORKER_METRICS_PORT/metrics - $((WORKER_METRICS_PORT + NUM_WORKERS - 1))/metrics (KV cache)" echo " Router: http://localhost:$ROUTER_METRICS_PORT/metrics (routing)" echo " Processor: http://localhost:$PROCESSOR_METRICS_PORT/metrics (KVE)" echo "" + echo "Dynamo Components:" + echo " Frontend: HTTP API on port $HTTP_PORT" + echo " Unified Workers: $NUM_WORKERS workers (TP=$TP_SIZE each, ports 30000-$((30000 + NUM_WORKERS - 1)))" + echo "" + echo "KV Cache Settings:" + echo " Block Size: $KV_BLOCK_SIZE tokens (DYNAMO_KV_BLOCK_SIZE)" + echo " GPU Mem Fraction: $MEM_FRACTION_STATIC (DYNAMO_MEM_FRACTION_STATIC)" + echo "" echo "API Endpoint: http://localhost:$HTTP_PORT/v1/chat/completions" echo "Health Check: http://localhost:$HTTP_PORT/health" echo "" @@ -553,6 +685,10 @@ if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then echo " ]" echo " }" echo "" + echo "Monitoring Dashboards:" + echo " Grafana: http://localhost:3000 (no login required)" + echo " Prometheus: http://localhost:9090" + echo "" echo "Useful Commands:" echo " Interactive shell: docker exec -it $CONTAINER_NAME bash" echo " View Dynamo logs: docker logs -f $CONTAINER_NAME" @@ -560,6 +696,7 @@ if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then echo " View NATS logs: docker logs -f nats-dynamo" echo " GPU usage: watch -n 2 nvidia-smi" echo " Stop all: bash stop_dynamo.sh" + echo " Stop all + metrics: bash stop_dynamo.sh --kill-metrics" echo "" echo "Query Metrics:" echo " curl http://localhost:$HTTP_PORT/metrics | grep dynamo_frontend" @@ -617,8 +754,8 @@ if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then echo "" # Wait for server to be ready - echo "Checking for API availability (timeout=15 minutes)..." - max_attempts=900 + echo "Checking for API availability (timeout=${WORKER_INIT_TIMEOUT_S}s)..." + max_attempts=$WORKER_INIT_TIMEOUT_S attempt=0 while [ $attempt -lt $max_attempts ]; do diff --git a/external/dynamo/start_dynamo_unified.sh b/external/dynamo/start_dynamo_unified.sh index c99a3114a9..81c47410c6 100755 --- a/external/dynamo/start_dynamo_unified.sh +++ b/external/dynamo/start_dynamo_unified.sh @@ -32,8 +32,8 @@ # Configuration Variables (can be overridden via environment variables) CONTAINER_NAME="dynamo-sglang" -WORKER_GPUS="${DYNAMO_GPU_DEVICES:-0,1,2,3}" -TP_SIZE="${DYNAMO_TP_SIZE:-4}" +WORKER_GPUS="${DYNAMO_GPU_DEVICES:-0,1,2,3,4,5,6,7}" +TP_SIZE="${DYNAMO_TP_SIZE:-2}" HTTP_PORT="${DYNAMO_HTTP_PORT:-8099}" MODEL="/workspace/models/Llama-3.3-70B-Instruct" SERVED_MODEL_NAME="${DYNAMO_MODEL_NAME:-llama-3.3-70b}" @@ -44,12 +44,15 @@ SHM_SIZE="${DYNAMO_SHM_SIZE:-16g}" ETCD_CLIENT_PORT="${DYNAMO_ETCD_PORT:-2379}" ETCD_PEER_PORT="${DYNAMO_ETCD_PEER_PORT:-2390}" NATS_PORT="${DYNAMO_NATS_PORT:-4222}" -WORKER_INIT_TIMEOUT_S="${DYNAMO_WORKER_INIT_TIMEOUT_S:-600}" +WORKER_INIT_TIMEOUT_S="${DYNAMO_WORKER_INIT_TIMEOUT_S:-1800}" # Compute container-internal GPU indices (GPUs are renumbered 0,1,2,... inside the container) NUM_GPUS=$(echo "$WORKER_GPUS" | tr ',' '\n' | wc -l) CONTAINER_GPU_INDICES=$(seq -s, 0 $((NUM_GPUS - 1))) +# Calculate number of workers based on available GPUs and TP size +NUM_WORKERS=$((NUM_GPUS / TP_SIZE)) + # Local paths - DYNAMO_MODEL_DIR must be set or script will error if [ -z "${DYNAMO_MODEL_DIR}" ]; then echo "ERROR: DYNAMO_MODEL_DIR environment variable must be set" @@ -82,7 +85,7 @@ if [ -d "${DYNAMO_MODEL_DIR}" ]; then echo "" echo "This usually means incomplete/corrupted download. Try:" echo " rm -rf ${DYNAMO_MODEL_DIR}" - echo " huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ${DYNAMO_MODEL_DIR}" + echo " hf download meta-llama/Llama-3.3-70B-Instruct --local-dir ${DYNAMO_MODEL_DIR}" exit 1 fi fi @@ -101,8 +104,9 @@ echo " - NATS (message queue for requests)" echo " - Dynamo Frontend (HTTP API on port $HTTP_PORT)" echo " - SGLang Worker (unified mode)" echo "" -echo "Backend Worker:" -echo " Unified: GPUs $WORKER_GPUS (TP=$TP_SIZE)" +echo "Backend Workers:" +echo " Workers: $NUM_WORKERS (GPUs: $NUM_GPUS, TP=$TP_SIZE per worker)" +echo " GPUs: $WORKER_GPUS" echo " Mode: UNIFIED (no prefill/decode disaggregation)" echo "" echo "=========================================================" @@ -217,7 +221,7 @@ if [ ! -d "$LOCAL_MODEL_DIR" ]; then echo "WARNING: Model directory not found at: $LOCAL_MODEL_DIR" echo "" echo "To download the model, run:" - echo " huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir $LOCAL_MODEL_DIR" + echo " hf download meta-llama/Llama-3.3-70B-Instruct --local-dir $LOCAL_MODEL_DIR" echo "" read -p "Continue anyway (model will be downloaded from HuggingFace)? [y/N] " -n 1 -r echo @@ -277,8 +281,8 @@ docker run -d \ wait_for_worker() { local worker_type=\$1 local pid=\$2 - # local max_wait=300 - local max_wait=${DYNAMO_WORKER_INIT_TIMEOUT_S:-600} + # Use WORKER_INIT_TIMEOUT_S (defaults to 1800s / 30 min) + local max_wait=$WORKER_INIT_TIMEOUT_S local elapsed=0 local poll_interval=5 @@ -329,23 +333,45 @@ docker run -d \ } echo '=========================================================' - echo 'Step 1: Starting Unified Worker (Host GPUs $WORKER_GPUS -> Container GPUs $CONTAINER_GPU_INDICES)...' + echo 'Step 1: Starting $NUM_WORKERS Unified Worker(s) (Host GPUs $WORKER_GPUS -> Container GPUs $CONTAINER_GPU_INDICES)...' echo '=========================================================' - CUDA_VISIBLE_DEVICES=$CONTAINER_GPU_INDICES \ - python3 -m dynamo.sglang \ - --model-path $MODEL \ - --served-model-name $SERVED_MODEL_NAME \ - --host 0.0.0.0 \ - --port 30000 \ - --tp $TP_SIZE \ - --trust-remote-code \ - --mem-fraction-static 0.8 & - WORKER_PID=\$! - echo \"Unified Worker PID: \$WORKER_PID\" + + # Start multiple workers, each using TP_SIZE GPUs + WORKER_PIDS=() + for i in \$(seq 0 \$(($NUM_WORKERS - 1))); do + # Calculate GPU range for this worker (e.g., worker 0: 0,1; worker 1: 2,3; etc.) + START_GPU=\$((i * $TP_SIZE)) + END_GPU=\$(((i + 1) * $TP_SIZE - 1)) + WORKER_GPU_LIST=\$(seq -s, \$START_GPU \$END_GPU) + WORKER_PORT=\$((30000 + i)) + + echo \"Starting Worker \$i: GPUs \$WORKER_GPU_LIST, Port \$WORKER_PORT\" + CUDA_VISIBLE_DEVICES=\$WORKER_GPU_LIST \ + python3 -m dynamo.sglang \ + --model-path $MODEL \ + --served-model-name $SERVED_MODEL_NAME \ + --host 0.0.0.0 \ + --port \$WORKER_PORT \ + --tp $TP_SIZE \ + --trust-remote-code \ + --mem-fraction-static 0.9 & + WORKER_PIDS+=(\$!) + echo \" Worker \$i PID: \${WORKER_PIDS[\$i]}\" + done echo \"\" + echo \"Total workers started: \${#WORKER_PIDS[@]}\" + echo \"Worker PIDs: \${WORKER_PIDS[*]}\" + echo \"\" + + # Wait for first worker to initialize (checks ETCD registration) + # Once one worker is registered, the frontend can start discovering workers + wait_for_worker \"Unified\" \${WORKER_PIDS[0]} || exit 1 - # Wait for unified worker to initialize (checks ETCD registration) - wait_for_worker \"Unified\" \$WORKER_PID || exit 1 + # Give additional workers time to initialize + if [ \${#WORKER_PIDS[@]} -gt 1 ]; then + echo \"Waiting additional 30s for remaining workers to initialize...\" + sleep 30 + fi echo '' echo '=========================================================' @@ -370,7 +396,12 @@ docker run -d \ echo \" NATS: localhost:$NATS_PORT\" echo \"\" echo \"Dynamo Components (This Container):\" - echo \" Unified Worker: PID \$WORKER_PID (GPUs $WORKER_GPUS, TP=$TP_SIZE, internal port 30000)\" + echo \" Unified Workers: \${#WORKER_PIDS[@]} workers (GPUs $WORKER_GPUS, TP=$TP_SIZE each)\" + for i in \$(seq 0 \$((\${#WORKER_PIDS[@]} - 1))); do + START_GPU=\$((i * $TP_SIZE)) + END_GPU=\$(((i + 1) * $TP_SIZE - 1)) + echo \" Worker \$i: PID \${WORKER_PIDS[\$i]}, GPUs \$START_GPU-\$END_GPU, port \$((30000 + i))\" + done echo \" Frontend: PID \$FRONTEND_PID (HTTP API on port $HTTP_PORT)\" echo '' echo 'Request Flow:' @@ -390,10 +421,12 @@ docker run -d \ echo \"ERROR: Frontend died!\" exit 1 fi - if ! kill -0 \$WORKER_PID 2>/dev/null; then - echo \"ERROR: Unified worker died!\" - exit 1 - fi + for i in \$(seq 0 \$((\${#WORKER_PIDS[@]} - 1))); do + if ! kill -0 \${WORKER_PIDS[\$i]} 2>/dev/null; then + echo \"ERROR: Worker \$i (PID \${WORKER_PIDS[\$i]}) died!\" + exit 1 + fi + done sleep 10 done " @@ -417,9 +450,9 @@ if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then echo " ↓" echo " Frontend discovers workers via ETCD" echo " ↓" - echo " Frontend routes to Unified Worker" + echo " Frontend routes to one of $NUM_WORKERS Unified Workers" echo " ↓ (localhost:$ETCD_CLIENT_PORT - worker discovery)" - echo " Unified Worker (GPUs $WORKER_GPUS, TP=$TP_SIZE)" + echo " Unified Workers ($NUM_WORKERS x TP=$TP_SIZE = $NUM_GPUS GPUs total)" echo " ↓" echo " Response" echo "" @@ -429,7 +462,7 @@ if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then echo "" echo "Dynamo Components (This Container):" echo " Frontend: HTTP API on port $HTTP_PORT" - echo " Unified Worker: GPUs $WORKER_GPUS (TP=$TP_SIZE, internal port 30000)" + echo " Unified Workers: $NUM_WORKERS workers (TP=$TP_SIZE each, ports 30000-$((30000 + NUM_WORKERS - 1)))" echo "" echo "API Endpoint: http://localhost:$HTTP_PORT/v1/chat/completions" echo "Health Check: http://localhost:$HTTP_PORT/health" @@ -445,7 +478,7 @@ if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then echo "" echo "=========================================================" echo "Test Request:" - echo "=========================================================" + echo "=====================================ca====================" echo "" echo "# Basic test" echo "curl http://localhost:$HTTP_PORT/v1/chat/completions \\" @@ -484,8 +517,8 @@ if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then echo "" # Wait for server to be ready (check /v1/models which only works when workers are discovered) - echo "Checking for API availability (timeout=15 minutes)..." - max_attempts=900 + echo "Checking for API availability (timeout=${WORKER_INIT_TIMEOUT_S}s)..." + max_attempts=$WORKER_INIT_TIMEOUT_S attempt=0 while [ $attempt -lt $max_attempts ]; do diff --git a/external/dynamo/stop_dynamo.sh b/external/dynamo/stop_dynamo.sh index 231eeb8c90..b0c3f976f8 100755 --- a/external/dynamo/stop_dynamo.sh +++ b/external/dynamo/stop_dynamo.sh @@ -17,6 +17,37 @@ # Dynamo SGLang Shutdown Script # Stops all components: Dynamo worker container, ETCD, and NATS # Works for: UNIFIED, THOMPSON SAMPLING, and DISAGGREGATED modes +# +# Usage: +# bash stop_dynamo.sh # Stop Dynamo, ETCD, NATS only +# bash stop_dynamo.sh --kill-metrics # Also stop Prometheus and Grafana +# bash stop_dynamo.sh --clear-metrics # Stop monitoring stack AND remove Prometheus data volume + +# Parse command line arguments +KILL_METRICS=false +CLEAR_METRICS=false +for arg in "$@"; do + case $arg in + --kill-metrics) + KILL_METRICS=true + shift + ;; + --clear-metrics) + KILL_METRICS=true + CLEAR_METRICS=true + shift + ;; + -h|--help) + echo "Usage: bash stop_dynamo.sh [OPTIONS]" + echo "" + echo "Options:" + echo " --kill-metrics Also stop Prometheus and Grafana containers" + echo " --clear-metrics Stop monitoring stack AND remove Prometheus data volume (clears old metrics)" + echo " -h, --help Show this help message" + exit 0 + ;; + esac +done echo "=========================================================" echo "Stopping Dynamo SGLang FULL STACK" @@ -68,13 +99,56 @@ else echo " (NATS container not running)" fi +# Stop monitoring stack if --kill-metrics flag is set +if [ "$KILL_METRICS" = true ]; then + echo "" + echo "=========================================================" + echo "Stopping Monitoring Stack (--kill-metrics)" + echo "=========================================================" + + # Stop Prometheus + if docker ps --format '{{.Names}}' | grep -q "^dynamo-prometheus$"; then + echo "" + echo "Stopping Prometheus container..." + docker stop dynamo-prometheus + docker rm dynamo-prometheus + echo "✓ Prometheus container stopped and removed" + else + echo " (Prometheus container not running)" + fi + + # Stop Grafana + if docker ps --format '{{.Names}}' | grep -q "^dynamo-grafana$"; then + echo "" + echo "Stopping Grafana container..." + docker stop dynamo-grafana + docker rm dynamo-grafana + echo "✓ Grafana container stopped and removed" + else + echo " (Grafana container not running)" + fi + + # Clear Prometheus data volume if --clear-metrics flag is set + if [ "$CLEAR_METRICS" = true ]; then + echo "" + echo "Clearing Prometheus data volume..." + docker volume rm monitoring_prometheus_data && echo "✓ Prometheus data volume removed (old metrics cleared)" + fi +fi + echo "" echo "=========================================================" echo "✓ All components stopped!" +if [ "$KILL_METRICS" = true ]; then + echo " (including monitoring stack)" +fi +if [ "$CLEAR_METRICS" = true ]; then + echo " (Prometheus data volume cleared)" +fi echo "=========================================================" echo "" echo "To restart:" echo " Standard Unified: bash start_dynamo_unified.sh" -echo " Thompson Sampling: bash start_dynamo_unified_thompson_hints.sh" +echo " Thompson Sampling: bash start_dynamo_optimized_thompson_hints.sh" echo "" diff --git a/src/nat/data_models/profiler.py b/src/nat/data_models/profiler.py index 134c4710e7..3c65db8f12 100644 --- a/src/nat/data_models/profiler.py +++ b/src/nat/data_models/profiler.py @@ -153,6 +153,26 @@ class DynamoMetricsConfig(BaseModel): "Should roughly match experiment duration. Too short = noisy. Too long = stale data included.", ) + # Historical lookback for range queries (set automatically from workflow duration if 0) + lookback_seconds: float = Field( + default=0.0, + description="Lookback time in seconds for Prometheus range queries when instant queries return no data. " + "If 0 (default), will be set automatically to the workflow duration + buffer. " + "This allows capturing TTFT/ITL metrics from the entire eval run, even after the workflow completes.", + ) + + # Workflow time window (set automatically by profiler) + workflow_start_timestamp: float | None = Field( + default=None, + description="Unix timestamp when the workflow started (set automatically by profiler). " + "Used for precise range query time windows.", + ) + workflow_end_timestamp: float | None = Field( + default=None, + description="Unix timestamp when the workflow ended (set automatically by profiler). " + "Used for precise range query time windows to isolate metrics to this eval run.", + ) + @classmethod def core_metrics_only( cls, diff --git a/src/nat/llm/dynamo_llm.py b/src/nat/llm/dynamo_llm.py index d6e848f81d..dc0c4a3058 100644 --- a/src/nat/llm/dynamo_llm.py +++ b/src/nat/llm/dynamo_llm.py @@ -271,11 +271,11 @@ def get_dynamo_field_names() -> frozenset[str]: class _DynamoTransport: """ Custom transport wrapper that injects nvext.annotations into request bodies. - + This approach is more reliable than using event hooks because it modifies the request BEFORE httpx's internal state machine processes it. """ - + def __init__( self, transport: "httpx.AsyncBaseTransport", @@ -289,21 +289,21 @@ def __init__( self._total_requests = total_requests self._osl = osl.upper() self._iat = iat.upper() - + async def handle_async_request(self, request: "httpx.Request") -> "httpx.Response": import httpx - + # Check context variable first (allows per-question override in batch evaluation) context_prefix_id = DynamoPrefixContext.get() prefix_id = context_prefix_id if context_prefix_id else self._prefix_id - + # Add HTTP headers (for generalized setup compatibility) headers = dict(request.headers) headers["x-prefix-id"] = prefix_id headers["x-prefix-total-requests"] = str(self._total_requests) headers["x-prefix-osl"] = self._osl headers["x-prefix-iat"] = self._iat - + # Modify body if it's a POST request with JSON content content = request.content if request.method == "POST" and content: @@ -317,32 +317,33 @@ async def handle_async_request(self, request: "httpx.Request") -> "httpx.Respons f"osl:{self._osl}", f"iat:{self._iat}", ] - + # Add/merge nvext.annotations if "nvext" not in body: body["nvext"] = {} if not isinstance(body["nvext"], dict): body["nvext"] = {} - + existing = body["nvext"].get("annotations", []) if not isinstance(existing, list): existing = [] - + # Our annotations take precedence body["nvext"]["annotations"] = annotations + [ a for a in existing if not any(a.startswith(f"{key}:") for key in ["prefix_id", "total_requests", "osl", "iat"]) ] - + # Re-encode content = json.dumps(body).encode("utf-8") headers["content-length"] = str(len(content)) - + logger.debug("Injected nvext.annotations: %s (body size: %d bytes)", - body["nvext"]["annotations"], len(content)) + body["nvext"]["annotations"], + len(content)) except (json.JSONDecodeError, UnicodeDecodeError) as e: logger.debug("Could not inject nvext.annotations: %s", e) - + # Create a new request with modified headers and content new_request = httpx.Request( method=request.method, @@ -351,9 +352,9 @@ async def handle_async_request(self, request: "httpx.Request") -> "httpx.Respons content=content, extensions=request.extensions, ) - + return await self._transport.handle_async_request(new_request) - + async def aclose(self): await self._transport.aclose() @@ -395,9 +396,9 @@ def create_httpx_client_with_dynamo_hooks( prefix_id = prefix_template.format(uuid=unique_id) else: prefix_id = f"nat-dynamo-{unique_id}" - + logger.debug("Created Dynamo client with prefix ID: %s", prefix_id) - + # Create a base transport and wrap it with our custom transport base_transport = httpx.AsyncHTTPTransport() dynamo_transport = _DynamoTransport( @@ -407,7 +408,7 @@ def create_httpx_client_with_dynamo_hooks( osl=osl, iat=iat, ) - + return httpx.AsyncClient( transport=dynamo_transport, timeout=httpx.Timeout(timeout), diff --git a/src/nat/profiler/inference_optimization/dynamo_metrics.py b/src/nat/profiler/inference_optimization/dynamo_metrics.py index 8d0e76af66..d0f6514b5d 100644 --- a/src/nat/profiler/inference_optimization/dynamo_metrics.py +++ b/src/nat/profiler/inference_optimization/dynamo_metrics.py @@ -118,6 +118,8 @@ class DynamoMetricsResult(BaseModel): """ import logging +import math +import time from typing import Any import httpx @@ -201,7 +203,6 @@ class DynamoMetricsResult(BaseModel): "thompson_requests_rate": "rate(dynamo_component_thompson_requests_total[{range}])", } - # ============================================================================= # DATA MODELS # ============================================================================= @@ -668,8 +669,6 @@ async def collect(self) -> DynamoMetricsResult: Returns: DynamoMetricsResult with collected metric values """ - import time - result = DynamoMetricsResult( collection_timestamp=time.time(), prometheus_url=self.prometheus_url, @@ -678,6 +677,27 @@ async def collect(self) -> DynamoMetricsResult: # Build list of metrics to collect based on config toggles metrics_to_collect = self._get_enabled_metrics() + # Log collection parameters + if self.config.workflow_start_timestamp is not None: + if self.config.workflow_end_timestamp is not None: + duration = self.config.workflow_end_timestamp - self.config.workflow_start_timestamp + lookback_info = f"isolated_window={duration:.1f}s" + else: + lookback_info = f"workflow_start={self.config.workflow_start_timestamp:.2f}" + elif self.config.lookback_seconds > 0: + lookback_info = f"lookback={self.config.lookback_seconds}s" + else: + lookback_info = "lookback=600s (default)" + + logger.info("Collecting %d Dynamo metrics from %s (query_range=%s, %s)", + len(metrics_to_collect), + self.prometheus_url, + self.config.query_range, + lookback_info) + + collected_count = 0 + null_count = 0 + # Collect each metric async with httpx.AsyncClient(timeout=30.0) as client: for metric_name, query_template in metrics_to_collect.items(): @@ -689,12 +709,31 @@ async def collect(self) -> DynamoMetricsResult: if value is not None: setattr(result, metric_name, value) logger.debug("Collected %s = %s", metric_name, value) + collected_count += 1 + else: + logger.debug("No data for metric %s", metric_name) + null_count += 1 except Exception as e: error_msg = f"Failed to collect {metric_name}: {e}" logger.warning(error_msg) result.errors.append(error_msg) + logger.info("Dynamo metrics collection complete: %d collected, %d null, %d errors", + collected_count, + null_count, + len(result.errors)) + + # Log summary of key metrics for debugging + core = result.get_core_metrics() + if core.ttft_p95_seconds is not None or core.itl_p95_seconds is not None: + logger.info("Core metrics - TTFT P95: %s, ITL P95: %s, KV Efficiency: %s", + core.ttft_p95_seconds, + core.itl_p95_seconds, + core.kv_efficiency) + else: + logger.warning("Core metrics (TTFT, ITL) not available - check Prometheus connectivity and metric names") + return result def _get_enabled_metrics(self) -> dict[str, str]: @@ -718,8 +757,7 @@ def _get_enabled_metrics(self) -> dict[str, str]: "kve_prompt_tokens_rate", "kve_device_blocks_rate", "kve_host_blocks_rate", - "kve_disk_blocks_rate", - # Supplementary KV cache metrics + "kve_disk_blocks_rate", # Supplementary KV cache metrics "kv_cache_usage_percent", "kv_cache_hit_rate_sglang", # Fallback for KVE "kv_cache_hit_rate_dynamo", @@ -747,7 +785,32 @@ def _get_enabled_metrics(self) -> dict[str, str]: async def _query_prometheus(self, client: httpx.AsyncClient, query: str) -> float | None: """ - Execute a Prometheus instant query and extract the scalar result. + Execute a Prometheus query and extract the scalar result. + + First attempts an instant query. If no data is returned (e.g., because + rate() returns 0 after workflow completion), falls back to a range query + with historical lookback to capture the most recent non-zero value. + + Args: + client: httpx AsyncClient + query: PromQL query string + + Returns: + Float value if successful, None if no data or error + """ + # First try instant query + value = await self._query_prometheus_instant(client, query) + if value is not None: + return value + + # If instant query failed, try range query with lookback + # This captures historical data when rate() returns 0 after workflow completes + logger.debug("Instant query returned no data, trying range query with lookback: %s", query) + return await self._query_prometheus_range(client, query) + + async def _query_prometheus_instant(self, client: httpx.AsyncClient, query: str) -> float | None: + """ + Execute a Prometheus instant query. Args: client: httpx AsyncClient @@ -765,13 +828,13 @@ async def _query_prometheus(self, client: httpx.AsyncClient, query: str) -> floa data = response.json() if data.get("status") != "success": - logger.warning("Prometheus query failed: %s", data.get("error", "unknown")) + logger.warning("Prometheus instant query failed: %s", data.get("error", "unknown")) return None results = data.get("data", {}).get("result", []) if not results: - logger.debug("No data for query: %s", query) + logger.debug("No data for instant query: %s", query) return None # For instant queries, extract the value from the first result @@ -781,12 +844,133 @@ async def _query_prometheus(self, client: httpx.AsyncClient, query: str) -> floa value = float(value_str) # Handle special float values - if value != value: # NaN check + if math.isnan(value): + logger.debug("Instant query returned NaN for: %s", query) + return None + + # Zero values from rate() after activity stops are not useful + if value == 0.0: + logger.debug("Instant query returned 0.0 for rate-based query: %s", query) return None return value except (KeyError, IndexError, ValueError) as e: - logger.debug("Failed to parse Prometheus result for query '%s': %s", query, e) + logger.debug("Failed to parse Prometheus instant result for query '%s': %s", query, e) + return None + + async def _query_prometheus_range(self, client: httpx.AsyncClient, query: str) -> float | None: + """ + Execute a Prometheus range query with historical lookback. + + This captures metrics that were recorded during the workflow execution + but are no longer updating (rate() would return 0 for instant queries). + + The time window is determined by: + 1. If workflow timestamps are set: query from workflow start to workflow end (isolated to this eval) + 2. If lookback_seconds is set: query that many seconds back from now + 3. Otherwise: default to 10 minutes (600 seconds) + + Args: + client: httpx AsyncClient + query: PromQL query string + + Returns: + The most recent non-NaN, non-zero value if found, None otherwise + """ + url = f"{self.prometheus_url}/api/v1/query_range" + + # Determine time window based on config + # Priority: workflow timestamps > lookback_seconds > default 600s + if self.config.workflow_start_timestamp is not None: + # Use exact workflow time window (no buffer before, small buffer after for scrape delay) + # No buffer before: avoids any risk of including pre-workflow empty data + # Small buffer after (15s): accounts for Prometheus scrape interval + start_time = self.config.workflow_start_timestamp + + if self.config.workflow_end_timestamp is not None: + # Use actual workflow end time + small buffer for scrape delay + end_time = self.config.workflow_end_timestamp + 15.0 + logger.debug("Using isolated workflow time window: %.2f to %.2f (%.1f seconds)", + start_time, + end_time, + end_time - start_time) + else: + # Fall back to current time if end timestamp not set + end_time = time.time() + logger.debug("Using workflow start with current time: %.2f to %.2f (%.1f seconds)", + start_time, + end_time, + end_time - start_time) + elif self.config.lookback_seconds > 0: + end_time = time.time() + start_time = end_time - self.config.lookback_seconds + logger.debug("Using configured lookback for range query: %.1f seconds", self.config.lookback_seconds) + else: + # Default to 10 minutes (600 seconds) for backward compatibility + end_time = time.time() + start_time = end_time - 600 + logger.debug("Using default 10-minute lookback for range query") + + # Use 15s step to get reasonable granularity + step = "15s" + + params = { + "query": query, + "start": start_time, + "end": end_time, + "step": step, + } + + try: + response = await client.get(url, params=params) + response.raise_for_status() + + data = response.json() + + if data.get("status") != "success": + logger.warning("Prometheus range query failed: %s", data.get("error", "unknown")) + return None + + results = data.get("data", {}).get("result", []) + + if not results: + logger.debug("No data for range query: %s", query) + return None + + # Range query result format: + # [{"metric": {...}, "values": [[timestamp, "value_string"], ...]}] + # Collect all valid (non-NaN, non-zero) values and compute the average + # This gives a representative measurement across the entire workflow + valid_values: list[float] = [] + + for series in results: + values = series.get("values", []) + for timestamp_val, value_str in values: + try: + value = float(value_str) + if not math.isnan(value) and value != 0.0: + valid_values.append(value) + except (ValueError, TypeError): + continue + + if valid_values: + # Use average for a representative measurement across the workflow + avg_value = sum(valid_values) / len(valid_values) + min_value = min(valid_values) + max_value = max(valid_values) + logger.debug("Range query found %d valid samples for %s: avg=%.4f, min=%.4f, max=%.4f", + len(valid_values), + query, + avg_value, + min_value, + max_value) + return avg_value + + logger.debug("Range query found no valid values for: %s", query) + return None + + except Exception as e: + logger.debug("Range query failed for '%s': %s", query, e) return None async def health_check(self) -> dict[str, Any]: @@ -896,4 +1080,3 @@ async def collect_core_metrics( ) result = await collect_dynamo_metrics(config) return result.get_core_metrics() - diff --git a/src/nat/profiler/profile_runner.py b/src/nat/profiler/profile_runner.py index 61602c881c..f7afdd8b90 100644 --- a/src/nat/profiler/profile_runner.py +++ b/src/nat/profiler/profile_runner.py @@ -83,6 +83,41 @@ def __init__(self, profiler_config: ProfilerConfig, output_dir: Path, write_outp # Ensure output directory os.makedirs(output_dir, exist_ok=True) + def _get_workflow_time_window( + self, + all_steps: list[list[IntermediateStep]], + ) -> tuple[float | None, float | None]: + """ + Extract the workflow time window from intermediate steps. + + Finds the earliest and latest event timestamps across all workflow executions + to determine the time range for Prometheus queries. + + Args: + all_steps: List of workflow executions, each containing intermediate steps + + Returns: + Tuple of (start_timestamp, end_timestamp) in Unix seconds, or (None, None) if no data + """ + min_timestamp = float('inf') + max_timestamp = float('-inf') + + for workflow_steps in all_steps: + for step in workflow_steps: + ts = step.event_timestamp + min_timestamp = min(min_timestamp, ts) + max_timestamp = max(max_timestamp, ts) + # Also check span_event_timestamp for start times of END events + span_ts = step.span_event_timestamp + if span_ts is not None: + min_timestamp = min(min_timestamp, span_ts) + + if min_timestamp == float('inf') or max_timestamp == float('-inf'): + logger.warning("Could not determine workflow time window from intermediate steps") + return None, None + + return min_timestamp, max_timestamp + async def run(self, all_steps: list[list[IntermediateStep]]) -> ProfilerResults: """ Main entrypoint: Works on Input DataFrame generated from eval to fit forecasting model, @@ -195,6 +230,19 @@ async def run(self, all_steps: list[list[IntermediateStep]]) -> ProfilerResults: if self.profile_config.dynamo_metrics.enable: from nat.profiler.inference_optimization.dynamo_metrics import collect_dynamo_metrics try: + # Calculate workflow time window from intermediate steps + workflow_start, workflow_end = self._get_workflow_time_window(all_steps) + if workflow_start is not None and workflow_end is not None: + # Set both start and end timestamps so Prometheus range queries + # are isolated to THIS eval run (not picking up data from other runs) + self.profile_config.dynamo_metrics.workflow_start_timestamp = workflow_start + self.profile_config.dynamo_metrics.workflow_end_timestamp = workflow_end + workflow_duration = workflow_end - workflow_start + logger.info("Workflow time window: %.1f seconds (%.2f to %.2f) - metrics isolated to this eval run", + workflow_duration, + workflow_start, + workflow_end) + dynamo_metrics_results = await collect_dynamo_metrics(self.profile_config.dynamo_metrics) if dynamo_metrics_results.errors: logger.warning("Dynamo metrics collection had errors: %s", dynamo_metrics_results.errors) From 2585adcf77db8a9ef6763048b3280d40fa4b66a8 Mon Sep 17 00:00:00 2001 From: bbednarski9 Date: Tue, 27 Jan 2026 21:25:54 +0000 Subject: [PATCH 07/13] vllm support and minimal KC cache control test case Signed-off-by: bbednarski9 --- .pre-commit-config.yaml | 3 +- external/dynamo/.env.example | 4 +- external/dynamo/monitor_dynamo.sh | 8 +- external/dynamo/monitoring/README.md | 116 +- external/dynamo/monitoring/docker-compose.yml | 1 + .../dashboards/json/dynamo-overview.json | 117 +- external/dynamo/monitoring/prometheus.yml | 11 + .../dynamo/monitoring/rules/vllm-aliases.yml | 70 ++ external/dynamo/monitoring/scripts/README.md | 189 +++ .../monitoring/scripts/cache_experiment.sh | 218 ++++ .../monitoring/scripts/kv_event_observer.py | 511 ++++++++ external/dynamo/optimized/ARCHITECTURE.md | 9 + external/dynamo/optimized/config.yaml | 5 + external/dynamo/optimized/processor.py | 18 +- external/dynamo/optimized/router.py | 16 +- external/dynamo/start_dynamo_disagg.sh | 2 +- ...dynamo_optimized_thompson_hints_sglang.sh} | 1 + ...rt_dynamo_optimized_thompson_hints_vllm.sh | 1023 +++++++++++++++++ external/dynamo/start_dynamo_unified.sh | 2 +- .../start_dynamo_unified_thompson_hints.sh | 2 +- external/dynamo/stop_dynamo.sh | 30 +- 21 files changed, 2255 insertions(+), 101 deletions(-) create mode 100644 external/dynamo/monitoring/rules/vllm-aliases.yml create mode 100644 external/dynamo/monitoring/scripts/README.md create mode 100755 external/dynamo/monitoring/scripts/cache_experiment.sh create mode 100755 external/dynamo/monitoring/scripts/kv_event_observer.py rename external/dynamo/{start_dynamo_optimized_thompson_hints.sh => start_dynamo_optimized_thompson_hints_sglang.sh} (99%) create mode 100755 external/dynamo/start_dynamo_optimized_thompson_hints_vllm.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fb2bc8368e..ac451bc0fa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,7 +38,8 @@ repos: name: Clear Jupyter Notebook Output Cells entry: ci/scripts/clear_notebook_output_cells.sh files: "\\.ipynb$" - language: unsupported_script + language: python + additional_dependencies: ["nbconvert"] - repo: https://github.com/tcort/markdown-link-check rev: v3.14.1 diff --git a/external/dynamo/.env.example b/external/dynamo/.env.example index c55c6e523c..a9370b2d25 100644 --- a/external/dynamo/.env.example +++ b/external/dynamo/.env.example @@ -57,8 +57,8 @@ export DYNAMO_GPU_DEVICES=0,1,2,3 # ============================================================================= # HTTP port for Dynamo frontend API -# Default: 8099 -# DYNAMO_HTTP_PORT=8099 +# Default: 8000 +DYNAMO_HTTP_PORT=8000 # ETCD client port for metadata and discovery # Default: 2379 diff --git a/external/dynamo/monitor_dynamo.sh b/external/dynamo/monitor_dynamo.sh index 156323f698..f3dffbfba2 100755 --- a/external/dynamo/monitor_dynamo.sh +++ b/external/dynamo/monitor_dynamo.sh @@ -124,9 +124,9 @@ case $option in ;; 6) print_header "Health Check" - echo "Testing: http://localhost:8099/health" + echo "Testing: http://localhost:8000/health" echo "" - http_code=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8099/health 2>&1) + http_code=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8000/health 2>&1) if [ "$http_code" == "200" ]; then print_status "ok" "Health check passed (HTTP $http_code)" else @@ -135,9 +135,9 @@ case $option in ;; 7) print_header "Test Basic Inference" - echo "Sending test request to http://localhost:8099/v1/chat/completions" + echo "Sending test request to http://localhost:8000/v1/chat/completions" echo "" - response=$(curl -s http://localhost:8099/v1/chat/completions \ + response=$(curl -s http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "llama-3.1-8b", diff --git a/external/dynamo/monitoring/README.md b/external/dynamo/monitoring/README.md index 6f2ae2e7bb..910b02115f 100644 --- a/external/dynamo/monitoring/README.md +++ b/external/dynamo/monitoring/README.md @@ -2,6 +2,17 @@ This directory contains a Prometheus + Grafana monitoring setup for the Dynamo LLM inference stack with Thompson Sampling router. +## Supported Backends + +The monitoring stack supports both **SGLang** and **vLLM** backends: + +| Backend | Metric Prefix | Startup Script | Features | +|---------|---------------|----------------|----------| +| SGLang | `sglang:` | `start_dynamo_optimized_thompson_hints_sglang.sh` | Fast inference | +| vLLM | `vllm:` | `start_dynamo_optimized_thompson_hints_vllm.sh` | Native KVBM support | + +The Grafana dashboard includes a **Backend** dropdown selector to switch between SGLang and vLLM metrics dynamically. + ## Quick Start ```bash @@ -12,12 +23,14 @@ docker compose up -d # Access the dashboards # Prometheus: http://localhost:9090 # Grafana: http://localhost:3000 (admin/admin) + +# In Grafana, use the "Backend" dropdown to select sglang or vllm ``` ## Prerequisites - Docker and Docker Compose -- Dynamo stack running (see `../start_dynamo_optimized_thompson_hints.sh`) +- Dynamo stack running (see `../start_dynamo_optimized_thompson_hints_sglang.sh` or `../start_dynamo_optimized_thompson_hints_vllm.sh`) ## Architecture @@ -87,7 +100,9 @@ User-facing HTTP API metrics for latency, throughput, and token statistics. ### Worker Metrics (`:8081/metrics`) -SGLang backend worker metrics including KV cache, scheduling, and internal statistics. +Backend worker metrics including KV cache, scheduling, and internal statistics. Both SGLang and vLLM expose similar metrics with different prefixes: +- **SGLang**: Metrics prefixed with `sglang:` (e.g., `sglang:cache_hit_rate`) +- **vLLM**: Metrics prefixed with `vllm:` (e.g., `vllm:cache_hit_rate`) #### Dynamo Component Metrics @@ -102,16 +117,25 @@ SGLang backend worker metrics including KV cache, scheduling, and internal stati | `dynamo_component_` | `dynamo_component_inflight_requests` | Gauge | Requests currently in worker | | `dynamo_component_` | `dynamo_component_uptime_seconds` | Gauge | Worker uptime | -#### SGLang Native Metrics +#### Backend Native Metrics + +Both SGLang and vLLM expose similar native metrics with their respective prefixes. Use the `${backend}` variable in the Grafana dashboard to switch between them. + +**Common metrics across both backends:** + +| Metric (use `${backend}:` prefix) | Type | Description | +|-----------------------------------|------|-------------| +| `cache_hit_rate` | Gauge | Prefix cache hit rate | +| `token_usage` | Gauge | Current token usage | +| `num_running_reqs` | Gauge | Currently running requests | +| `num_queue_reqs` | Gauge | Queued requests | +| `num_used_tokens` | Gauge | Tokens currently in use | +| `gen_throughput` | Gauge | Generation throughput | + +**SGLang-specific metrics:** | Prefix | Full Metric Name | Type | Description | |--------|------------------|------|-------------| -| `sglang:` | `sglang:cache_hit_rate` | Gauge | Prefix cache hit rate | -| `sglang:` | `sglang:token_usage` | Gauge | Current token usage | -| `sglang:` | `sglang:num_running_reqs` | Gauge | Currently running requests | -| `sglang:` | `sglang:num_queue_reqs` | Gauge | Queued requests | -| `sglang:` | `sglang:num_used_tokens` | Gauge | Tokens currently in use | -| `sglang:` | `sglang:gen_throughput` | Gauge | Generation throughput | | `sglang:` | `sglang:utilization` | Gauge | GPU utilization | | `sglang:` | `sglang:queue_time_seconds` | Histogram | Time spent in queue | | `sglang:` | `sglang:per_stage_req_latency_seconds` | Histogram | Per-stage request latency | @@ -120,6 +144,17 @@ SGLang backend worker metrics including KV cache, scheduling, and internal stati | `sglang:` | `sglang:engine_startup_time` | Gauge | Engine startup duration | | `sglang:` | `sglang:engine_load_weights_time` | Gauge | Model weight loading time | +**vLLM-specific metrics:** + +| Prefix | Full Metric Name | Type | Description | +|--------|------------------|------|-------------| +| `vllm:` | `vllm:gpu_cache_usage_perc` | Gauge | GPU KV cache usage percentage | +| `vllm:` | `vllm:cpu_cache_usage_perc` | Gauge | CPU KV cache usage percentage | +| `vllm:` | `vllm:num_requests_running` | Gauge | Currently running requests | +| `vllm:` | `vllm:num_requests_waiting` | Gauge | Waiting requests in queue | +| `vllm:` | `vllm:generation_tokens_total` | Counter | Total generation tokens | +| `vllm:` | `vllm:prompt_tokens_total` | Counter | Total prompt tokens | + ### Router Metrics (`:8082/metrics`) Dynamo component metrics for the Thompson Sampling router (uses standard `dynamo_component_*` prefix). @@ -172,7 +207,9 @@ TotalWork = cached_prompt_blocks * block_size w_hit = (w_gpu_hit, w_cpu_hit, w_disk_hit) # weights per hit source ``` -Since CPU/disk hit metrics are not available in SGLang (KVBM not yet supported), we use a **simplified KVES proxy**: +Since full KVES requires GPU/CPU/disk hit breakdowns, we use a **simplified KVES proxy** based on cache hit rate: + +**Note**: vLLM with KVBM enabled provides richer KV cache metrics than SGLang. ```promql # KVES Proxy (using SGLang native metric - RECOMMENDED) @@ -186,13 +223,15 @@ sglang:cache_hit_rate * 100 > `cached_tokens` in its API responses. The processor's `thompson_kve_*` counters will show 0 > unless the underlying engine provides `usage.prompt_tokens_details.cached_tokens`. -> **Note on Full KVES**: To implement the full KVES equation with CPU/disk hit weights, you would need -> to switch to vLLM with KVBM enabled, which provides GPU→CPU→Disk tiered caching with proper metrics. +> **Note on Full KVES**: To implement the full KVES equation with CPU/disk hit weights, use +> vLLM with KVBM enabled, which provides GPU→CPU→Disk tiered caching with proper metrics. ## KV Cache Metrics Status This section documents the working status of all KV cache related metrics across the Dynamo stack. +**Backend Selection**: The Grafana dashboard uses a `${backend}` template variable. Select `sglang` or `vllm` from the dropdown to switch all backend-specific queries. + ### Working Metrics ✓ | Prefix | Full Metric Name | Status | Description | @@ -238,27 +277,50 @@ This section documents the working status of all KV cache related metrics across ### Recommended KV Cache Queries +The following queries use `${backend}` variable (set to `sglang` or `vllm` in Grafana): + ```promql -# KV Cache Memory Usage % (RECOMMENDED - this actually works!) -sglang:token_usage * 100 +# KV Cache Memory Usage % (RECOMMENDED - works with both backends!) +${backend}:token_usage * 100 # Absolute tokens in KV cache -sglang:num_used_tokens +${backend}:num_used_tokens # Total KV cache capacity (blocks) dynamo_component_kvstats_total_blocks # Prefix Cache Hit Rate % (may be 0 without repeated prefix queries) -sglang:cache_hit_rate * 100 +${backend}:cache_hit_rate * 100 # Token throughput -sglang:gen_throughput +${backend}:gen_throughput +``` + +**Direct queries** (without variable): +```promql +# SGLang specific +sglang:token_usage * 100 +sglang:cache_hit_rate * 100 + +# vLLM specific +vllm:token_usage * 100 +vllm:cache_hit_rate * 100 ``` ## Grafana Dashboard The pre-configured dashboard "Dynamo LLM Overview" includes: +### Backend Selector + +The dashboard includes a **Backend** dropdown variable at the top. Select: +- **sglang** - For SGLang workers (metrics prefixed with `sglang:`) +- **vllm** - For vLLM workers (metrics prefixed with `vllm:`) + +All backend-specific panels automatically update based on your selection. + +### Dashboard Panels + 1. **Inflight Requests** - Current load across all components 2. **Requests/min** - Throughput 3. **Time to First Token (P95)** - Latency to start generating @@ -269,9 +331,9 @@ The pre-configured dashboard "Dynamo LLM Overview" includes: 8. **KV Cache Usage** - Memory usage % and prefix cache hit rate % over time 9. **KV Cache Tokens & Throughput** - Absolute token count and generation throughput 10. **KV Cache Details (Per-Worker)** - Detailed per-worker metrics including: - - KVES: Prefix hit rate (%) - `avg_over_time(sglang:cache_hit_rate[1m]) * 100` - - KV Usage (%) - `avg_over_time(sglang:token_usage[1m]) * 100` - - KV Tokens Used - `last_over_time(sglang:num_used_tokens[1m])` + - KVES: Prefix hit rate (%) - `avg_over_time(${backend}:cache_hit_rate[1m]) * 100` + - KV Usage (%) - `avg_over_time(${backend}:token_usage[1m]) * 100` + - KV Tokens Used - `last_over_time(${backend}:num_used_tokens[1m])` - KV Capacity (blocks) - `last_over_time(dynamo_component_kvstats_total_blocks[1m])` - Frontend Block Size - `last_over_time(dynamo_frontend_model_kv_cache_block_size[5m])` 11. **KVES Proxy by Worker** - Color-coded efficiency score per worker (0-1 scale) @@ -279,16 +341,16 @@ The pre-configured dashboard "Dynamo LLM Overview" includes: ### Thompson Sampling Panels (Included) -The dashboard includes these Thompson Sampling and SGLang monitoring panels: +The dashboard includes these Thompson Sampling and worker monitoring panels: - **Routing Decisions/sec** - `rate(dynamo_component_thompson_routing_decisions_total[5m])` -- **SGLang Queue Depth** - `sglang:num_queue_reqs` + `sglang:num_running_reqs` -- **Worker Utilization** - `sglang:utilization` + `sglang:token_usage` +- **Worker Queue Depth** - `${backend}:num_queue_reqs` +- **Worker Activity** - `${backend}:num_running_reqs` -> **Note on KV Cache Metrics**: The dashboard uses SGLang's native metrics (`sglang:token_usage`, -> `sglang:cache_hit_rate`, `sglang:num_used_tokens`) which are reliably populated. The Dynamo-specific -> `dynamo_component_kvstats_*` metrics are not populated by the SGLang backend. See the -> "KV Cache Metrics Status" section above for detailed metric availability. +> **Note on KV Cache Metrics**: The dashboard uses backend-native metrics (`${backend}:token_usage`, +> `${backend}:cache_hit_rate`, `${backend}:num_used_tokens`) which are reliably populated by both +> SGLang and vLLM. The Dynamo-specific `dynamo_component_kvstats_*` metrics may not be populated +> depending on your backend configuration. See the "KV Cache Metrics Status" section above for details. ## Files diff --git a/external/dynamo/monitoring/docker-compose.yml b/external/dynamo/monitoring/docker-compose.yml index 3f70780954..c66355919b 100644 --- a/external/dynamo/monitoring/docker-compose.yml +++ b/external/dynamo/monitoring/docker-compose.yml @@ -18,6 +18,7 @@ services: network_mode: host volumes: - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./rules:/etc/prometheus/rules:ro - prometheus_data:/prometheus command: - '--config.file=/etc/prometheus/prometheus.yml' diff --git a/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json b/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json index 801b2a7d0d..8852d819f1 100644 --- a/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json +++ b/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json @@ -50,7 +50,7 @@ "targets": [ { "expr": "dynamo_frontend_inflight_requests", - "legendFormat": "Inflight Requests (dynamo_frontend_inflight_requests)", + "legendFormat": "Inflight Requests", "refId": "A" } ], @@ -96,7 +96,7 @@ "targets": [ { "expr": "sum(increase(dynamo_frontend_requests_total[5s]))", - "legendFormat": "Total Requests/min (dynamo_frontend_requests_total)", + "legendFormat": "Total Requests/min", "refId": "A" } ], @@ -144,7 +144,7 @@ "targets": [ { "expr": "histogram_quantile(0.95, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5s]))", - "legendFormat": "P95 TTFT (dynamo_frontend_time_to_first_token_seconds)", + "legendFormat": "P95 TTFT", "refId": "A" } ], @@ -189,8 +189,8 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "sglang:cache_hit_rate * 100", - "legendFormat": "Cache Hit Rate ({{instance}}) [sglang:cache_hit_rate * 100]", + "expr": "${backend}:cache_hit_rate * 100", + "legendFormat": "Cache Hit Rate ({{instance}})", "refId": "A" } ], @@ -245,17 +245,17 @@ "targets": [ { "expr": "histogram_quantile(0.5, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5s]))", - "legendFormat": "P50 (dynamo_frontend_time_to_first_token_seconds)", + "legendFormat": "P50", "refId": "A" }, { "expr": "histogram_quantile(0.95, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5s]))", - "legendFormat": "P95 (dynamo_frontend_time_to_first_token_seconds)", + "legendFormat": "P95", "refId": "B" }, { "expr": "histogram_quantile(0.99, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5s]))", - "legendFormat": "P99 (dynamo_frontend_time_to_first_token_seconds)", + "legendFormat": "P99", "refId": "C" } ], @@ -310,17 +310,17 @@ "targets": [ { "expr": "histogram_quantile(0.5, rate(dynamo_frontend_inter_token_latency_seconds_bucket[5s]))", - "legendFormat": "P50 (dynamo_frontend_inter_token_latency_seconds)", + "legendFormat": "P50", "refId": "A" }, { "expr": "histogram_quantile(0.95, rate(dynamo_frontend_inter_token_latency_seconds_bucket[5s]))", - "legendFormat": "P95 (dynamo_frontend_inter_token_latency_seconds)", + "legendFormat": "P95", "refId": "B" }, { "expr": "histogram_quantile(0.99, rate(dynamo_frontend_inter_token_latency_seconds_bucket[5s]))", - "legendFormat": "P99 (dynamo_frontend_inter_token_latency_seconds)", + "legendFormat": "P99", "refId": "C" } ], @@ -374,18 +374,18 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "sglang:gen_throughput", - "legendFormat": "Worker ({{instance}}) [sglang:gen_throughput]", + "expr": "${backend}:gen_throughput", + "legendFormat": "Worker ({{instance}}) [gen_throughput]", "refId": "A" }, { - "expr": "sum(sglang:gen_throughput)", - "legendFormat": "Total Workers (sum) [sum(sglang:gen_throughput)]", + "expr": "sum(${backend}:gen_throughput)", + "legendFormat": "Total Workers (sum)", "refId": "C" }, { "expr": "rate(dynamo_frontend_output_tokens_total{job=\"dynamo-frontend\"}[5s])", - "legendFormat": "Frontend Output (delivered) [rate(dynamo_frontend_output_tokens_total[5s])]", + "legendFormat": "Frontend Output (delivered)", "refId": "B" } ], @@ -439,14 +439,9 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "sglang:token_usage * 100", - "legendFormat": "KV Cache % ({{instance}}) [sglang:token_usage * 100]", + "expr": "${backend}:token_usage * 100", + "legendFormat": "KV Cache % ({{instance}}) [token_usage]", "refId": "A" - }, - { - "expr": "sglang:cache_hit_rate * 100", - "legendFormat": "Prefix Cache % ({{instance}}) [sglang:cache_hit_rate * 100]", - "refId": "B" } ], "title": "KV Cache Usage", @@ -499,8 +494,8 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "sglang:num_used_tokens", - "legendFormat": "Tokens in KV Cache ({{instance}}) [sglang:num_used_tokens]", + "expr": "${backend}:num_used_tokens", + "legendFormat": "Tokens in KV Cache ({{instance}}) [num_used_tokens]", "refId": "A" } ], @@ -555,32 +550,32 @@ "targets": [ { "expr": "sum(rate(dynamo_frontend_requests_total[5s]))", - "legendFormat": "1. Frontend (total) [sum(rate(dynamo_frontend_requests_total[5s]))]", + "legendFormat": "1. Frontend (total)", "refId": "A" }, { "expr": "sum(rate(dynamo_component_requests_total{dynamo_namespace=\"dynamo\",dynamo_component=\"backend\"}[5s]))", - "legendFormat": "2. Processor (backend) [sum(rate(dynamo_component_requests_total{backend}[5s]))]", + "legendFormat": "2. Processor (backend)", "refId": "B" }, { "expr": "sum(rate(dynamo_component_requests_total{dynamo_namespace=\"dynamo\",dynamo_component=\"router\",dynamo_endpoint=\"find_worker\"}[5s]))", - "legendFormat": "3. Router (find_worker) [sum(rate(dynamo_component_requests_total{router}[5s]))]", + "legendFormat": "3. Router (find_worker)", "refId": "C" }, { "expr": "rate(dynamo_component_requests_total{dynamo_namespace=\"workers\",dynamo_component=\"worker\",instance=\"localhost:18081\"}[5s])", - "legendFormat": "4. Worker 0 (18081) [rate(dynamo_component_requests_total{worker:18081}[5s])]", + "legendFormat": "4. Worker 0 (18081)", "refId": "D" }, { "expr": "rate(dynamo_component_requests_total{dynamo_namespace=\"workers\",dynamo_component=\"worker\",instance=\"localhost:18082\"}[5s])", - "legendFormat": "4. Worker 1 (18082) [rate(dynamo_component_requests_total{worker:18082}[5s])]", + "legendFormat": "4. Worker 1 (18082)", "refId": "E" }, { "expr": "sum(rate(dynamo_component_requests_total{dynamo_namespace=\"workers\",dynamo_component=\"worker\"}[5s]))", - "legendFormat": "4. Workers (total) [sum(rate(dynamo_component_requests_total{workers}[5s]))]", + "legendFormat": "4. Workers (total)", "refId": "F" } ], @@ -638,12 +633,12 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "sglang:num_queue_reqs", - "legendFormat": "Queue Depth ({{instance}}) [sglang:num_queue_reqs]", + "expr": "${backend}:num_queue_reqs", + "legendFormat": "Queue Depth ({{instance}}) [num_queue_reqs]", "refId": "A" } ], - "title": "SGLang Queue Depth", + "title": "Worker Queue Depth", "type": "timeseries" }, { @@ -693,12 +688,12 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "sglang:num_running_reqs", - "legendFormat": "Running Requests ({{instance}}) [sglang:num_running_reqs]", + "expr": "${backend}:num_running_reqs", + "legendFormat": "Running Requests ({{instance}}) [num_running_reqs]", "refId": "A" } ], - "title": "Worker Activity (SGLang)", + "title": "Worker Activity (Running Requests)", "type": "timeseries" }, { @@ -775,17 +770,17 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "avg_over_time(sglang:cache_hit_rate[1m]) * 100", + "expr": "avg_over_time(${backend}:cache_hit_rate[1m]) * 100", "legendFormat": "KVES: Prefix Hit Rate % ({{instance}})", "refId": "A" }, { - "expr": "avg_over_time(sglang:token_usage[1m]) * 100", + "expr": "avg_over_time(${backend}:token_usage[1m]) * 100", "legendFormat": "KV Usage % ({{instance}})", "refId": "B" }, { - "expr": "last_over_time(sglang:num_used_tokens[1m])", + "expr": "last_over_time(${backend}:num_used_tokens[1m])", "legendFormat": "KV Tokens Used ({{instance}})", "refId": "C" }, @@ -808,7 +803,7 @@ "type": "prometheus", "uid": "prometheus" }, - "description": "KV Cache Efficiency Score (KVES) proxy using SGLang's native prefix cache hit rate. KVES ∈ [0,1]: 0 = no cache benefit, 1 = full reuse. This is a simplified proxy for the full KVES equation (which requires CPU/disk hit metrics not currently available in SGLang).", + "description": "KV Cache Efficiency Score (KVES) proxy using the backend's native prefix cache hit rate. KVES ∈ [0,1]: 0 = no cache benefit, 1 = full reuse. This is a simplified proxy for the full KVES equation (which requires CPU/disk hit metrics not currently available in all backends).", "fieldConfig": { "defaults": { "color": { @@ -864,7 +859,7 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "sglang:cache_hit_rate", + "expr": "${backend}:cache_hit_rate", "legendFormat": "Worker ({{instance}})", "refId": "A" } @@ -932,7 +927,7 @@ "pluginVersion": "10.2.2", "targets": [ { - "expr": "sglang:token_usage * 100", + "expr": "${backend}:token_usage * 100", "legendFormat": "Worker ({{instance}})", "refId": "A" } @@ -944,9 +939,39 @@ "refresh": "2s", "schemaVersion": 38, "style": "dark", - "tags": ["dynamo", "llm", "inference"], + "tags": ["dynamo", "llm", "inference", "sglang", "vllm"], "templating": { - "list": [] + "list": [ + { + "current": { + "selected": true, + "text": "vllm", + "value": "vllm" + }, + "description": "Backend inference engine (sglang or vllm). Metrics are prefixed with this value.", + "hide": 0, + "includeAll": false, + "label": "Backend", + "multi": false, + "name": "backend", + "options": [ + { + "selected": false, + "text": "sglang", + "value": "sglang" + }, + { + "selected": true, + "text": "vllm", + "value": "vllm" + } + ], + "query": "vllm,sglang", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + } + ] }, "time": { "from": "now-15m", @@ -959,5 +984,3 @@ "version": 1, "weekStart": "" } - - diff --git a/external/dynamo/monitoring/prometheus.yml b/external/dynamo/monitoring/prometheus.yml index abeaa3187b..1d2bf4be3a 100644 --- a/external/dynamo/monitoring/prometheus.yml +++ b/external/dynamo/monitoring/prometheus.yml @@ -3,6 +3,11 @@ # Prometheus configuration for Dynamo metrics collection # +# Supports both SGLang and vLLM backends: +# - SGLang metrics use 'sglang:' prefix (e.g., sglang:cache_hit_rate) +# - vLLM metrics use 'vllm:' prefix (e.g., vllm:cache_hit_rate) +# - Grafana dashboard uses ${backend} variable to switch between them +# # Metrics Endpoints (using 18xxx range to avoid conflicts): # - Frontend (8000): User-facing latency, throughput, tokens # - Workers (18081-180xx): KV cache stats, NATS metrics, internal stats (one per worker) @@ -17,6 +22,11 @@ global: scrape_interval: 2s evaluation_interval: 2s +# Recording rules to create vLLM metric aliases that match the dashboard expectations +# This allows the same dashboard queries to work for both SGLang and vLLM backends +rule_files: + - /etc/prometheus/rules/*.yml + scrape_configs: # Dynamo Frontend metrics (user-facing latency, throughput) - job_name: 'dynamo-frontend' @@ -26,6 +36,7 @@ scrape_configs: scrape_interval: 2s # Dynamo Worker metrics (KV cache, internal stats) + # Works for both SGLang and vLLM backends - same ports, different metric prefixes # Multiple workers use sequential ports starting at 18081 # Add/remove targets based on your NUM_WORKERS setting - job_name: 'dynamo-worker' diff --git a/external/dynamo/monitoring/rules/vllm-aliases.yml b/external/dynamo/monitoring/rules/vllm-aliases.yml new file mode 100644 index 0000000000..075f5c1d55 --- /dev/null +++ b/external/dynamo/monitoring/rules/vllm-aliases.yml @@ -0,0 +1,70 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Prometheus Recording Rules for vLLM Metric Aliases +# =================================================== +# +# vLLM metrics have different names than SGLang metrics. +# These recording rules create vLLM metrics with names that match +# what the Grafana dashboard expects, enabling a single parameterized +# dashboard to work with both backends. +# +# Dashboard variable: ${backend} = "sglang" or "vllm" +# Dashboard queries: ${backend}:metric_name +# +# Metric Mapping: +# Dashboard Query | vLLM Source Metric(s) +# -----------------------|--------------------------------------- +# vllm:cache_hit_rate | prefix_cache_hits / prefix_cache_queries +# vllm:gen_throughput | rate(generation_tokens_total) +# vllm:token_usage | kv_cache_usage_perc +# vllm:num_running_reqs | num_requests_running (same name!) +# vllm:num_queue_reqs | num_requests_waiting +# vllm:num_used_tokens | (computed from kv_cache_usage_perc * total_blocks) + +groups: + - name: vllm_metric_aliases + interval: 2s + rules: + # Cache hit rate: prefix_cache_hits / prefix_cache_queries + # This matches SGLang's cache_hit_rate metric + - record: "vllm:cache_hit_rate" + expr: | + ( + sum by (instance, model_name) (vllm:prefix_cache_hits_total) + / + clamp_min(sum by (instance, model_name) (vllm:prefix_cache_queries_total), 1) + ) + + # Generation throughput: rate of generation tokens + # This matches SGLang's gen_throughput metric + - record: "vllm:gen_throughput" + expr: | + rate(vllm:generation_tokens_total[5s]) + + # Token usage percentage: direct alias for kv_cache_usage_perc + # This matches SGLang's token_usage metric + - record: "vllm:token_usage" + expr: | + vllm:kv_cache_usage_perc + + # Number of requests in queue: alias for num_requests_waiting + # This matches SGLang's num_queue_reqs metric + - record: "vllm:num_queue_reqs" + expr: | + vllm:num_requests_waiting + + # Note: vllm:num_requests_running is already the correct name, + # but SGLang uses num_running_reqs. Create an alias. + - record: "vllm:num_running_reqs" + expr: | + vllm:num_requests_running + + # Number of used tokens: estimate from cache usage percentage + # Note: This is an approximation since vLLM doesn't expose exact token count + # The dashboard may need adjustment for accurate display + - record: "vllm:num_used_tokens" + expr: | + vllm:kv_cache_usage_perc * 100000 + + diff --git a/external/dynamo/monitoring/scripts/README.md b/external/dynamo/monitoring/scripts/README.md new file mode 100644 index 0000000000..65e347e6ba --- /dev/null +++ b/external/dynamo/monitoring/scripts/README.md @@ -0,0 +1,189 @@ +# KV Cache Event Observer + +Real-time monitoring of vLLM prefix cache events (block stored, evicted, and cache hits). + +## Quick Start + +```bash +# Basic monitoring (ZMQ events only) +docker exec -it dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py -p 20080 -v + +# With cache hit detection (polls Prometheus metrics) +docker exec -it dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py -p 20080 -v -m 18081 +``` + +## Usage + +```bash +# Basic verbose monitoring +docker exec -it dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py -p 20080 -v + +# With cache hit detection (recommended for experiments) +docker exec -it dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py -p 20080 -v -m 18081 + +# Run for 60 seconds +docker exec -it dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py -p 20080 -v -d 60 + +# Save events to file +docker exec -it dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py -p 20080 -v -o /tmp/events.jsonl + +# Monitor worker 1 (port 20081, metrics 18082) +docker exec -it dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py -p 20081 -v -m 18082 +``` + +## Options + +| Flag | Description | +|------|-------------| +| `-p`, `--port` | KV event ZMQ port (default: 20080, worker 1 = 20081, etc.) | +| `-m`, `--metrics-port` | Prometheus metrics port for cache hit detection (e.g., 18081) | +| `-v`, `--verbose` | Print each event as it happens | +| `-d`, `--duration` | Run for N seconds then stop | +| `-o`, `--output` | Save events to JSONL file | +| `-H`, `--host` | Worker host (default: localhost) | + +## Event Types + +| Symbol | Event | Source | Description | +|--------|-------|--------|-------------| +| 📦 | STORED | ZMQ | Block committed to prefix cache | +| 🗑️ | REMOVED | ZMQ | Block evicted from cache | +| 🧹 | CLEARED | ZMQ | Entire cache cleared | +| ✅ | CACHE HIT | Metrics | Tokens served from cache (requires `-m`) | + +## Example Output + +``` +[KV Observer] Listening for KV events (msgpack multipart)... +[KV Observer] Cache hits will show as ✅ [CACHE HIT] +------------------------------------------------------------ +📦 [STORED ] seq= 32 hash=df6f76832e34d5f5 tokens= 64 medium=GPU +🗑️ [REMOVED ] seq= 33 hash=eaacc201f3aaf753 medium=GPU +✅ [CACHE HIT] tokens= 64 queried= 128 hit_rate=50% +📦 [STORED ] seq= 34 hash=df6f76832e34d5f5 tokens= 64 medium=GPU +------------------------------------------------------------ +[KV Observer] Final Statistics: + stored_blocks: 2 + evicted_blocks: 1 + net_blocks: 1 + cache_hit_tokens: 64 + cache_query_tokens: 192 + cache_hit_rate: 33.3% +``` + +## Notes + +- **STORED/REMOVED events**: Published via ZMQ when cache state changes +- **CACHE HIT events**: Detected by polling Prometheus metrics (requires `-m` flag) +- **No event = cache hit**: If a repeated query shows no STORED event, the block was already cached +- Events only fire for **full blocks** (64 tokens with default block size) +- Short prompts (less than 64 tokens) may not generate STORED events for incomplete blocks +- With limited cache (e.g., 16 blocks), expect frequent evictions +- **Clearing the cache**: vLLM does not expose a direct cache clear API. To fully clear the cache, restart the vLLM worker. Alternatively, use `--flush` with the experiment script to fill the cache with unique queries, pushing out old entries via LRU eviction. + +## Port Mapping + +| Worker | ZMQ Port (`-p`) | Metrics Port (`-m`) | +|--------|-----------------|---------------------| +| Worker 0 | 20080 | 18081 | +| Worker 1 | 20081 | 18082 | +| Worker 2 | 20082 | 18083 | + +## Manual Cache Lifecycle Experiment + +This experiment demonstrates the full KV cache lifecycle: **STORE → STORE → EVICT → STORE → CACHE HIT**. + +### Setup + +```bash +# 1. Stop any running Dynamo stack +bash stop_dynamo.sh + +# 2. Configure limited cache for experiment (5 blocks) +export DYNAMO_GPU_DEVICES=0,1,2,3 +export DYNAMO_TP_SIZE=4 +export DYNAMO_KV_BLOCK_SIZE=64 +export DYNAMO_NUM_GPU_BLOCKS_OVERRIDE=5 + +# 3. Start vLLM with KV events enabled +bash start_dynamo_optimized_thompson_hints_vllm.sh > startup_output.txt + +# 4. In a separate terminal, start the observer +docker exec -it dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py -p 20080 -v -m 18081 +``` + +### Run Queries + +Queries must be **65+ tokens** (including chat template) to generate cache events: + +```bash +# Query A (70 tokens) - STORE +curl -s http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" \ + -d '{"model":"llama-3.3-70b","messages":[{"role":"user","content":"Query A: The quick brown fox jumps over the lazy dog repeatedly. The quick brown fox jumps over the lazy dog repeatedly. The quick brown fox jumps over the lazy dog."}],"max_tokens":5}' + +# Query B (72 tokens) - STORE +curl -s http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" \ + -d '{"model":"llama-3.3-70b","messages":[{"role":"user","content":"Query B: Pack my box with five dozen liquor jugs today please. Pack my box with five dozen liquor jugs today please. Pack my box with five dozen liquor jugs."}],"max_tokens":5}' + +# Query C (76 tokens) - EVICT A, STORE C +curl -s http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" \ + -d '{"model":"llama-3.3-70b","messages":[{"role":"user","content":"Query C: How vexingly quick daft zebras jump over the moon tonight. How vexingly quick daft zebras jump over the moon tonight. How vexingly quick daft zebras jump."}],"max_tokens":5}' + +# Query C again - CACHE HIT +curl -s http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" \ + -d '{"model":"llama-3.3-70b","messages":[{"role":"user","content":"Query C: How vexingly quick daft zebras jump over the moon tonight. How vexingly quick daft zebras jump over the moon tonight. How vexingly quick daft zebras jump."}],"max_tokens":5}' +``` + +### Expected Observer Output + +``` +📦 [STORED ] seq= 0 hash=ca596e30d283c6f7 tokens= 64 medium=GPU ← Query A +📦 [STORED ] seq= 1 hash=41ccc959d03a1d21 tokens= 64 medium=GPU ← Query B +🗑️ [REMOVED ] seq= 2 hash=ca596e30d283c6f7 medium=GPU ← Query A evicted (LRU) +📦 [STORED ] seq= 2 hash=b5291e07de5d51cc tokens= 64 medium=GPU ← Query C +✅ [CACHE HIT] tokens= 64 queried= 76 hit_rate=84% ← Query C repeated +``` + +### Cache Size Guidelines + +| Blocks | Usable | Behavior | +|--------|--------|----------| +| 3-4 | ~0-1 | Constant thrashing, no cache benefit | +| 5-8 | ~2-4 | Good for demonstrating evictions + hits | +| 16+ | ~10+ | Production-like behavior | + +### Key Requirements + +- **Prompt length**: Must exceed 64 tokens (1 block) to generate STORED events +- **Cache size**: Use `DYNAMO_NUM_GPU_BLOCKS_OVERRIDE=5` to force evictions +- **Metrics flag**: Use `-m 18081` to detect cache hits (not published via ZMQ) + +## Cache Experiment Script + +Run a complete A → B → C → A cache experiment: + +```bash +# Basic experiment +./cache_experiment.sh + +# Flush cache first (recommended) +./cache_experiment.sh --flush + +# Verbose output (full API responses) +./cache_experiment.sh --flush --verbose +``` + +The script: +1. Optionally flushes the cache by filling it with unique queries +2. Starts the KV event observer in the background +3. Sends Query A (should STORE) +4. Sends Query B (should STORE) +5. Sends Query C (should STORE) +6. Sends Query A again (should show CACHE HIT) +7. Displays observer output and final statistics + +## Requirements + +- vLLM must be started with `--kv-events-config` containing `enable_kv_cache_events: true` +- The startup script `start_dynamo_optimized_thompson_hints_vllm.sh` configures this automatically when `DYNAMO_ENABLE_KV_EVENTS=true` + diff --git a/external/dynamo/monitoring/scripts/cache_experiment.sh b/external/dynamo/monitoring/scripts/cache_experiment.sh new file mode 100755 index 0000000000..77602d957b --- /dev/null +++ b/external/dynamo/monitoring/scripts/cache_experiment.sh @@ -0,0 +1,218 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# KV Cache Experiment Script +# +# Demonstrates prefix cache behavior with queries A → B → C → A +# Shows: STORED events, REMOVED (eviction) events, and CACHE HITs +# +# Usage: +# ./cache_experiment.sh # Run with defaults +# ./cache_experiment.sh --flush # Flush cache first by filling it +# ./cache_experiment.sh --verbose # Show full curl responses + +set -euo pipefail + +# Configuration +API_URL="${DYNAMO_API_URL:-http://localhost:8000}" +MODEL="${DYNAMO_MODEL_NAME:-llama-3.3-70b}" +ZMQ_PORT="${DYNAMO_KV_EVENT_PORT:-20080}" +METRICS_PORT="${DYNAMO_WORKER_METRICS_PORT:-18081}" +MAX_TOKENS=5 +VERBOSE=false +FLUSH_CACHE=false + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --verbose|-v) VERBOSE=true; shift ;; + --flush|-f) FLUSH_CACHE=true; shift ;; + --help|-h) + echo "Usage: $0 [--verbose] [--flush]" + echo " --verbose, -v Show full API responses" + echo " --flush, -f Flush cache by filling it before experiment" + exit 0 + ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' # No Color + +echo -e "${CYAN}=========================================================${NC}" +echo -e "${CYAN} KV Cache Experiment: A → B → C → A${NC}" +echo -e "${CYAN}=========================================================${NC}" +echo "" +echo -e "API: ${API_URL}" +echo -e "Model: ${MODEL}" +echo -e "ZMQ Port: ${ZMQ_PORT}" +echo -e "Metrics Port: ${METRICS_PORT}" +echo "" + +# Check if API is available +echo -e "${BLUE}Checking API availability...${NC}" +if ! curl -s --max-time 5 "${API_URL}/health" > /dev/null 2>&1; then + echo -e "${RED}ERROR: API not available at ${API_URL}${NC}" + echo "Make sure Dynamo is running: bash start_dynamo_optimized_thompson_hints_vllm.sh" + exit 1 +fi +echo -e "${GREEN}✓ API is available${NC}" +echo "" + +# Long prompts that will fill at least 1 complete block (64 tokens each) +# Each prompt is ~120+ tokens to ensure at least 1 full block is stored +QUERY_A="Query Alpha: Please provide a comprehensive and detailed explanation of quantum computing technology. Start by explaining what quantum bits (qubits) are and how they fundamentally differ from classical binary bits. Then thoroughly discuss the principle of quantum superposition and how it enables massive parallelism in quantum computations." + +QUERY_B="Query Beta: Please provide an in-depth explanation of machine learning and artificial intelligence. Begin by describing the fundamental differences between supervised, unsupervised, and reinforcement learning paradigms. Then explain neural network architectures including feedforward networks, convolutional neural networks, and transformers." + +QUERY_C="Query Charlie: Please provide a detailed overview of cloud computing infrastructure and services. Start by explaining the differences between Infrastructure as a Service (IaaS), Platform as a Service (PaaS), and Software as a Service (SaaS). Then discuss containerization technologies like Docker and Kubernetes orchestration." + +# Function to send a query and display results +send_query() { + local name=$1 + local prompt=$2 + local color=$3 + + echo -e "${color}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" + echo -e "${color}Sending Query ${name}${NC}" + echo -e "${color}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" + + response=$(curl -s "${API_URL}/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{ + \"model\": \"${MODEL}\", + \"messages\": [{\"role\": \"user\", \"content\": \"${prompt}\"}], + \"max_tokens\": ${MAX_TOKENS} + }") + + if [ "$VERBOSE" = true ]; then + echo "$response" | jq . + else + prompt_tokens=$(echo "$response" | jq -r '.usage.prompt_tokens // "N/A"') + completion_tokens=$(echo "$response" | jq -r '.usage.completion_tokens // "N/A"') + echo -e " Prompt tokens: ${prompt_tokens}" + echo -e " Completion tokens: ${completion_tokens}" + fi + echo "" +} + +# Function to flush cache by sending many unique queries +flush_cache() { + echo -e "${YELLOW}Flushing cache by filling it with unique queries...${NC}" + echo -e "${YELLOW}(This may take a minute)${NC}" + echo "" + + for i in $(seq 1 20); do + curl -s "${API_URL}/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{ + \"model\": \"${MODEL}\", + \"messages\": [{\"role\": \"user\", \"content\": \"Flush query number ${i}: This is a unique cache flush query designed to evict existing cached blocks from the prefix cache. Random identifier: ${RANDOM}${RANDOM}${RANDOM}. Please provide a detailed explanation of topic ${i}.\"}], + \"max_tokens\": 1 + }" > /dev/null 2>&1 + echo -ne "\r Progress: ${i}/20" + done + echo -e "\n${GREEN}✓ Cache flushed${NC}" + echo "" +} + +# Get initial cache metrics +echo -e "${BLUE}Initial cache state:${NC}" +initial_hits=$(curl -s "http://localhost:${METRICS_PORT}/metrics" | grep "vllm:prefix_cache_hits_total{" | grep -oE '[0-9.]+$' || echo "0") +initial_queries=$(curl -s "http://localhost:${METRICS_PORT}/metrics" | grep "vllm:prefix_cache_queries_total{" | grep -oE '[0-9.]+$' || echo "0") +echo -e " Cache hits: ${initial_hits}" +echo -e " Cache queries: ${initial_queries}" +echo "" + +# Flush cache if requested +if [ "$FLUSH_CACHE" = true ]; then + flush_cache +fi + +# Start the KV event observer in the background +echo -e "${BLUE}Starting KV event observer...${NC}" +OBSERVER_LOG=$(mktemp) +docker exec dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py \ + -p "${ZMQ_PORT}" -v -m "${METRICS_PORT}" -d 60 > "$OBSERVER_LOG" 2>&1 & +OBSERVER_PID=$! +sleep 2 +echo -e "${GREEN}✓ Observer started (PID: ${OBSERVER_PID})${NC}" +echo "" + +echo -e "${CYAN}=========================================================${NC}" +echo -e "${CYAN} Starting Query Sequence: A → B → C → A${NC}" +echo -e "${CYAN}=========================================================${NC}" +echo "" + +# Send queries with delays to allow event processing +send_query "A (first time)" "$QUERY_A" "$GREEN" +sleep 2 + +send_query "B" "$QUERY_B" "$YELLOW" +sleep 2 + +send_query "C" "$QUERY_C" "$RED" +sleep 2 + +send_query "A (repeated - expect cache hit)" "$QUERY_A" "$GREEN" +sleep 3 + +# Stop observer and show results +echo -e "${CYAN}=========================================================${NC}" +echo -e "${CYAN} Stopping Observer & Showing Results${NC}" +echo -e "${CYAN}=========================================================${NC}" +echo "" + +# Kill observer gracefully +kill $OBSERVER_PID 2>/dev/null || true +wait $OBSERVER_PID 2>/dev/null || true +sleep 1 + +# Display observer output +echo -e "${BLUE}KV Event Observer Output:${NC}" +echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" +cat "$OBSERVER_LOG" +echo "" + +# Get final cache metrics +echo -e "${BLUE}Final cache state:${NC}" +final_hits=$(curl -s "http://localhost:${METRICS_PORT}/metrics" | grep "vllm:prefix_cache_hits_total{" | grep -oE '[0-9.]+$' || echo "0") +final_queries=$(curl -s "http://localhost:${METRICS_PORT}/metrics" | grep "vllm:prefix_cache_queries_total{" | grep -oE '[0-9.]+$' || echo "0") +echo -e " Cache hits: ${final_hits} (delta: +$(echo "$final_hits - $initial_hits" | bc))" +echo -e " Cache queries: ${final_queries} (delta: +$(echo "$final_queries - $initial_queries" | bc))" +echo "" + +# Calculate hit rate for this experiment +delta_hits=$(echo "$final_hits - $initial_hits" | bc) +delta_queries=$(echo "$final_queries - $initial_queries" | bc) +if [ "$delta_queries" != "0" ]; then + hit_rate=$(echo "scale=1; $delta_hits * 100 / $delta_queries" | bc) + echo -e "${GREEN}Experiment hit rate: ${hit_rate}%${NC}" +fi + +# Cleanup +rm -f "$OBSERVER_LOG" + +echo "" +echo -e "${CYAN}=========================================================${NC}" +echo -e "${CYAN} Experiment Complete!${NC}" +echo -e "${CYAN}=========================================================${NC}" +echo "" +echo -e "Expected behavior:" +echo -e " • Query A (1st): ${GREEN}📦 STORED${NC} - new block cached" +echo -e " • Query B: ${YELLOW}📦 STORED${NC} - new block cached (may evict old blocks)" +echo -e " • Query C: ${RED}📦 STORED${NC} - new block cached (may evict old blocks)" +echo -e " • Query A (2nd): ${GREEN}✅ CACHE HIT${NC} - if A still in cache, or 📦 STORED if evicted" +echo "" +echo -e "With 16 blocks available, all 3 queries should fit without evicting each other." +echo -e "To force evictions, restart with: DYNAMO_NUM_GPU_BLOCKS_OVERRIDE=4" +echo "" + + diff --git a/external/dynamo/monitoring/scripts/kv_event_observer.py b/external/dynamo/monitoring/scripts/kv_event_observer.py new file mode 100755 index 0000000000..147030c5ff --- /dev/null +++ b/external/dynamo/monitoring/scripts/kv_event_observer.py @@ -0,0 +1,511 @@ +#!/usr/bin/env python3 +""" +KV Cache Event Observer for Dynamo vLLM Workers + +Subscribes to vLLM's ZMQ KV event publisher and logs/monitors block-level +events (stored, evicted) in real-time. Also polls Prometheus metrics to +detect cache hits (which don't generate ZMQ events). + +vLLM publishes events in msgpack format via ZMQ multipart messages: + - Part 0: Topic (bytes, usually empty) + - Part 1: Sequence number (8 bytes, big-endian int64) + - Part 2: Payload (msgpack-encoded KVEventBatch) + +KVEventBatch structure (msgpack): + [timestamp, events_list, dp_rank] + +Event types (from ZMQ): + - BlockStored: A new block was committed to prefix cache + - BlockRemoved: A block was evicted from prefix cache + - AllBlocksCleared: Entire cache was cleared + +Metrics polling (for cache hits): + - vllm:prefix_cache_hits_total: Cumulative cache hit tokens + - vllm:prefix_cache_queries_total: Cumulative cache query tokens + +Usage: + # Inside container: + python /workspace/monitoring/scripts/kv_event_observer.py --port 20080 --verbose + + # With cache hit tracking (polls metrics endpoint): + python /workspace/monitoring/scripts/kv_event_observer.py -p 20080 -v --metrics-port 18081 + + # Output to file: + python kv_event_observer.py --port 20080 --verbose --output kv_events.jsonl +""" + +import argparse +import json +import signal +import sys +import time +import threading +import urllib.request +import re +from collections import defaultdict +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Any, Optional, List + +try: + import zmq +except ImportError: + print("ERROR: pyzmq not installed. Run: pip install pyzmq") + sys.exit(1) + +try: + import msgpack +except ImportError: + print("ERROR: msgpack not installed. Run: pip install msgpack") + sys.exit(1) + + +def format_hash(block_hash: Any) -> str: + """Format a block hash for display.""" + if isinstance(block_hash, bytes): + return block_hash.hex()[:16] + elif isinstance(block_hash, int): + return f"{block_hash:016x}"[:16] + return str(block_hash)[:16] + + +@dataclass +class KVCacheStats: + """Aggregated statistics for KV cache events.""" + stored_blocks: int = 0 + evicted_blocks: int = 0 + cleared_count: int = 0 + cache_hit_tokens: int = 0 # Tokens served from cache (from metrics) + cache_query_tokens: int = 0 # Total tokens queried (from metrics) + unique_hashes: set = field(default_factory=set) + hash_to_blocks: dict = field(default_factory=lambda: defaultdict(list)) + last_event_time: float = 0.0 + last_seq: int = -1 + + def record_stored(self, block_hashes: List[Any], parent_hash: Any = None): + """Record BlockStored event.""" + self.last_event_time = time.time() + for bh in block_hashes: + h = format_hash(bh) + self.stored_blocks += 1 + self.unique_hashes.add(h) + + def record_removed(self, block_hashes: List[Any]): + """Record BlockRemoved event.""" + self.last_event_time = time.time() + for bh in block_hashes: + h = format_hash(bh) + self.evicted_blocks += 1 + self.unique_hashes.discard(h) + + def record_cleared(self): + """Record AllBlocksCleared event.""" + self.last_event_time = time.time() + self.cleared_count += 1 + self.unique_hashes.clear() + + def record_cache_hit(self, hit_tokens: int, query_tokens: int): + """Record cache hit from metrics delta.""" + self.cache_hit_tokens += hit_tokens + self.cache_query_tokens += query_tokens + + def summary(self) -> dict: + """Return summary statistics.""" + hit_rate = (self.cache_hit_tokens / self.cache_query_tokens * 100) if self.cache_query_tokens > 0 else 0 + return { + "stored_blocks": self.stored_blocks, + "evicted_blocks": self.evicted_blocks, + "net_blocks": self.stored_blocks - self.evicted_blocks, + "cleared_count": self.cleared_count, + "unique_hashes_current": len(self.unique_hashes), + "cache_hit_tokens": self.cache_hit_tokens, + "cache_query_tokens": self.cache_query_tokens, + "cache_hit_rate": f"{hit_rate:.1f}%", + "last_seq": self.last_seq, + } + + +class KVEventObserver: + """Observes KV cache events from a vLLM worker via ZMQ. + + Also optionally polls Prometheus metrics to detect cache hits, + which don't generate ZMQ events. + """ + + def __init__( + self, + host: str = "localhost", + port: int = 20080, + verbose: bool = False, + output_file: Optional[str] = None, + metrics_port: Optional[int] = None, + ): + self.host = host + self.port = port + self.verbose = verbose + self.output_file = output_file + self.metrics_port = metrics_port + self.stats = KVCacheStats() + self.running = False + self._output_handle = None + + # Metrics polling state + self._last_hits = 0.0 + self._last_queries = 0.0 + self._metrics_thread = None + + self.context = zmq.Context() + self.socket = self.context.socket(zmq.SUB) + + def _parse_metric(self, metrics_text: str, metric_name: str) -> float: + """Extract a metric value from Prometheus text format.""" + pattern = rf'^{re.escape(metric_name)}\{{[^}}]*\}}\s+([0-9.e+-]+)' + for line in metrics_text.split('\n'): + match = re.match(pattern, line) + if match: + return float(match.group(1)) + return 0.0 + + def _poll_metrics(self): + """Background thread to poll Prometheus metrics for cache hits.""" + metrics_url = f"http://{self.host}:{self.metrics_port}/metrics" + + while self.running: + try: + with urllib.request.urlopen(metrics_url, timeout=2) as resp: + metrics_text = resp.read().decode('utf-8') + + hits = self._parse_metric(metrics_text, 'vllm:prefix_cache_hits_total') + queries = self._parse_metric(metrics_text, 'vllm:prefix_cache_queries_total') + + # Calculate deltas + hit_delta = hits - self._last_hits + query_delta = queries - self._last_queries + + if hit_delta > 0: + # Cache hit detected! + self.stats.record_cache_hit(int(hit_delta), int(query_delta)) + if self.verbose: + hit_rate = (hit_delta / query_delta * 100) if query_delta > 0 else 0 + print(f"✅ [CACHE HIT] tokens={int(hit_delta):4d} queried={int(query_delta):4d} hit_rate={hit_rate:.0f}%") + elif query_delta > 0: + # Queries happened but no hits (cache miss) + self.stats.record_cache_hit(0, int(query_delta)) + + self._last_hits = hits + self._last_queries = queries + + except Exception as e: + if self.verbose: + print(f"[Metrics] Poll error: {e}") + + time.sleep(0.5) # Poll every 500ms + + def connect(self): + """Connect to the vLLM KV event publisher.""" + endpoint = f"tcp://{self.host}:{self.port}" + print(f"[KV Observer] Connecting to {endpoint}...") + self.socket.connect(endpoint) + # Subscribe to all topics (empty string = all) + self.socket.setsockopt_string(zmq.SUBSCRIBE, "") + self.socket.setsockopt(zmq.RCVTIMEO, 1000) + print(f"[KV Observer] ✓ Connected and subscribed") + + if self.output_file: + self._output_handle = open(self.output_file, "a") + print(f"[KV Observer] Writing events to: {self.output_file}") + + if self.metrics_port: + print(f"[KV Observer] Polling metrics at http://{self.host}:{self.metrics_port}/metrics") + # Initialize baseline metrics + try: + metrics_url = f"http://{self.host}:{self.metrics_port}/metrics" + with urllib.request.urlopen(metrics_url, timeout=2) as resp: + metrics_text = resp.read().decode('utf-8') + self._last_hits = self._parse_metric(metrics_text, 'vllm:prefix_cache_hits_total') + self._last_queries = self._parse_metric(metrics_text, 'vllm:prefix_cache_queries_total') + print(f"[KV Observer] ✓ Baseline: hits={self._last_hits:.0f} queries={self._last_queries:.0f}") + except Exception as e: + print(f"[KV Observer] ⚠ Could not get baseline metrics: {e}") + + def parse_multipart(self, parts: List[bytes]) -> Optional[dict]: + """Parse a ZMQ multipart message from vLLM. + + Format: [topic, sequence, payload] + Payload is msgpack-encoded KVEventBatch: [timestamp, events_list, dp_rank] + + Note: The order is [ts, events, dp_rank], NOT [ts, dp_rank, events]! + """ + if len(parts) < 3: + if self.verbose: + print(f"[KV Observer] Warning: Expected 3 parts, got {len(parts)}") + return None + + topic, seq_bytes, payload = parts[0], parts[1], parts[2] + + try: + seq = int.from_bytes(seq_bytes, "big", signed=True) + self.stats.last_seq = seq + except Exception: + seq = -1 + + try: + # Decode msgpack payload + batch = msgpack.unpackb(payload, raw=False, strict_map_key=False) + + # vLLM KVEventBatch format: [timestamp, events_list, dp_rank] + # Note: events is at index 1, dp_rank at index 2! + if isinstance(batch, (list, tuple)) and len(batch) >= 3: + ts = batch[0] + events = batch[1] # Events are at index 1 + dp_rank = batch[2] # dp_rank is at index 2 + elif isinstance(batch, dict): + ts = batch.get("ts", time.time()) + dp_rank = batch.get("data_parallel_rank", 0) + events = batch.get("events", []) + else: + events = [batch] if batch else [] + ts = time.time() + dp_rank = 0 + + # Ensure events is a list + if not isinstance(events, list): + events = [events] if events else [] + + return { + "seq": seq, + "timestamp": ts, + "dp_rank": dp_rank, + "events": events, + "topic": topic.decode("utf-8", errors="replace") if topic else "", + } + except Exception as e: + if self.verbose: + print(f"[KV Observer] Parse error: {e}") + print(f"[KV Observer] Raw payload: {payload[:100]}...") + return None + + def handle_event(self, event_data: dict): + """Handle a parsed event batch.""" + seq = event_data.get("seq", -1) + ts = event_data.get("timestamp", 0) + dp_rank = event_data.get("dp_rank", 0) + events = event_data.get("events", []) + + for event in events: + # Events can be dicts or tuples/lists + # vLLM format (list): + # BlockRemoved: ['BlockRemoved', [hash_list], medium] + # BlockStored: ['BlockStored', [hash_list], parent_hash, token_ids, block_size, lora_id, medium] + # AllBlocksCleared: ['AllBlocksCleared'] + if isinstance(event, dict): + event_type = event.get("type", event.get("event_type", "unknown")) + block_hashes = event.get("block_hashes", []) + parent_hash = event.get("parent_block_hash") + medium = event.get("medium", "GPU") + token_ids = event.get("token_ids", []) + block_size = event.get("block_size", 0) + elif isinstance(event, (list, tuple)) and len(event) >= 1: + event_type = str(event[0]) if event else "unknown" + + if event_type == "BlockRemoved" and len(event) >= 2: + # ['BlockRemoved', [hashes], medium] + block_hashes = event[1] if isinstance(event[1], list) else [event[1]] + medium = event[2] if len(event) > 2 else "GPU" + parent_hash = None + token_ids = [] + block_size = 0 + elif event_type == "BlockStored" and len(event) >= 2: + # ['BlockStored', [hashes], parent_hash, token_ids, block_size, lora_id, medium] + block_hashes = event[1] if isinstance(event[1], list) else [event[1]] + parent_hash = event[2] if len(event) > 2 else None + token_ids = event[3] if len(event) > 3 else [] + block_size = event[4] if len(event) > 4 else 0 + medium = event[6] if len(event) > 6 else "GPU" + elif event_type == "AllBlocksCleared": + block_hashes = [] + parent_hash = None + medium = "GPU" + token_ids = [] + block_size = 0 + else: + block_hashes = event[1] if len(event) > 1 and isinstance(event[1], list) else [] + parent_hash = None + medium = event[-1] if len(event) > 2 and isinstance(event[-1], str) else "GPU" + token_ids = [] + block_size = 0 + else: + event_type = str(type(event).__name__) + block_hashes = [] + parent_hash = None + medium = "GPU" + token_ids = [] + block_size = 0 + + # Normalize event type (vLLM uses class names like "BlockStored") + event_type_lower = event_type.lower() + + if "stored" in event_type_lower or "blockstored" in event_type_lower: + self.stats.record_stored(block_hashes, parent_hash) + if self.verbose: + num_tokens = len(token_ids) if token_ids else block_size + for bh in block_hashes: + print(f"📦 [STORED ] seq={seq:6d} hash={format_hash(bh)} tokens={num_tokens:3d} medium={medium}") + elif "removed" in event_type_lower or "blockremoved" in event_type_lower: + self.stats.record_removed(block_hashes) + if self.verbose: + for bh in block_hashes: + print(f"🗑️ [REMOVED ] seq={seq:6d} hash={format_hash(bh)} medium={medium}") + elif "cleared" in event_type_lower or "allblockscleared" in event_type_lower: + self.stats.record_cleared() + if self.verbose: + print(f"🧹 [CLEARED ] seq={seq:6d} All blocks cleared") + else: + if self.verbose: + print(f"❓ [UNKNOWN ] seq={seq:6d} type={event_type} data={event[:3] if isinstance(event, (list, tuple)) else event}") + + # Write to output file + if self._output_handle: + def get_event_type(e): + if isinstance(e, dict): + return str(e.get("type", "unknown")) + elif isinstance(e, (list, tuple)) and len(e) > 0: + return str(e[0]) + else: + return str(e) + + output = { + "_timestamp": datetime.now(timezone.utc).isoformat(), + "seq": seq, + "ts": ts, + "dp_rank": dp_rank, + "events": [{"type": get_event_type(e)} for e in events], + } + self._output_handle.write(json.dumps(output) + "\n") + self._output_handle.flush() + + def run(self, duration: Optional[float] = None): + """Run the observer loop.""" + self.running = True + start_time = time.time() + batches_received = 0 + + # Start metrics polling thread if configured + if self.metrics_port: + self._metrics_thread = threading.Thread( + target=self._poll_metrics, + daemon=True, + name="metrics-poller" + ) + self._metrics_thread.start() + + print(f"[KV Observer] Listening for KV events (msgpack multipart)...") + if self.metrics_port: + print(f"[KV Observer] Cache hits will show as ✅ [CACHE HIT]") + print(f"[KV Observer] Press Ctrl+C to stop") + print("-" * 60) + + try: + while self.running: + if duration and (time.time() - start_time) >= duration: + print(f"\n[KV Observer] Duration limit reached ({duration}s)") + break + + try: + # Receive multipart message + parts = self.socket.recv_multipart() + event_data = self.parse_multipart(parts) + + if event_data: + self.handle_event(event_data) + batches_received += 1 + + if batches_received % 20 == 0 and not self.verbose: + summary = self.stats.summary() + print( + f"[{batches_received:5d} batches] " + f"Stored: {summary['stored_blocks']:4d} | " + f"Removed: {summary['evicted_blocks']:4d} | " + f"Net: {summary['net_blocks']:4d} | " + f"Hashes: {summary['unique_hashes_current']} | " + f"Seq: {summary['last_seq']}" + ) + except zmq.Again: + # Timeout, continue loop + continue + + except KeyboardInterrupt: + print("\n[KV Observer] Interrupted") + finally: + self.stop() + + def stop(self): + """Stop and print final statistics.""" + self.running = False + + print("-" * 60) + print("[KV Observer] Final Statistics:") + for key, value in self.stats.summary().items(): + print(f" {key}: {value}") + + if self._output_handle: + self._output_handle.close() + + self.socket.close() + self.context.term() + print("[KV Observer] Stopped") + + +def main(): + parser = argparse.ArgumentParser( + description="Observe KV cache events from vLLM workers", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Monitor worker 0 (ZMQ events only): + python kv_event_observer.py -p 20080 -v + + # Monitor with cache hit detection (polls Prometheus metrics): + python kv_event_observer.py -p 20080 -v -m 18081 + + # Monitor worker 1: + python kv_event_observer.py -p 20081 -v -m 18082 + + # Save events to file: + python kv_event_observer.py -p 20080 -o events.jsonl + + # Run for 60 seconds: + python kv_event_observer.py -p 20080 -d 60 + +Event types: + 📦 STORED - Block committed to prefix cache (ZMQ) + 🗑️ REMOVED - Block evicted from cache (ZMQ) + ✅ CACHE HIT - Tokens served from cache (metrics polling) +""" + ) + parser.add_argument("--host", "-H", default="localhost", help="Worker host (default: localhost)") + parser.add_argument("--port", "-p", type=int, default=20080, help="KV event ZMQ port (default: 20080)") + parser.add_argument("--metrics-port", "-m", type=int, help="Prometheus metrics port for cache hit detection (e.g., 18081)") + parser.add_argument("--verbose", "-v", action="store_true", help="Print each event") + parser.add_argument("--output", "-o", help="Output file (JSONL format)") + parser.add_argument("--duration", "-d", type=float, help="Run duration in seconds") + + args = parser.parse_args() + + observer = KVEventObserver( + host=args.host, + port=args.port, + verbose=args.verbose, + output_file=args.output, + metrics_port=args.metrics_port, + ) + + signal.signal(signal.SIGINT, lambda s, f: setattr(observer, 'running', False)) + signal.signal(signal.SIGTERM, lambda s, f: setattr(observer, 'running', False)) + + observer.connect() + observer.run(duration=args.duration) + + +if __name__ == "__main__": + main() diff --git a/external/dynamo/optimized/ARCHITECTURE.md b/external/dynamo/optimized/ARCHITECTURE.md index 72e43e627a..86f8ba2206 100644 --- a/external/dynamo/optimized/ARCHITECTURE.md +++ b/external/dynamo/optimized/ARCHITECTURE.md @@ -357,6 +357,15 @@ export DYNAMO_TP_SIZE=2 | `DYN_SYSTEM_PORT` | `8081` | Dynamo system/metrics port | | `DYNAMO_ROUTER_WAIT_FOR_WORKERS_TIMEOUT_S` | `600` | Worker discovery timeout | +### Backend-Specific Configuration (REQUIRED) + +| Variable | Values | Description | +|----------|--------|-------------| +| `DYNAMO_WORKER_COMPONENT` | `worker` or `backend` | **REQUIRED.** Component name where workers register. SGLang uses `worker` (via `--endpoint workers.worker.generate`). vLLM uses `backend` (hardcoded in `dynamo.vllm`). | + +> **Important**: `DYNAMO_WORKER_COMPONENT` must be set for the router and processor to find +> the backend workers. Without this variable, startup will fail with an error. + > **Note on `DYN_ROUTER_MODE`**: The startup script passes `--router-mode round-robin` to the > default frontend, but this is **irrelevant** in our architecture. The frontend's built-in > router routes to `dynamo.backend.generate`, which is our Processor (not a real backend). diff --git a/external/dynamo/optimized/config.yaml b/external/dynamo/optimized/config.yaml index 2c6dbe8f59..ae496265e6 100644 --- a/external/dynamo/optimized/config.yaml +++ b/external/dynamo/optimized/config.yaml @@ -7,6 +7,11 @@ # This file contains all configurable parameters for the WorkloadAwareRouter. # Parameters can be overridden via CLI flags (see PARAMETERS.md for details). # +# REQUIRED ENVIRONMENT VARIABLE: +# DYNAMO_WORKER_COMPONENT - Component name where backend workers register. +# - SGLang: "worker" (workers register at workers.worker.generate) +# - vLLM: "backend" (workers register at workers.backend.generate) +# # CLI Override Examples: # python router.py --config config.yaml --affinity-base 0.5 # python router.py --config config.yaml --override affinity.reuse_weight=0.2 diff --git a/external/dynamo/optimized/processor.py b/external/dynamo/optimized/processor.py index 5c1f4871c8..4416d20cd8 100644 --- a/external/dynamo/optimized/processor.py +++ b/external/dynamo/optimized/processor.py @@ -104,6 +104,7 @@ import argparse import asyncio import logging +import os import time import uuid from collections.abc import AsyncIterator @@ -352,14 +353,23 @@ async def initialize(self): await self.router_pick_client.wait_for_instances() logger.info("Router clients initialized successfully") - # Connect to actual workers at workers.worker.generate + # Connect to actual workers at workers.{component}.generate # Workers are in the "workers" namespace (hidden from frontend discovery) # while this processor is in "dynamo" namespace (frontend discovers us) - worker_component = self.runtime.namespace("workers").component("worker") + # Component name varies by backend (REQUIRED - no default): + # - SGLang: uses "worker" (set via --endpoint workers.worker.generate) + # - vLLM: uses "backend" (hardcoded in dynamo.vllm) + worker_component_name = os.environ.get("DYNAMO_WORKER_COMPONENT") + if not worker_component_name: + raise ValueError( + "DYNAMO_WORKER_COMPONENT environment variable is required. " + "Set to 'worker' for SGLang or 'backend' for vLLM." + ) + worker_component = self.runtime.namespace("workers").component(worker_component_name) self.engine_client = await worker_component.endpoint("generate").client() - logger.info("Engine client created, waiting for worker instances...") + logger.info("Engine client created for workers/%s/generate, waiting for worker instances...", worker_component_name) await self.engine_client.wait_for_instances() - logger.info("Processor initialized successfully (routing to workers.worker.generate)") + logger.info("Processor initialized successfully (routing to workers/%s/generate)", worker_component_name) # ---- annotation extraction ---- @staticmethod diff --git a/external/dynamo/optimized/router.py b/external/dynamo/optimized/router.py index e1c02f6a05..e68ed5d9df 100644 --- a/external/dynamo/optimized/router.py +++ b/external/dynamo/optimized/router.py @@ -594,11 +594,19 @@ async def initialize(self): # Initialize Prometheus metrics self._metrics = _init_prometheus_metrics() - # Connect to actual SGLang workers at workers.worker.generate + # Connect to actual workers at workers.{component}.generate # Workers are in the "workers" namespace (hidden from frontend discovery) - # (NOT backend.generate - that's where the Processor registers to intercept frontend) - engine = self.runtime.namespace("workers").component("worker") - logger.info("Getting engine client for workers/worker/generate") + # Component name varies by backend (REQUIRED - no default): + # - SGLang: uses "worker" (set via --endpoint workers.worker.generate) + # - vLLM: uses "backend" (hardcoded in dynamo.vllm) + worker_component = os.environ.get("DYNAMO_WORKER_COMPONENT") + if not worker_component: + raise ValueError( + "DYNAMO_WORKER_COMPONENT environment variable is required. " + "Set to 'worker' for SGLang or 'backend' for vLLM." + ) + engine = self.runtime.namespace("workers").component(worker_component) + logger.info("Getting engine client for workers/%s/generate", worker_component) self.engine_client = await engine.endpoint("generate").client() min_workers = int(self.min_workers) diff --git a/external/dynamo/start_dynamo_disagg.sh b/external/dynamo/start_dynamo_disagg.sh index 8477a66a90..e9935afdfb 100755 --- a/external/dynamo/start_dynamo_disagg.sh +++ b/external/dynamo/start_dynamo_disagg.sh @@ -36,7 +36,7 @@ CONTAINER_NAME="dynamo-sglang" PREFILL_GPUS="${DYNAMO_PREFILL_GPUS:-0,1}" DECODE_GPUS="${DYNAMO_DECODE_GPUS:-2,3}" TP_SIZE="${DYNAMO_TP_SIZE:-2}" -HTTP_PORT="${DYNAMO_HTTP_PORT:-8099}" +HTTP_PORT="${DYNAMO_HTTP_PORT:-8000}" MODEL="/workspace/models/Llama-3.3-70B-Instruct" SERVED_MODEL_NAME="${DYNAMO_MODEL_NAME:-llama-3.3-70b}" IMAGE="nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.1" diff --git a/external/dynamo/start_dynamo_optimized_thompson_hints.sh b/external/dynamo/start_dynamo_optimized_thompson_hints_sglang.sh similarity index 99% rename from external/dynamo/start_dynamo_optimized_thompson_hints.sh rename to external/dynamo/start_dynamo_optimized_thompson_hints_sglang.sh index d9e546c977..820e04564b 100755 --- a/external/dynamo/start_dynamo_optimized_thompson_hints.sh +++ b/external/dynamo/start_dynamo_optimized_thompson_hints_sglang.sh @@ -343,6 +343,7 @@ docker run -d \ -e PROCESSOR_METRICS_PORT=$PROCESSOR_METRICS_PORT \ -e KV_BLOCK_SIZE=$KV_BLOCK_SIZE \ -e MEM_FRACTION_STATIC=$MEM_FRACTION_STATIC \ + -e DYNAMO_WORKER_COMPONENT=worker \ $IMAGE \ bash -c " set -e diff --git a/external/dynamo/start_dynamo_optimized_thompson_hints_vllm.sh b/external/dynamo/start_dynamo_optimized_thompson_hints_vllm.sh new file mode 100755 index 0000000000..0b489d714c --- /dev/null +++ b/external/dynamo/start_dynamo_optimized_thompson_hints_vllm.sh @@ -0,0 +1,1023 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Dynamo vLLM with OPTIMIZED Thompson Sampling Router Architecture +# +# Key difference from SGLang version: +# - Uses vLLM backend instead of SGLang +# - vLLM has native KVBM support for KV event publishing +# - Different CLI flags (--block-size vs --page-size, etc.) +# - Enables radix/prefix caching by default (no --disable-radix-cache) +# +# Architecture: +# Client → Default Dynamo Frontend (tokenization + nvext parsing) +# ↓ PreprocessedRequest with annotations +# Custom Processor (extracts hints, queries router) +# ↓ RouterRequest +# Custom Router (Thompson Sampling + KV overlap) +# ↓ worker_id +# vLLM Backend Worker +# ↓ response tokens +# Processor sends feedback to Router +# +# Components: +# - ETCD (metadata and worker discovery) +# - NATS (message queue for KV events) +# - Default Dynamo Frontend (HTTP API on port 8000) +# - Custom Router (Thompson Sampling + KV overlap) +# - Custom Processor (hint extraction + routing) +# - vLLM Workers (unified mode, multiple workers with TP=2 each) +# +# Prometheus Metrics: +# - Frontend: http://localhost:8000/metrics +# - Backend/Router/Processor: http://localhost:8081/metrics +# +# To stop all components: bash stop_dynamo.sh + +set -euo pipefail + +# Configuration Variables (can be overridden via environment variables) +# See env.example for documentation on each variable +CONTAINER_NAME="dynamo-vllm" +WORKER_GPUS="${DYNAMO_GPU_DEVICES:-0,1,2,3,4,5,6,7}" +TP_SIZE="${DYNAMO_TP_SIZE:-2}" +HTTP_PORT="${DYNAMO_HTTP_PORT:-8000}" +# Metrics ports - each component gets its own port to avoid conflicts +# Using 18xxx range to avoid conflicts with common services +# Workers use sequential ports starting at WORKER_METRICS_PORT (18081, 18082, ...) +# Router and Processor are offset to allow for many workers +WORKER_METRICS_PORT="${DYNAMO_WORKER_METRICS_PORT:-18081}" +ROUTER_METRICS_PORT="${DYNAMO_ROUTER_METRICS_PORT:-18090}" +PROCESSOR_METRICS_PORT="${DYNAMO_PROCESSOR_METRICS_PORT:-18091}" +MODEL="/workspace/models/Llama-3.3-70B-Instruct" +SERVED_MODEL_NAME="${DYNAMO_MODEL_NAME:-llama-3.3-70b}" +# vLLM container image - update version as needed +IMAGE="${DYNAMO_VLLM_IMAGE:-nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.1}" +SHM_SIZE="${DYNAMO_SHM_SIZE:-16g}" +WORKER_INIT_TIMEOUT_S="${DYNAMO_WORKER_INIT_TIMEOUT_S:-1800}" + +# KV Cache Configuration +# Block size in tokens - must match between vLLM (--block-size) and Frontend (--kv-cache-block-size) +KV_BLOCK_SIZE="${DYNAMO_KV_BLOCK_SIZE:-16}" +# Fraction of GPU memory for KV cache (0.0-1.0). Reduce to test cache pressure/degradation. +# NOTE: 0.85 is safer than 0.9+ to avoid OOM during vLLM warmup with large max_num_seqs +GPU_MEMORY_UTILIZATION="${DYNAMO_GPU_MEMORY_UTILIZATION:-0.85}" +# Maximum concurrent sequences per worker. Lower values use less memory during warmup. +# vLLM default is 1024, but this can cause OOM on memory-constrained setups. +MAX_NUM_SEQS="${DYNAMO_MAX_NUM_SEQS:-256}" +# Override the number of GPU KV cache blocks (for experiments with limited cache). +# Set to a small number (e.g., 8-16) to force cache eviction behavior. +# Leave empty/unset to use automatic calculation based on GPU memory. +NUM_GPU_BLOCKS_OVERRIDE="${DYNAMO_NUM_GPU_BLOCKS_OVERRIDE:-}" + +# Compute container-internal GPU indices (GPUs are renumbered 0,1,2,... inside the container) +NUM_GPUS=$(echo "$WORKER_GPUS" | tr ',' '\n' | wc -l) +CONTAINER_GPU_INDICES=$(seq -s, 0 $((NUM_GPUS - 1))) + +# Calculate number of workers based on available GPUs and TP size +NUM_WORKERS=$((NUM_GPUS / TP_SIZE)) + +# vLLM-specific: Enable KVBM event publishing for radix tree observability +# Each worker needs a unique KV event port - configured via DYN_VLLM_KV_EVENT_PORT +# Port allocation: Worker 0 = 20080, Worker 1 = 20081, etc. +# This is set per-worker at startup time below +ENABLE_KV_EVENTS="${DYNAMO_ENABLE_KV_EVENTS:-true}" +KV_EVENT_BASE_PORT="${DYNAMO_KV_EVENT_BASE_PORT:-20080}" + +# Local paths - DYNAMO_MODEL_DIR must be set or script will error +if [ -z "${DYNAMO_MODEL_DIR:-}" ]; then + echo "ERROR: DYNAMO_MODEL_DIR environment variable must be set" + echo "" + echo "Example:" + echo " export DYNAMO_MODEL_DIR=\"/path/to/your/models/Llama-3.3-70B-Instruct\"" + echo "" + echo "Then run this script again." + exit 1 +fi + +# Validate model directory +if [ -d "${DYNAMO_MODEL_DIR}" ]; then + if [ ! -f "${DYNAMO_MODEL_DIR}/config.json" ]; then + echo "ERROR: ${DYNAMO_MODEL_DIR} exists but is not a valid model directory" + echo "" + echo "Missing: config.json" + echo "" + echo "Find it: find ~/.cache/huggingface/hub -name config.json -path '*Llama-3.3-70B*'" + exit 1 + fi + + if ! grep -q '"model_type"' "${DYNAMO_MODEL_DIR}/config.json" 2>/dev/null; then + echo "ERROR: ${DYNAMO_MODEL_DIR}/config.json is missing 'model_type' field" + echo "" + echo "This usually means incomplete/corrupted download. Try:" + echo " rm -rf ${DYNAMO_MODEL_DIR}" + echo " hf download meta-llama/Llama-3.3-70B-Instruct --local-dir ${DYNAMO_MODEL_DIR}" + exit 1 + fi +fi +LOCAL_MODEL_DIR="${DYNAMO_MODEL_DIR}" + +# Repository directory - auto-detect from script location +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CUSTOM_DYNAMO_DIR="${SCRIPT_DIR}/optimized" + +echo "=========================================================" +echo "Dynamo vLLM with OPTIMIZED Thompson Sampling Router" +echo "=========================================================" +echo "Model: Llama-3.3-70B-Instruct" +echo "Container: $CONTAINER_NAME" +echo "HTTP Port: $HTTP_PORT (default Dynamo frontend)" +echo "Metrics Ports:" +echo " - Worker: $WORKER_METRICS_PORT (KV cache, internal)" +echo " - Router: $ROUTER_METRICS_PORT (Thompson routing)" +echo " - Processor: $PROCESSOR_METRICS_PORT (KVE metrics)" +echo "" +echo "Architecture Differences (vs SGLang version):" +echo " - vLLM backend (native KVBM support)" +echo " - KV events enabled: $ENABLE_KV_EVENTS" +echo " - Different CLI flags (--block-size, --gpu-memory-utilization)" +echo " - Prefix caching enabled by default" +echo "" +echo "Components:" +echo " - ETCD (metadata and discovery)" +echo " - NATS (message queue for KV events)" +echo " - Default Frontend (HTTP API on port $HTTP_PORT)" +echo " - Custom Router (Thompson Sampling + KV overlap)" +echo " - Custom Processor (hint extraction + routing)" +echo " - vLLM Worker (unified mode)" +echo "" +echo "Backend Workers:" +echo " Workers: $NUM_WORKERS (GPUs: $NUM_GPUS, TP=$TP_SIZE per worker)" +echo " GPUs: $WORKER_GPUS" +echo " Mode: UNIFIED (no prefill/decode disaggregation)" +echo "" +echo "KV Cache Configuration:" +echo " Block Size: $KV_BLOCK_SIZE tokens (--block-size / --kv-cache-block-size)" +echo " GPU Mem Utilization: $GPU_MEMORY_UTILIZATION (--gpu-memory-utilization)" +echo " Max Concurrent Seqs: $MAX_NUM_SEQS (--max-num-seqs, prevents OOM during warmup)" +echo " KV Events: $ENABLE_KV_EVENTS (KVBM event publishing)" +if [ "$ENABLE_KV_EVENTS" = "true" ] && [ "$NUM_WORKERS" -gt 1 ]; then + echo " Per-worker ports: $KV_EVENT_BASE_PORT - $((KV_EVENT_BASE_PORT + NUM_WORKERS - 1))" +fi +if [ -n "$NUM_GPU_BLOCKS_OVERRIDE" ]; then + echo " ⚠️ GPU Blocks Override: $NUM_GPU_BLOCKS_OVERRIDE (EXPERIMENT MODE - limited cache!)" +fi +echo "" +echo "=========================================================" + +# Verify custom components exist +if [ ! -f "$CUSTOM_DYNAMO_DIR/router.py" ]; then + echo "✗ ERROR: Custom router.py not found at: $CUSTOM_DYNAMO_DIR/router.py" + exit 1 +fi +if [ ! -f "$CUSTOM_DYNAMO_DIR/processor.py" ]; then + echo "✗ ERROR: Custom processor.py not found at: $CUSTOM_DYNAMO_DIR/processor.py" + exit 1 +fi +echo "✓ Custom components found in: $CUSTOM_DYNAMO_DIR" +echo "" + +# Start ETCD if not running +if docker ps -a --format '{{.Names}}' | grep -q "^etcd-dynamo$"; then + echo "Removing existing ETCD container..." + docker rm -f etcd-dynamo +fi + +echo "Starting ETCD container..." +docker run -d \ + --name etcd-dynamo \ + --network host \ + -e ALLOW_NONE_AUTHENTICATION=yes \ + -e ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379 \ + -e ETCD_ADVERTISE_CLIENT_URLS=http://localhost:2379 \ + bitnamilegacy/etcd:3.6.1 + +# Wait for ETCD to be ready +echo "Waiting for ETCD to be ready..." +for i in {1..30}; do + if curl -s http://localhost:2379/health > /dev/null 2>&1; then + echo "✓ ETCD is ready" + sleep 2 + break + fi + if [ $i -eq 30 ]; then + echo "✗ ERROR: ETCD failed to start within 30 seconds" + docker logs etcd-dynamo + exit 1 + fi + sleep 1 +done + +# Start NATS if not running +if docker ps -a --format '{{.Names}}' | grep -q "^nats-dynamo$"; then + echo "Removing existing NATS container..." + docker rm -f nats-dynamo +fi + +echo "Starting NATS container..." +docker run -d \ + --name nats-dynamo \ + --network host \ + nats:2.11.4 \ + -js + +# Wait for NATS to be ready +echo "Waiting for NATS to be ready..." +for i in {1..30}; do + if timeout 2 bash -c 'cat < /dev/null > /dev/tcp/localhost/4222' 2>/dev/null; then + echo "✓ NATS is ready" + break + fi + if [ $i -eq 30 ]; then + echo "✗ ERROR: NATS failed to start within 30 seconds" + docker logs nats-dynamo + exit 1 + fi + sleep 1 +done +echo "" + +# Start monitoring stack (Prometheus + Grafana) if not running +MONITORING_DIR="${SCRIPT_DIR}/monitoring" +if [ -f "$MONITORING_DIR/docker-compose.yml" ]; then + PROMETHEUS_RUNNING=$(docker ps --format '{{.Names}}' | grep -q "^dynamo-prometheus$" && echo "true" || echo "false") + GRAFANA_RUNNING=$(docker ps --format '{{.Names}}' | grep -q "^dynamo-grafana$" && echo "true" || echo "false") + + if [ "$PROMETHEUS_RUNNING" = "false" ] || [ "$GRAFANA_RUNNING" = "false" ]; then + echo "Starting monitoring stack (Prometheus + Grafana)..." + cd "$MONITORING_DIR" + docker compose up -d + cd "$SCRIPT_DIR" + + # Wait for Prometheus to be ready + echo "Waiting for Prometheus to be ready..." + for i in {1..30}; do + if curl -s http://localhost:9090/-/ready > /dev/null 2>&1; then + echo "✓ Prometheus is ready (http://localhost:9090)" + break + fi + if [ $i -eq 30 ]; then + echo "⚠ WARNING: Prometheus may not be fully ready yet" + fi + sleep 1 + done + + # Wait for Grafana to be ready + echo "Waiting for Grafana to be ready..." + for i in {1..30}; do + if curl -s http://localhost:3000/api/health > /dev/null 2>&1; then + echo "✓ Grafana is ready (http://localhost:3000)" + break + fi + if [ $i -eq 30 ]; then + echo "⚠ WARNING: Grafana may not be fully ready yet" + fi + sleep 1 + done + echo "" + else + echo "✓ Monitoring stack already running" + echo " Prometheus: http://localhost:9090" + echo " Grafana: http://localhost:3000" + echo "" + fi +else + echo "⚠ Monitoring docker-compose.yml not found at: $MONITORING_DIR" + echo " Skipping monitoring stack startup" + echo "" +fi + +# Clean up existing Dynamo container if it exists +if docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + echo "Removing existing Dynamo container: $CONTAINER_NAME" + docker rm -f $CONTAINER_NAME +fi + +# Verify HF_TOKEN is set +if [ -z "${HF_TOKEN:-}" ]; then + echo "" + echo "⚠ HF_TOKEN environment variable is not set." + echo "" + if [ -d "$LOCAL_MODEL_DIR" ]; then + echo "✓ Local model found - proceeding without HF_TOKEN" + HF_TOKEN="dummy" + else + echo "✗ Local model NOT found and no HF_TOKEN to download it" + echo "" + read -p "Please enter your HuggingFace token (or press Enter to skip): " HF_TOKEN + if [ -z "$HF_TOKEN" ]; then + echo "WARNING: Proceeding without HF_TOKEN." + HF_TOKEN="dummy" + else + echo "✓ HuggingFace token received" + fi + fi +else + echo "✓ HuggingFace token is set" +fi +echo "" + +# Verify model exists locally +if [ ! -d "$LOCAL_MODEL_DIR" ]; then + echo "WARNING: Model directory not found at: $LOCAL_MODEL_DIR" + echo "" + echo "To download the model, run:" + echo " hf download meta-llama/Llama-3.3-70B-Instruct --local-dir $LOCAL_MODEL_DIR" + echo "" + read -p "Continue anyway (model will be downloaded from HuggingFace)? [y/N] " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + exit 1 + fi +fi + +# KV events are configured per-worker via --kv-events-config JSON inside the container +# Each worker gets a unique endpoint port: tcp://*:$KV_EVENT_PORT + +# Start container with optimized Thompson Sampling components +echo "" +echo "Starting Dynamo container with OPTIMIZED Thompson Sampling components (vLLM)..." +docker run -d \ + --name $CONTAINER_NAME \ + --gpus "\"device=${WORKER_GPUS}\"" \ + --network host \ + --ipc=host \ + --shm-size=$SHM_SIZE \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + -v $LOCAL_MODEL_DIR:$MODEL:ro \ + -v $CUSTOM_DYNAMO_DIR:/workspace/custom_dynamo:ro \ + -v ${SCRIPT_DIR}/monitoring/scripts:/workspace/monitoring/scripts:ro \ + -e HF_TOKEN="$HF_TOKEN" \ + -e HUGGING_FACE_HUB_TOKEN="$HF_TOKEN" \ + -e RUST_BACKTRACE=1 \ + -e PYTHONUNBUFFERED=1 \ + -e DYN_HTTP_PORT=$HTTP_PORT \ + -e DYN_ROUTER_MODE=round-robin \ + -e WORKER_METRICS_PORT=$WORKER_METRICS_PORT \ + -e ROUTER_METRICS_PORT=$ROUTER_METRICS_PORT \ + -e PROCESSOR_METRICS_PORT=$PROCESSOR_METRICS_PORT \ + -e KV_BLOCK_SIZE=$KV_BLOCK_SIZE \ + -e GPU_MEMORY_UTILIZATION=$GPU_MEMORY_UTILIZATION \ + -e MAX_NUM_SEQS=$MAX_NUM_SEQS \ + -e ENABLE_KV_EVENTS=$ENABLE_KV_EVENTS \ + -e KV_EVENT_BASE_PORT=$KV_EVENT_BASE_PORT \ + -e DYNAMO_WORKER_COMPONENT=backend \ + $IMAGE \ + bash -c " + set -e + + echo '=========================================================' + echo 'Verifying external infrastructure services...' + echo '=========================================================' + + # Verify ETCD is accessible + if curl -s http://localhost:2379/health > /dev/null 2>&1; then + echo '✓ ETCD accessible at localhost:2379' + else + echo '✗ ERROR: ETCD not accessible at localhost:2379' + exit 1 + fi + + # Verify NATS is accessible + if timeout 2 bash -c '/dev/null; then + echo '✓ NATS accessible at localhost:4222' + else + echo '✗ ERROR: NATS not accessible at localhost:4222' + exit 1 + fi + + echo '' + + # Function to wait for worker initialization via ETCD registration + wait_for_worker() { + local worker_type=\$1 + local pid=\$2 + # Use WORKER_INIT_TIMEOUT_S (defaults to 1800s / 30 min) + local max_wait=$WORKER_INIT_TIMEOUT_S + local elapsed=0 + local poll_interval=5 + + echo \"Waiting for \$worker_type worker (PID \$pid) to initialize...\" + echo \" Detection: ETCD worker registration\" + echo \" Timeout: \${max_wait}s\" + + while [ \$elapsed -lt \$max_wait ]; do + if ! kill -0 \$pid 2>/dev/null; then + echo \"ERROR: \$worker_type worker process died!\" + return 1 + fi + + local etcd_response=\$(curl -s --max-time 2 http://localhost:2379/v3/kv/range \ + -X POST \ + -H \"Content-Type: application/json\" \ + -d '{\"key\":\"AA==\",\"range_end\":\"AA==\",\"keys_only\":true}' 2>&1) + + if [ \$((elapsed % 30)) -eq 0 ] && [ \$elapsed -gt 0 ]; then + echo \" [DEBUG] ETCD count: \$(echo \"\$etcd_response\" | grep -o '\"count\":\"[^\"]*\"')\" + fi + + if echo \"\$etcd_response\" | grep -q '\"count\"' && \ + ! echo \"\$etcd_response\" | grep -q '\"count\":\"0\"'; then + echo \"✓ \$worker_type worker is ready (registered with ETCD at \${elapsed}s)\" + return 0 + fi + + sleep \$poll_interval + elapsed=\$((elapsed + poll_interval)) + if [ \$((elapsed % 30)) -eq 0 ]; then + echo \" ... \${elapsed}s / \${max_wait}s (waiting for ETCD registration)\" + fi + done + + echo \"ERROR: \$worker_type worker failed to register with ETCD within \${max_wait}s\" + return 1 + } + + # Function to wait for ALL workers to register with ETCD + # Counts workers registered at workers.backend.generate endpoint + wait_for_all_workers() { + local expected_count=\$1 + local max_wait=$WORKER_INIT_TIMEOUT_S + local elapsed=0 + local poll_interval=10 + + echo \"\" + echo \"Waiting for ALL \$expected_count vLLM workers to register with ETCD...\" + echo \" Detection: Count workers at workers.backend.generate endpoint\" + echo \" Timeout: \${max_wait}s\" + echo \"\" + + while [ \$elapsed -lt \$max_wait ]; do + # Check all worker PIDs are still alive + for wpid in \"\${WORKER_PIDS[@]}\"; do + if ! kill -0 \$wpid 2>/dev/null; then + echo \"ERROR: Worker process \$wpid died during initialization!\" + return 1 + fi + done + + # Count worker registrations in ETCD + # Workers register with keys like: v1/instances/workers/backend/generate/ + local worker_count=\$(curl -s --max-time 2 http://localhost:2379/v3/kv/range \ + -X POST \ + -H \"Content-Type: application/json\" \ + -d '{ + \"key\": \"'\"djEvaW5zdGFuY2VzL3dvcmtlcnMvYmFja2VuZC9nZW5lcmF0ZS8=\"'\", + \"range_end\": \"'\"djEvaW5zdGFuY2VzL3dvcmtlcnMvYmFja2VuZC9nZW5lcmF0ZTA=\"'\", + \"count_only\": true + }' 2>/dev/null | grep -o '\"count\":\"[^\"]*\"' | grep -o '[0-9]*' || echo \"0\") + + if [ \"\$worker_count\" -ge \"\$expected_count\" ]; then + echo \"✓ All \$expected_count vLLM workers registered with ETCD (took \${elapsed}s)\" + return 0 + fi + + if [ \$((elapsed % 30)) -eq 0 ]; then + echo \" [\${elapsed}s] Workers registered: \$worker_count / \$expected_count\" + fi + + sleep \$poll_interval + elapsed=\$((elapsed + poll_interval)) + done + + echo \"ERROR: Only \$worker_count / \$expected_count workers registered within \${max_wait}s\" + echo \" Some workers may still be initializing torch.compile (can take 10+ min first time)\" + return 1 + } + + # ========================================================================= + # STARTUP ORDER WITH MODEL NAME ISOLATION + # ========================================================================= + # Using different model names to force ALL traffic through the processor. + # Workers register with internal model name (${SERVED_MODEL_NAME}-internal), + # while processor registers with public model name (${SERVED_MODEL_NAME}). + # Frontend only routes to backends matching the requested model name. + # + # Order: + # 1. Workers (model=${SERVED_MODEL_NAME}-internal, not discovered for public model) + # 2. Router (needs workers to be present) + # 3. Processor (model=${SERVED_MODEL_NAME}, frontend discovers this) + # 4. Frontend (routes ${SERVED_MODEL_NAME} requests to processor ONLY) + # ========================================================================= + + echo '=========================================================' + echo 'Step 1: Starting $NUM_WORKERS vLLM Unified Worker(s) (Host GPUs $WORKER_GPUS -> Container GPUs $CONTAINER_GPU_INDICES)...' + echo '=========================================================' + # Workers register at workers.worker.generate (in 'workers' namespace) + # They start first so the router can discover them during initialization + # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component + + # KV events configuration + # NOTE: KV events are configured via --kv-events-config JSON, not --enable-kv-events flag + # Each worker gets a unique endpoint port via the config + # --enable-prefix-caching is a separate vLLM feature (always enabled by default in unified mode) + if [ \"\$ENABLE_KV_EVENTS\" = \"true\" ]; then + echo \"KV Events: ENABLED (per-worker ports starting at \$KV_EVENT_BASE_PORT)\" + else + echo \"KV Events: DISABLED (set DYNAMO_ENABLE_KV_EVENTS=true to enable)\" + fi + + # Build optional --num-gpu-blocks-override flag (for cache size experiments) + GPU_BLOCKS_OVERRIDE_OPT=\"\" + if [ -n \"$NUM_GPU_BLOCKS_OVERRIDE\" ]; then + GPU_BLOCKS_OVERRIDE_OPT=\"--num-gpu-blocks-override $NUM_GPU_BLOCKS_OVERRIDE\" + echo \"GPU Blocks Override: $NUM_GPU_BLOCKS_OVERRIDE (experiment mode - limited cache!)\" + fi + + # Start multiple workers, each using TP_SIZE GPUs + WORKER_PIDS=() + for i in \$(seq 0 \$(($NUM_WORKERS - 1))); do + # Calculate GPU range for this worker (e.g., worker 0: 0,1; worker 1: 2,3; etc.) + START_GPU=\$((i * $TP_SIZE)) + END_GPU=\$(((i + 1) * $TP_SIZE - 1)) + WORKER_GPU_LIST=\$(seq -s, \$START_GPU \$END_GPU) + + # Calculate port offsets for this worker to avoid ZMQ port conflicts + # + # 1. NIXL Side Channel Ports (for KV transfer handshake) + # Each worker's NIXL connector uses TP_SIZE consecutive ports + # Port spacing = TP_SIZE (minimum needed to avoid overlap) + # Examples: + # TP=1, 8 GPUs → 8 workers: 5557, 5558, 5559, 5560, 5561, 5562, 5563, 5564 + # TP=2, 8 GPUs → 4 workers: 5557-5558, 5559-5560, 5561-5562, 5563-5564 + # TP=4, 8 GPUs → 2 workers: 5557-5560, 5561-5564 + # TP=8, 8 GPUs → 1 worker: 5557-5564 + NIXL_BASE_PORT=\$((5557 + i * $TP_SIZE)) + + # 2. KV Event Publisher Port (for publishing KV cache events to subscriber) + # Each worker needs a unique port for its ZMQ publisher + # Set via DYN_VLLM_KV_EVENT_PORT environment variable + # Default base: 20080, Worker 0: 20080, Worker 1: 20081, etc. + KV_EVENT_PORT=\$(($KV_EVENT_BASE_PORT + i)) + + echo \"Starting vLLM Worker \$i: GPUs \$WORKER_GPU_LIST (internal model name)\" + echo \" KV Block Size: $KV_BLOCK_SIZE tokens, GPU Mem Util: $GPU_MEMORY_UTILIZATION, Max Seqs: $MAX_NUM_SEQS\" + echo \" NIXL Port Range: \$NIXL_BASE_PORT - \$((NIXL_BASE_PORT + $TP_SIZE - 1)) (TP=$TP_SIZE)\" + echo \" KV Event Port: \$KV_EVENT_PORT (KV Events: $ENABLE_KV_EVENTS)\" + # NOTE: dynamo.vllm does NOT accept --host/--port/--endpoint like dynamo.sglang + # Endpoint is set via DYN_ENDPOINT env var, namespace via DYN_NAMESPACE + # VLLM_NIXL_SIDE_CHANNEL_PORT sets the base port for NIXL handshake listener + # DYN_VLLM_KV_EVENT_PORT sets the port for KV event publishing (unique per worker) + # KV events are configured via --kv-events-config JSON with unique endpoint per worker + + # Build KV events config JSON for this worker (unique endpoint per worker) + KV_EVENTS_JSON=\"{\\\"enable_kv_cache_events\\\":true,\\\"publisher\\\":\\\"zmq\\\",\\\"endpoint\\\":\\\"tcp://*:\$KV_EVENT_PORT\\\"}\" + + if [ \"\$ENABLE_KV_EVENTS\" = \"true\" ]; then + CUDA_VISIBLE_DEVICES=\$WORKER_GPU_LIST \ + DYN_SYSTEM_PORT=\$((WORKER_METRICS_PORT + i)) \ + DYN_NAMESPACE=workers \ + DYN_ENDPOINT=workers.worker.generate \ + VLLM_NIXL_SIDE_CHANNEL_PORT=\$NIXL_BASE_PORT \ + DYN_VLLM_KV_EVENT_PORT=\$KV_EVENT_PORT \ + python3 -m dynamo.vllm \ + --model $MODEL \ + --served-model-name ${SERVED_MODEL_NAME}-internal \ + --tensor-parallel-size $TP_SIZE \ + --trust-remote-code \ + --block-size $KV_BLOCK_SIZE \ + --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ + --max-num-seqs $MAX_NUM_SEQS \ + \$GPU_BLOCKS_OVERRIDE_OPT \ + --kv-events-config \"\$KV_EVENTS_JSON\" & + else + CUDA_VISIBLE_DEVICES=\$WORKER_GPU_LIST \ + DYN_SYSTEM_PORT=\$((WORKER_METRICS_PORT + i)) \ + DYN_NAMESPACE=workers \ + DYN_ENDPOINT=workers.worker.generate \ + VLLM_NIXL_SIDE_CHANNEL_PORT=\$NIXL_BASE_PORT \ + DYN_VLLM_KV_EVENT_PORT=\$KV_EVENT_PORT \ + python3 -m dynamo.vllm \ + --model $MODEL \ + --served-model-name ${SERVED_MODEL_NAME}-internal \ + --tensor-parallel-size $TP_SIZE \ + --trust-remote-code \ + --block-size $KV_BLOCK_SIZE \ + --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ + --max-num-seqs $MAX_NUM_SEQS \ + \$GPU_BLOCKS_OVERRIDE_OPT & + fi + WORKER_PIDS+=(\$!) + echo \" Worker \$i PID: \${WORKER_PIDS[\$i]}\" + done + echo \"\" + echo \"Total workers started: \${#WORKER_PIDS[@]}\" + echo \"Worker PIDs: \${WORKER_PIDS[*]}\" + echo \"Registered at: workers.worker.generate (model: ${SERVED_MODEL_NAME}-internal)\" + echo \"NOTE: Workers use internal model name so frontend only discovers processor\" + echo \"\" + + # Wait for first worker to initialize (checks ETCD registration) + wait_for_worker \"vLLM Unified\" \${WORKER_PIDS[0]} || exit 1 + + # Wait for ALL workers to register with ETCD + # vLLM workers can take a long time to initialize due to torch.compile + if [ \${#WORKER_PIDS[@]} -gt 1 ]; then + wait_for_all_workers \${#WORKER_PIDS[@]} || { + echo \"WARNING: Not all workers initialized. Continuing with available workers.\" + echo \" Dashboard metrics may be incomplete.\" + } + fi + + echo '' + echo '=========================================================' + echo 'Step 2: Starting Custom Router (Thompson Sampling + Prometheus)...' + echo '=========================================================' + # Router uses config.yaml for all parameters + # It needs workers to be present (started in Step 1) + # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component + DYN_SYSTEM_PORT=\$ROUTER_METRICS_PORT \ + python3 /workspace/custom_dynamo/router.py \ + --config /workspace/custom_dynamo/config.yaml & + ROUTER_PID=\$! + echo \"Router PID: \$ROUTER_PID\" + echo \"Metrics at: http://localhost:\$ROUTER_METRICS_PORT/metrics\" + sleep 15 + echo \"\" + + echo '' + echo '=========================================================' + echo 'Step 3: Starting Custom Processor (Static Mode)...' + echo '=========================================================' + # STATIC MODE: Processor uses @dynamo_worker(static=True) so it registers + # at dynamo.backend.generate WITHOUT an instance ID. This is required for + # --static-endpoint on the frontend to find it. + # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component + DYN_SYSTEM_PORT=\$PROCESSOR_METRICS_PORT \ + python3 /workspace/custom_dynamo/processor.py \ + --enable-router \ + --model-path $MODEL \ + --model-name $SERVED_MODEL_NAME & + PROCESSOR_PID=\$! + echo \"Processor PID: \$PROCESSOR_PID\" + echo \"Model: $SERVED_MODEL_NAME (from $MODEL)\" + echo \"Registered at: dynamo.backend.generate (namespace=dynamo)\" + echo \"Forwards to: workers.worker.generate (actual vLLM workers)\" + echo \"Metrics at: http://localhost:\$PROCESSOR_METRICS_PORT/metrics\" + sleep 15 + echo \"\" + + echo '' + echo '=========================================================' + echo 'Step 4: Starting Default Dynamo Frontend (Namespace-Scoped Discovery)...' + echo '=========================================================' + # NAMESPACE-SCOPED DISCOVERY: Frontend discovers backends via ETCD ModelWatcher, + # but only from the 'dynamo' namespace. Workers are in the 'workers' namespace, + # so the frontend will ONLY discover the processor (in 'dynamo' namespace). + # This ensures ALL requests go through the Thompson Sampling router. + echo \"Frontend KV Block Size: $KV_BLOCK_SIZE tokens (must match worker --block-size)\" + python3 -m dynamo.frontend \ + --http-port $HTTP_PORT \ + --model-name $SERVED_MODEL_NAME \ + --model-path $MODEL \ + --kv-cache-block-size $KV_BLOCK_SIZE \ + --namespace dynamo & + FRONTEND_PID=\$! + echo \"Frontend PID: \$FRONTEND_PID\" + echo \"Discovery: ETCD ModelWatcher (namespace=dynamo, discovers processor ONLY)\" + sleep 15 + echo \"\" + + echo '' + echo '=========================================================' + echo '✓ All components started successfully!' + echo '=========================================================' + echo \"Infrastructure Services (External):\" + echo \" ETCD: localhost:2379\" + echo \" NATS: localhost:4222\" + echo \"\" + echo \"Dynamo Components (This Container):\" + echo \" vLLM Unified Workers: \${#WORKER_PIDS[@]} workers (GPUs $WORKER_GPUS, TP=$TP_SIZE each)\" + for i in \$(seq 0 \$((\${#WORKER_PIDS[@]} - 1))); do + START_GPU=\$((i * $TP_SIZE)) + END_GPU=\$(((i + 1) * $TP_SIZE - 1)) + echo \" Worker \$i: PID \${WORKER_PIDS[\$i]}, GPUs \$START_GPU-\$END_GPU\" + done + echo \" → Registered at: workers.worker.generate (hidden from frontend)\" + echo \" Router: PID \$ROUTER_PID (Thompson Sampling + Prometheus)\" + echo \" → Registered at: dynamo.router.{find_worker,feedback}\" + echo \" → Metrics: http://localhost:\$ROUTER_METRICS_PORT/metrics\" + echo \" Processor: PID \$PROCESSOR_PID (NVExt annotation extraction)\" + echo \" → Registered at: dynamo.backend.generate (STATIC mode)\" + echo \" → Metrics: http://localhost:\$PROCESSOR_METRICS_PORT/metrics\" + echo \" Frontend: PID \$FRONTEND_PID (Default Dynamo HTTP API on port $HTTP_PORT)\" + echo \" → Discovery: ETCD ModelWatcher\" + echo \" → Metrics: http://localhost:$HTTP_PORT/metrics\" + echo '' + echo 'Request Flow (Dynamic Discovery - Thompson Sampling when routed to processor):' + echo ' Client → Default Frontend API (port $HTTP_PORT)' + echo ' ↓ (tokenization + nvext parsing)' + echo ' Frontend routes via ETCD ModelWatcher (processor OR workers)' + echo ' ↓' + echo ' IF routed to Processor (dynamo.backend.generate):' + echo ' ↓ (extract hints from annotations)' + echo ' ↓ (query Thompson Sampling router)' + echo ' Custom Router → worker_id' + echo ' ↓ (KV overlap + workload-aware selection)' + echo ' Processor routes to → workers.worker.generate (with worker_id)' + echo ' ↓' + echo ' vLLM Unified Worker (workers.worker.generate)' + echo ' ↓' + echo ' Response + Feedback to Router' + echo '' + echo 'Prometheus Metrics Endpoints:' + echo ' - Frontend: http://localhost:$HTTP_PORT/metrics (latency, throughput)' + echo ' - Workers: http://localhost:\$WORKER_METRICS_PORT/metrics - \$((WORKER_METRICS_PORT + \${#WORKER_PIDS[@]} - 1))/metrics (KV cache)' + echo ' - Router: http://localhost:\$ROUTER_METRICS_PORT/metrics (thompson_router_*)' + echo ' - Processor: http://localhost:\$PROCESSOR_METRICS_PORT/metrics (thompson_* KVE)' + echo '=========================================================' + + # Monitor all processes + while true; do + if ! kill -0 \$FRONTEND_PID 2>/dev/null; then + echo \"ERROR: Frontend died!\" + exit 1 + fi + if ! kill -0 \$PROCESSOR_PID 2>/dev/null; then + echo \"ERROR: Processor died!\" + exit 1 + fi + if ! kill -0 \$ROUTER_PID 2>/dev/null; then + echo \"ERROR: Router died!\" + exit 1 + fi + for i in \$(seq 0 \$((\${#WORKER_PIDS[@]} - 1))); do + if ! kill -0 \${WORKER_PIDS[\$i]} 2>/dev/null; then + echo \"ERROR: Worker \$i (PID \${WORKER_PIDS[\$i]}) died!\" + exit 1 + fi + done + sleep 10 + done + " + +# Wait for container to start +echo "" +echo "Waiting for container to start..." +sleep 15 + +# Check if container started successfully +if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + echo "" + echo "=========================================================" + echo "✓ Dynamo with OPTIMIZED Thompson Sampling Router Started! (vLLM)" + echo "=========================================================" + echo "" + echo "Architecture (Model Name Isolation - Thompson Sampling):" + echo "" + echo " Model Name Isolation Mode:" + echo " - Workers register with internal model name (${SERVED_MODEL_NAME}-internal)" + echo " - Processor registers with public model name (${SERVED_MODEL_NAME})" + echo " - Frontend routes ${SERVED_MODEL_NAME} requests to processor ONLY" + echo " - ALL requests go through Thompson Sampling router" + echo "" + echo " Startup Order:" + echo " 1. Workers → model=${SERVED_MODEL_NAME}-internal (not matched by frontend)" + echo " 2. Router → dynamo.router.{find_worker,feedback}" + echo " 3. Processor → model=${SERVED_MODEL_NAME} (matched by frontend)" + echo " 4. Frontend → routes to processor for ${SERVED_MODEL_NAME} requests" + echo "" + echo " Request Flow (ALL requests go through processor):" + echo " Client Request (with nvext.annotations)" + echo " ↓" + echo " Default Dynamo Frontend (port $HTTP_PORT)" + echo " ↓ ETCD ModelWatcher (namespace=dynamo) routes to processor" + echo " Custom Processor (dynamo.backend.generate)" + echo " ↓ extracts: prefix_id, total_requests, osl, iat" + echo " ↓ queries Thompson Sampling router" + echo " Custom Router → worker_id" + echo " ↓ KV overlap + workload-aware selection" + echo " Processor forwards to workers.worker.generate" + echo " ↓" + echo " vLLM Unified Workers ($NUM_WORKERS x TP=$TP_SIZE = $NUM_GPUS GPUs total)" + echo " ↓" + echo " Response + Feedback Loop" + echo "" + echo "Infrastructure Services (Managed):" + echo " ETCD: etcd-dynamo container, localhost:2379" + echo " NATS: nats-dynamo container, localhost:4222" + echo "" + echo "Prometheus Metrics Endpoints:" + echo " Frontend: http://localhost:$HTTP_PORT/metrics (latency, throughput)" + echo " Workers: http://localhost:$WORKER_METRICS_PORT/metrics - $((WORKER_METRICS_PORT + NUM_WORKERS - 1))/metrics (KV cache)" + echo " Router: http://localhost:$ROUTER_METRICS_PORT/metrics (routing)" + echo " Processor: http://localhost:$PROCESSOR_METRICS_PORT/metrics (KVE)" + echo "" + echo "Dynamo Components:" + echo " Frontend: HTTP API on port $HTTP_PORT" + echo " vLLM Unified Workers: $NUM_WORKERS workers (TP=$TP_SIZE each)" + echo "" + echo "KV Cache Settings:" + echo " Block Size: $KV_BLOCK_SIZE tokens (DYNAMO_KV_BLOCK_SIZE)" + echo " GPU Mem Utilization: $GPU_MEMORY_UTILIZATION (DYNAMO_GPU_MEMORY_UTILIZATION)" + echo " Max Concurrent Seqs: $MAX_NUM_SEQS (DYNAMO_MAX_NUM_SEQS)" + echo " KV Events: $ENABLE_KV_EVENTS (DYNAMO_ENABLE_KV_EVENTS)" + echo "" + echo "API Endpoint: http://localhost:$HTTP_PORT/v1/chat/completions" + echo "Health Check: http://localhost:$HTTP_PORT/health" + echo "" + echo "NVExt Annotations (in request body):" + echo " \"nvext\": {" + echo " \"annotations\": [" + echo " \"prefix_id:\"," + echo " \"total_requests:\"," + echo " \"osl:LOW|MEDIUM|HIGH\"," + echo " \"iat:LOW|MEDIUM|HIGH\"" + echo " ]" + echo " }" + echo "" + echo "Monitoring Dashboards:" + echo " Grafana: http://localhost:3000 (no login required)" + echo " Prometheus: http://localhost:9090" + echo "" + echo "Useful Commands:" + echo " Interactive shell: docker exec -it $CONTAINER_NAME bash" + echo " View Dynamo logs: docker logs -f $CONTAINER_NAME" + echo " View ETCD logs: docker logs -f etcd-dynamo" + echo " View NATS logs: docker logs -f nats-dynamo" + echo " GPU usage: watch -n 2 nvidia-smi" + echo " Stop all: bash stop_dynamo.sh" + echo " Stop all + metrics: bash stop_dynamo.sh --kill-metrics" + echo "" + echo "Query Metrics (vLLM uses 'vllm:' prefix):" + echo " curl http://localhost:$HTTP_PORT/metrics | grep dynamo_frontend" + echo " curl http://localhost:$WORKER_METRICS_PORT/metrics | grep vllm:" + echo " curl http://localhost:$ROUTER_METRICS_PORT/metrics | grep thompson_router" + echo " curl http://localhost:$PROCESSOR_METRICS_PORT/metrics | grep thompson_kve" + echo "" + echo "=========================================================" + echo "Test Request (with nvext annotations):" + echo "=========================================================" + echo "" + echo "# Basic test (no hints)" + echo "curl http://localhost:$HTTP_PORT/v1/chat/completions \\" + echo " -H 'Content-Type: application/json' \\" + echo " -d '{" + echo " \"model\": \"$SERVED_MODEL_NAME\"," + echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}]," + echo " \"max_tokens\": 50" + echo " }'" + echo "" + echo "# Test with nvext annotations (routing hints)" + echo "curl http://localhost:$HTTP_PORT/v1/chat/completions \\" + echo " -H 'Content-Type: application/json' \\" + echo " -d '{" + echo " \"model\": \"$SERVED_MODEL_NAME\"," + echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}]," + echo " \"max_tokens\": 50," + echo " \"nvext\": {" + echo " \"annotations\": [" + echo " \"prefix_id:test-session-001\"," + echo " \"total_requests:5\"," + echo " \"osl:MEDIUM\"," + echo " \"iat:LOW\"" + echo " ]" + echo " }" + echo " }'" + echo "" + echo "# Streaming test with hints" + echo "curl http://localhost:$HTTP_PORT/v1/chat/completions \\" + echo " -H 'Content-Type: application/json' \\" + echo " -d '{" + echo " \"model\": \"$SERVED_MODEL_NAME\"," + echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}]," + echo " \"max_tokens\": 50," + echo " \"stream\": true," + echo " \"nvext\": {" + echo " \"annotations\": [\"prefix_id:stream-test\", \"total_requests:1\"]" + echo " }" + echo " }'" + echo "" + echo "=========================================================" + echo "" + echo "Waiting for vLLM to initialize (this may take 5-10 minutes for a 70B model)..." + echo "Monitoring logs (Ctrl+C to exit, container continues)..." + echo "" + + # Wait for server to be ready + echo "Checking for API availability (timeout=${WORKER_INIT_TIMEOUT_S}s)..." + max_attempts=$WORKER_INIT_TIMEOUT_S + attempt=0 + + while [ $attempt -lt $max_attempts ]; do + # Use || true to prevent curl connection failures from exiting due to set -e + # curl returns "000" for connection refused, so we just need to prevent the exit + health_response=$(curl -s --max-time 5 -o /dev/null -w "%{http_code}" http://localhost:$HTTP_PORT/health 2>/dev/null) || true + if [ "$health_response" = "200" ]; then + echo "✓ Dynamo API is ready! (health check passed)" + break + fi + attempt=$((attempt + 1)) + if [ $((attempt % 15)) -eq 0 ]; then + echo " ... still waiting ($attempt/$max_attempts) - health response: $health_response" + fi + sleep 1 + done + + if [ $attempt -ge $max_attempts ]; then + echo "" + echo "⚠ Timeout waiting for API. Check logs with: docker logs $CONTAINER_NAME" + echo "" + else + echo "" + echo "Quick test (polling every 15s for up to 5 minutes):" + echo "" + + quick_test_max_attempts=20 # 20 * 15s = 5 minutes + quick_test_attempt=0 + quick_test_success=false + + while [ $quick_test_attempt -lt $quick_test_max_attempts ]; do + quick_test_attempt=$((quick_test_attempt + 1)) + echo " Attempt $quick_test_attempt/$quick_test_max_attempts..." + + quick_test_response=$(curl -s --max-time 60 http://localhost:$HTTP_PORT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "'$SERVED_MODEL_NAME'", + "messages": [{"role": "user", "content": "Say hello"}], + "max_tokens": 20 + }' 2>&1) || true + + # Check if response is empty/null + if [ -z "$quick_test_response" ]; then + echo " Empty response, retrying in 15s..." + sleep 15 + continue + fi + + # Check if response contains an error + error_message=$(echo "$quick_test_response" | jq -r '.error.message // .error // empty' 2>/dev/null) + if [ -n "$error_message" ]; then + echo "" + echo "=========================================================" + echo "✗ Quick test failed with error:" + echo " $error_message" + echo "=========================================================" + echo "" + echo "Full response:" + echo "$quick_test_response" | jq . 2>/dev/null || echo "$quick_test_response" + echo "" + echo "Check logs with: docker logs $CONTAINER_NAME" + exit 1 + fi + + # Check if response has valid choices (success) + choices_content=$(echo "$quick_test_response" | jq -r '.choices[0].message.content // empty' 2>/dev/null) + if [ -n "$choices_content" ]; then + echo "" + echo "=========================================================" + echo "✓ Quick test successful!" + echo "=========================================================" + echo "" + echo "$quick_test_response" | jq '.choices[0].message.content, .usage' + echo "" + echo "=========================================================" + echo "Container is running. View logs with:" + echo " docker logs -f $CONTAINER_NAME" + echo "=========================================================" + quick_test_success=true + break + fi + + # Response exists but no choices - might still be loading + echo " Response received but no valid choices, retrying in 15s..." + echo " Response: $(echo "$quick_test_response" | head -c 200)..." + sleep 15 + done + + if [ "$quick_test_success" = false ]; then + echo "" + echo "=========================================================" + echo "⚠ Quick test timed out after 5 minutes" + echo "=========================================================" + echo "" + echo "Container is running but may not be fully ready." + echo "Try manually: curl http://localhost:$HTTP_PORT/v1/chat/completions ..." + echo "Check logs with: docker logs $CONTAINER_NAME" + fi + fi +else + echo "" + echo "=========================================================" + echo "✗ Container failed to start!" + echo "=========================================================" + echo "" + echo "Check logs with: docker logs $CONTAINER_NAME" + exit 1 +fi + diff --git a/external/dynamo/start_dynamo_unified.sh b/external/dynamo/start_dynamo_unified.sh index 81c47410c6..5ed3e34bd5 100755 --- a/external/dynamo/start_dynamo_unified.sh +++ b/external/dynamo/start_dynamo_unified.sh @@ -34,7 +34,7 @@ CONTAINER_NAME="dynamo-sglang" WORKER_GPUS="${DYNAMO_GPU_DEVICES:-0,1,2,3,4,5,6,7}" TP_SIZE="${DYNAMO_TP_SIZE:-2}" -HTTP_PORT="${DYNAMO_HTTP_PORT:-8099}" +HTTP_PORT="${DYNAMO_HTTP_PORT:-8000}" MODEL="/workspace/models/Llama-3.3-70B-Instruct" SERVED_MODEL_NAME="${DYNAMO_MODEL_NAME:-llama-3.3-70b}" IMAGE="nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.1" diff --git a/external/dynamo/start_dynamo_unified_thompson_hints.sh b/external/dynamo/start_dynamo_unified_thompson_hints.sh index 86977029c2..3a804b892d 100755 --- a/external/dynamo/start_dynamo_unified_thompson_hints.sh +++ b/external/dynamo/start_dynamo_unified_thompson_hints.sh @@ -41,7 +41,7 @@ CONTAINER_NAME="dynamo-sglang" WORKER_GPUS="${DYNAMO_GPU_DEVICES:-0,1,2,3}" TP_SIZE="${DYNAMO_TP_SIZE:-4}" -HTTP_PORT="${DYNAMO_HTTP_PORT:-8099}" +HTTP_PORT="${DYNAMO_HTTP_PORT:-8000}" MODEL="/workspace/models/Llama-3.3-70B-Instruct" SERVED_MODEL_NAME="${DYNAMO_MODEL_NAME:-llama-3.3-70b}" IMAGE="nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.1" diff --git a/external/dynamo/stop_dynamo.sh b/external/dynamo/stop_dynamo.sh index b0c3f976f8..5afdc83edc 100755 --- a/external/dynamo/stop_dynamo.sh +++ b/external/dynamo/stop_dynamo.sh @@ -14,9 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Dynamo SGLang Shutdown Script -# Stops all components: Dynamo worker container, ETCD, and NATS +# Dynamo Shutdown Script +# Stops all components: Dynamo worker container (SGLang or vLLM), ETCD, and NATS # Works for: UNIFIED, THOMPSON SAMPLING, and DISAGGREGATED modes +# Supports both SGLang and vLLM backends # # Usage: # bash stop_dynamo.sh # Stop Dynamo, ETCD, NATS only @@ -50,26 +51,36 @@ for arg in "$@"; do done echo "=========================================================" -echo "Stopping Dynamo SGLang FULL STACK" +echo "Stopping Dynamo FULL STACK (SGLang/vLLM)" echo "=========================================================" echo "" -# Stop Dynamo containers (check for both standard and thompson variants) +# Stop Dynamo containers (check for SGLang and vLLM variants) STOPPED_CONTAINER=false +# SGLang containers if docker ps --format '{{.Names}}' | grep -q "^dynamo-sglang$"; then - echo "Stopping Dynamo container (standard)..." + echo "Stopping Dynamo container (SGLang)..." docker stop dynamo-sglang docker rm dynamo-sglang - echo "✓ Dynamo container stopped and removed" + echo "✓ Dynamo SGLang container stopped and removed" STOPPED_CONTAINER=true fi if docker ps --format '{{.Names}}' | grep -q "^dynamo-sglang-thompson$"; then - echo "Stopping Dynamo container (Thompson Sampling)..." + echo "Stopping Dynamo container (SGLang Thompson Sampling)..." docker stop dynamo-sglang-thompson docker rm dynamo-sglang-thompson - echo "✓ Dynamo Thompson container stopped and removed" + echo "✓ Dynamo SGLang Thompson container stopped and removed" + STOPPED_CONTAINER=true +fi + +# vLLM containers +if docker ps --format '{{.Names}}' | grep -q "^dynamo-vllm$"; then + echo "Stopping Dynamo container (vLLM)..." + docker stop dynamo-vllm + docker rm dynamo-vllm + echo "✓ Dynamo vLLM container stopped and removed" STOPPED_CONTAINER=true fi @@ -149,6 +160,7 @@ echo "=========================================================" echo "" echo "To restart:" echo " Standard Unified: bash start_dynamo_unified.sh" -echo " Thompson Sampling: bash start_dynamo_optimized_thompson_hints.sh" +echo " SGLang Thompson: bash start_dynamo_optimized_thompson_hints_sglang.sh" +echo " vLLM Thompson: bash start_dynamo_optimized_thompson_hints_vllm.sh" echo "" From 9249b3c40817b38db283b9f48adbb847ee94e12f Mon Sep 17 00:00:00 2001 From: bbednarski9 Date: Fri, 30 Jan 2026 03:02:19 +0000 Subject: [PATCH 08/13] vllm harness for MultiRLU Signed-off-by: bbednarski9 --- external/dynamo/.env.example | 18 +- external/dynamo/build_multi_lru_image.sh | 267 ++++ external/dynamo/demo_priority_eviction.sh | 167 ++ external/dynamo/generalized/processor.py | 2 +- external/dynamo/generalized/router.py | 2 +- external/dynamo/monitoring/scripts/README.md | 189 --- .../monitoring/scripts/cache_experiment.sh | 218 --- .../dynamo/optimized/processor_multilru.py | 833 ++++++++++ external/dynamo/optimized/router_multilru.py | 1404 +++++++++++++++++ ...rt_dynamo_optimized_thompson_hints_vllm.sh | 86 +- 10 files changed, 2766 insertions(+), 420 deletions(-) create mode 100755 external/dynamo/build_multi_lru_image.sh create mode 100755 external/dynamo/demo_priority_eviction.sh delete mode 100644 external/dynamo/monitoring/scripts/README.md delete mode 100755 external/dynamo/monitoring/scripts/cache_experiment.sh create mode 100644 external/dynamo/optimized/processor_multilru.py create mode 100644 external/dynamo/optimized/router_multilru.py diff --git a/external/dynamo/.env.example b/external/dynamo/.env.example index a9370b2d25..916f826481 100644 --- a/external/dynamo/.env.example +++ b/external/dynamo/.env.example @@ -112,4 +112,20 @@ DYNAMO_KV_BLOCK_SIZE=64 # Fraction of GPU memory for KV cache (0.0-1.0) # Reduce to test cache pressure/degradation scenarios # Default: 0.9 (90% of GPU memory for KV cache) -DYNAMO_MEM_FRACTION_STATIC=0.9 +# DYNAMO_MEM_FRACTION_STATIC=0.9 + +# ============================================================================= +# OPTIONAL VARIABLES - LRU development +# ============================================================================= + +# Path to Dynamo source for patching (auto-detected from DYNAMO_REPO_DIR) +# DYNAMO_SOURCE_DIR=/path/to/dynamo + +# vLLM worker option 1: default +DYNAMO_USE_MULTILRU=false +DYNAMO_VLLM_IMAGE="nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.1" + +#vLLM worker option 2: MultiLRU from Ryan Olsen's dev branch +# DYNAMO_USE_MULTILRU=true # will force script to use processor_multirlu.py and router_multirlu.py +# DYNAMO_VLLM_IMAGE="dynamo-multi-lru:latest" + diff --git a/external/dynamo/build_multi_lru_image.sh b/external/dynamo/build_multi_lru_image.sh new file mode 100755 index 0000000000..d1c86a2e0e --- /dev/null +++ b/external/dynamo/build_multi_lru_image.sh @@ -0,0 +1,267 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Build Dynamo vLLM Image with MultiLruBackend from ryan/kvbm-next branch +# +# This script builds the Dynamo vLLM runtime image from source using the +# ryan/kvbm-next branch, which includes the 4-pool MultiLruBackend for +# frequency-based KV cache eviction. +# +# The build uses the branch's native container/build.sh with: +# - Framework: VLLM +# - KVBM enabled (includes MultiLruBackend) +# - vLLM v0.14.0 +# - CUDA 12.9 +# - Python 3.12 +# +# Usage: +# ./build_multi_lru_image.sh [options] +# +# Options: +# --no-cache Build without Docker cache +# --skip-clone Skip cloning/updating the branch (use existing source) +# --source-dir DIR Source directory (default: auto-detect kvbm_next_source or kvbm_next_build) +# --target TARGET Docker build target (default: runtime) +# --tag TAG Custom image tag (default: dynamo-multi-lru:latest) +# --dry-run Print commands without executing +# --help Show this help message +# +# Environment Variables: +# DYNAMO_SOURCE_DIR Source directory (alternative to --source-dir) +# DYNAMO_BUILD_JOBS Cargo build parallelism (default: 4, reduce if OOM) +# DYNAMO_MAX_JOBS vLLM compilation parallelism (default: 8) + +set -euo pipefail + +# Script directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Configuration +BRANCH="ryan/kvbm-next" +REPO_URL="https://github.com/ai-dynamo/dynamo.git" + +# Build options (can be overridden by command line args) +KVBM_NEXT_DIR="" # Will be set after arg parsing +IMAGE_TAG="${DYNAMO_IMAGE_TAG:-dynamo-multi-lru:latest}" +BUILD_TARGET="${DYNAMO_BUILD_TARGET:-runtime}" +NO_CACHE="" +SKIP_CLONE=false +DRY_RUN="" +CARGO_BUILD_JOBS="${DYNAMO_BUILD_JOBS:-4}" +MAX_JOBS="${DYNAMO_MAX_JOBS:-8}" + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --no-cache) + NO_CACHE="--no-cache" + shift + ;; + --skip-clone) + SKIP_CLONE=true + shift + ;; + --source-dir) + KVBM_NEXT_DIR="$2" + shift 2 + ;; + --target) + BUILD_TARGET="$2" + shift 2 + ;; + --tag) + IMAGE_TAG="$2" + shift 2 + ;; + --dry-run) + DRY_RUN="--dry-run" + shift + ;; + --help|-h) + head -42 "$0" | tail -37 + exit 0 + ;; + *) + echo "Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + esac +done + +# Auto-detect source directory if not specified +if [ -z "$KVBM_NEXT_DIR" ]; then + if [ -n "${DYNAMO_SOURCE_DIR:-}" ]; then + KVBM_NEXT_DIR="$DYNAMO_SOURCE_DIR" + elif [ -d "${SCRIPT_DIR}/kvbm_next_source" ] && [ -f "${SCRIPT_DIR}/kvbm_next_source/container/build.sh" ]; then + KVBM_NEXT_DIR="${SCRIPT_DIR}/kvbm_next_source" + echo "Auto-detected existing source: $KVBM_NEXT_DIR" + else + KVBM_NEXT_DIR="${SCRIPT_DIR}/kvbm_next_build" + fi +fi + +echo "=========================================================" +echo "Building Dynamo vLLM Image with MultiLruBackend" +echo "=========================================================" +echo "" +echo "Configuration:" +echo " Branch: $BRANCH" +echo " Source Dir: $KVBM_NEXT_DIR" +echo " Image Tag: $IMAGE_TAG" +echo " Build Target: $BUILD_TARGET" +echo " Cargo Jobs: $CARGO_BUILD_JOBS" +echo " vLLM Jobs: $MAX_JOBS" +echo " Skip Clone: $SKIP_CLONE" +echo " No Cache: ${NO_CACHE:-false}" +echo "" + +# Step 1: Clone or update the ryan/kvbm-next branch +if [ "$SKIP_CLONE" = false ]; then + if [ -d "$KVBM_NEXT_DIR" ]; then + echo "Updating existing $BRANCH branch..." + cd "$KVBM_NEXT_DIR" + git fetch origin + git checkout "$BRANCH" + git pull origin "$BRANCH" + git submodule update --init --recursive + else + echo "Cloning $BRANCH branch..." + git clone --branch "$BRANCH" --depth 1 "$REPO_URL" "$KVBM_NEXT_DIR" + cd "$KVBM_NEXT_DIR" + git submodule update --init --recursive + fi + echo "✓ Source code ready at $KVBM_NEXT_DIR" +else + if [ ! -d "$KVBM_NEXT_DIR" ]; then + echo "ERROR: --skip-clone specified but source directory doesn't exist: $KVBM_NEXT_DIR" + exit 1 + fi + echo "Using existing source at $KVBM_NEXT_DIR" + cd "$KVBM_NEXT_DIR" +fi +echo "" + +# Step 2: Apply MultiLruBackend patch (if needed) +# The scheduler at lib/bindings/kvbm/src/v2/scheduler/mod.rs may use LineageBackend by default. +# We patch it to use MultiLruBackend for frequency-based eviction. +SCHEDULER_FILE="lib/bindings/kvbm/src/v2/scheduler/mod.rs" + +if [ -f "$SCHEDULER_FILE" ]; then + if grep -q "with_lineage_backend" "$SCHEDULER_FILE"; then + echo "Patching scheduler to enable MultiLruBackend..." + sed -i 's/\.with_lineage_backend()/.with_multi_lru_backend()/g' "$SCHEDULER_FILE" + + if grep -q "with_multi_lru_backend" "$SCHEDULER_FILE"; then + echo "✓ Scheduler patched: LineageBackend → MultiLruBackend" + grep -n "with_multi_lru_backend" "$SCHEDULER_FILE" | head -3 + else + echo "WARNING: Patch may have failed - check $SCHEDULER_FILE" + fi + elif grep -q "with_multi_lru_backend" "$SCHEDULER_FILE"; then + echo "✓ Scheduler already uses MultiLruBackend" + else + echo "WARNING: Could not find backend configuration in $SCHEDULER_FILE" + echo " The scheduler may use a different configuration method." + fi +else + echo "WARNING: Scheduler file not found at $SCHEDULER_FILE" + echo " This is expected if the branch structure has changed." +fi +echo "" + +# Step 3: Build the image using the branch's build.sh +echo "=========================================================" +echo "Building Docker image..." +echo "=========================================================" +echo "" +echo "Build command:" +echo " ./container/build.sh \\" +echo " --framework VLLM \\" +echo " --target $BUILD_TARGET \\" +echo " --tag $IMAGE_TAG \\" +echo " --enable-kvbm \\" +echo " --build-arg CARGO_BUILD_JOBS=$CARGO_BUILD_JOBS \\" +echo " --vllm-max-jobs $MAX_JOBS \\" +echo " $NO_CACHE $DRY_RUN" +echo "" + +# Make build.sh executable +chmod +x container/build.sh + +# Run the build +# Note: --enable-kvbm is automatically set for VLLM framework, but we set it explicitly for clarity +./container/build.sh \ + --framework VLLM \ + --target "$BUILD_TARGET" \ + --tag "$IMAGE_TAG" \ + --enable-kvbm \ + --build-arg "CARGO_BUILD_JOBS=$CARGO_BUILD_JOBS" \ + --vllm-max-jobs "$MAX_JOBS" \ + $NO_CACHE \ + $DRY_RUN + +BUILD_EXIT_CODE=$? + +if [ $BUILD_EXIT_CODE -eq 0 ]; then + echo "" + echo "=========================================================" + echo "✓ Build successful!" + echo "=========================================================" + echo "" + echo "Image: $IMAGE_TAG" + echo "" + + # Verify the image has KVBM installed + echo "Verifying image contents..." + if docker run --rm "$IMAGE_TAG" python3 -c "import kvbm; print('✓ KVBM module installed')" 2>/dev/null; then + echo "" + else + echo "⚠ Warning: Could not verify KVBM installation in image" + fi + + # Check for DynamoScheduler + if docker run --rm "$IMAGE_TAG" python3 -c "from kvbm.v2.vllm.schedulers.dynamo import DynamoScheduler; print('✓ DynamoScheduler available')" 2>/dev/null; then + echo "" + else + echo "⚠ Warning: Could not verify DynamoScheduler in image" + fi + + echo "Features:" + echo " - vLLM v0.14.0 backend" + echo " - KVBM with MultiLruBackend (4-pool frequency-based eviction)" + echo " - CUDA 12.9" + echo " - Python 3.12" + echo " - NIXL 0.9.0 for KV transfer" + echo "" + echo "MultiLruBackend Configuration:" + echo " - 4 priority pools: Cold → Warm → Hot → VeryHot" + echo " - Default promotion thresholds: [2, 6, 15] accesses" + echo " - Frequently accessed blocks protected from eviction" + echo "" + echo "To use this image, update your startup script:" + echo " IMAGE=\"$IMAGE_TAG\"" + echo "" + echo "Or set the environment variable:" + echo " export DYNAMO_VLLM_IMAGE=\"$IMAGE_TAG\"" + echo "" + echo "Then run:" + echo " ./start_dynamo_optimized_thompson_hints_vllm_multilru.sh" + echo "" +else + echo "" + echo "=========================================================" + echo "✗ Build failed with exit code: $BUILD_EXIT_CODE" + echo "=========================================================" + echo "" + echo "Troubleshooting:" + echo " 1. Check Docker daemon is running" + echo " 2. Ensure sufficient disk space (needs ~50GB)" + echo " 3. Try reducing parallelism:" + echo " DYNAMO_BUILD_JOBS=2 DYNAMO_MAX_JOBS=4 ./build_multi_lru_image.sh" + echo " 4. Check build logs above for specific errors" + echo "" + exit $BUILD_EXIT_CODE +fi + diff --git a/external/dynamo/demo_priority_eviction.sh b/external/dynamo/demo_priority_eviction.sh new file mode 100755 index 0000000000..d876f4471b --- /dev/null +++ b/external/dynamo/demo_priority_eviction.sh @@ -0,0 +1,167 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# +# MultiLRU Priority Eviction Demo +# ================================ +# Demonstrates frequency-based cache eviction protection +# +# Prerequisites: +# - Start Dynamo with: DYNAMO_NUM_GPU_BLOCKS_OVERRIDE=12 +# - This gives us 12 blocks total (small cache for quick demo) +# +# ┌─────────────────────────────────────────────────────────────────────────┐ +# │ RECOMMENDED: Run the KV Event Observer in a separate terminal │ +# │ │ +# │ This lets you see cache events in real-time as the demo runs: │ +# │ 📦 STORED - Blocks committed to prefix cache │ +# │ 🗑️ REMOVED - Blocks evicted (should be COLD blocks, not HOT!) │ +# │ ✅ CACHE HIT - Tokens served from cache │ +# │ │ +# │ Run inside the container: │ +# │ docker exec -it dynamo-vllm python \ │ +# │ /workspace/monitoring/scripts/kv_event_observer.py \ │ +# │ --port 20080 --verbose --metrics-port 18081 │ +# │ │ +# │ This shows you EXACTLY what the MultiLRU eviction policy is doing: │ +# │ - Watch HOT blocks get stored and stay in cache │ +# │ - Watch COLD blocks get stored then evicted │ +# │ - Verify HOT blocks are protected when cache fills up │ +# └─────────────────────────────────────────────────────────────────────────┘ +# +# What this demo shows: +# 1. Access a "HOT" prompt multiple times (promotes to VeryHot pool) +# 2. Fill cache with unique "COLD" prompts (forces eviction) +# 3. Access HOT prompt again - it still gets cache hits! +# 4. Cold blocks were evicted, hot blocks protected + +set -euo pipefail + +API="http://localhost:8000/v1/completions" +MODEL="llama-3.3-70b" + +# Long prompt to fill ~2 blocks (128+ tokens with block_size=64) +HOT_PROMPT="HOT_DEMO: This prompt will be accessed frequently and should be protected from eviction by the MultiLRU frequency-based cache management system. The quick brown fox jumps over the lazy dog multiple times throughout this demonstration. First jump over the lazy dog. Second jump over the lazy dog. Third jump over the lazy dog. Fourth jump over the lazy dog. Fifth jump over the lazy dog. Sixth jump over the lazy dog. Seventh jump over the lazy dog. Eighth jump over the lazy dog. This text ensures we have enough tokens to fill at least two complete KV cache blocks for proper prefix caching behavior." + +echo "╔══════════════════════════════════════════════════════════════╗" +echo "║ MultiLRU Priority Eviction Demo ║" +echo "║ Thresholds: [3, 8, 15] accesses for pool promotion ║" +echo "╚══════════════════════════════════════════════════════════════╝" +echo "" + +# Get baseline +get_hits() { + docker exec dynamo-vllm curl -s http://localhost:18081/metrics 2>/dev/null | \ + grep "prefix_cache_hits_total{" | grep -v external | awk '{print $NF}' +} + +BASELINE=$(get_hits) +echo "📊 Baseline cache hits: $BASELINE" +echo "" + +# ============================================================ +# STEP 1: Make HOT prompt "hot" (20 accesses → VeryHot pool) +# ============================================================ +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +echo "🔥 STEP 1: Access HOT prompt 20 times (threshold for VeryHot: 15)" +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + +for i in {1..20}; do + curl -s "$API" -H "Content-Type: application/json" -d "{ + \"model\": \"$MODEL\", + \"prompt\": \"$HOT_PROMPT\", + \"max_tokens\": 2, + \"nvext\": { + \"annotations\": [ + \"prefix_id:hot-demo-prompt\", + \"backend:frequency_multi_lru\" + ] + } + }" > /dev/null + echo -n "🔥" +done +echo "" + +AFTER_HOT=$(get_hits) +HOT_HITS=$((${AFTER_HOT%.*} - ${BASELINE%.*})) +echo " Cache hits from HOT prompt: $HOT_HITS tokens" +echo " → HOT blocks now in VeryHot pool (protected)" +echo "" + +# ============================================================ +# STEP 2: Fill cache with COLD prompts (forces eviction) +# ============================================================ +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +echo "❄️ STEP 2: Fill cache with 20 unique COLD prompts" +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + +for i in {1..20}; do + # Each COLD prompt is unique and fills 2+ blocks + COLD="COLD_$i: This is unique cold prompt number $i designed to fill the KV cache and trigger eviction. The quick brown fox jumps over the lazy dog. First unique jump $i. Second unique jump $i. Third unique jump $i. Fourth unique jump $i. Fifth unique jump $i. Sixth unique jump $i. Adding more padding text to ensure this prompt fills at least two complete cache blocks. Extra content for block filling: $i $i $i $i $i $i $i $i." + curl -s "$API" -H "Content-Type: application/json" -d "{ + \"model\": \"$MODEL\", + \"prompt\": \"$COLD\", + \"max_tokens\": 2, + \"nvext\": { + \"annotations\": [ + \"prefix_id:cold-$i\", + \"backend:frequency_multi_lru\" + ] + } + }" > /dev/null + echo -n "❄️" +done +echo "" + +AFTER_COLD=$(get_hits) +echo " Cold prompts added (each unique, no cache hits expected)" +echo " → Eviction should have occurred (cache overflow)" +echo "" + +# ============================================================ +# STEP 3: Test HOT prompt - should still get cache hits! +# ============================================================ +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +echo "🎯 STEP 3: Access HOT prompt again (was it protected?)" +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + +for i in {1..5}; do + curl -s "$API" -H "Content-Type: application/json" -d "{ + \"model\": \"$MODEL\", + \"prompt\": \"$HOT_PROMPT\", + \"max_tokens\": 2, + \"nvext\": { + \"annotations\": [ + \"prefix_id:hot-demo-prompt\", + \"backend:frequency_multi_lru\" + ] + } + }" > /dev/null + echo -n "🎯" +done +echo "" + +FINAL=$(get_hits) +FINAL_HITS=$((${FINAL%.*} - ${AFTER_COLD%.*})) +echo "" + +# ============================================================ +# RESULTS +# ============================================================ +echo "╔══════════════════════════════════════════════════════════════╗" +echo "║ RESULTS ║" +echo "╠══════════════════════════════════════════════════════════════╣" +printf "║ HOT prompt initial cache hits: %6d tokens ║\n" "$HOT_HITS" +printf "║ HOT prompt hits AFTER eviction: %6d tokens ║\n" "$FINAL_HITS" +echo "╠══════════════════════════════════════════════════════════════╣" + +if [ "$FINAL_HITS" -gt 0 ]; then + echo "║ ✅ SUCCESS: Hot blocks PROTECTED from eviction! ║" + echo "║ ║" + echo "║ MultiLRU frequency-based eviction kept the frequently ║" + echo "║ accessed blocks while evicting cold (single-access) ones. ║" +else + echo "║ ❌ Hot blocks were evicted (no protection) ║" +fi +echo "╚══════════════════════════════════════════════════════════════╝" + diff --git a/external/dynamo/generalized/processor.py b/external/dynamo/generalized/processor.py index 7403000a55..c984842442 100644 --- a/external/dynamo/generalized/processor.py +++ b/external/dynamo/generalized/processor.py @@ -426,7 +426,7 @@ def parse_args(): async def worker(runtime: DistributedRuntime): args = parse_args() component = runtime.namespace("dynamo").component("processor") - await component.create_service() + # NOTE: create_service() was removed in Dynamo 0.8.x - endpoint creation handles registration handler = ProcessorRequestHandler(runtime, model_name=args.model, enable_router=args.enable_router) await handler.initialize() diff --git a/external/dynamo/generalized/router.py b/external/dynamo/generalized/router.py index 96274617b7..eb8bd04eb0 100644 --- a/external/dynamo/generalized/router.py +++ b/external/dynamo/generalized/router.py @@ -1067,7 +1067,7 @@ async def worker(runtime: DistributedRuntime): args = parse_args() component = runtime.namespace("dynamo").component("router") - await component.create_service() + # NOTE: create_service() was removed in Dynamo 0.8.x - endpoint creation handles registration logger.info("Initializing WorkloadAwareRouter (LinTS + feedback + timeout + traces)") router = WorkloadAwareRouter( diff --git a/external/dynamo/monitoring/scripts/README.md b/external/dynamo/monitoring/scripts/README.md deleted file mode 100644 index 65e347e6ba..0000000000 --- a/external/dynamo/monitoring/scripts/README.md +++ /dev/null @@ -1,189 +0,0 @@ -# KV Cache Event Observer - -Real-time monitoring of vLLM prefix cache events (block stored, evicted, and cache hits). - -## Quick Start - -```bash -# Basic monitoring (ZMQ events only) -docker exec -it dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py -p 20080 -v - -# With cache hit detection (polls Prometheus metrics) -docker exec -it dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py -p 20080 -v -m 18081 -``` - -## Usage - -```bash -# Basic verbose monitoring -docker exec -it dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py -p 20080 -v - -# With cache hit detection (recommended for experiments) -docker exec -it dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py -p 20080 -v -m 18081 - -# Run for 60 seconds -docker exec -it dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py -p 20080 -v -d 60 - -# Save events to file -docker exec -it dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py -p 20080 -v -o /tmp/events.jsonl - -# Monitor worker 1 (port 20081, metrics 18082) -docker exec -it dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py -p 20081 -v -m 18082 -``` - -## Options - -| Flag | Description | -|------|-------------| -| `-p`, `--port` | KV event ZMQ port (default: 20080, worker 1 = 20081, etc.) | -| `-m`, `--metrics-port` | Prometheus metrics port for cache hit detection (e.g., 18081) | -| `-v`, `--verbose` | Print each event as it happens | -| `-d`, `--duration` | Run for N seconds then stop | -| `-o`, `--output` | Save events to JSONL file | -| `-H`, `--host` | Worker host (default: localhost) | - -## Event Types - -| Symbol | Event | Source | Description | -|--------|-------|--------|-------------| -| 📦 | STORED | ZMQ | Block committed to prefix cache | -| 🗑️ | REMOVED | ZMQ | Block evicted from cache | -| 🧹 | CLEARED | ZMQ | Entire cache cleared | -| ✅ | CACHE HIT | Metrics | Tokens served from cache (requires `-m`) | - -## Example Output - -``` -[KV Observer] Listening for KV events (msgpack multipart)... -[KV Observer] Cache hits will show as ✅ [CACHE HIT] ------------------------------------------------------------- -📦 [STORED ] seq= 32 hash=df6f76832e34d5f5 tokens= 64 medium=GPU -🗑️ [REMOVED ] seq= 33 hash=eaacc201f3aaf753 medium=GPU -✅ [CACHE HIT] tokens= 64 queried= 128 hit_rate=50% -📦 [STORED ] seq= 34 hash=df6f76832e34d5f5 tokens= 64 medium=GPU ------------------------------------------------------------- -[KV Observer] Final Statistics: - stored_blocks: 2 - evicted_blocks: 1 - net_blocks: 1 - cache_hit_tokens: 64 - cache_query_tokens: 192 - cache_hit_rate: 33.3% -``` - -## Notes - -- **STORED/REMOVED events**: Published via ZMQ when cache state changes -- **CACHE HIT events**: Detected by polling Prometheus metrics (requires `-m` flag) -- **No event = cache hit**: If a repeated query shows no STORED event, the block was already cached -- Events only fire for **full blocks** (64 tokens with default block size) -- Short prompts (less than 64 tokens) may not generate STORED events for incomplete blocks -- With limited cache (e.g., 16 blocks), expect frequent evictions -- **Clearing the cache**: vLLM does not expose a direct cache clear API. To fully clear the cache, restart the vLLM worker. Alternatively, use `--flush` with the experiment script to fill the cache with unique queries, pushing out old entries via LRU eviction. - -## Port Mapping - -| Worker | ZMQ Port (`-p`) | Metrics Port (`-m`) | -|--------|-----------------|---------------------| -| Worker 0 | 20080 | 18081 | -| Worker 1 | 20081 | 18082 | -| Worker 2 | 20082 | 18083 | - -## Manual Cache Lifecycle Experiment - -This experiment demonstrates the full KV cache lifecycle: **STORE → STORE → EVICT → STORE → CACHE HIT**. - -### Setup - -```bash -# 1. Stop any running Dynamo stack -bash stop_dynamo.sh - -# 2. Configure limited cache for experiment (5 blocks) -export DYNAMO_GPU_DEVICES=0,1,2,3 -export DYNAMO_TP_SIZE=4 -export DYNAMO_KV_BLOCK_SIZE=64 -export DYNAMO_NUM_GPU_BLOCKS_OVERRIDE=5 - -# 3. Start vLLM with KV events enabled -bash start_dynamo_optimized_thompson_hints_vllm.sh > startup_output.txt - -# 4. In a separate terminal, start the observer -docker exec -it dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py -p 20080 -v -m 18081 -``` - -### Run Queries - -Queries must be **65+ tokens** (including chat template) to generate cache events: - -```bash -# Query A (70 tokens) - STORE -curl -s http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" \ - -d '{"model":"llama-3.3-70b","messages":[{"role":"user","content":"Query A: The quick brown fox jumps over the lazy dog repeatedly. The quick brown fox jumps over the lazy dog repeatedly. The quick brown fox jumps over the lazy dog."}],"max_tokens":5}' - -# Query B (72 tokens) - STORE -curl -s http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" \ - -d '{"model":"llama-3.3-70b","messages":[{"role":"user","content":"Query B: Pack my box with five dozen liquor jugs today please. Pack my box with five dozen liquor jugs today please. Pack my box with five dozen liquor jugs."}],"max_tokens":5}' - -# Query C (76 tokens) - EVICT A, STORE C -curl -s http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" \ - -d '{"model":"llama-3.3-70b","messages":[{"role":"user","content":"Query C: How vexingly quick daft zebras jump over the moon tonight. How vexingly quick daft zebras jump over the moon tonight. How vexingly quick daft zebras jump."}],"max_tokens":5}' - -# Query C again - CACHE HIT -curl -s http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" \ - -d '{"model":"llama-3.3-70b","messages":[{"role":"user","content":"Query C: How vexingly quick daft zebras jump over the moon tonight. How vexingly quick daft zebras jump over the moon tonight. How vexingly quick daft zebras jump."}],"max_tokens":5}' -``` - -### Expected Observer Output - -``` -📦 [STORED ] seq= 0 hash=ca596e30d283c6f7 tokens= 64 medium=GPU ← Query A -📦 [STORED ] seq= 1 hash=41ccc959d03a1d21 tokens= 64 medium=GPU ← Query B -🗑️ [REMOVED ] seq= 2 hash=ca596e30d283c6f7 medium=GPU ← Query A evicted (LRU) -📦 [STORED ] seq= 2 hash=b5291e07de5d51cc tokens= 64 medium=GPU ← Query C -✅ [CACHE HIT] tokens= 64 queried= 76 hit_rate=84% ← Query C repeated -``` - -### Cache Size Guidelines - -| Blocks | Usable | Behavior | -|--------|--------|----------| -| 3-4 | ~0-1 | Constant thrashing, no cache benefit | -| 5-8 | ~2-4 | Good for demonstrating evictions + hits | -| 16+ | ~10+ | Production-like behavior | - -### Key Requirements - -- **Prompt length**: Must exceed 64 tokens (1 block) to generate STORED events -- **Cache size**: Use `DYNAMO_NUM_GPU_BLOCKS_OVERRIDE=5` to force evictions -- **Metrics flag**: Use `-m 18081` to detect cache hits (not published via ZMQ) - -## Cache Experiment Script - -Run a complete A → B → C → A cache experiment: - -```bash -# Basic experiment -./cache_experiment.sh - -# Flush cache first (recommended) -./cache_experiment.sh --flush - -# Verbose output (full API responses) -./cache_experiment.sh --flush --verbose -``` - -The script: -1. Optionally flushes the cache by filling it with unique queries -2. Starts the KV event observer in the background -3. Sends Query A (should STORE) -4. Sends Query B (should STORE) -5. Sends Query C (should STORE) -6. Sends Query A again (should show CACHE HIT) -7. Displays observer output and final statistics - -## Requirements - -- vLLM must be started with `--kv-events-config` containing `enable_kv_cache_events: true` -- The startup script `start_dynamo_optimized_thompson_hints_vllm.sh` configures this automatically when `DYNAMO_ENABLE_KV_EVENTS=true` - diff --git a/external/dynamo/monitoring/scripts/cache_experiment.sh b/external/dynamo/monitoring/scripts/cache_experiment.sh deleted file mode 100755 index 77602d957b..0000000000 --- a/external/dynamo/monitoring/scripts/cache_experiment.sh +++ /dev/null @@ -1,218 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# KV Cache Experiment Script -# -# Demonstrates prefix cache behavior with queries A → B → C → A -# Shows: STORED events, REMOVED (eviction) events, and CACHE HITs -# -# Usage: -# ./cache_experiment.sh # Run with defaults -# ./cache_experiment.sh --flush # Flush cache first by filling it -# ./cache_experiment.sh --verbose # Show full curl responses - -set -euo pipefail - -# Configuration -API_URL="${DYNAMO_API_URL:-http://localhost:8000}" -MODEL="${DYNAMO_MODEL_NAME:-llama-3.3-70b}" -ZMQ_PORT="${DYNAMO_KV_EVENT_PORT:-20080}" -METRICS_PORT="${DYNAMO_WORKER_METRICS_PORT:-18081}" -MAX_TOKENS=5 -VERBOSE=false -FLUSH_CACHE=false - -# Parse arguments -while [[ $# -gt 0 ]]; do - case $1 in - --verbose|-v) VERBOSE=true; shift ;; - --flush|-f) FLUSH_CACHE=true; shift ;; - --help|-h) - echo "Usage: $0 [--verbose] [--flush]" - echo " --verbose, -v Show full API responses" - echo " --flush, -f Flush cache by filling it before experiment" - exit 0 - ;; - *) echo "Unknown option: $1"; exit 1 ;; - esac -done - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -CYAN='\033[0;36m' -NC='\033[0m' # No Color - -echo -e "${CYAN}=========================================================${NC}" -echo -e "${CYAN} KV Cache Experiment: A → B → C → A${NC}" -echo -e "${CYAN}=========================================================${NC}" -echo "" -echo -e "API: ${API_URL}" -echo -e "Model: ${MODEL}" -echo -e "ZMQ Port: ${ZMQ_PORT}" -echo -e "Metrics Port: ${METRICS_PORT}" -echo "" - -# Check if API is available -echo -e "${BLUE}Checking API availability...${NC}" -if ! curl -s --max-time 5 "${API_URL}/health" > /dev/null 2>&1; then - echo -e "${RED}ERROR: API not available at ${API_URL}${NC}" - echo "Make sure Dynamo is running: bash start_dynamo_optimized_thompson_hints_vllm.sh" - exit 1 -fi -echo -e "${GREEN}✓ API is available${NC}" -echo "" - -# Long prompts that will fill at least 1 complete block (64 tokens each) -# Each prompt is ~120+ tokens to ensure at least 1 full block is stored -QUERY_A="Query Alpha: Please provide a comprehensive and detailed explanation of quantum computing technology. Start by explaining what quantum bits (qubits) are and how they fundamentally differ from classical binary bits. Then thoroughly discuss the principle of quantum superposition and how it enables massive parallelism in quantum computations." - -QUERY_B="Query Beta: Please provide an in-depth explanation of machine learning and artificial intelligence. Begin by describing the fundamental differences between supervised, unsupervised, and reinforcement learning paradigms. Then explain neural network architectures including feedforward networks, convolutional neural networks, and transformers." - -QUERY_C="Query Charlie: Please provide a detailed overview of cloud computing infrastructure and services. Start by explaining the differences between Infrastructure as a Service (IaaS), Platform as a Service (PaaS), and Software as a Service (SaaS). Then discuss containerization technologies like Docker and Kubernetes orchestration." - -# Function to send a query and display results -send_query() { - local name=$1 - local prompt=$2 - local color=$3 - - echo -e "${color}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" - echo -e "${color}Sending Query ${name}${NC}" - echo -e "${color}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" - - response=$(curl -s "${API_URL}/v1/chat/completions" \ - -H "Content-Type: application/json" \ - -d "{ - \"model\": \"${MODEL}\", - \"messages\": [{\"role\": \"user\", \"content\": \"${prompt}\"}], - \"max_tokens\": ${MAX_TOKENS} - }") - - if [ "$VERBOSE" = true ]; then - echo "$response" | jq . - else - prompt_tokens=$(echo "$response" | jq -r '.usage.prompt_tokens // "N/A"') - completion_tokens=$(echo "$response" | jq -r '.usage.completion_tokens // "N/A"') - echo -e " Prompt tokens: ${prompt_tokens}" - echo -e " Completion tokens: ${completion_tokens}" - fi - echo "" -} - -# Function to flush cache by sending many unique queries -flush_cache() { - echo -e "${YELLOW}Flushing cache by filling it with unique queries...${NC}" - echo -e "${YELLOW}(This may take a minute)${NC}" - echo "" - - for i in $(seq 1 20); do - curl -s "${API_URL}/v1/chat/completions" \ - -H "Content-Type: application/json" \ - -d "{ - \"model\": \"${MODEL}\", - \"messages\": [{\"role\": \"user\", \"content\": \"Flush query number ${i}: This is a unique cache flush query designed to evict existing cached blocks from the prefix cache. Random identifier: ${RANDOM}${RANDOM}${RANDOM}. Please provide a detailed explanation of topic ${i}.\"}], - \"max_tokens\": 1 - }" > /dev/null 2>&1 - echo -ne "\r Progress: ${i}/20" - done - echo -e "\n${GREEN}✓ Cache flushed${NC}" - echo "" -} - -# Get initial cache metrics -echo -e "${BLUE}Initial cache state:${NC}" -initial_hits=$(curl -s "http://localhost:${METRICS_PORT}/metrics" | grep "vllm:prefix_cache_hits_total{" | grep -oE '[0-9.]+$' || echo "0") -initial_queries=$(curl -s "http://localhost:${METRICS_PORT}/metrics" | grep "vllm:prefix_cache_queries_total{" | grep -oE '[0-9.]+$' || echo "0") -echo -e " Cache hits: ${initial_hits}" -echo -e " Cache queries: ${initial_queries}" -echo "" - -# Flush cache if requested -if [ "$FLUSH_CACHE" = true ]; then - flush_cache -fi - -# Start the KV event observer in the background -echo -e "${BLUE}Starting KV event observer...${NC}" -OBSERVER_LOG=$(mktemp) -docker exec dynamo-vllm python /workspace/monitoring/scripts/kv_event_observer.py \ - -p "${ZMQ_PORT}" -v -m "${METRICS_PORT}" -d 60 > "$OBSERVER_LOG" 2>&1 & -OBSERVER_PID=$! -sleep 2 -echo -e "${GREEN}✓ Observer started (PID: ${OBSERVER_PID})${NC}" -echo "" - -echo -e "${CYAN}=========================================================${NC}" -echo -e "${CYAN} Starting Query Sequence: A → B → C → A${NC}" -echo -e "${CYAN}=========================================================${NC}" -echo "" - -# Send queries with delays to allow event processing -send_query "A (first time)" "$QUERY_A" "$GREEN" -sleep 2 - -send_query "B" "$QUERY_B" "$YELLOW" -sleep 2 - -send_query "C" "$QUERY_C" "$RED" -sleep 2 - -send_query "A (repeated - expect cache hit)" "$QUERY_A" "$GREEN" -sleep 3 - -# Stop observer and show results -echo -e "${CYAN}=========================================================${NC}" -echo -e "${CYAN} Stopping Observer & Showing Results${NC}" -echo -e "${CYAN}=========================================================${NC}" -echo "" - -# Kill observer gracefully -kill $OBSERVER_PID 2>/dev/null || true -wait $OBSERVER_PID 2>/dev/null || true -sleep 1 - -# Display observer output -echo -e "${BLUE}KV Event Observer Output:${NC}" -echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" -cat "$OBSERVER_LOG" -echo "" - -# Get final cache metrics -echo -e "${BLUE}Final cache state:${NC}" -final_hits=$(curl -s "http://localhost:${METRICS_PORT}/metrics" | grep "vllm:prefix_cache_hits_total{" | grep -oE '[0-9.]+$' || echo "0") -final_queries=$(curl -s "http://localhost:${METRICS_PORT}/metrics" | grep "vllm:prefix_cache_queries_total{" | grep -oE '[0-9.]+$' || echo "0") -echo -e " Cache hits: ${final_hits} (delta: +$(echo "$final_hits - $initial_hits" | bc))" -echo -e " Cache queries: ${final_queries} (delta: +$(echo "$final_queries - $initial_queries" | bc))" -echo "" - -# Calculate hit rate for this experiment -delta_hits=$(echo "$final_hits - $initial_hits" | bc) -delta_queries=$(echo "$final_queries - $initial_queries" | bc) -if [ "$delta_queries" != "0" ]; then - hit_rate=$(echo "scale=1; $delta_hits * 100 / $delta_queries" | bc) - echo -e "${GREEN}Experiment hit rate: ${hit_rate}%${NC}" -fi - -# Cleanup -rm -f "$OBSERVER_LOG" - -echo "" -echo -e "${CYAN}=========================================================${NC}" -echo -e "${CYAN} Experiment Complete!${NC}" -echo -e "${CYAN}=========================================================${NC}" -echo "" -echo -e "Expected behavior:" -echo -e " • Query A (1st): ${GREEN}📦 STORED${NC} - new block cached" -echo -e " • Query B: ${YELLOW}📦 STORED${NC} - new block cached (may evict old blocks)" -echo -e " • Query C: ${RED}📦 STORED${NC} - new block cached (may evict old blocks)" -echo -e " • Query A (2nd): ${GREEN}✅ CACHE HIT${NC} - if A still in cache, or 📦 STORED if evicted" -echo "" -echo -e "With 16 blocks available, all 3 queries should fit without evicting each other." -echo -e "To force evictions, restart with: DYNAMO_NUM_GPU_BLOCKS_OVERRIDE=4" -echo "" - - diff --git a/external/dynamo/optimized/processor_multilru.py b/external/dynamo/optimized/processor_multilru.py new file mode 100644 index 0000000000..0e114d377b --- /dev/null +++ b/external/dynamo/optimized/processor_multilru.py @@ -0,0 +1,833 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Optimized Processor for Thompson Sampling Router Architecture. + +This processor uses the "Processor-as-Backend" pattern with DYNAMIC DISCOVERY +to intercept requests from the default Dynamo frontend and apply custom Thompson +Sampling routing. + +## Dynamic Discovery Mode (Forward-Compatible) + +Instead of using the deprecated `--static-endpoint` flag on the frontend, this +processor registers a model card in ETCD so the frontend can discover it via +its ModelWatcher. This is the forward-compatible approach. + +### Requirements: +- Processor must be started with `--model-path` and `--model-name` arguments +- Model path must point to a valid model directory with tokenizer files +- Model name must match what the frontend expects (e.g., "llama-3.3-70b") + +### Endpoint Registration Pattern + +1. **This Processor registers as `dynamo.backend.generate`** - Dynamically with instance ID +2. **Processor calls `register_llm()`** - Advertises model card in ETCD +3. **Frontend's ModelWatcher discovers us** - Routes requests to our endpoint +4. **SGLang Worker registers as `workers.worker.generate`** - We forward to actual workers + +## Request Flow + +``` +Frontend (discovers backends via ETCD ModelWatcher) + → routes to dynamo.backend.generate-{instance_id} + → THIS PROCESSOR (discovered via model card!) + → extracts hints from nvext annotations + → queries Thompson Sampling router → worker_id + → forwards to workers.worker.generate (actual SGLang workers) +``` + +Key differences from generalized/processor.py: +- Uses dynamic discovery (no --static-endpoint on frontend) +- Registers model card via register_llm() for ETCD discovery +- Registers as `dynamo.backend.generate` (not `dynamo.processor.process`) +- Forwards to `workers.worker.generate` (workers in separate namespace) +- Receives PreprocessedRequest instead of ChatCompletionRequest +- Extracts hints from nvext annotations (prefix_id:value format) +- Uses Dynamo metrics API for Prometheus integration (auto-exposed at /metrics) +- No tokenization (handled by frontend preprocessor) + +## Metrics + +All metrics are exposed via Dynamo's `/metrics` endpoint (requires DYN_SYSTEM_PORT). +Metrics use the `dynamo_component_` prefix and include standard Dynamo labels: +- `dynamo_namespace`, `dynamo_component`, `dynamo_endpoint` + +Custom metrics for Thompson Sampling routing: +- `requests_total` - Total requests processed +- `request_latency_seconds` - End-to-end request latency histogram +- `tokens_in_total` / `tokens_out_total` - Token throughput counters +- `routing_decisions_total` - Per-worker routing decision counter +- `router_errors_total` / `engine_errors_total` - Error counters +- `active_requests` - Current in-flight request gauge + +KV Cache Efficiency (KVE) metrics: +- `kve_prompt_tokens_total` - Total prompt tokens (efficiency denominator) +- `kve_cached_tokens_total` - Total cached tokens hit (efficiency numerator) +- `kve_device_blocks_total` - Cache hits from device (GPU) memory +- `kve_host_blocks_total` - Cache hits from host (CPU) memory +- `kve_disk_blocks_total` - Cache hits from disk + +## Grafana Integration + +Metrics are exposed at `/metrics` in Prometheus format. Enable with: + DYN_SYSTEM_PORT=8081 python processor.py --model-path ... --model-name ... + +Full metric names include the `dynamo_component_` prefix: + dynamo_component_requests_total{dynamo_namespace="dynamo",dynamo_component="backend",dynamo_endpoint="generate"} + +Example PromQL queries for Grafana dashboards: + # KV Cache Efficiency (%) + rate(dynamo_component_kve_cached_tokens_total[5m]) / rate(dynamo_component_kve_prompt_tokens_total[5m]) * 100 + + # Request latency p99 + histogram_quantile(0.99, rate(dynamo_component_request_latency_seconds_bucket[5m])) + +## Data Source Requirements + +KVE metrics require the underlying engine to return cache efficiency data: +- `usage.prompt_tokens_details.cached_tokens` - Standard OpenAI field (should work with prefix caching enabled) +- `nvext.cache_hit_breakdown` - Engine-specific extension (NOT standard Dynamo NvExt) +""" + +import argparse +import asyncio +import logging +import os +import time +import uuid +from collections.abc import AsyncIterator +from typing import Any + +import uvloop +from dynamo.llm import ModelInput +from dynamo.llm import ModelType +from dynamo.llm import register_llm +from dynamo.runtime import DistributedRuntime +from dynamo.runtime import dynamo_worker +from dynamo.runtime.logging import configure_dynamo_logging +from pydantic import BaseModel + +configure_dynamo_logging() +logger = logging.getLogger(__name__) + + +# ----------------------- request / response models ----------------------- # +class RouterRequest(BaseModel): + """Request to the Thompson Sampling router.""" + + tokens: list[int] + prefix_id: str = "" + reuse_budget: int = 0 # remaining *after this request* + expected_osl: str | None = "MEDIUM" + interarrival: str | None = "MEDIUM" + + +class RouterFeedbackRequest(BaseModel): + """Feedback to the router after request completion.""" + + decision_id: str + latency_ms: float + success: bool | None = True + tokens_in: int | None = None + tokens_out: int | None = None + finish_reason: str | None = None + + +# ----------------------- KV efficiency data ----------------------- # +class KVEfficiencyData: + """ + Container for KV cache efficiency data extracted from worker responses. + + This data is used to compute and publish KVE metrics asynchronously, + ensuring zero impact on routing throughput. + """ + + __slots__ = ("prompt_tokens", "cached_tokens", "device_blocks", "host_blocks", "disk_blocks") + + def __init__(self): + self.prompt_tokens: int = 0 + self.cached_tokens: int = 0 + self.device_blocks: int = 0 + self.host_blocks: int = 0 + self.disk_blocks: int = 0 + + def has_data(self) -> bool: + """Check if any KVE data was collected.""" + return self.prompt_tokens > 0 + + @classmethod + def from_response(cls, data: dict[str, Any]) -> "KVEfficiencyData": + """ + Extract KVE data from a worker response chunk. + + Expected fields in response (OpenAI-compatible): + - usage.prompt_tokens: Total prompt tokens + - usage.prompt_tokens_details.cached_tokens: Cached token count + + Optional engine-specific fields (may not be present): + - nvext.cache_hit_breakdown.{device,host,disk}_blocks: Per-tier hits + + Note: cache_hit_breakdown is NOT a standard Dynamo NvExt field. + It must be enabled/configured in the underlying engine (vLLM/SGLang). + """ + kve = cls() + + # Extract from usage field (OpenAI-compatible, should always work) + usage = data.get("usage") + if isinstance(usage, dict): + kve.prompt_tokens = usage.get("prompt_tokens", 0) or 0 + prompt_details = usage.get("prompt_tokens_details") + if isinstance(prompt_details, dict): + kve.cached_tokens = prompt_details.get("cached_tokens", 0) or 0 + + # Extract cache breakdown from nvext (engine-specific, may not be present) + # This is NOT a standard Dynamo NvExt field - requires engine configuration + nvext = data.get("nvext") + if isinstance(nvext, dict): + breakdown = nvext.get("cache_hit_breakdown") + if isinstance(breakdown, dict): + kve.device_blocks = breakdown.get("device_blocks", 0) or 0 + kve.host_blocks = breakdown.get("host_blocks", 0) or 0 + kve.disk_blocks = breakdown.get("disk_blocks", 0) or 0 + + return kve + + +# ----------------------- metrics dataclass ----------------------- # +class ProcessorMetrics: + """ + Container for Thompson Sampling processor metrics. + + Uses prometheus_client directly (compatible with ryan/kvbm-next branch). + Metrics are registered with Dynamo's endpoint via register_prometheus_expfmt_callback. + """ + + def __init__(self, endpoint): + """ + Initialize metrics using prometheus_client directly. + + Args: + endpoint: Dynamo endpoint object for registering metrics callback. + """ + from prometheus_client import Counter, Gauge, Histogram, REGISTRY + + # Request throughput (prefixed with thompson_ to avoid conflicts with + # serve_endpoint's built-in work handler metrics) + self.requests_total = Counter( + "thompson_requests_total", + "Total requests processed by the Thompson Sampling processor", + ) + + # Latency histogram + self.request_latency_seconds = Histogram( + "thompson_request_latency_seconds", + "End-to-end request latency in seconds", + ) + + # Token throughput + self.tokens_in_total = Counter( + "thompson_tokens_in_total", + "Total input tokens processed", + ) + self.tokens_out_total = Counter( + "thompson_tokens_out_total", + "Total output tokens generated", + ) + + # Routing decisions by worker (for analyzing load distribution) + self.routing_decisions_total = Counter( + "thompson_routing_decisions_total", + "Routing decisions by worker", + ["worker_id"], + ) + + # Error tracking + self.router_errors_total = Counter( + "thompson_router_errors_total", + "Router communication errors (failed to pick worker)", + ) + self.engine_errors_total = Counter( + "thompson_engine_errors_total", + "Backend engine errors (failed during streaming)", + ) + + # Active request gauge + self.active_requests = Gauge( + "thompson_active_requests", + "Currently active requests being processed", + ) + + # ----------------------------------------------------------------- + # KV Cache Efficiency (KVE) metrics + # These track cache hit rates for analyzing routing effectiveness. + # Efficiency = kve_cached_tokens_total / kve_prompt_tokens_total + # ----------------------------------------------------------------- + self.kve_prompt_tokens_total = Counter( + "thompson_kve_prompt_tokens_total", + "Total prompt tokens processed (KV efficiency denominator)", + ) + self.kve_cached_tokens_total = Counter( + "thompson_kve_cached_tokens_total", + "Total cached tokens hit (KV efficiency numerator)", + ) + + # Cache hit breakdown by memory tier (for analyzing cache hierarchy) + self.kve_device_blocks_total = Counter( + "thompson_kve_device_blocks_total", + "KV cache blocks hit from device (GPU) memory", + ) + self.kve_host_blocks_total = Counter( + "thompson_kve_host_blocks_total", + "KV cache blocks hit from host (CPU) memory", + ) + self.kve_disk_blocks_total = Counter( + "thompson_kve_disk_blocks_total", + "KV cache blocks hit from disk storage", + ) + + # Register metrics with Dynamo's endpoint for /metrics exposure + from dynamo.common.utils.prometheus import register_engine_metrics_callback + register_engine_metrics_callback( + endpoint, REGISTRY, metric_prefix_filters=["thompson_"] + ) + + logger.info("Processor metrics initialized via prometheus_client") + + +# -------------------------- processor handler -------------------------- # +class ProcessorRequestHandler: + """ + Processor that receives PreprocessedRequest from the default Dynamo frontend, + extracts routing hints from nvext annotations, and coordinates with the + Thompson Sampling router for intelligent worker selection. + """ + + def __init__( + self, + runtime: DistributedRuntime, + endpoint, + enable_router: bool = True, + ): + """ + Initialize the processor request handler. + + Args: + runtime: Dynamo distributed runtime for client connections. + endpoint: Dynamo endpoint for metrics registration. + enable_router: Whether to use Thompson Sampling router (default: True). + """ + self.runtime = runtime + self.endpoint = endpoint + self.enable_router = enable_router + + # Client connections (initialized in initialize()) + self.router_pick_client = None + self.router_feedback_client = None + self.engine_client = None + + # Prefix-level state: {prefix_id: {"total": int, "processed": int}} + self._prefix_state: dict[str, dict[str, int]] = {} + self._prefix_lock = asyncio.Lock() + + # Metrics (initialized in initialize()) + self._metrics: ProcessorMetrics | None = None + + async def initialize(self): + """Initialize processor by setting up metrics and connecting to services.""" + # Initialize metrics using Dynamo's metrics API + self._metrics = ProcessorMetrics(self.endpoint) + + # Connect to Thompson Sampling router + if self.enable_router: + router_component = self.runtime.namespace("dynamo").component("router") + self.router_pick_client = await router_component.endpoint("find_worker").client() + self.router_feedback_client = await router_component.endpoint("feedback").client() + logger.info("Router clients created, waiting for instances...") + await self.router_pick_client.wait_for_instances() + logger.info("Router clients initialized successfully") + + # Connect to actual workers at workers.{component}.generate + # Workers are in the "workers" namespace (hidden from frontend discovery) + # while this processor is in "dynamo" namespace (frontend discovers us) + # Component name varies by backend (REQUIRED - no default): + # - SGLang: uses "worker" (set via --endpoint workers.worker.generate) + # - vLLM: uses "backend" (hardcoded in dynamo.vllm) + worker_component_name = os.environ.get("DYNAMO_WORKER_COMPONENT") + if not worker_component_name: + raise ValueError( + "DYNAMO_WORKER_COMPONENT environment variable is required. " + "Set to 'worker' for SGLang or 'backend' for vLLM." + ) + worker_component = self.runtime.namespace("workers").component(worker_component_name) + self.engine_client = await worker_component.endpoint("generate").client() + logger.info("Engine client created for workers/%s/generate, waiting for worker instances...", worker_component_name) + await self.engine_client.wait_for_instances() + logger.info("Processor initialized successfully (routing to workers/%s/generate)", worker_component_name) + + # ---- annotation extraction ---- + @staticmethod + def _extract_annotation(annotations: list[str], key: str, default: str | None = None) -> str | None: + """Extract value from annotations list (format: 'key:value').""" + prefix = f"{key}:" + for ann in annotations: + if ann.startswith(prefix): + return ann[len(prefix):] + return default + + def _extract_hints(self, request: dict[str, Any]) -> tuple[str, int, str, str]: + """ + Extract routing hints from PreprocessedRequest annotations. + + Returns: (prefix_id, total_requests, osl, iat, use_frequency_backend) + """ + annotations = request.get("annotations", []) + if not isinstance(annotations, list): + annotations = [] + + # Extract prefix_id (generate one if not provided) + prefix_id = self._extract_annotation(annotations, "prefix_id") + if not prefix_id: + prefix_id = f"auto-{uuid.uuid4().hex}" + + # Extract total_requests count + total_str = self._extract_annotation(annotations, "total_requests", "1") + try: + total_requests = max(1, int(total_str)) + except (ValueError, TypeError): + total_requests = 1 + + # Extract expected output sequence length category + osl = self._extract_annotation(annotations, "osl", "MEDIUM") + osl = osl.upper() if osl else "MEDIUM" + if osl not in ("LOW", "MEDIUM", "HIGH"): + osl = "MEDIUM" + + # Extract interarrival time category + iat = self._extract_annotation(annotations, "iat", "MEDIUM") + iat = iat.upper() if iat else "MEDIUM" + if iat not in ("LOW", "MEDIUM", "HIGH"): + iat = "MEDIUM" + + # Extract backend selection (determines v1 vs v2 routing) + backend_selector = self._extract_annotation(annotations, "backend") + use_frequency_backend = backend_selector == "frequency_multi_lru" if backend_selector else False + + return prefix_id, total_requests, osl, iat, use_frequency_backend + + async def _update_prefix_state(self, prefix_id: str, total_requests: int) -> int: + """ + Update prefix counters and return remaining_after (reuse_budget). + + This tracks how many requests remain for a given prefix, allowing the + router to make informed decisions about KV cache placement. + """ + async with self._prefix_lock: + state = self._prefix_state.get(prefix_id) + if state is None: + state = {"total": total_requests, "processed": 0} + self._prefix_state[prefix_id] = state + else: + # Update total if a higher count is reported + state["total"] = max(state["total"], total_requests) + + state["processed"] += 1 + remaining_after = max(state["total"] - state["processed"], 0) + + # Clean up completed prefixes immediately + if remaining_after == 0: + self._prefix_state.pop(prefix_id, None) + + return remaining_after + + async def _pick_worker( + self, + token_ids: list[int], + prefix_id: str, + reuse_budget: int, + osl: str, + iat: str, + ) -> tuple[int | None, str | None]: + """ + Pick a worker via the Thompson Sampling router. + + Returns: (worker_id, decision_id) or (None, None) if routing fails. + """ + if not self.router_pick_client: + return None, None + + req = RouterRequest( + tokens=token_ids, + prefix_id=prefix_id, + reuse_budget=max(int(reuse_budget), 0), + expected_osl=osl, + interarrival=iat, + ) + + try: + stream = await self.router_pick_client.generate(req.model_dump()) + + worker_id: int | None = None + decision_id: str | None = None + + async for chunk in stream: + data = chunk.data() + if "error" in data: + logger.error("Router error: %s", data["error"]) + self._metrics.router_errors_total.inc() + break + + wid = data.get("worker_id", -1) + if wid == -1: + break + + worker_id = int(wid) + decision_id = data.get("decision_id") + break + + # Record routing decision + if worker_id is not None: + self._metrics.routing_decisions_total.labels(worker_id=str(worker_id)).inc() + else: + logger.warning("Router stream ended without worker_id; falling back to engine load balancing.") + + return worker_id, decision_id + + except Exception as e: + logger.error("Failed to pick worker: %s", e) + self._metrics.router_errors_total.inc() + return None, None + + async def _send_feedback_safely( + self, + decision_id: str | None, + latency_ms: float, + success: bool, + tokens_in: int, + tokens_out: int, + finish_reason: str | None, + ): + """ + Send feedback to router (fire-and-forget style). + + This feedback is used by the Thompson Sampling algorithm to update + its model of worker performance. + """ + if not decision_id or not self.router_feedback_client: + return + + try: + feedback = RouterFeedbackRequest( + decision_id=decision_id, + latency_ms=float(latency_ms), + success=bool(success), + tokens_in=int(tokens_in), + tokens_out=int(tokens_out), + finish_reason=finish_reason or "", + ) + stream = await self.router_feedback_client.generate(feedback.model_dump()) + async for _ in stream: + pass + except Exception: + logger.exception("Failed to send router feedback") + + def _update_kve_metrics_sync(self, kve: KVEfficiencyData) -> None: + """ + Update KV cache efficiency metrics (synchronous, called from background task). + + This is intentionally synchronous - counter increments are atomic and + extremely fast (microseconds). The async wrapper exists only to allow + fire-and-forget scheduling via create_task(). + """ + if not kve.has_data(): + return + + # Update counters - these are atomic operations + self._metrics.kve_prompt_tokens_total.inc(kve.prompt_tokens) + self._metrics.kve_cached_tokens_total.inc(kve.cached_tokens) + self._metrics.kve_device_blocks_total.inc(kve.device_blocks) + self._metrics.kve_host_blocks_total.inc(kve.host_blocks) + self._metrics.kve_disk_blocks_total.inc(kve.disk_blocks) + + # Log efficiency for debugging (only if we have meaningful data) + if kve.prompt_tokens > 0: + efficiency = kve.cached_tokens / kve.prompt_tokens * 100 + logger.debug( + "KVE update: prompt=%d cached=%d eff=%.1f%% (dev=%d host=%d disk=%d)", + kve.prompt_tokens, + kve.cached_tokens, + efficiency, + kve.device_blocks, + kve.host_blocks, + kve.disk_blocks, + ) + + async def _update_kve_metrics_async(self, kve: KVEfficiencyData) -> None: + """ + Async wrapper for KVE metric updates (fire-and-forget via create_task). + + This allows the main streaming path to continue without waiting for + metric updates, ensuring zero impact on routing throughput. + """ + try: + self._update_kve_metrics_sync(kve) + except Exception: + # Never let metric updates crash the system + logger.exception("Failed to update KVE metrics") + + async def _stream_from_engine( + self, + request: dict[str, Any], + worker_id: int | None, + decision_id: str | None, + tokens_in: int, + ) -> AsyncIterator[dict[str, Any]]: + """ + Stream response from the backend engine. + + Yields response chunks and sends feedback to the router on completion. + Also updates Prometheus metrics for latency and token throughput. + + KV cache efficiency (KVE) metrics are updated asynchronously via + create_task() to ensure zero impact on routing throughput. + """ + t0 = time.perf_counter() + tokens_out = 0 + finish_reason: str | None = None + kve_data: KVEfficiencyData | None = None # Collected from response + + try: + # Route to specific worker or use engine's load balancing + if worker_id is not None: + stream = await self.engine_client.direct(request, worker_id) + else: + stream = await self.engine_client.generate(request) + + async for chunk in stream: + data = chunk.data() + + # Handle engine errors + if "error" in data: + latency_ms = (time.perf_counter() - t0) * 1000.0 + await self._send_feedback_safely(decision_id, latency_ms, False, tokens_in, tokens_out, "error") + self._metrics.engine_errors_total.inc() + yield {"error": data["error"]} + return + + # Count output tokens + if "token_ids" in data and isinstance(data["token_ids"], list): + tokens_out += len(data["token_ids"]) + + # Extract KVE data if present (typically in final chunk or usage chunk) + # We check for 'usage' field which contains cache efficiency info + if "usage" in data or "nvext" in data: + extracted = KVEfficiencyData.from_response(data) + if extracted.has_data(): + kve_data = extracted + + # Pass through the chunk + yield data + + # Handle completion + if "finish_reason" in data and data["finish_reason"] is not None: + finish_reason = data["finish_reason"] + latency_seconds = time.perf_counter() - t0 + latency_ms = latency_seconds * 1000.0 + + # Send feedback to router (this is already fire-and-forget) + await self._send_feedback_safely(decision_id, + latency_ms, + True, + tokens_in, + tokens_out, + finish_reason) + + # Update core Prometheus metrics (fast atomic operations) + self._metrics.request_latency_seconds.observe(latency_seconds) + self._metrics.tokens_in_total.inc(tokens_in) + self._metrics.tokens_out_total.inc(tokens_out) + + # Fire-and-forget KVE metric update (async, non-blocking) + # This ensures KVE computation has ZERO impact on routing throughput + if kve_data is not None: + asyncio.create_task(self._update_kve_metrics_async(kve_data)) + + return + + except Exception as e: + latency_ms = (time.perf_counter() - t0) * 1000.0 + await self._send_feedback_safely(decision_id, latency_ms, False, tokens_in, tokens_out, "exception") + self._metrics.engine_errors_total.inc() + logger.exception("Engine stream exception") + yield {"error": str(e)} + return + + # ---- main generation endpoint ---- + async def generate(self, raw: dict[str, Any]): + """ + Processor endpoint: receives PreprocessedRequest from frontend. + + Expected format (from Dynamo preprocessor): + { + "token_ids": [...], + "annotations": ["prefix_id:xyz", "total_requests:10", ...], + "sampling_options": {...}, + "stop_conditions": {...}, + ... + } + """ + # Track active requests + self._metrics.active_requests.inc() + + try: + # Increment request counter + self._metrics.requests_total.inc() + + # Extract routing hints from annotations + prefix_id, total_requests, osl, iat, use_frequency_backend = self._extract_hints(raw) + + # Determine KVBM routing path based on backend selection + kvbm_version = "v2" if use_frequency_backend else "v1" + + # Get token IDs from preprocessed request + token_ids = raw.get("token_ids", []) + if not isinstance(token_ids, list): + token_ids = [] + + tokens_in = len(token_ids) + logger.info( + "Processing request: prefix=%s total=%d osl=%s iat=%s tokens=%d kvbm=%s backend=%s", + prefix_id, + total_requests, + osl, + iat, + tokens_in, + kvbm_version, + "frequency_multi_lru" if use_frequency_backend else "default_3pool", + ) + + # Compute reuse_budget := remaining AFTER this request + reuse_budget = await self._update_prefix_state(prefix_id, total_requests) + + # Pick worker via Thompson Sampling router + worker_id, decision_id = await self._pick_worker(token_ids, prefix_id, reuse_budget, osl, iat) + + logger.info( + "Routing decision: worker=%s decision=%s reuse_budget=%d", + worker_id, + decision_id, + reuse_budget, + ) + + # Stream response from engine + async for resp in self._stream_from_engine(raw, worker_id, decision_id, tokens_in): + yield resp + + finally: + self._metrics.active_requests.dec() + + +# -------------------------- worker entry point -------------------------- # +def parse_args(): + """Parse command-line arguments for the processor.""" + parser = argparse.ArgumentParser(description="Optimized Thompson Sampling Processor") + parser.add_argument( + "--enable-router", + action="store_true", + default=True, + help="Enable Thompson Sampling router integration", + ) + parser.add_argument( + "--no-router", + action="store_false", + dest="enable_router", + help="Disable router (use engine load balancing only)", + ) + parser.add_argument( + "--model-path", + type=str, + required=True, + help="Path to the model directory (for loading tokenizer and model card)", + ) + parser.add_argument( + "--model-name", + type=str, + required=True, + help="Served model name (must match frontend's --model-name)", + ) + return parser.parse_args() + + +@dynamo_worker() # Dynamic mode - required to call router/workers which are also dynamic +async def worker(runtime: DistributedRuntime): + """ + Main worker entry point for the Thompson Sampling processor. + + This processor registers as a backend that the frontend can discover via ETCD, + then forwards requests to actual workers after applying Thompson Sampling routing. + """ + args = parse_args() + + # DYNAMIC DISCOVERY MODE: + # Instead of using --static-endpoint on the frontend, we register a model card + # in ETCD so the frontend can discover us via its ModelWatcher. + # + # This is the forward-compatible approach since --static-endpoint is deprecated. + # + # Flow: + # 1. We register as dynamo.backend.generate (dynamically with instance ID) + # 2. We call register_llm() to advertise ourselves in ETCD + # 3. Frontend's ModelWatcher discovers us and routes requests to us + # 4. We forward to actual workers at workers.worker.generate + + component = runtime.namespace("dynamo").component("backend") + # NOTE: create_service() was removed in Dynamo 0.8.x - endpoint creation handles registration + + # Create the endpoint FIRST (needed for register_llm and metrics) + endpoint = component.endpoint("generate") + + # Register the model card with ETCD so the frontend can discover us + # We accept preprocessed tokens (ModelInput.Tokens) and serve chat/completions + logger.info( + "Registering model card: model_name=%s, model_path=%s", + args.model_name, + args.model_path, + ) + # IMPORTANT: kv_cache_block_size must match what workers use (default page_size=1) + # Otherwise checksums will differ and frontend will reject the processor's model card + await register_llm( + model_input=ModelInput.Tokens, # We accept tokenized input from frontend + model_type=ModelType.Chat | ModelType.Completions, # Chat and completions endpoints + endpoint=endpoint, + model_path=args.model_path, + model_name=args.model_name, + kv_cache_block_size=1, # Must match worker page_size to ensure same checksum + ) + logger.info("Model card registered successfully - frontend can now discover us via ETCD") + + # Initialize the request handler with the endpoint for metrics + handler = ProcessorRequestHandler( + runtime=runtime, + endpoint=endpoint, + enable_router=args.enable_router, + ) + await handler.initialize() + + # Serve as "backend.generate" - frontend will route to us after ETCD discovery + await endpoint.serve_endpoint(handler.generate) + + +if __name__ == "__main__": + uvloop.install() + asyncio.run(worker()) # pylint: disable=no-value-for-parameter diff --git a/external/dynamo/optimized/router_multilru.py b/external/dynamo/optimized/router_multilru.py new file mode 100644 index 0000000000..bca8881add --- /dev/null +++ b/external/dynamo/optimized/router_multilru.py @@ -0,0 +1,1404 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Optimized Thompson Sampling Router with Prometheus Metrics. + +This router implements Contextual Thompson Sampling with: + - KV overlap locality + - Remaining per-prefix requests (reuse_budget) + - OSL-based decode cost, ISL/prefill cost per worker + - IAT-based stickiness/opportunity weighting + - Instant & outstanding load (no TTL decay) + - Delayed bandit update using observed latency via `feedback` endpoint + - Timeout penalty for missing feedback + - Prometheus metrics (instead of CSV) + - Debug traces for offline analysis + +Key differences from generalized/router.py: + - Uses Prometheus metrics instead of CSV logging + - Removed CSV file I/O + - Added comprehensive Prometheus gauges, counters, and histograms +""" + +import argparse +import asyncio +import json +import logging +import math +import os +import random +import threading +import time +import uuid +from collections import deque +from functools import wraps +from pathlib import Path +from typing import Any + +import numpy as np +import uvloop +import yaml +from dynamo.runtime import DistributedRuntime +from dynamo.runtime import dynamo_worker +from dynamo.runtime.logging import configure_dynamo_logging +from pydantic import BaseModel + +# Try to import KV routing classes from dynamo.llm, fallback to stubs if unavailable +try: + from dynamo.llm import KvIndexer + from dynamo.llm import OverlapScores +except ImportError: + logger_init = logging.getLogger(__name__) + logger_init.warning("dynamo.llm KV classes not available, using fallback implementations") + + class OverlapScores: + """Fallback: KV cache overlap scores between a request and workers.""" + + def __init__(self, scores: dict[int, float] | None = None): + self.scores = scores if scores is not None else {} + + class KvIndexer: + """Fallback: KV cache indexer for finding overlap between requests and workers.""" + + def __init__(self, engine: Any, block_size: int): + self.engine = engine + self.block_size = block_size + + async def find_matches_for_request(self, tokens: list[int], min_overlap: int) -> OverlapScores: + """Find overlap scores for each worker. Returns empty scores (round-robin fallback).""" + return OverlapScores({}) + + +configure_dynamo_logging() +logger = logging.getLogger(__name__) + +WorkerId = int + + +# ---------------------- config loading ---------------------- # +def get_default_config_path() -> Path: + """Get path to default config.yaml in the same directory as this script.""" + return Path(__file__).parent / "config.yaml" + + +def load_config(config_path: str | Path | None = None) -> dict[str, Any]: + """Load configuration from YAML file. + + Args: + config_path: Path to YAML config file. If None, uses default config.yaml. + + Returns: + Configuration dictionary with nested structure. + """ + if config_path is None: + config_path = get_default_config_path() + + config_path = Path(config_path) + if not config_path.exists(): + logger.warning("Config file not found: %s, using built-in defaults", config_path) + return get_builtin_defaults() + + with open(config_path, encoding="utf-8") as f: + config = yaml.safe_load(f) + + logger.info("Loaded config from: %s", config_path) + return config + + +def get_builtin_defaults() -> dict[str, Any]: + """Return built-in default configuration (matches config.yaml).""" + return { + "infrastructure": { + "block_size": 64, + "router_type": "kv", + "min_workers": 1, + }, + "affinity": { + "base": 0.30, + "reuse_weight": 0.15, + "iat_weight": 0.20, + "sticky_load_floor": 0.70, + }, + "exploration": { + "base_ts_weight": 0.10, + "temperature": { + "base": 1.0, + "min": 0.15, + "max": 2.0, + }, + }, + "switching_cost": { + "base": 0.20, + "reuse_penalty": 0.08, + "iat_penalty": 0.05, + }, + "load_balancing": { + "queue_penalty_weight": 0.50, + "gpu_penalty_weight": 1.00, + "outstanding_work_weight": 0.45, + "job_gpu_coupling_weight": 0.40, + "job_queue_coupling_weight": 0.20, + }, + "prefill": { + "token_scale": 1024.0, + "weight": 1.0, + }, + "lints": { + "lambda": 1.0, + "v": 0.25, + "forget_rate": 0.995, + }, + "feedback": { + "timeout_seconds": 120.0, + "sweep_interval_seconds": 5.0, + "timeout_reward": 0.0, + "latency_ema_alpha": 0.2, + }, + "debug": { + "traces_enabled": False, + "trace_dir": "/tmp/dynamo_router_traces", + "buffer_size": 2000, + }, + } + + +def get_nested(config: dict, dotted_key: str, default: Any = None) -> Any: + """Get a nested value from config using dot notation. + + Args: + config: Configuration dictionary + dotted_key: Key in dot notation, e.g., "affinity.base" + default: Default value if key not found + + Returns: + Value at the nested key, or default if not found. + """ + keys = dotted_key.split(".") + obj = config + for k in keys: + if not isinstance(obj, dict) or k not in obj: + return default + obj = obj[k] + return obj + + +def set_nested(config: dict, dotted_key: str, value: Any) -> None: + """Set a nested value in config using dot notation. + + Args: + config: Configuration dictionary (modified in place) + dotted_key: Key in dot notation, e.g., "affinity.base" + value: Value to set + """ + keys = dotted_key.split(".") + obj = config + for k in keys[:-1]: + if k not in obj: + obj[k] = {} + obj = obj[k] + obj[keys[-1]] = value + + +def auto_cast(value_str: str) -> Any: + """Auto-cast a string value to appropriate type. + + Args: + value_str: String value from CLI + + Returns: + Value cast to int, float, bool, or str as appropriate. + """ + # Boolean + if value_str.lower() in ("true", "yes", "1"): + return True + if value_str.lower() in ("false", "no", "0"): + return False + + # Integer + try: + return int(value_str) + except ValueError: + pass + + # Float + try: + return float(value_str) + except ValueError: + pass + + # String + return value_str + + +def apply_cli_overrides(config: dict, args: argparse.Namespace) -> dict: + """Apply CLI argument overrides to configuration. + + Args: + config: Base configuration dictionary + args: Parsed CLI arguments + + Returns: + Configuration with CLI overrides applied. + """ + # Apply explicit CLI flags + if args.affinity_base is not None: + set_nested(config, "affinity.base", args.affinity_base) + logger.info("CLI override: affinity.base = %s", args.affinity_base) + + if args.temp_base is not None: + set_nested(config, "exploration.temperature.base", args.temp_base) + logger.info("CLI override: exploration.temperature.base = %s", args.temp_base) + + if args.lints_v is not None: + set_nested(config, "lints.v", args.lints_v) + logger.info("CLI override: lints.v = %s", args.lints_v) + + # Apply generic --override flags + if args.override: + for override in args.override: + if "=" not in override: + logger.warning("Invalid override format (expected key=value): %s", override) + continue + key, value_str = override.split("=", 1) + value = auto_cast(value_str) + set_nested(config, key, value) + logger.info("CLI override: %s = %s", key, value) + + return config + + +def _init_prometheus_metrics(): + """Initialize Prometheus metrics lazily.""" + import functools + + @functools.lru_cache(maxsize=1) + def _init() -> dict: + metrics: dict = {} + try: + from prometheus_client import REGISTRY + from prometheus_client import Counter + from prometheus_client import Gauge + from prometheus_client import Histogram + + metrics["decisions_total"] = Counter( + "thompson_router_decisions_total", + "Total routing decisions by worker", + ["worker_id"], + registry=REGISTRY, + ) + metrics["kv_overlap"] = Gauge( + "thompson_router_kv_overlap", + "KV cache overlap score for last decision by worker", + ["worker_id"], + registry=REGISTRY, + ) + metrics["feedback_latency"] = Histogram( + "thompson_router_feedback_latency_seconds", + "Latency from feedback by worker", + ["worker_id"], + buckets=[0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0], + registry=REGISTRY, + ) + metrics["reward"] = Gauge( + "thompson_router_reward", + "Last computed reward by worker", + ["worker_id"], + registry=REGISTRY, + ) + metrics["pending_decisions"] = Gauge( + "thompson_router_pending_decisions", + "Number of pending decisions awaiting feedback", + registry=REGISTRY, + ) + metrics["timeout_penalties"] = Counter( + "thompson_router_timeout_penalties_total", + "Total timeout penalties applied", + registry=REGISTRY, + ) + metrics["sticky_decisions"] = Counter( + "thompson_router_sticky_decisions_total", + "Decisions that stayed on the same worker (sticky)", + registry=REGISTRY, + ) + metrics["switch_decisions"] = Counter( + "thompson_router_switch_decisions_total", + "Decisions that switched to a different worker", + registry=REGISTRY, + ) + metrics["beta_alpha"] = Gauge( + "thompson_router_beta_alpha", + "Beta distribution alpha parameter by worker", + ["worker_id"], + registry=REGISTRY, + ) + metrics["beta_beta"] = Gauge( + "thompson_router_beta_beta", + "Beta distribution beta parameter by worker", + ["worker_id"], + registry=REGISTRY, + ) + metrics["prefix_state_size"] = Gauge( + "thompson_router_prefix_state_size", + "Number of active prefix states", + registry=REGISTRY, + ) + metrics["reuse_budget"] = Histogram( + "thompson_router_reuse_budget", + "Distribution of reuse_budget values", + buckets=[0, 1, 2, 5, 10, 20, 50, 100], + registry=REGISTRY, + ) + metrics["tokens_per_request"] = Histogram( + "thompson_router_tokens_per_request", + "Distribution of input token counts", + buckets=[32, 64, 128, 256, 512, 1024, 2048, 4096, 8192], + registry=REGISTRY, + ) + logger.info("Prometheus metrics initialized for router") + except ImportError: + logger.warning("prometheus_client not available, metrics disabled") + + return metrics + + return _init() + + +# ---------------------- request / response models ---------------------- # +class RouterRequest(BaseModel): + tokens: list[int] + prefix_id: str = "" + reuse_budget: int = 0 # remaining *after this request* + expected_osl: str | None = "MEDIUM" + interarrival: str | None = "MEDIUM" + + +class RouterResponse(BaseModel): + worker_id: int + prefix_hit_rate: float + decision_id: str | None = None + + +class FeedbackRequest(BaseModel): + decision_id: str + latency_ms: float + success: bool | None = True + tokens_in: int | None = None + tokens_out: int | None = None + finish_reason: str | None = None + + +class FeedbackAck(BaseModel): + ok: bool + used_baseline: float + reward: float + worker_id: int | None = None + error: str | None = None + + +# ---------------------- helper decorator ---------------------- # +def safe_update(lock_name: str): + + def decorator(fn): + + @wraps(fn) + def wrapper(self, *args, **kwargs): + lock = getattr(self, lock_name) + with lock: + return fn(self, *args, **kwargs) + + return wrapper + + return decorator + + +# ---------------------- router implementation ---------------------- # +class WorkloadAwareRouter: + """ + Contextual Thompson Sampling router with Prometheus metrics. + """ + + def __init__( + self, + runtime: DistributedRuntime, + block_size: int = 64, + router_type: str = "kv", + min_workers: int = 1, + # Affinity / exploration + affinity_base: float = 0.30, + affinity_reuse_weight: float = 0.15, + affinity_iat_weight: float = 0.20, + base_ts_weight: float = 0.10, + sticky_load_floor: float = 0.70, + # Softmax temperature + temp_base: float = 1.0, + temp_min: float = 0.15, + temp_max: float = 2.0, + # Switching cost + switch_cost_base: float = 0.20, + switch_cost_reuse: float = 0.08, + switch_cost_iat: float = 0.05, + # Load / opportunity cost + queue_penalty_weight: float = 0.50, + gpu_penalty_weight: float = 1.00, + outstanding_work_weight: float = 0.45, + job_gpu_coupling_weight: float = 0.40, + job_queue_coupling_weight: float = 0.20, + # Prefill / ISL + prefill_token_scale: float = 1024.0, + prefill_weight: float = 1.0, + # LinTS + lints_lambda: float = 1.0, + lints_v: float = 0.25, + lints_forget: float = 0.995, + # ---------- Feedback timeout / sweep ---------- + feedback_timeout_seconds: float = 120.0, + pending_sweep_interval_seconds: float = 5.0, + timeout_reward: float = 0.0, + # ---------- Latency EMA (reward normalization) ---------- + latency_ema_alpha: float = 0.2, + # ---------- Debug traces ---------- + debug_traces: bool = False, + debug_trace_dir: str = "/tmp/dynamo_router_traces", + debug_buffer_size: int = 2000, + ): + self.runtime = runtime + self.block_size = block_size + self.router_type = router_type + self.min_workers = min_workers + + # clients / helpers (initialized later) + self.engine_client = None + self.indexer: KvIndexer | None = None + + # concurrency primitives + self._init_lock = threading.Lock() + self._bandit_lock = threading.Lock() + self._prefix_lock = threading.Lock() + self._lin_lock = threading.Lock() + self._pending_lock = threading.Lock() + + # prefix state: pid -> {"worker": int|None, "reuse_remaining": int} + self.prefix_cache_state: dict[str, dict[str, int | None]] = {} + # pid -> {"decode_cost","prefill_cost","iat_factor"} + self.prefix_meta: dict[str, dict[str, float]] = {} + + # Beta bandits and LinTS params + self.worker_bandits: dict[int, tuple[float, float]] = {} + self.feature_dim = 9 + self.lin_lambda = float(lints_lambda) + self.lin_v = float(lints_v) + self.lin_forget = float(lints_forget) + self.lin_forget = max(1e-6, min(self.lin_forget, 0.999999)) + self.linA: dict[int, np.ndarray] = {} + self.linb: dict[int, np.ndarray] = {} + + # knobs + self.affinity_base = float(affinity_base) + self.affinity_reuse_weight = float(affinity_reuse_weight) + self.affinity_iat_weight = float(affinity_iat_weight) + self.base_ts_weight = float(base_ts_weight) + self.sticky_load_floor = float(sticky_load_floor) + self.temp_base = float(temp_base) + self.temp_min = float(temp_min) + self.temp_max = float(temp_max) + self.switch_cost_base = float(switch_cost_base) + self.switch_cost_reuse = float(switch_cost_reuse) + self.switch_cost_iat = float(switch_cost_iat) + self.queue_penalty_weight = float(queue_penalty_weight) + self.gpu_penalty_weight = float(gpu_penalty_weight) + self.outstanding_work_weight = float(outstanding_work_weight) + self.job_gpu_coupling_weight = float(job_gpu_coupling_weight) + self.job_queue_coupling_weight = float(job_queue_coupling_weight) + self.prefill_token_scale = float(prefill_token_scale) + self.prefill_weight = float(prefill_weight) + + # LinTS numerics + self._jt_base = 1e-9 + self._jt_mult = 10.0 + self._jt_max = 1e-3 + self._eig_floor = 1e-10 + + # Feedback timeout / sweep + self.feedback_timeout_seconds = float(feedback_timeout_seconds) + self.pending_sweep_interval_seconds = float(pending_sweep_interval_seconds) + self.timeout_reward = float(max(0.0, min(1.0, timeout_reward))) + self._last_pending_sweep = 0.0 + + # Latency EMA baselines (two modes: raw ms, or ms/token) + self.latency_ema_alpha = float(latency_ema_alpha) + # Global (per-mode) + self.lat_ema_global: dict[bool, float | None] = {False: None, True: None} + # Per worker (per-mode) + self.lat_ema_worker: dict[tuple[int, bool], float] = {} + # Per bucket (per-mode): (wid, osl, prefill_bin, per_tok) -> value + self.lat_ema_bucket: dict[tuple[int, str, str, bool], float] = {} + + # Pending decisions waiting for feedback + self.pending: dict[str, dict[str, Any]] = {} + + # Debug traces + self.debug_traces = bool(debug_traces) + self.debug_trace_dir = str(debug_trace_dir) + self.recent_traces: deque = deque(maxlen=int(debug_buffer_size)) + if self.debug_traces: + os.makedirs(self.debug_trace_dir, exist_ok=True) + logger.info("Router debug traces enabled -> %s", self.debug_trace_dir) + + # Prometheus metrics + self._metrics = {} + + # --------------------- tracing --------------------- # + def _emit_trace(self, kind: str, payload: dict[str, Any]): + if not self.debug_traces: + return + item = {"ts": time.time(), "kind": kind, **payload} + self.recent_traces.append(item) + try: + path = os.path.join(self.debug_trace_dir, "router_traces.jsonl") + with open(path, "a", encoding="utf-8") as f: + f.write(json.dumps(item, separators=(",", ":")) + "\n") + except Exception as e: + logger.debug("Trace write failed: %s", e) + + # --------------------- level mappings --------------------- # + @staticmethod + def _norm_level(s: str | None, default: str = "MEDIUM") -> str: + if not s: + return default + s = str(s).strip().upper() + return s if s in ("LOW", "MEDIUM", "HIGH") else default + + @staticmethod + def _decode_cost(osl: str) -> float: + return {"LOW": 1.0, "MEDIUM": 2.0, "HIGH": 3.0}[osl] + + @staticmethod + def _iat_factor(iat: str) -> float: + return {"LOW": 1.5, "MEDIUM": 1.0, "HIGH": 0.6}[iat] + + # --------------------- init --------------------- # + async def initialize(self): + """Initialize router by polling for backend workers.""" + # Initialize Prometheus metrics + self._metrics = _init_prometheus_metrics() + + # Connect to actual workers at workers.{component}.generate + # Workers are in the "workers" namespace (hidden from frontend discovery) + # Component name varies by backend (REQUIRED - no default): + # - SGLang: uses "worker" (set via --endpoint workers.worker.generate) + # - vLLM: uses "backend" (hardcoded in dynamo.vllm) + worker_component = os.environ.get("DYNAMO_WORKER_COMPONENT") + if not worker_component: + raise ValueError( + "DYNAMO_WORKER_COMPONENT environment variable is required. " + "Set to 'worker' for SGLang or 'backend' for vLLM." + ) + engine = self.runtime.namespace("workers").component(worker_component) + logger.info("Getting engine client for workers/%s/generate", worker_component) + self.engine_client = await engine.endpoint("generate").client() + + min_workers = int(self.min_workers) + if min_workers < 0: + raise ValueError(f"min_workers must be >= 0, got {min_workers}") + + timeout_s = float(os.environ.get("DYNAMO_ROUTER_WAIT_FOR_WORKERS_TIMEOUT_S", "600")) + if not math.isfinite(timeout_s) or timeout_s <= 0: + raise ValueError("DYNAMO_ROUTER_WAIT_FOR_WORKERS_TIMEOUT_S must be a finite number > 0") + + deadline = time.monotonic() + timeout_s + backoff_s = 0.5 + + logger.info("Waiting for backend workers (min_workers=%d, timeout_s=%.1f)...", min_workers, timeout_s) + + if min_workers == 0: + instance_ids_raw = list(self.engine_client.instance_ids()) + logger.info("Backend workers discovered (min_workers=0): %s", instance_ids_raw) + else: + while True: + remaining = deadline - time.monotonic() + if remaining <= 0: + raise TimeoutError(f"Timed out after {timeout_s}s waiting for >= {min_workers} backend worker(s)") + + try: + await asyncio.wait_for( + self.engine_client.wait_for_instances(), + timeout=min(remaining, 10.0), + ) + except TimeoutError: + pass + + instance_ids_raw = list(self.engine_client.instance_ids()) + if len(instance_ids_raw) >= min_workers: + try: + instance_ids = [int(w) for w in instance_ids_raw] + except Exception: + instance_ids = instance_ids_raw + logger.info("Backend workers discovered: %s", instance_ids) + break + + await asyncio.sleep(backoff_s) + backoff_s = min(backoff_s * 1.5, 5.0) + + self.indexer = KvIndexer(engine, self.block_size) + + self._initialize_bandits() + self._initialize_contextual() + logger.info("WorkloadAwareRouter initialized with %d backend worker(s)", + len(list(self.engine_client.instance_ids()))) + + @safe_update("_init_lock") + def _initialize_bandits(self): + for wid in self.engine_client.instance_ids(): + wid = int(wid) + self.worker_bandits.setdefault(wid, (1.0, 1.0)) + # Update Prometheus metrics + if self._metrics.get("beta_alpha"): + self._metrics["beta_alpha"].labels(worker_id=str(wid)).set(1.0) + if self._metrics.get("beta_beta"): + self._metrics["beta_beta"].labels(worker_id=str(wid)).set(1.0) + + @safe_update("_init_lock") + def _initialize_contextual(self): + for wid in self.engine_client.instance_ids(): + wid = int(wid) + if wid not in self.linA: + self.linA[wid] = self.lin_lambda * np.eye(self.feature_dim, dtype=np.float64) + self.linb[wid] = np.zeros(self.feature_dim, dtype=np.float64) + + def _ensure_worker_context(self, worker_id: int): + if worker_id not in self.linA: + with self._lin_lock: + if worker_id not in self.linA: + self.linA[worker_id] = self.lin_lambda * np.eye(self.feature_dim, dtype=np.float64) + self.linb[worker_id] = np.zeros(self.feature_dim, dtype=np.float64) + + # --------------------- prefix state --------------------- # + @safe_update("_prefix_lock") + def _get_prefix(self, pid: str) -> tuple[int | None, int]: + info = self.prefix_cache_state.get(pid) + if info: + return info.get("worker"), int(info.get("reuse_remaining") or 0) + return None, 0 + + @safe_update("_prefix_lock") + def _set_prefix( + self, + pid: str, + wid: int, + reuse_remaining: int, + decode_cost: float, + prefill_cost: float, + iat_factor: float, + ): + """Record/refresh prefix assignment.""" + if reuse_remaining <= 0: + self.prefix_cache_state.pop(pid, None) + self.prefix_meta.pop(pid, None) + else: + self.prefix_cache_state[pid] = {"worker": wid, "reuse_remaining": max(0, int(reuse_remaining))} + self.prefix_meta[pid] = { + "decode_cost": float(decode_cost), + "prefill_cost": float(max(prefill_cost, 0.0)), + "iat_factor": float(iat_factor), + } + + # Update prefix state size metric + if self._metrics.get("prefix_state_size"): + self._metrics["prefix_state_size"].set(len(self.prefix_cache_state)) + + def _worker_outstanding(self, wid: int) -> tuple[int, float]: + """Returns (reuse_total, work_total) for a worker.""" + reuse_total = 0 + work_total = 0.0 + for pid, info in self.prefix_cache_state.items(): + if info.get("worker") != wid: + continue + r = int(info.get("reuse_remaining") or 0) + reuse_total += r + meta = self.prefix_meta.get(pid) + if meta: + work_total += float(r) * (float(meta.get("decode_cost", 2.0)) + + float(meta.get("prefill_cost", 0.0))) * float(meta.get("iat_factor", 1.0)) + return reuse_total, work_total + + # --------------------- bandits --------------------- # + def _linTS_sample(self, wid: int, x: np.ndarray) -> float: + self._ensure_worker_context(wid) + with self._lin_lock: + A = np.array(self.linA[wid], dtype=np.float64, copy=True) + b = np.array(self.linb[wid], dtype=np.float64, copy=True) + + A = 0.5 * (A + A.T) + eye = np.eye(self.feature_dim, dtype=np.float64) + jitter = self._jt_base + L = None + while True: + try: + L = np.linalg.cholesky(A + jitter * eye) + break + except np.linalg.LinAlgError: + jitter = jitter * self._jt_mult if jitter > 0 else self._jt_base + if jitter > self._jt_max: + vals, vecs = np.linalg.eigh(A) + vals = np.maximum(vals, self._eig_floor) + A_inv = vecs @ (np.diag(1.0 / vals)) @ vecs.T + mu = A_inv @ b + z = np.random.normal(size=self.feature_dim) + noise = vecs @ (z / np.sqrt(vals)) + theta = mu + (self.lin_v * noise) + return float(theta @ x) + + y = np.linalg.solve(L, b) + mu = np.linalg.solve(L.T, y) + z = np.random.normal(size=self.feature_dim) + noise = np.linalg.solve(L.T, z) + theta = mu + (self.lin_v * noise) + return float(theta @ x) + + def _update_contextual(self, wid: int, x: np.ndarray, reward: float): + r = float(max(0.0, min(1.0, reward))) + with self._lin_lock: + A = self.linA[wid] + b = self.linb[wid] + A *= self.lin_forget + b *= self.lin_forget + A += np.outer(x, x) + ridge = (1.0 - self.lin_forget) * self.lin_lambda + if ridge > 0.0: + A += ridge * np.eye(self.feature_dim, dtype=np.float64) + self.linA[wid] = 0.5 * (A + A.T) + self.linb[wid] = b + x * r + + def _ts_sample(self, worker_id: int) -> float: + with self._bandit_lock: + alpha, beta = self.worker_bandits.get(worker_id, (1.0, 1.0)) + return np.random.beta(alpha, beta) + + def _update_bandit(self, worker_id: int, reward: float): + with self._bandit_lock: + alpha, beta = self.worker_bandits.get(worker_id, (1.0, 1.0)) + r = float(max(0.0, min(1.0, reward))) + new_alpha = alpha + r + new_beta = beta + 1.0 - r + self.worker_bandits[worker_id] = (new_alpha, new_beta) + + # Update Prometheus metrics + if self._metrics.get("beta_alpha"): + self._metrics["beta_alpha"].labels(worker_id=str(worker_id)).set(new_alpha) + if self._metrics.get("beta_beta"): + self._metrics["beta_beta"].labels(worker_id=str(worker_id)).set(new_beta) + + # --------------------- features / scores --------------------- # + def _prefill_cost_for_worker(self, tokens: list[int], overlap: float) -> float: + isl = max(0, len(tokens)) + frac = min(max(float(overlap), 0.0), 1.0) + uncached = max(0.0, float(isl) * (1.0 - frac)) + return (uncached / self.prefill_token_scale) * self.prefill_weight + + @staticmethod + def _prefill_bin(prefill_cost: float) -> str: + if prefill_cost < 0.25: + return "LOW" + if prefill_cost < 0.75: + return "MEDIUM" + return "HIGH" + + def _feature_vector( + self, + wid: int, + metrics: dict[str, Any] | None, + scores: "OverlapScores", + last_w: int | None, + reuse_after: int, + decode_cost: float, + prefill_cost: float, + iat_factor: float, + ) -> np.ndarray: + gpu = 0.0 + queue = 0.0 + if metrics and isinstance(metrics, dict) and "endpoints" in metrics: + for ep in metrics["endpoints"]: + if ep.get("worker_id") == wid: + gpu = float(ep.get("gpu_cache_usage_perc", 0.0)) + queue = float(ep.get("num_requests_waiting", 0.0)) + break + inv_load = 1.0 / (1.0 + self.gpu_penalty_weight * max(0.0, gpu) + self.queue_penalty_weight * max(0.0, queue)) + + overlap = float(scores.scores.get(wid, 0.0)) + affinity = 1.0 if (last_w is not None and wid == last_w) else 0.0 + _, work_out = self._worker_outstanding(wid) + + decode_norm = decode_cost / 3.0 + prefill_norm = math.tanh(prefill_cost) + iat_norm = iat_factor / 1.5 + outstanding_norm = math.tanh(0.1 * work_out) + reuse_norm = math.tanh(0.25 * float(max(reuse_after, 0))) + + return np.array([ + 1.0, + inv_load, + overlap, + affinity, + outstanding_norm, + decode_norm, + prefill_norm, + iat_norm, + reuse_norm, + ], + dtype=np.float64) + + def _load_score(self, wid: int, metrics: dict[str, Any] | None, job_cost_total: float) -> float: + gpu = 0.0 + queue = 0.0 + if metrics and isinstance(metrics, dict) and "endpoints" in metrics: + for ep in metrics["endpoints"]: + if ep.get("worker_id") == wid: + gpu = float(ep.get("gpu_cache_usage_perc", 0.0)) + queue = float(ep.get("num_requests_waiting", 0.0)) + break + _, work_out = self._worker_outstanding(wid) + penalty = (self.gpu_penalty_weight * gpu + self.queue_penalty_weight * queue + + self.outstanding_work_weight * max(0.0, work_out) + + self.job_gpu_coupling_weight * job_cost_total * gpu + + self.job_queue_coupling_weight * job_cost_total * queue) + return 1.0 / (1.0 + max(0.0, penalty)) + + def _softmax(self, scores: list[float], temp: float) -> list[float]: + t = float(min(max(temp, self.temp_min), self.temp_max)) + m = float(np.max(scores)) + exps = np.exp((np.array(scores) - m) / max(1e-6, t)) + s = float(np.sum(exps)) + if s <= 0.0 or not np.isfinite(s): + return [1.0 / len(scores)] * len(scores) + return list((exps / s).astype(float)) + + # --------------------- selection --------------------- # + def _select_worker( + self, + worker_ids, + req: RouterRequest, + metrics: dict[str, Any] | None, + scores: OverlapScores, + ) -> tuple[int, dict[str, float], dict[int, dict[str, float]], list[float], list[float]]: + osl = self._norm_level(req.expected_osl, "MEDIUM") + iat = self._norm_level(req.interarrival, "MEDIUM") + last_w, _ = self._get_prefix(req.prefix_id) + + reuse_after = max(int(req.reuse_budget), 0) + decode_cost = self._decode_cost(osl) + iat_factor = self._iat_factor(iat) + + temp = self.temp_base / (1.0 + float(reuse_after) * iat_factor) + temp = min(max(temp, self.temp_min), self.temp_max) + + raw_scores: list[float] = [] + worker_list: list[int] = [int(w) for w in worker_ids] + per_worker_ctx: dict[int, dict[str, float]] = {} + load_mods: list[float] = [] + overlaps: list[float] = [] + + for wid in worker_list: + overlap = float(scores.scores.get(wid, 0.0)) + prefill_cost = self._prefill_cost_for_worker(req.tokens, overlap) + job_cost_total = decode_cost + prefill_cost + + x = self._feature_vector( + wid=wid, + metrics=metrics, + scores=scores, + last_w=last_w, + reuse_after=reuse_after, + decode_cost=decode_cost, + prefill_cost=prefill_cost, + iat_factor=iat_factor, + ) + + val = self._linTS_sample(wid, x) + explore_w = self.base_ts_weight / (1.0 + float(reuse_after) * iat_factor) + val += explore_w * self._ts_sample(wid) + + if last_w == wid and (reuse_after > 0): + val += (self.affinity_base + self.affinity_reuse_weight * float(reuse_after) + + self.affinity_iat_weight * iat_factor) * (0.5 + 0.5 * overlap) + + if last_w is not None and wid != last_w and (reuse_after > 0): + val -= (self.switch_cost_base + self.switch_cost_reuse * float(reuse_after) + + self.switch_cost_iat * iat_factor) + + load_mod = self._load_score(wid, metrics, job_cost_total=job_cost_total) + if last_w == wid and reuse_after > 0: + load_mod = max(load_mod, self.sticky_load_floor) + val *= load_mod + + if np.isnan(val) or np.isinf(val): + val = -1e9 + + raw_scores.append(float(val)) + load_mods.append(float(load_mod)) + overlaps.append(float(overlap)) + per_worker_ctx[wid] = { + "decode_cost": decode_cost, + "prefill_cost": prefill_cost, + "iat_factor": iat_factor, + "overlap": overlap, + "reuse_after": float(reuse_after), + "load_mod": load_mod, + } + + probs = self._softmax(raw_scores, temp) + r = random.random() + cum = 0.0 + idx = 0 + for i, p in enumerate(probs): + cum += p + if r <= cum: + idx = i + break + chosen = int(worker_list[idx]) + + return chosen, per_worker_ctx[chosen], per_worker_ctx, raw_scores, probs + + # --------------------- latency baselines & reward --------------------- # + def _ema_update(self, old: float | None, new: float) -> float: + a = self.latency_ema_alpha + return new if old is None else (a * new + (1.0 - a) * old) + + def _get_latency_baseline(self, wid: int, osl: str, prefill_bin: str, per_tok: bool, fallback: float) -> float: + key_b = (wid, osl, prefill_bin, per_tok) + key_w = (wid, per_tok) + if key_b in self.lat_ema_bucket: + return self.lat_ema_bucket[key_b] + if key_w in self.lat_ema_worker: + return self.lat_ema_worker[key_w] + if self.lat_ema_global[per_tok] is not None: + return self.lat_ema_global[per_tok] # type: ignore + return max(1.0, float(fallback)) + + def _update_latency_baselines(self, wid: int, osl: str, prefill_bin: str, metric: float, per_tok: bool) -> float: + self.lat_ema_global[per_tok] = self._ema_update(self.lat_ema_global[per_tok], metric) + key_w = (wid, per_tok) + self.lat_ema_worker[key_w] = self._ema_update(self.lat_ema_worker.get(key_w), metric) + key_b = (wid, osl, prefill_bin, per_tok) + self.lat_ema_bucket[key_b] = self._ema_update(self.lat_ema_bucket.get(key_b), metric) + return self.lat_ema_bucket[key_b] + + @staticmethod + def _latency_metric(latency_ms: float, tokens_out: int | None) -> tuple[float, bool]: + if tokens_out is not None and int(tokens_out) > 0: + return float(latency_ms) / float(max(1, int(tokens_out))), True + return float(latency_ms), False + + @staticmethod + def _metric_to_reward(metric: float, baseline: float, success: bool) -> float: + if not success: + return 0.0 + denom = max(1e-3, baseline) + ratio = metric / denom + return float(1.0 / (1.0 + ratio)) + + # --------------------- timeout sweep --------------------- # + def _sweep_pending(self, now: float): + if now - self._last_pending_sweep < self.pending_sweep_interval_seconds: + return + self._last_pending_sweep = now + expired: list[tuple[str, dict[str, Any]]] = [] + with self._pending_lock: + for did, rec in list(self.pending.items()): + if now - float(rec.get("start_ts", now)) >= self.feedback_timeout_seconds: + expired.append((did, rec)) + self.pending.pop(did, None) + + # Update pending count metric + if self._metrics.get("pending_decisions"): + self._metrics["pending_decisions"].set(len(self.pending)) + + for did, rec in expired: + wid = int(rec["wid"]) + x = rec["x"] + reward = float(self.timeout_reward) + self._update_bandit(wid, reward) + self._update_contextual(wid, x, reward) + + if self._metrics.get("timeout_penalties"): + self._metrics["timeout_penalties"].inc() + + self._emit_trace( + "timeout", + { + "decision_id": did, + "wid": wid, + "reward": reward, + "age": self.feedback_timeout_seconds, + "prefix_id": rec.get("prefix_id"), + "osl": rec.get("osl"), + "prefill_bin": rec.get("prefill_bin"), + }) + logger.warning("Timeout feedback: wid=%s decision=%s reward=%.3f", wid, did, reward) + + # --------------------- main endpoint: find_worker --------------------- # + async def generate(self, request: dict): + req = RouterRequest(**request) + + worker_ids = [int(w) for w in self.engine_client.instance_ids()] + if not worker_ids: + yield RouterResponse(worker_id=-1, prefix_hit_rate=0.0).model_dump() + return + + now = time.time() + self._sweep_pending(now) + + # Track tokens per request + if self._metrics.get("tokens_per_request"): + self._metrics["tokens_per_request"].observe(len(req.tokens)) + if self._metrics.get("reuse_budget"): + self._metrics["reuse_budget"].observe(req.reuse_budget) + + metrics = None # TODO: Replace with proper metrics query when API is available + if self.router_type == "kv_load": + wid, _ = self._get_underloaded(metrics) + yield RouterResponse(worker_id=wid, prefix_hit_rate=0.0).model_dump() + return + + scores: OverlapScores = await self.indexer.find_matches_for_request(req.tokens, 0) + chosen, chosen_ctx, all_ctx, raw_scores, probs = self._select_worker(worker_ids, req, metrics, scores) + + last_w, _ = self._get_prefix(req.prefix_id) + + osl = self._norm_level(req.expected_osl, "MEDIUM") + iat = self._norm_level(req.interarrival, "MEDIUM") + decode_cost = self._decode_cost(osl) + overlap_chosen = float(scores.scores.get(chosen, 0.0)) + prefill_cost_chosen = self._prefill_cost_for_worker(req.tokens, overlap_chosen) + iat_factor = self._iat_factor(iat) + + # Update prefix state + self._set_prefix( + req.prefix_id, + chosen, + reuse_remaining=max(int(req.reuse_budget), 0), + decode_cost=decode_cost, + prefill_cost=prefill_cost_chosen, + iat_factor=iat_factor, + ) + + # Build feature x for chosen & store pending decision + x = self._feature_vector( + wid=chosen, + metrics=metrics, + scores=scores, + last_w=last_w, + reuse_after=max(int(req.reuse_budget), 0), + decode_cost=decode_cost, + prefill_cost=prefill_cost_chosen, + iat_factor=iat_factor, + ) + decision_id = uuid.uuid4().hex + with self._pending_lock: + self.pending[decision_id] = { + "wid": int(chosen), + "x": x, + "osl": osl, + "prefill_bin": self._prefill_bin(prefill_cost_chosen), + "start_ts": now, + "prefix_id": req.prefix_id, + "tokens_in": len(req.tokens), + "reuse_after": int(req.reuse_budget), + "overlap": overlap_chosen, + "prefill_cost": float(prefill_cost_chosen), + "decode_cost": float(decode_cost), + } + # Update pending count metric + if self._metrics.get("pending_decisions"): + self._metrics["pending_decisions"].set(len(self.pending)) + + # Update Prometheus metrics + if self._metrics.get("decisions_total"): + self._metrics["decisions_total"].labels(worker_id=str(chosen)).inc() + if self._metrics.get("kv_overlap"): + self._metrics["kv_overlap"].labels(worker_id=str(chosen)).set(overlap_chosen) + + # Track sticky vs switch decisions + if last_w is not None: + if chosen == last_w: + if self._metrics.get("sticky_decisions"): + self._metrics["sticky_decisions"].inc() + elif self._metrics.get("switch_decisions"): + self._metrics["switch_decisions"].inc() + + # Decision trace + if self.debug_traces: + worker_list = [int(w) for w in worker_ids] + details = { + wid: { + "score": float(raw_scores[i]), + "prob": float(probs[i]), + **all_ctx[wid], + } + for i, wid in enumerate(worker_list) + } + self._emit_trace("decision", + { + "decision_id": decision_id, + "prefix_id": req.prefix_id, + "chosen": int(chosen), + "workers": details, + }) + + logger.info( + "Router picked worker=%s decision=%s prefix=%s (last=%s reuse_after=%s osl=%s " + "prefill_cost=%.3f iat=%s overlap=%.3f)", + chosen, + decision_id, + req.prefix_id, + last_w, + req.reuse_budget, + osl, + prefill_cost_chosen, + iat, + overlap_chosen, + ) + + resp = RouterResponse(worker_id=chosen, prefix_hit_rate=overlap_chosen, decision_id=decision_id) + yield resp.model_dump() + return + + # --------------------- feedback endpoint --------------------- # + async def feedback(self, request: dict): + """Ex-post reward update from processor with observed latency.""" + try: + fb = FeedbackRequest(**request) + except Exception as e: + ack = FeedbackAck(ok=False, used_baseline=0.0, reward=0.0, error=str(e)) + yield ack.model_dump() + return + + with self._pending_lock: + decision = self.pending.pop(fb.decision_id, None) + # Update pending count metric + if self._metrics.get("pending_decisions"): + self._metrics["pending_decisions"].set(len(self.pending)) + + if not decision: + ack = FeedbackAck(ok=False, used_baseline=0.0, reward=0.0, error="unknown_decision") + yield ack.model_dump() + return + + wid: int = int(decision["wid"]) + x: np.ndarray = decision["x"] + osl: str = str(decision["osl"]) + prefill_bin: str = str(decision["prefill_bin"]) + tokens_out = None if fb.tokens_out is None else int(fb.tokens_out) + metric, per_tok = self._latency_metric(float(fb.latency_ms), tokens_out) + + # Baseline lookup (hierarchical) + baseline_before = self._get_latency_baseline(wid, osl, prefill_bin, per_tok, fallback=metric) + reward = self._metric_to_reward(metric, baseline_before, bool(fb.success)) + + # Update EMAs only on successes + if fb.success: + baseline_after = self._update_latency_baselines(wid, osl, prefill_bin, metric, per_tok) + else: + baseline_after = baseline_before + + # Update bandits with ex-post reward + self._update_bandit(wid, reward) + self._update_contextual(wid, x, reward) + + # Update Prometheus metrics + if self._metrics.get("feedback_latency"): + self._metrics["feedback_latency"].labels(worker_id=str(wid)).observe(fb.latency_ms / 1000.0) + if self._metrics.get("reward"): + self._metrics["reward"].labels(worker_id=str(wid)).set(reward) + + self._emit_trace( + "feedback", + { + "decision_id": fb.decision_id, + "wid": wid, + "latency_ms": float(fb.latency_ms), + "tokens_out": tokens_out, + "metric": metric, + "per_tok": per_tok, + "baseline_used": baseline_before, + "baseline_after": baseline_after, + "reward": reward, + "success": bool(fb.success), + "finish_reason": fb.finish_reason or "", + }) + + logger.info( + "Feedback: wid=%s decision=%s metric=%.3f%s baseline=%.3f reward=%.3f success=%s", + wid, + fb.decision_id, + metric, + " ms/tok" if per_tok else " ms", + baseline_before, + reward, + fb.success, + ) + + ack = FeedbackAck(ok=True, used_baseline=float(baseline_before), reward=float(reward), worker_id=wid) + yield ack.model_dump() + return + + # --------------------- helpers --------------------- # + def _get_underloaded(self, metrics: dict[str, Any] | None): + if not metrics or not metrics.get("endpoints"): + wid = int(random.choice(list(self.engine_client.instance_ids()))) + return wid, 0.0 + loads = {ep.get("worker_id"): ep.get("gpu_cache_usage_perc", 0.0) for ep in metrics["endpoints"]} + min_val = min(loads.values()) + candidates = [wid for wid, v in loads.items() if v == min_val] + return random.choice(candidates), min_val + + +# ---------------------- worker entry point ---------------------- # +def parse_args(): + """Parse minimal CLI arguments. + + The router uses a YAML config file for most parameters. + Only frequently-tuned parameters have dedicated CLI flags. + Use --override for any other parameter. + + See PARAMETERS.md for full documentation. + """ + parser = argparse.ArgumentParser( + description="Optimized Thompson Sampling Router with Prometheus Metrics", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Use default config + python router.py + + # Use custom config file + python router.py --config /path/to/config.yaml + + # Override specific values + python router.py --config config.yaml --affinity-base 0.5 --temp-base 1.5 + + # Override any config value + python router.py --config config.yaml --override load_balancing.gpu_penalty_weight=2.0 + +See PARAMETERS.md for full parameter documentation. + """, + ) + + # Config file + parser.add_argument( + "--config", + type=str, + default=None, + help="Path to YAML config file (default: config.yaml in script directory)", + ) + + # Primary tuning knobs (explicit CLI flags) + parser.add_argument( + "--affinity-base", + type=float, + default=None, + help="Primary stickiness control [0.0-1.0] (overrides config)", + ) + parser.add_argument( + "--temp-base", + type=float, + default=None, + help="Primary exploration control [0.15-2.0] (overrides config)", + ) + parser.add_argument( + "--lints-v", + type=float, + default=None, + help="LinTS exploration variance [0.0-1.0] (overrides config)", + ) + + # Generic override for any config value + parser.add_argument( + "--override", + action="append", + default=[], + metavar="KEY=VALUE", + help="Override any config value using dot notation (repeatable)", + ) + + return parser.parse_args() + + +@dynamo_worker() +async def worker(runtime: DistributedRuntime): + # Parse CLI and load config + args = parse_args() + config = load_config(args.config) + config = apply_cli_overrides(config, args) + + component = runtime.namespace("dynamo").component("router") + # NOTE: create_service() was removed in Dynamo 0.8.x - endpoint creation handles registration + logger.info("Initializing Optimized Thompson Sampling Router (Prometheus metrics)") + + # Extract config values with nested access + router = WorkloadAwareRouter( + runtime, + # Infrastructure + block_size=get_nested(config, "infrastructure.block_size", 64), + router_type=str(get_nested(config, "infrastructure.router_type", "kv")).lower(), + min_workers=get_nested(config, "infrastructure.min_workers", 1), + # Affinity + affinity_base=get_nested(config, "affinity.base", 0.30), + affinity_reuse_weight=get_nested(config, "affinity.reuse_weight", 0.15), + affinity_iat_weight=get_nested(config, "affinity.iat_weight", 0.20), + sticky_load_floor=get_nested(config, "affinity.sticky_load_floor", 0.70), + # Exploration + base_ts_weight=get_nested(config, "exploration.base_ts_weight", 0.10), + temp_base=get_nested(config, "exploration.temperature.base", 1.0), + temp_min=get_nested(config, "exploration.temperature.min", 0.15), + temp_max=get_nested(config, "exploration.temperature.max", 2.0), + # Switching cost + switch_cost_base=get_nested(config, "switching_cost.base", 0.20), + switch_cost_reuse=get_nested(config, "switching_cost.reuse_penalty", 0.08), + switch_cost_iat=get_nested(config, "switching_cost.iat_penalty", 0.05), + # Load balancing + queue_penalty_weight=get_nested(config, "load_balancing.queue_penalty_weight", 0.50), + gpu_penalty_weight=get_nested(config, "load_balancing.gpu_penalty_weight", 1.00), + outstanding_work_weight=get_nested(config, "load_balancing.outstanding_work_weight", 0.45), + job_gpu_coupling_weight=get_nested(config, "load_balancing.job_gpu_coupling_weight", 0.40), + job_queue_coupling_weight=get_nested(config, "load_balancing.job_queue_coupling_weight", 0.20), + # Prefill + prefill_token_scale=get_nested(config, "prefill.token_scale", 1024.0), + prefill_weight=get_nested(config, "prefill.weight", 1.0), + # LinTS + lints_lambda=get_nested(config, "lints.lambda", 1.0), + lints_v=get_nested(config, "lints.v", 0.25), + lints_forget=get_nested(config, "lints.forget_rate", 0.995), + # Feedback + feedback_timeout_seconds=get_nested(config, "feedback.timeout_seconds", 120.0), + pending_sweep_interval_seconds=get_nested(config, "feedback.sweep_interval_seconds", 5.0), + timeout_reward=get_nested(config, "feedback.timeout_reward", 0.0), + latency_ema_alpha=get_nested(config, "feedback.latency_ema_alpha", 0.2), + # Debug + debug_traces=get_nested(config, "debug.traces_enabled", False), + debug_trace_dir=get_nested(config, "debug.trace_dir", "/tmp/dynamo_router_traces"), + debug_buffer_size=get_nested(config, "debug.buffer_size", 2000), + ) + await router.initialize() + + # Serve both endpoints + await asyncio.gather( + component.endpoint("find_worker").serve_endpoint(router.generate), + component.endpoint("feedback").serve_endpoint(router.feedback), + ) + + +if __name__ == "__main__": + uvloop.install() + asyncio.run(worker()) diff --git a/external/dynamo/start_dynamo_optimized_thompson_hints_vllm.sh b/external/dynamo/start_dynamo_optimized_thompson_hints_vllm.sh index 0b489d714c..17bf20fe20 100755 --- a/external/dynamo/start_dynamo_optimized_thompson_hints_vllm.sh +++ b/external/dynamo/start_dynamo_optimized_thompson_hints_vllm.sh @@ -53,7 +53,7 @@ set -euo pipefail # See env.example for documentation on each variable CONTAINER_NAME="dynamo-vllm" WORKER_GPUS="${DYNAMO_GPU_DEVICES:-0,1,2,3,4,5,6,7}" -TP_SIZE="${DYNAMO_TP_SIZE:-2}" +TP_SIZE="${DYNAMO_TP_SIZE:-4}" HTTP_PORT="${DYNAMO_HTTP_PORT:-8000}" # Metrics ports - each component gets its own port to avoid conflicts # Using 18xxx range to avoid conflicts with common services @@ -64,8 +64,38 @@ ROUTER_METRICS_PORT="${DYNAMO_ROUTER_METRICS_PORT:-18090}" PROCESSOR_METRICS_PORT="${DYNAMO_PROCESSOR_METRICS_PORT:-18091}" MODEL="/workspace/models/Llama-3.3-70B-Instruct" SERVED_MODEL_NAME="${DYNAMO_MODEL_NAME:-llama-3.3-70b}" -# vLLM container image - update version as needed -IMAGE="${DYNAMO_VLLM_IMAGE:-nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.1}" + +# ============================================================================ +# MultiLRU Configuration Logic +# ============================================================================ +# Default behavior (standard vLLM 0.7.1 image): +# - Uses router.py and processor.py (with @dynamo_worker(static=False)) +# - Uses standard vLLM scheduler (no MultiLRU) +# - Works with nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.1 +# +# To enable MultiLRU (requires custom-built image): +# export DYNAMO_USE_MULTILRU=true +# export DYNAMO_VLLM_IMAGE=dynamo-multi-lru:latest +# bash start_dynamo_optimized_thompson_hints_vllm.sh +# ============================================================================ + +# Enforce safe defaults: only use multilru if EXPLICITLY enabled +if [ "${DYNAMO_USE_MULTILRU:-}" != "true" ]; then + # Not explicitly set to true - use standard configuration + DYNAMO_USE_MULTILRU="false" + # If image wasn't explicitly set to custom multilru image, use standard + if [ "${DYNAMO_VLLM_IMAGE:-}" != "dynamo-multi-lru:latest" ]; then + IMAGE="nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.1" + else + IMAGE="${DYNAMO_VLLM_IMAGE}" + fi +else + # Explicitly enabled - use multilru configuration + DYNAMO_USE_MULTILRU="true" + # Default to custom image if not specified + IMAGE="${DYNAMO_VLLM_IMAGE:-dynamo-multi-lru:latest}" +fi + SHM_SIZE="${DYNAMO_SHM_SIZE:-16g}" WORKER_INIT_TIMEOUT_S="${DYNAMO_WORKER_INIT_TIMEOUT_S:-1800}" @@ -137,6 +167,11 @@ CUSTOM_DYNAMO_DIR="${SCRIPT_DIR}/optimized" echo "=========================================================" echo "Dynamo vLLM with OPTIMIZED Thompson Sampling Router" echo "=========================================================" +if [ "$DYNAMO_USE_MULTILRU" = "true" ]; then + echo "Configuration: MultiLRU Mode (custom image: $IMAGE)" +else + echo "Configuration: Standard Mode (image: $IMAGE)" +fi echo "Model: Llama-3.3-70B-Instruct" echo "Container: $CONTAINER_NAME" echo "HTTP Port: $HTTP_PORT (default Dynamo frontend)" @@ -178,16 +213,27 @@ fi echo "" echo "=========================================================" -# Verify custom components exist -if [ ! -f "$CUSTOM_DYNAMO_DIR/router.py" ]; then - echo "✗ ERROR: Custom router.py not found at: $CUSTOM_DYNAMO_DIR/router.py" +# Select router/processor scripts based on DYNAMO_USE_MULTILRU +if [ "$DYNAMO_USE_MULTILRU" = "true" ]; then + ROUTER_SCRIPT="router_multilru.py" + PROCESSOR_SCRIPT="processor_multilru.py" +else + ROUTER_SCRIPT="router.py" + PROCESSOR_SCRIPT="processor.py" +fi + +# Verify selected components exist +if [ ! -f "$CUSTOM_DYNAMO_DIR/$ROUTER_SCRIPT" ]; then + echo "✗ ERROR: Custom $ROUTER_SCRIPT not found at: $CUSTOM_DYNAMO_DIR/$ROUTER_SCRIPT" exit 1 fi -if [ ! -f "$CUSTOM_DYNAMO_DIR/processor.py" ]; then - echo "✗ ERROR: Custom processor.py not found at: $CUSTOM_DYNAMO_DIR/processor.py" +if [ ! -f "$CUSTOM_DYNAMO_DIR/$PROCESSOR_SCRIPT" ]; then + echo "✗ ERROR: Custom $PROCESSOR_SCRIPT not found at: $CUSTOM_DYNAMO_DIR/$PROCESSOR_SCRIPT" exit 1 fi echo "✓ Custom components found in: $CUSTOM_DYNAMO_DIR" +echo " Router: $ROUTER_SCRIPT" +echo " Processor: $PROCESSOR_SCRIPT" echo "" # Start ETCD if not running @@ -375,6 +421,7 @@ docker run -d \ -e MAX_NUM_SEQS=$MAX_NUM_SEQS \ -e ENABLE_KV_EVENTS=$ENABLE_KV_EVENTS \ -e KV_EVENT_BASE_PORT=$KV_EVENT_BASE_PORT \ + -e DYNAMO_USE_MULTILRU=$DYNAMO_USE_MULTILRU \ -e DYNAMO_WORKER_COMPONENT=backend \ $IMAGE \ bash -c " @@ -577,6 +624,16 @@ docker run -d \ # Build KV events config JSON for this worker (unique endpoint per worker) KV_EVENTS_JSON=\"{\\\"enable_kv_cache_events\\\":true,\\\"publisher\\\":\\\"zmq\\\",\\\"endpoint\\\":\\\"tcp://*:\$KV_EVENT_PORT\\\"}\" + # Build scheduler class option - use DynamoScheduler for MultiLruBackend if available + # Set DYNAMO_USE_MULTILRU=false to disable + SCHEDULER_OPT=\"\" + if [ \"\${DYNAMO_USE_MULTILRU:-false}\" = \"true\" ]; then + SCHEDULER_OPT=\"--scheduler-cls kvbm.v2.vllm.schedulers.dynamo.DynamoScheduler\" + echo \" Scheduler: DynamoScheduler with MultiLruBackend (frequency-based eviction)\" + else + echo \" Scheduler: Default vLLM scheduler\" + fi + if [ \"\$ENABLE_KV_EVENTS\" = \"true\" ]; then CUDA_VISIBLE_DEVICES=\$WORKER_GPU_LIST \ DYN_SYSTEM_PORT=\$((WORKER_METRICS_PORT + i)) \ @@ -592,6 +649,7 @@ docker run -d \ --block-size $KV_BLOCK_SIZE \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ --max-num-seqs $MAX_NUM_SEQS \ + \$SCHEDULER_OPT \ \$GPU_BLOCKS_OVERRIDE_OPT \ --kv-events-config \"\$KV_EVENTS_JSON\" & else @@ -609,6 +667,7 @@ docker run -d \ --block-size $KV_BLOCK_SIZE \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ --max-num-seqs $MAX_NUM_SEQS \ + \$SCHEDULER_OPT \ \$GPU_BLOCKS_OVERRIDE_OPT & fi WORKER_PIDS+=(\$!) @@ -641,7 +700,7 @@ docker run -d \ # It needs workers to be present (started in Step 1) # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component DYN_SYSTEM_PORT=\$ROUTER_METRICS_PORT \ - python3 /workspace/custom_dynamo/router.py \ + python3 /workspace/custom_dynamo/$ROUTER_SCRIPT \ --config /workspace/custom_dynamo/config.yaml & ROUTER_PID=\$! echo \"Router PID: \$ROUTER_PID\" @@ -658,7 +717,7 @@ docker run -d \ # --static-endpoint on the frontend to find it. # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component DYN_SYSTEM_PORT=\$PROCESSOR_METRICS_PORT \ - python3 /workspace/custom_dynamo/processor.py \ + python3 /workspace/custom_dynamo/$PROCESSOR_SCRIPT \ --enable-router \ --model-path $MODEL \ --model-name $SERVED_MODEL_NAME & @@ -826,6 +885,13 @@ if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then echo " GPU Mem Utilization: $GPU_MEMORY_UTILIZATION (DYNAMO_GPU_MEMORY_UTILIZATION)" echo " Max Concurrent Seqs: $MAX_NUM_SEQS (DYNAMO_MAX_NUM_SEQS)" echo " KV Events: $ENABLE_KV_EVENTS (DYNAMO_ENABLE_KV_EVENTS)" + if [ "${DYNAMO_USE_MULTILRU:-false}" = "true" ]; then + echo " Scheduler: DynamoScheduler with MultiLruBackend (DYNAMO_USE_MULTILRU=true)" + echo " → 4-pool system: Cold→Warm→Hot→VeryHot" + echo " → Promotion thresholds: [2, 6, 15] accesses" + else + echo " Scheduler: Default vLLM scheduler (DYNAMO_USE_MULTILRU=false)" + fi echo "" echo "API Endpoint: http://localhost:$HTTP_PORT/v1/chat/completions" echo "Health Check: http://localhost:$HTTP_PORT/health" From 58f91e274b361f377e6a7bdf33b2aef28411979c Mon Sep 17 00:00:00 2001 From: bbednarski9 Date: Fri, 30 Jan 2026 03:26:54 +0000 Subject: [PATCH 09/13] ruff checks Signed-off-by: bbednarski9 --- ...eval_config_no_rethinking_minimal_test.yml | 2 +- .../monitoring/scripts/kv_event_observer.py | 188 +++++++++--------- external/dynamo/optimized/processor.py | 9 +- .../dynamo/optimized/processor_multilru.py | 22 +- external/dynamo/optimized/router.py | 6 +- external/dynamo/optimized/router_multilru.py | 6 +- 6 files changed, 116 insertions(+), 117 deletions(-) diff --git a/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/eval_config_no_rethinking_minimal_test.yml b/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/eval_config_no_rethinking_minimal_test.yml index 6d3b8aff31..e5a695ba0a 100644 --- a/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/eval_config_no_rethinking_minimal_test.yml +++ b/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/eval_config_no_rethinking_minimal_test.yml @@ -78,7 +78,7 @@ llms: dynamo_llm: _type: dynamo model_name: llama-3.3-70b - base_url: http://localhost:8099/v1 + base_url: http://localhost:8000/v1 api_key: dummy # _type: nim # model_name: meta/llama-3.3-70b-instruct diff --git a/external/dynamo/monitoring/scripts/kv_event_observer.py b/external/dynamo/monitoring/scripts/kv_event_observer.py index 147030c5ff..93627e980e 100755 --- a/external/dynamo/monitoring/scripts/kv_event_observer.py +++ b/external/dynamo/monitoring/scripts/kv_event_observer.py @@ -36,16 +36,18 @@ import argparse import json +import re import signal import sys -import time import threading +import time import urllib.request -import re from collections import defaultdict -from dataclasses import dataclass, field -from datetime import datetime, timezone -from typing import Any, Optional, List +from dataclasses import dataclass +from dataclasses import field +from datetime import UTC +from datetime import datetime +from typing import Any try: import zmq @@ -81,34 +83,34 @@ class KVCacheStats: hash_to_blocks: dict = field(default_factory=lambda: defaultdict(list)) last_event_time: float = 0.0 last_seq: int = -1 - - def record_stored(self, block_hashes: List[Any], parent_hash: Any = None): + + def record_stored(self, block_hashes: list[Any], parent_hash: Any = None): """Record BlockStored event.""" self.last_event_time = time.time() for bh in block_hashes: h = format_hash(bh) self.stored_blocks += 1 self.unique_hashes.add(h) - - def record_removed(self, block_hashes: List[Any]): + + def record_removed(self, block_hashes: list[Any]): """Record BlockRemoved event.""" self.last_event_time = time.time() for bh in block_hashes: h = format_hash(bh) self.evicted_blocks += 1 self.unique_hashes.discard(h) - + def record_cleared(self): """Record AllBlocksCleared event.""" self.last_event_time = time.time() self.cleared_count += 1 self.unique_hashes.clear() - + def record_cache_hit(self, hit_tokens: int, query_tokens: int): """Record cache hit from metrics delta.""" self.cache_hit_tokens += hit_tokens self.cache_query_tokens += query_tokens - + def summary(self) -> dict: """Return summary statistics.""" hit_rate = (self.cache_hit_tokens / self.cache_query_tokens * 100) if self.cache_query_tokens > 0 else 0 @@ -131,14 +133,14 @@ class KVEventObserver: Also optionally polls Prometheus metrics to detect cache hits, which don't generate ZMQ events. """ - + def __init__( self, host: str = "localhost", port: int = 20080, verbose: bool = False, - output_file: Optional[str] = None, - metrics_port: Optional[int] = None, + output_file: str | None = None, + metrics_port: int | None = None, ): self.host = host self.port = port @@ -148,15 +150,15 @@ def __init__( self.stats = KVCacheStats() self.running = False self._output_handle = None - + # Metrics polling state self._last_hits = 0.0 self._last_queries = 0.0 self._metrics_thread = None - + self.context = zmq.Context() self.socket = self.context.socket(zmq.SUB) - + def _parse_metric(self, metrics_text: str, metric_name: str) -> float: """Extract a metric value from Prometheus text format.""" pattern = rf'^{re.escape(metric_name)}\{{[^}}]*\}}\s+([0-9.e+-]+)' @@ -165,42 +167,44 @@ def _parse_metric(self, metrics_text: str, metric_name: str) -> float: if match: return float(match.group(1)) return 0.0 - + def _poll_metrics(self): """Background thread to poll Prometheus metrics for cache hits.""" metrics_url = f"http://{self.host}:{self.metrics_port}/metrics" - + while self.running: try: with urllib.request.urlopen(metrics_url, timeout=2) as resp: metrics_text = resp.read().decode('utf-8') - + hits = self._parse_metric(metrics_text, 'vllm:prefix_cache_hits_total') queries = self._parse_metric(metrics_text, 'vllm:prefix_cache_queries_total') - + # Calculate deltas hit_delta = hits - self._last_hits query_delta = queries - self._last_queries - + if hit_delta > 0: # Cache hit detected! self.stats.record_cache_hit(int(hit_delta), int(query_delta)) if self.verbose: hit_rate = (hit_delta / query_delta * 100) if query_delta > 0 else 0 - print(f"✅ [CACHE HIT] tokens={int(hit_delta):4d} queried={int(query_delta):4d} hit_rate={hit_rate:.0f}%") + print( + f"✅ [CACHE HIT] tokens={int(hit_delta):4d} queried={int(query_delta):4d} hit_rate={hit_rate:.0f}%" + ) elif query_delta > 0: # Queries happened but no hits (cache miss) self.stats.record_cache_hit(0, int(query_delta)) - + self._last_hits = hits self._last_queries = queries - + except Exception as e: if self.verbose: print(f"[Metrics] Poll error: {e}") - + time.sleep(0.5) # Poll every 500ms - + def connect(self): """Connect to the vLLM KV event publisher.""" endpoint = f"tcp://{self.host}:{self.port}" @@ -209,12 +213,12 @@ def connect(self): # Subscribe to all topics (empty string = all) self.socket.setsockopt_string(zmq.SUBSCRIBE, "") self.socket.setsockopt(zmq.RCVTIMEO, 1000) - print(f"[KV Observer] ✓ Connected and subscribed") - + print("[KV Observer] ✓ Connected and subscribed") + if self.output_file: self._output_handle = open(self.output_file, "a") print(f"[KV Observer] Writing events to: {self.output_file}") - + if self.metrics_port: print(f"[KV Observer] Polling metrics at http://{self.host}:{self.metrics_port}/metrics") # Initialize baseline metrics @@ -227,8 +231,8 @@ def connect(self): print(f"[KV Observer] ✓ Baseline: hits={self._last_hits:.0f} queries={self._last_queries:.0f}") except Exception as e: print(f"[KV Observer] ⚠ Could not get baseline metrics: {e}") - - def parse_multipart(self, parts: List[bytes]) -> Optional[dict]: + + def parse_multipart(self, parts: list[bytes]) -> dict | None: """Parse a ZMQ multipart message from vLLM. Format: [topic, sequence, payload] @@ -240,19 +244,19 @@ def parse_multipart(self, parts: List[bytes]) -> Optional[dict]: if self.verbose: print(f"[KV Observer] Warning: Expected 3 parts, got {len(parts)}") return None - + topic, seq_bytes, payload = parts[0], parts[1], parts[2] - + try: seq = int.from_bytes(seq_bytes, "big", signed=True) self.stats.last_seq = seq except Exception: seq = -1 - + try: # Decode msgpack payload batch = msgpack.unpackb(payload, raw=False, strict_map_key=False) - + # vLLM KVEventBatch format: [timestamp, events_list, dp_rank] # Note: events is at index 1, dp_rank at index 2! if isinstance(batch, (list, tuple)) and len(batch) >= 3: @@ -267,11 +271,11 @@ def parse_multipart(self, parts: List[bytes]) -> Optional[dict]: events = [batch] if batch else [] ts = time.time() dp_rank = 0 - + # Ensure events is a list if not isinstance(events, list): events = [events] if events else [] - + return { "seq": seq, "timestamp": ts, @@ -284,14 +288,14 @@ def parse_multipart(self, parts: List[bytes]) -> Optional[dict]: print(f"[KV Observer] Parse error: {e}") print(f"[KV Observer] Raw payload: {payload[:100]}...") return None - + def handle_event(self, event_data: dict): """Handle a parsed event batch.""" seq = event_data.get("seq", -1) ts = event_data.get("timestamp", 0) dp_rank = event_data.get("dp_rank", 0) events = event_data.get("events", []) - + for event in events: # Events can be dicts or tuples/lists # vLLM format (list): @@ -307,7 +311,7 @@ def handle_event(self, event_data: dict): block_size = event.get("block_size", 0) elif isinstance(event, (list, tuple)) and len(event) >= 1: event_type = str(event[0]) if event else "unknown" - + if event_type == "BlockRemoved" and len(event) >= 2: # ['BlockRemoved', [hashes], medium] block_hashes = event[1] if isinstance(event[1], list) else [event[1]] @@ -341,16 +345,17 @@ def handle_event(self, event_data: dict): medium = "GPU" token_ids = [] block_size = 0 - + # Normalize event type (vLLM uses class names like "BlockStored") event_type_lower = event_type.lower() - + if "stored" in event_type_lower or "blockstored" in event_type_lower: self.stats.record_stored(block_hashes, parent_hash) if self.verbose: num_tokens = len(token_ids) if token_ids else block_size for bh in block_hashes: - print(f"📦 [STORED ] seq={seq:6d} hash={format_hash(bh)} tokens={num_tokens:3d} medium={medium}") + print( + f"📦 [STORED ] seq={seq:6d} hash={format_hash(bh)} tokens={num_tokens:3d} medium={medium}") elif "removed" in event_type_lower or "blockremoved" in event_type_lower: self.stats.record_removed(block_hashes) if self.verbose: @@ -360,12 +365,14 @@ def handle_event(self, event_data: dict): self.stats.record_cleared() if self.verbose: print(f"🧹 [CLEARED ] seq={seq:6d} All blocks cleared") - else: - if self.verbose: - print(f"❓ [UNKNOWN ] seq={seq:6d} type={event_type} data={event[:3] if isinstance(event, (list, tuple)) else event}") - + elif self.verbose: + print( + f"❓ [UNKNOWN ] seq={seq:6d} type={event_type} data={event[:3] if isinstance(event, (list, tuple)) else event}" + ) + # Write to output file if self._output_handle: + def get_event_type(e): if isinstance(e, dict): return str(e.get("type", "unknown")) @@ -373,94 +380,89 @@ def get_event_type(e): return str(e[0]) else: return str(e) - + output = { - "_timestamp": datetime.now(timezone.utc).isoformat(), + "_timestamp": datetime.now(UTC).isoformat(), "seq": seq, "ts": ts, "dp_rank": dp_rank, - "events": [{"type": get_event_type(e)} for e in events], + "events": [{ + "type": get_event_type(e) + } for e in events], } self._output_handle.write(json.dumps(output) + "\n") self._output_handle.flush() - - def run(self, duration: Optional[float] = None): + + def run(self, duration: float | None = None): """Run the observer loop.""" self.running = True start_time = time.time() batches_received = 0 - + # Start metrics polling thread if configured if self.metrics_port: - self._metrics_thread = threading.Thread( - target=self._poll_metrics, - daemon=True, - name="metrics-poller" - ) + self._metrics_thread = threading.Thread(target=self._poll_metrics, daemon=True, name="metrics-poller") self._metrics_thread.start() - - print(f"[KV Observer] Listening for KV events (msgpack multipart)...") + + print("[KV Observer] Listening for KV events (msgpack multipart)...") if self.metrics_port: - print(f"[KV Observer] Cache hits will show as ✅ [CACHE HIT]") - print(f"[KV Observer] Press Ctrl+C to stop") + print("[KV Observer] Cache hits will show as ✅ [CACHE HIT]") + print("[KV Observer] Press Ctrl+C to stop") print("-" * 60) - + try: while self.running: if duration and (time.time() - start_time) >= duration: print(f"\n[KV Observer] Duration limit reached ({duration}s)") break - + try: # Receive multipart message parts = self.socket.recv_multipart() event_data = self.parse_multipart(parts) - + if event_data: self.handle_event(event_data) batches_received += 1 - + if batches_received % 20 == 0 and not self.verbose: summary = self.stats.summary() - print( - f"[{batches_received:5d} batches] " - f"Stored: {summary['stored_blocks']:4d} | " - f"Removed: {summary['evicted_blocks']:4d} | " - f"Net: {summary['net_blocks']:4d} | " - f"Hashes: {summary['unique_hashes_current']} | " - f"Seq: {summary['last_seq']}" - ) + print(f"[{batches_received:5d} batches] " + f"Stored: {summary['stored_blocks']:4d} | " + f"Removed: {summary['evicted_blocks']:4d} | " + f"Net: {summary['net_blocks']:4d} | " + f"Hashes: {summary['unique_hashes_current']} | " + f"Seq: {summary['last_seq']}") except zmq.Again: # Timeout, continue loop continue - + except KeyboardInterrupt: print("\n[KV Observer] Interrupted") finally: self.stop() - + def stop(self): """Stop and print final statistics.""" self.running = False - + print("-" * 60) print("[KV Observer] Final Statistics:") for key, value in self.stats.summary().items(): print(f" {key}: {value}") - + if self._output_handle: self._output_handle.close() - + self.socket.close() self.context.term() print("[KV Observer] Stopped") def main(): - parser = argparse.ArgumentParser( - description="Observe KV cache events from vLLM workers", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" + parser = argparse.ArgumentParser(description="Observe KV cache events from vLLM workers", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" Examples: # Monitor worker 0 (ZMQ events only): python kv_event_observer.py -p 20080 -v @@ -481,17 +483,19 @@ def main(): 📦 STORED - Block committed to prefix cache (ZMQ) 🗑️ REMOVED - Block evicted from cache (ZMQ) ✅ CACHE HIT - Tokens served from cache (metrics polling) -""" - ) +""") parser.add_argument("--host", "-H", default="localhost", help="Worker host (default: localhost)") parser.add_argument("--port", "-p", type=int, default=20080, help="KV event ZMQ port (default: 20080)") - parser.add_argument("--metrics-port", "-m", type=int, help="Prometheus metrics port for cache hit detection (e.g., 18081)") + parser.add_argument("--metrics-port", + "-m", + type=int, + help="Prometheus metrics port for cache hit detection (e.g., 18081)") parser.add_argument("--verbose", "-v", action="store_true", help="Print each event") parser.add_argument("--output", "-o", help="Output file (JSONL format)") parser.add_argument("--duration", "-d", type=float, help="Run duration in seconds") - + args = parser.parse_args() - + observer = KVEventObserver( host=args.host, port=args.port, @@ -499,10 +503,10 @@ def main(): output_file=args.output, metrics_port=args.metrics_port, ) - + signal.signal(signal.SIGINT, lambda s, f: setattr(observer, 'running', False)) signal.signal(signal.SIGTERM, lambda s, f: setattr(observer, 'running', False)) - + observer.connect() observer.run(duration=args.duration) diff --git a/external/dynamo/optimized/processor.py b/external/dynamo/optimized/processor.py index 4416d20cd8..c814b04e26 100644 --- a/external/dynamo/optimized/processor.py +++ b/external/dynamo/optimized/processor.py @@ -361,13 +361,12 @@ async def initialize(self): # - vLLM: uses "backend" (hardcoded in dynamo.vllm) worker_component_name = os.environ.get("DYNAMO_WORKER_COMPONENT") if not worker_component_name: - raise ValueError( - "DYNAMO_WORKER_COMPONENT environment variable is required. " - "Set to 'worker' for SGLang or 'backend' for vLLM." - ) + raise ValueError("DYNAMO_WORKER_COMPONENT environment variable is required. " + "Set to 'worker' for SGLang or 'backend' for vLLM.") worker_component = self.runtime.namespace("workers").component(worker_component_name) self.engine_client = await worker_component.endpoint("generate").client() - logger.info("Engine client created for workers/%s/generate, waiting for worker instances...", worker_component_name) + logger.info("Engine client created for workers/%s/generate, waiting for worker instances...", + worker_component_name) await self.engine_client.wait_for_instances() logger.info("Processor initialized successfully (routing to workers/%s/generate)", worker_component_name) diff --git a/external/dynamo/optimized/processor_multilru.py b/external/dynamo/optimized/processor_multilru.py index 0e114d377b..e3a0c7f412 100644 --- a/external/dynamo/optimized/processor_multilru.py +++ b/external/dynamo/optimized/processor_multilru.py @@ -221,7 +221,10 @@ def __init__(self, endpoint): Args: endpoint: Dynamo endpoint object for registering metrics callback. """ - from prometheus_client import Counter, Gauge, Histogram, REGISTRY + from prometheus_client import REGISTRY + from prometheus_client import Counter + from prometheus_client import Gauge + from prometheus_client import Histogram # Request throughput (prefixed with thompson_ to avoid conflicts with # serve_endpoint's built-in work handler metrics) @@ -299,9 +302,7 @@ def __init__(self, endpoint): # Register metrics with Dynamo's endpoint for /metrics exposure from dynamo.common.utils.prometheus import register_engine_metrics_callback - register_engine_metrics_callback( - endpoint, REGISTRY, metric_prefix_filters=["thompson_"] - ) + register_engine_metrics_callback(endpoint, REGISTRY, metric_prefix_filters=["thompson_"]) logger.info("Processor metrics initialized via prometheus_client") @@ -366,13 +367,12 @@ async def initialize(self): # - vLLM: uses "backend" (hardcoded in dynamo.vllm) worker_component_name = os.environ.get("DYNAMO_WORKER_COMPONENT") if not worker_component_name: - raise ValueError( - "DYNAMO_WORKER_COMPONENT environment variable is required. " - "Set to 'worker' for SGLang or 'backend' for vLLM." - ) + raise ValueError("DYNAMO_WORKER_COMPONENT environment variable is required. " + "Set to 'worker' for SGLang or 'backend' for vLLM.") worker_component = self.runtime.namespace("workers").component(worker_component_name) self.engine_client = await worker_component.endpoint("generate").client() - logger.info("Engine client created for workers/%s/generate, waiting for worker instances...", worker_component_name) + logger.info("Engine client created for workers/%s/generate, waiting for worker instances...", + worker_component_name) await self.engine_client.wait_for_instances() logger.info("Processor initialized successfully (routing to workers/%s/generate)", worker_component_name) @@ -420,7 +420,7 @@ def _extract_hints(self, request: dict[str, Any]) -> tuple[str, int, str, str]: if iat not in ("LOW", "MEDIUM", "HIGH"): iat = "MEDIUM" - # Extract backend selection (determines v1 vs v2 routing) + # Extract backend selection (determines v1 vs v2 routing) backend_selector = self._extract_annotation(annotations, "backend") use_frequency_backend = backend_selector == "frequency_multi_lru" if backend_selector else False @@ -699,7 +699,7 @@ async def generate(self, raw: dict[str, Any]): # Determine KVBM routing path based on backend selection kvbm_version = "v2" if use_frequency_backend else "v1" - + # Get token IDs from preprocessed request token_ids = raw.get("token_ids", []) if not isinstance(token_ids, list): diff --git a/external/dynamo/optimized/router.py b/external/dynamo/optimized/router.py index e68ed5d9df..24684a5714 100644 --- a/external/dynamo/optimized/router.py +++ b/external/dynamo/optimized/router.py @@ -601,10 +601,8 @@ async def initialize(self): # - vLLM: uses "backend" (hardcoded in dynamo.vllm) worker_component = os.environ.get("DYNAMO_WORKER_COMPONENT") if not worker_component: - raise ValueError( - "DYNAMO_WORKER_COMPONENT environment variable is required. " - "Set to 'worker' for SGLang or 'backend' for vLLM." - ) + raise ValueError("DYNAMO_WORKER_COMPONENT environment variable is required. " + "Set to 'worker' for SGLang or 'backend' for vLLM.") engine = self.runtime.namespace("workers").component(worker_component) logger.info("Getting engine client for workers/%s/generate", worker_component) self.engine_client = await engine.endpoint("generate").client() diff --git a/external/dynamo/optimized/router_multilru.py b/external/dynamo/optimized/router_multilru.py index bca8881add..bbeb9ef6e8 100644 --- a/external/dynamo/optimized/router_multilru.py +++ b/external/dynamo/optimized/router_multilru.py @@ -601,10 +601,8 @@ async def initialize(self): # - vLLM: uses "backend" (hardcoded in dynamo.vllm) worker_component = os.environ.get("DYNAMO_WORKER_COMPONENT") if not worker_component: - raise ValueError( - "DYNAMO_WORKER_COMPONENT environment variable is required. " - "Set to 'worker' for SGLang or 'backend' for vLLM." - ) + raise ValueError("DYNAMO_WORKER_COMPONENT environment variable is required. " + "Set to 'worker' for SGLang or 'backend' for vLLM.") engine = self.runtime.namespace("workers").component(worker_component) logger.info("Getting engine client for workers/%s/generate", worker_component) self.engine_client = await engine.endpoint("generate").client() From 144e30f240541292d813fa17bc1da5c2628ceb65 Mon Sep 17 00:00:00 2001 From: bbednarski9 Date: Fri, 30 Jan 2026 08:16:23 +0000 Subject: [PATCH 10/13] sequence diagrams Signed-off-by: bbednarski9 --- external/dynamo/END_TO_END_EVALS.md | 0 external/dynamo/monitoring/README.md | 387 ++++++++++++++++++++++----- 2 files changed, 326 insertions(+), 61 deletions(-) create mode 100644 external/dynamo/END_TO_END_EVALS.md diff --git a/external/dynamo/END_TO_END_EVALS.md b/external/dynamo/END_TO_END_EVALS.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/external/dynamo/monitoring/README.md b/external/dynamo/monitoring/README.md index 910b02115f..060ce892aa 100644 --- a/external/dynamo/monitoring/README.md +++ b/external/dynamo/monitoring/README.md @@ -1,6 +1,6 @@ # Dynamo Monitoring Stack -This directory contains a Prometheus + Grafana monitoring setup for the Dynamo LLM inference stack with Thompson Sampling router. +This directory contains a Prometheus + Grafana monitoring setup for the Dynamo LLM inference stack with Thompson Sampling router. Metrics are collected at **2-second resolution** directly from ai-dynamo's Prometheus API for per-request granularity. ## Supported Backends @@ -15,31 +15,137 @@ The Grafana dashboard includes a **Backend** dropdown selector to switch between ## Quick Start +The monitoring stack starts **automatically** when you run the Dynamo startup script: + ```bash -# Start the monitoring stack +# Start Dynamo (monitoring starts automatically) +bash start_dynamo_optimized_thompson_hints_vllm.sh + +# Or start monitoring manually if needed cd monitoring docker compose up -d +``` -# Access the dashboards -# Prometheus: http://localhost:9090 -# Grafana: http://localhost:3000 (admin/admin) +**Access the dashboards:** +- **Grafana**: http://localhost:3000 (no login required) +- **Prometheus**: http://localhost:9090 -# In Grafana, use the "Backend" dropdown to select sglang or vllm +**Direct dashboard link:** ``` +http://localhost:3000/d/dynamo-overview/dynamo-llm-overview +``` + +In Grafana, use the **Backend** dropdown to select `sglang` or `vllm` based on your deployment. ## Prerequisites - Docker and Docker Compose - Dynamo stack running (see `../start_dynamo_optimized_thompson_hints_sglang.sh` or `../start_dynamo_optimized_thompson_hints_vllm.sh`) +## Accessing Grafana Dashboard + +### Local Access + +If running on your local machine: + +1. Open your browser +2. Navigate to: **http://localhost:3000/d/dynamo-overview/dynamo-llm-overview** +3. No login required (anonymous access enabled) +4. Use the **Backend** dropdown (top left) to select `sglang` or `vllm` +5. Use the **time filter** (top right) to adjust the time range + +### Remote Access via SSH Tunnel + +If Dynamo and monitoring are running on a remote server (for example, a GPU cluster), use SSH port forwarding: + +**Step 1: Create SSH tunnel** +```bash +# Replace and with your credentials +ssh -L 3000:localhost:3000 @ + +# Example with VPN-accessible server: +ssh -L 3000:localhost:3000 myuser@10.57.201.5 +``` + +**Step 2: Open browser** +Navigate to: **http://localhost:3000/d/dynamo-overview/dynamo-llm-overview** + +**Step 3: Set time filter** +- Click the time picker in the top-right corner of Grafana +- Select a preset range (Last 1 hour, Last 6 hours, Last 24 hours) +- Or set a custom range to view historical data from previous benchmark runs + +> **Tip**: Data persists across restarts. Zoom out to the last 12-24 hours to see multiple benchmark intervals. + +### Viewing Historical Data + +Prometheus stores metrics data persistently. To view data from previous runs: + +1. Open the Grafana dashboard +2. Use the time picker (top right) to expand the time range +3. Look for intervals of activity separated by gaps +4. Compare KV Efficiency scores across different runs + +**Example observation**: With a tool-calling agent (20 tools) on 4xH100 with 2 workers, you might see: +- Worker 18081: 25.4% average KV Efficiency +- Worker 18082: 16.4% average KV Efficiency + +### Sharing Dashboard Access + +Anyone with SSH access to the remote server can view the same data: + +1. Share the SSH tunnel command with team members +2. They can connect and view real-time or historical metrics +3. Useful for collaborative debugging and performance analysis + ## Architecture +The monitoring stack collects metrics from all Dynamo components. The architecture uses **model name isolation** to ensure all requests flow through the Thompson Sampling router. + +### Request Flow (Model Name Isolation) + +``` +Client Request (with nvext.annotations) + ↓ +┌─────────────────────────────────────────────────────────────────────────┐ +│ Default Dynamo Frontend (:8000) │ +│ - Tokenization + nvext parsing │ +│ - ETCD ModelWatcher (namespace=dynamo) │ +│ - Routes to processor ONLY (workers use internal model name) │ +└─────────────────────────────────────────────────────────────────────────┘ + ↓ discovers processor (model: llama-3.3-70b) +┌─────────────────────────────────────────────────────────────────────────┐ +│ Custom Processor (:18091/metrics) │ +│ - Extracts hints: prefix_id, total_requests, osl, iat │ +│ - Queries Thompson Sampling router │ +│ - Registered at: dynamo.backend.generate (namespace=dynamo) │ +└─────────────────────────────────────────────────────────────────────────┘ + ↓ queries router +┌─────────────────────────────────────────────────────────────────────────┐ +│ Custom Router (:18090/metrics) │ +│ - Thompson Sampling + KV overlap scoring │ +│ - Returns optimal worker_id │ +│ - Registered at: dynamo.router.{find_worker,feedback} │ +└─────────────────────────────────────────────────────────────────────────┘ + ↓ returns worker_id +┌─────────────────────────────────────────────────────────────────────────┐ +│ vLLM/SGLang Workers (:18081, :18082, ... /metrics) │ +│ - Registered at: workers.worker.generate (namespace=workers) │ +│ - Model: llama-3.3-70b-internal (hidden from frontend) │ +│ - Each worker uses TP_SIZE GPUs │ +└─────────────────────────────────────────────────────────────────────────┘ + ↓ +Response + Feedback to Router +``` + +### Metrics Collection + ``` ┌──────────────────────────────────────────────────────────────────────────────┐ │ Dynamo Stack │ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ -│ │ Frontend │ │ Worker │ │ Router │ │ Processor │ │ -│ │ :8000 │ │ :8081 │ │ :8082 │ │ :8083 │ │ +│ │ Frontend │ │ Workers │ │ Router │ │ Processor │ │ +│ │ :8000 │ │ :18081-180xx│ │ :18090 │ │ :18091 │ │ │ │ /metrics │ │ /metrics │ │ /metrics │ │ /metrics │ │ │ │ (latency) │ │ (KV cache) │ │ (routing) │ │ (KVE) │ │ │ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ @@ -50,32 +156,56 @@ docker compose up -d │ Monitoring Stack │ │ ┌────────────────────────────────────────────────────────────────────────┐ │ │ │ Prometheus :9090 │ │ -│ │ Scrapes all 4 endpoints every 5 seconds: │ │ -│ │ - Frontend (:8000) - latency, throughput, tokens │ │ -│ │ - Worker (:8081) - KV cache, NATS, internal stats │ │ -│ │ - Router (:8082) - Thompson Sampling routing metrics │ │ -│ │ - Processor (:8083) - Thompson Sampling KVE metrics │ │ +│ │ Scrapes all endpoints every 2 seconds for per-request granularity: │ │ +│ │ - Frontend (:8000) - latency, throughput, tokens │ │ +│ │ - Workers (:18081-180xx) - KV cache, backend stats (per-worker) │ │ +│ │ - Router (:18090) - Thompson Sampling routing metrics │ │ +│ │ - Processor (:18091) - Thompson Sampling KVE metrics │ │ │ └────────────────────────────────┬───────────────────────────────────────┘ │ │ │ │ │ ▼ │ │ ┌────────────────────────────────────────────────────────────────────────┐ │ │ │ Grafana :3000 │ │ -│ │ Pre-configured dashboard: "Dynamo LLM Overview" │ │ -│ │ Login: admin / admin │ │ +│ │ Dashboard: "Dynamo LLM Overview" │ │ +│ │ URL: /d/dynamo-overview/dynamo-llm-overview │ │ +│ │ Access: Anonymous (no login required) │ │ │ └────────────────────────────────────────────────────────────────────────┘ │ └──────────────────────────────────────────────────────────────────────────────┘ ``` +### Model Name Isolation Explained + +| Component | Model Name | Namespace | Purpose | +|-----------|------------|-----------|---------| +| Workers | `llama-3.3-70b-internal` | `workers` | Hidden from frontend discovery | +| Processor | `llama-3.3-70b` | `dynamo` | Discovered by frontend | +| Router | N/A | `dynamo` | Internal routing service | + +This isolation ensures **ALL requests** go through the Thompson Sampling router, enabling: +- KV overlap-aware worker selection +- Workload hint extraction (prefix_id, osl, iat) +- Per-request feedback for router learning + ## Metrics Endpoints -| Component | Port | URL | Description | -|-----------|------|-----|-------------| -| Frontend | 8000 | `http://localhost:8000/metrics` | User-facing metrics (latency, throughput) | -| Workers | 18081-180xx | `http://localhost:18081/metrics` | Internal metrics (KV cache, NATS stats) - one port per worker | -| Router | 18090 | `http://localhost:18090/metrics` | Thompson Sampling routing metrics | -| Processor | 18091 | `http://localhost:18091/metrics` | Thompson Sampling KVE metrics | +| Component | Port(s) | URL | Description | +|-----------|---------|-----|-------------| +| Frontend | 8000 | `http://localhost:8000/metrics` | User-facing metrics (latency, throughput, tokens) | +| Workers | 18081+ | `http://localhost:18081/metrics` | KV cache, backend stats - one port per worker | +| Router | 18090 | `http://localhost:18090/metrics` | Thompson Sampling routing decisions | +| Processor | 18091 | `http://localhost:18091/metrics` | Thompson Sampling KVE (KV Efficiency) metrics | + +### Worker Port Allocation -**Note**: Worker metrics ports are sequential starting at 18081. With 2 workers: 18081, 18082. With 4 workers: 18081-18084. +Worker metrics ports are sequential starting at `DYNAMO_WORKER_METRICS_PORT` (default: 18081): + +| Configuration | Workers | GPU Allocation | Metrics Ports | +|---------------|---------|----------------|---------------| +| 8 GPUs, TP=4 | 2 | GPUs 0-3, 4-7 | 18081, 18082 | +| 8 GPUs, TP=2 | 4 | GPUs 0-1, 2-3, 4-5, 6-7 | 18081-18084 | +| 4 GPUs, TP=2 | 2 | GPUs 0-1, 2-3 | 18081, 18082 | + +Each worker is identified in Grafana by its metrics port (for example, `instance="localhost:18081"`). ## Key Metrics @@ -98,7 +228,7 @@ User-facing HTTP API metrics for latency, throughput, and token statistics. | `dynamo_frontend_` | `dynamo_frontend_model_context_length` | Gauge | Model context window size | | `dynamo_frontend_` | `dynamo_frontend_model_kv_cache_block_size` | Gauge | KV cache block size | -### Worker Metrics (`:8081/metrics`) +### Worker Metrics (`:18081+/metrics`) Backend worker metrics including KV cache, scheduling, and internal statistics. Both SGLang and vLLM expose similar metrics with different prefixes: - **SGLang**: Metrics prefixed with `sglang:` (e.g., `sglang:cache_hit_rate`) @@ -155,7 +285,7 @@ Both SGLang and vLLM expose similar native metrics with their respective prefixe | `vllm:` | `vllm:generation_tokens_total` | Counter | Total generation tokens | | `vllm:` | `vllm:prompt_tokens_total` | Counter | Total prompt tokens | -### Router Metrics (`:8082/metrics`) +### Router Metrics (`:18090/metrics`) Dynamo component metrics for the Thompson Sampling router (uses standard `dynamo_component_*` prefix). @@ -175,7 +305,7 @@ Dynamo component metrics for the Thompson Sampling router (uses standard `dynamo - `find_worker` - Worker selection requests - `feedback` - Feedback from completed requests -### Thompson Sampling Processor Metrics (`:8083/metrics`) +### Thompson Sampling Processor Metrics (`:18091/metrics`) Custom Thompson Sampling KV Efficiency (KVE) metrics from the processor component. @@ -309,7 +439,15 @@ vllm:cache_hit_rate * 100 ## Grafana Dashboard -The pre-configured dashboard "Dynamo LLM Overview" includes: +### Dashboard Access + +| Property | Value | +|----------|-------| +| Dashboard Name | Dynamo LLM Overview | +| Direct URL | `http://localhost:3000/d/dynamo-overview/dynamo-llm-overview` | +| Authentication | None required (anonymous access enabled) | +| Data Refresh | Every 2 seconds (configurable) | +| Data Retention | Persistent (survives restarts) | ### Backend Selector @@ -319,6 +457,13 @@ The dashboard includes a **Backend** dropdown variable at the top. Select: All backend-specific panels automatically update based on your selection. +### Time Controls + +Use the time picker (top right) to: +- Select preset ranges: Last 5 minutes, Last 1 hour, Last 6 hours, Last 24 hours +- Set custom absolute time ranges for specific benchmark intervals +- Use the refresh dropdown to control auto-refresh frequency + ### Dashboard Panels 1. **Inflight Requests** - Current load across all components @@ -371,9 +516,30 @@ monitoring/ ## Usage -### Start Monitoring +### Automatic Startup (Recommended) + +The monitoring stack starts **automatically** when you run the Dynamo startup script: + +```bash +# Start Dynamo with monitoring (vLLM backend) +bash start_dynamo_optimized_thompson_hints_vllm.sh + +# Or SGLang backend +bash start_dynamo_optimized_thompson_hints_sglang.sh +``` + +The script will: +1. Start ETCD and NATS infrastructure +2. Start Prometheus and Grafana containers +3. Wait for monitoring services to be ready +4. Start Dynamo components (workers, router, processor, frontend) + +### Manual Startup + +If you need to start monitoring separately: ```bash +cd monitoring docker compose up -d ``` @@ -423,54 +589,100 @@ bash stop_dynamo.sh --kill-metrics docker volume rm monitoring_prometheus_data # Restart everything (monitoring will start automatically) -bash start_dynamo_optimized_thompson_hints.sh +bash start_dynamo_optimized_thompson_hints_vllm.sh ``` ## Remote Access via SSH Port Forwarding -If the monitoring stack is running on a remote server, use SSH port forwarding to access Grafana and Prometheus locally. +If the monitoring stack is running on a remote GPU server (for example, a leased cluster node), use SSH port forwarding to access Grafana and Prometheus from your local machine. + +### Step-by-Step Remote Access -### General Syntax +**1. Create SSH tunnel to the remote server:** ```bash -ssh -L :localhost: @ +# General syntax +ssh -L 3000:localhost:3000 @ + +# Example with VPN-accessible server +ssh -L 3000:localhost:3000 myuser@10.57.201.5 ``` -### Access Grafana (Port 3000) +**2. Open the Grafana dashboard in your browser:** -```bash -ssh -L 3000:localhost:3000 @ ``` +http://localhost:3000/d/dynamo-overview/dynamo-llm-overview +``` + +**3. Configure the time range:** +- Click the time picker (top right corner of Grafana UI) +- Select a preset: Last 1 hour, Last 6 hours, Last 12 hours, Last 24 hours +- Or set a custom absolute time range to view specific benchmark intervals + +**4. Select your backend:** +- Use the **Backend** dropdown (top left) to choose `sglang` or `vllm` +- All panels will automatically update to show backend-specific metrics -Then open http://localhost:3000 in your browser. +### Sharing Data with Team Members -### Access Prometheus (Port 9090) +Anyone with SSH access to the same server can view the monitoring data: ```bash -ssh -L 9090:localhost:9090 @ +# Team member creates their own tunnel +ssh -L 3000:localhost:3000 @ + +# Then opens the same dashboard URL +# http://localhost:3000/d/dynamo-overview/dynamo-llm-overview ``` -Then open http://localhost:9090 in your browser. +This enables collaborative analysis - multiple people can view the same data simultaneously to focus on specific signals. ### Forward Multiple Ports To access both Grafana and Prometheus simultaneously: ```bash -ssh -L 3000:localhost:3000 -L 9090:localhost:9090 @ +ssh -L 3000:localhost:3000 -L 9090:localhost:9090 @ ``` +Access: +- Grafana: http://localhost:3000/d/dynamo-overview/dynamo-llm-overview +- Prometheus: http://localhost:9090 + ### Background SSH Tunnel -To run the tunnel in the background: +To run the tunnel in the background (stays open after terminal closes): ```bash -ssh -f -N -L 3000:localhost:3000 -L 9090:localhost:9090 @ +ssh -f -N -L 3000:localhost:3000 -L 9090:localhost:9090 @ ``` - `-f`: Run in background after authentication - `-N`: Don't execute remote commands (tunnel only) +To kill a background tunnel: +```bash +# Find the SSH process +ps aux | grep "ssh -f -N -L 3000" + +# Kill it +kill +``` + +### Viewing Historical Benchmark Data + +Prometheus persists all metrics data. To view historical benchmarks: + +1. Open the Grafana dashboard +2. Expand the time range using the time picker (top right) +3. Zoom out to 12-24 hours to see multiple benchmark intervals +4. Gaps between data intervals indicate periods when Dynamo was stopped + +**Example**: After running multiple benchmark sessions, you might see: +- Interval 1: Baseline configuration +- Interval 2: Optimized parameters (small gap) +- Interval 3: Best KV Efficiency (for example, Worker 18081: 25.4%, Worker 18082: 16.4%) + ## Manual Metrics Queries ### Prometheus UI (http://localhost:9090) @@ -518,20 +730,24 @@ sglang:num_queue_reqs # All frontend metrics curl -s http://localhost:8000/metrics -# All worker metrics (Dynamo + SGLang) -curl -s http://localhost:8081/metrics +# All worker metrics (Worker 0) +curl -s http://localhost:18081/metrics + +# All worker metrics (Worker 1, if running multiple workers) +curl -s http://localhost:18082/metrics # All router metrics -curl -s http://localhost:8082/metrics +curl -s http://localhost:18090/metrics # All processor metrics (Thompson Sampling) -curl -s http://localhost:8083/metrics +curl -s http://localhost:18091/metrics # Filter specific metrics curl -s http://localhost:8000/metrics | grep time_to_first_token -curl -s http://localhost:8081/metrics | grep kvstats -curl -s http://localhost:8081/metrics | grep "sglang:" -curl -s http://localhost:8083/metrics | grep thompson +curl -s http://localhost:18081/metrics | grep kvstats +curl -s http://localhost:18081/metrics | grep "sglang:" # SGLang backend +curl -s http://localhost:18081/metrics | grep "vllm:" # vLLM backend +curl -s http://localhost:18091/metrics | grep thompson ``` ## Troubleshooting @@ -540,15 +756,52 @@ curl -s http://localhost:8083/metrics | grep thompson Check if Dynamo is running: ```bash +# Check frontend health curl http://localhost:8000/health -curl http://localhost:8081/metrics + +# Check worker metrics (Worker 0) +curl http://localhost:18081/metrics + +# Check router metrics +curl http://localhost:18090/metrics + +# Check processor metrics +curl http://localhost:18091/metrics ``` ### Grafana shows "No data" -1. Verify Prometheus is scraping: http://localhost:9090/targets -2. Check if metrics exist: http://localhost:9090/graph (query a metric name) -3. Ensure time range is correct in Grafana +1. **Verify Prometheus is scraping**: http://localhost:9090/targets + - All targets should show "UP" state + - Check for scrape errors in the "Error" column + +2. **Check if metrics exist**: http://localhost:9090/graph + - Query a metric name (for example, `dynamo_frontend_requests_total`) + - If no data, Dynamo may not be running or generating traffic + +3. **Ensure time range is correct in Grafana**: + - Click the time picker (top right) + - Select "Last 1 hour" or expand to see historical data + - If you just started, wait 30-60 seconds for initial data + +4. **Check backend selector**: + - Make sure the Backend dropdown matches your deployment (sglang vs vllm) + - Backend mismatch will result in empty panels + +### SSH tunnel issues + +If you can't access Grafana via SSH tunnel: + +```bash +# Verify the tunnel is active +ps aux | grep "ssh -L 3000" + +# Test if port 3000 is accessible locally +curl -s http://localhost:3000/api/health + +# If "connection refused", recreate the tunnel +ssh -L 3000:localhost:3000 @ +``` ### Port conflicts @@ -563,12 +816,24 @@ environment: - GF_SERVER_HTTP_PORT=3001 # Different port ``` +### Stale metrics after restart + +If you see old worker instances in Grafana after restarting Dynamo: + +```bash +# Clear Prometheus data and restart +docker stop dynamo-prometheus +docker rm dynamo-prometheus +docker volume rm monitoring_prometheus_data +cd monitoring && docker compose up -d +``` + ## Alternative: File-Based Collection If you don't want to run Prometheus/Grafana, use the collection script: ```bash -cd /localhome/local-bbednarski/NeMo-Agent-Toolkit/external/dynamo +cd external/dynamo ./collect_metrics.sh ./metrics_output 30 # Collect every 30s ``` @@ -578,12 +843,12 @@ This creates timestamped `.prom` files that can be analyzed later or imported in ### Summary by Component -| Component | Port | Metric Count | Key Prefixes | -|-----------|------|--------------|--------------| +| Component | Port(s) | Metric Count | Key Prefixes | +|-----------|---------|--------------|--------------| | Frontend | 8000 | ~22 | `dynamo_frontend_*` | -| Worker | 8081 | ~50 | `dynamo_component_kvstats_*`, `sglang:*` | -| Router | 8082 | ~20 | `dynamo_component_*` (labeled `router`) | -| Processor | 8083 | ~35 | `dynamo_component_thompson_*` | +| Workers | 18081+ | ~50 | `dynamo_component_kvstats_*`, `sglang:*` or `vllm:*` | +| Router | 18090 | ~20 | `dynamo_component_*` (labeled `router`) | +| Processor | 18091 | ~35 | `dynamo_component_thompson_*` | ### All Metric Names by Component @@ -608,7 +873,7 @@ dynamo_frontend_time_to_first_token_seconds_{bucket,count,sum}
-Worker (port 8081) - 50 metrics +Worker (ports 18081+) - 50 metrics per worker **Dynamo Component Metrics:** ``` @@ -658,7 +923,7 @@ sglang:utilization
-Router (port 8082) - 20 metrics +Router (port 18090) - 20 metrics ``` dynamo_component_inflight_requests{dynamo_component="router"} @@ -683,7 +948,7 @@ dynamo_component_uptime_seconds
-Processor (port 8083) - 35 metrics +Processor (port 18091) - 35 metrics **Standard Dynamo Component Metrics:** ``` From 25a8bd53e891840222dd92d7200f5b42fce92dfc Mon Sep 17 00:00:00 2001 From: bbednarski9 Date: Fri, 30 Jan 2026 08:18:12 +0000 Subject: [PATCH 11/13] sequence diagrams Signed-off-by: bbednarski9 --- external/dynamo/E2E_SEQUENCE.md | 12 ++++++++++++ external/dynamo/END_TO_END_EVALS.md | 0 2 files changed, 12 insertions(+) create mode 100644 external/dynamo/E2E_SEQUENCE.md delete mode 100644 external/dynamo/END_TO_END_EVALS.md diff --git a/external/dynamo/E2E_SEQUENCE.md b/external/dynamo/E2E_SEQUENCE.md new file mode 100644 index 0000000000..b50020825f --- /dev/null +++ b/external/dynamo/E2E_SEQUENCE.md @@ -0,0 +1,12 @@ +```mermaid +graph TB + A[Incoming Request] --> B{Has Priority?} + B -->|Yes| C[Route to v2 Path] + B -->|No| D[Route to v1 Path] + C --> E[KVBM v2 PyScheduler] + E --> F[ExplicitMultiLruBackend] + D --> G[Existing KVBM v1] + G --> H[Current 3-Pool or Default LRU] + F --> I[vLLM Worker] + H --> I +``` \ No newline at end of file diff --git a/external/dynamo/END_TO_END_EVALS.md b/external/dynamo/END_TO_END_EVALS.md deleted file mode 100644 index e69de29bb2..0000000000 From 1379b12c9faa1b3a0ee1488890835ceb8a9e1f34 Mon Sep 17 00:00:00 2001 From: bbednarski9 Date: Fri, 30 Jan 2026 08:20:15 +0000 Subject: [PATCH 12/13] sequence diagrams 3 Signed-off-by: bbednarski9 --- external/dynamo/E2E_SEQUENCE.md | 792 +++++++++++++++++++++++++++++++- 1 file changed, 781 insertions(+), 11 deletions(-) diff --git a/external/dynamo/E2E_SEQUENCE.md b/external/dynamo/E2E_SEQUENCE.md index b50020825f..e5aa8916bf 100644 --- a/external/dynamo/E2E_SEQUENCE.md +++ b/external/dynamo/E2E_SEQUENCE.md @@ -1,12 +1,782 @@ +# End-to-End Sequence Diagram: NeMo Agent Toolkit → Dynamo Integration + +This document captures the information flow from NeMo Agent Toolkit chat requests through `dynamo_llm.py` to the custom components launched by `start_dynamo_optimized_thompson_hints_vllm.sh`. + +## Architecture Overview + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ NeMo Agent Toolkit │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ DynamoModelConfig (dynamo_llm.py) │ │ +│ │ prefix_template: "react-benchmark-{uuid}" │ │ +│ │ prefix_total_requests: 10 │ │ +│ │ prefix_osl: MEDIUM │ │ +│ │ prefix_iat: MEDIUM │ │ +│ │ # reuse_budget: (computed by processor: total_requests - count) │ │ +│ │ │ │ +│ │ _DynamoTransport injects: │ │ +│ │ → HTTP Headers: x-prefix-id, x-prefix-total-requests, ... │ │ +│ │ → nvext.annotations in request body │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ Dynamo Stack (Docker Container) │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Default Frontend (port 8000) │ │ +│ │ → Tokenization + nvext parsing │ │ +│ │ → ETCD ModelWatcher (namespace=dynamo) │ │ +│ │ → Discovers processor ONLY (workers hidden) │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Custom Processor (processor.py / processor_multilru.py) │ │ +│ │ → Registered at: dynamo.backend.generate │ │ +│ │ → Extracts: prefix_id, total_requests, osl, iat │ │ +│ │ → Manages reuse_budget tracking │ │ +│ │ → Queries Router, forwards to Workers │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ │ │ +│ ▼ ▼ │ +│ ┌────────────────────────────┐ ┌─────────────────────────────────────┐ │ +│ │ Custom Router (router.py) │ │ vLLM Workers (dynamo.vllm) │ │ +│ │ → Thompson Sampling │ │ → workers.backend.generate │ │ +│ │ → KV Overlap Scoring │ │ → MultiLRU (optional) │ │ +│ │ → LinTS + Beta-TS │ │ → KV Events via ZMQ │ │ +│ └────────────────────────────┘ └─────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +## Sequence Diagram: Full Request Flow + ```mermaid -graph TB - A[Incoming Request] --> B{Has Priority?} - B -->|Yes| C[Route to v2 Path] - B -->|No| D[Route to v1 Path] - C --> E[KVBM v2 PyScheduler] - E --> F[ExplicitMultiLruBackend] - D --> G[Existing KVBM v1] - G --> H[Current 3-Pool or Default LRU] - F --> I[vLLM Worker] - H --> I -``` \ No newline at end of file +sequenceDiagram + autonumber + + box rgb(45, 50, 80) NeMo Agent Toolkit + participant Client as Agent/Client
(LangChain/LlamaIndex) + participant DynamoLLM as DynamoModelConfig
(dynamo_llm.py) + participant Transport as _DynamoTransport
(httpx wrapper) + end + + box rgb(50, 70, 50) Infrastructure + participant ETCD as ETCD
(Service Discovery) + participant NATS as NATS
(KV Events) + end + + box rgb(70, 50, 50) Dynamo Stack + participant Frontend as Default Frontend
(dynamo.frontend) + participant Processor as Custom Processor
(processor.py) + participant Router as Thompson Router
(router.py) + participant Worker as vLLM Worker
(dynamo.vllm) + participant KVBM as MultiLRU Backend
(kvbm.v2) + end + + box rgb(60, 60, 40) Observability + participant Prometheus as Prometheus
(Metrics) + end + + %% ==================== INITIALIZATION PHASE ==================== + Note over ETCD,NATS: Infrastructure Startup + + Worker->>ETCD: Register at workers.backend.generate
(model: llama-3.3-70b-internal) + Note over Worker: Workers use internal model name
to hide from frontend discovery + + Router->>ETCD: Register at dynamo.router.find_worker
& dynamo.router.feedback + + Processor->>ETCD: Register at dynamo.backend.generate
(model: llama-3.3-70b) + Note over Processor: Processor uses PUBLIC model name
→ Frontend discovers ONLY processor + + Frontend->>ETCD: ModelWatcher (namespace=dynamo)
Discovers processor only + + Worker->>NATS: Subscribe to KV event streams + + %% ==================== REQUEST PHASE ==================== + Note over Client,Prometheus: Request Flow with Prefix Hints + + rect rgb(35, 40, 60) + Note right of Client: User initiates chat request + Client->>DynamoLLM: chat.completions.create()
with DynamoPrefixContext + + DynamoLLM->>DynamoLLM: Generate prefix_id from template
"react-benchmark-{uuid}" + + DynamoLLM->>Transport: Build request with config:
prefix_total_requests=10
prefix_osl=MEDIUM
prefix_iat=MEDIUM + end + + rect rgb(40, 50, 45) + Note right of Transport: Transport Layer Injection + Transport->>Transport: Inject HTTP Headers:
x-prefix-id: react-benchmark-abc123
x-prefix-total-requests: 10
x-prefix-osl: MEDIUM
x-prefix-iat: MEDIUM + + Transport->>Transport: Inject nvext.annotations:
["prefix_id:react-benchmark-abc123",
"total_requests:10",
"osl:MEDIUM", "iat:MEDIUM"] + + Transport->>Frontend: POST /v1/chat/completions
(HTTP + nvext.annotations) + end + + rect rgb(50, 40, 40) + Note right of Frontend: Frontend Processing + Frontend->>Frontend: Parse nvext.annotations
from request body + + Frontend->>Frontend: Tokenize messages
→ token_ids: [128000, 9906, ...] + + Frontend->>Frontend: Build PreprocessedRequest:
{token_ids, annotations, sampling_options} + + Frontend->>ETCD: Query ModelWatcher
(namespace=dynamo) + ETCD-->>Frontend: Discovered: dynamo.backend.generate
(processor, NOT workers) + + Frontend->>Processor: Forward PreprocessedRequest
via dynamo.backend.generate + end + + rect rgb(55, 45, 45) + Note right of Processor: Processor - Hint Extraction + Processor->>Processor: Extract from annotations:
prefix_id = "react-benchmark-abc123"
total_requests = 10
osl = "MEDIUM"
iat = "MEDIUM" + + Processor->>Processor: Update _prefix_state:
reuse_budget = total - processed + + Processor->>Processor: Build RouterRequest:
{tokens, prefix_id, reuse_budget, osl, iat} + end + + rect rgb(45, 55, 50) + Note right of Router: Thompson Sampling Routing + Processor->>Router: Query find_worker(RouterRequest) + + Router->>Router: Get available workers
from engine_client.instance_ids() + + Router->>Router: KvIndexer.find_matches_for_request()
→ OverlapScores per worker + + loop For each worker + Router->>Router: Build 9-dim feature vector:
[1.0, inv_load, overlap, affinity,
outstanding_norm, decode_norm,
prefill_norm, iat_norm, reuse_norm] + + Router->>Router: LinTS sample: θ ~ N(μ, v²Σ⁻¹)
score = θᵀx + + Router->>Router: Beta-TS sample: p ~ Beta(α, β)
Add exploration bonus + + Router->>Router: Apply affinity bonus (if sticky)
Apply switching penalty (if switch) + + Router->>Router: Compute load modifier
(GPU util, queue depth, outstanding work) + end + + Router->>Router: Softmax selection with temperature
temp = base / (1 + reuse * iat_factor) + + Router->>Router: Store pending decision:
{decision_id, wid, x, start_ts, ...} + + Router-->>Processor: RouterResponse:
{worker_id, decision_id, overlap} + + Router->>Prometheus: thompson_router_decisions_total++
thompson_router_kv_overlap.set() + end + + rect rgb(50, 50, 55) + Note right of Worker: Worker Execution + Processor->>Processor: thompson_routing_decisions_total++
(worker_id label) + + Processor->>Worker: Forward PreprocessedRequest
via workers.backend.generate
(direct routing to worker_id) + + alt MultiLRU Enabled (DYNAMO_USE_MULTILRU=true) + Worker->>KVBM: DynamoScheduler.schedule() + + Note over KVBM: MultiLRU 4-Pool Architecture:
Cold (freq < 2) → Warm (2-5)
→ Hot (6-14) → VeryHot (≥15) + + KVBM->>KVBM: FrequencyTracker.touch(hash)
Calculate priority level + + KVBM->>KVBM: find_matches() across pools
Evict from coldest first + + KVBM-->>Worker: Scheduled sequences
with KV cache allocation + else Standard vLLM Scheduler + Worker->>Worker: Standard LRU scheduling + end + + Worker->>Worker: Execute prefill + decode
with prefix caching + + Worker->>NATS: Publish KV events
(cache state changes) + + loop Stream tokens + Worker-->>Processor: Token chunks
{token_ids, finish_reason, usage} + + Processor->>Processor: Extract KVEfficiencyData:
cached_tokens, device_blocks, etc. + + Processor-->>Frontend: Forward token chunks + Frontend-->>Transport: SSE stream + Transport-->>Client: Streaming response + end + end + + rect rgb(45, 50, 55) + Note right of Processor: Feedback Loop + Processor->>Processor: Calculate latency_ms
tokens_in, tokens_out + + Processor->>Router: FeedbackRequest:
{decision_id, latency_ms, success,
tokens_in, tokens_out, finish_reason} + + Router->>Router: Retrieve pending decision
by decision_id + + Router->>Router: Compute reward:
metric = latency_ms / tokens_out
baseline = EMA(worker, osl, prefill)
reward = 1 / (1 + metric/baseline) + + Router->>Router: Update Beta bandit:
α' = α + reward
β' = β + (1 - reward) + + Router->>Router: Update LinTS:
A = forget·A + xxᵀ + ridge·I
b = forget·b + x·reward + + Router->>Prometheus: thompson_router_feedback_latency
thompson_router_reward.set() + + Router-->>Processor: FeedbackAck:
{ok, reward, baseline_used} + end + + rect rgb(40, 45, 50) + Note right of Prometheus: Metrics Collection + Processor->>Prometheus: thompson_kve_prompt_tokens_total
thompson_kve_cached_tokens_total
thompson_kve_device_blocks_total + + Processor->>Prometheus: thompson_request_latency_seconds
thompson_tokens_in/out_total + + Worker->>Prometheus: vllm:gpu_cache_usage_perc
vllm:num_requests_waiting + end +``` + +## Detailed Data Structures + +### 1. NeMo Agent Toolkit → Frontend + +**HTTP Request with nvext.annotations:** +```json +{ + "model": "llama-3.3-70b", + "messages": [{"role": "user", "content": "Hello!"}], + "max_tokens": 50, + "stream": true, + "nvext": { + "annotations": [ + "prefix_id:react-benchmark-abc123", + "total_requests:10", + "osl:MEDIUM", + "iat:MEDIUM" + ] + } +} +``` + +**HTTP Headers (legacy support):** +``` +x-prefix-id: react-benchmark-abc123 +x-prefix-total-requests: 10 +x-prefix-osl: MEDIUM +x-prefix-iat: MEDIUM +``` + +### 2. Frontend → Processor (PreprocessedRequest) + +```json +{ + "token_ids": [128000, 9906, 0, ...], + "annotations": [ + "prefix_id:react-benchmark-abc123", + "total_requests:10", + "osl:MEDIUM", + "iat:MEDIUM" + ], + "sampling_options": { + "temperature": 0.7, + "top_p": 0.9 + }, + "stop_conditions": { + "max_tokens": 50 + } +} +``` + +### 3. Processor → Router (RouterRequest) + +```json +{ + "tokens": [128000, 9906, 0, ...], + "prefix_id": "react-benchmark-abc123", + "reuse_budget": 9, + "expected_osl": "MEDIUM", + "interarrival": "MEDIUM" +} +``` + +### 4. Router → Processor (RouterResponse) + +```json +{ + "worker_id": 0, + "prefix_hit_rate": 0.85, + "decision_id": "a1b2c3d4e5f6..." +} +``` + +### 5. Processor → Router (FeedbackRequest) + +```json +{ + "decision_id": "a1b2c3d4e5f6...", + "latency_ms": 1234.56, + "success": true, + "tokens_in": 128, + "tokens_out": 50, + "finish_reason": "stop" +} +``` + +## KvIndexer: Router ↔ Worker KV State Binding + +The router accesses KV cache overlap data via Python bindings to the Rust `KvIndexer`. This is how the router determines which worker has the best prefix cache match. + +### KvIndexer Python Binding Interface + +```python +# From kvbm_next_source/lib/bindings/python/src/dynamo/_core.pyi + +class OverlapScores: + """Collection of prefix matching scores for workers.""" + + @property + def scores(self) -> Dict[int, int]: + """Map of worker_id → number of matching blocks.""" + ... + + @property + def frequencies(self) -> List[int]: + """Access frequencies for matched blocks (0 entries omitted).""" + ... + +class KvIndexer: + """Tracks KV events emitted by workers (add_block, remove_block).""" + + def __init__(self, component: Component, block_size: int) -> None: + """Create KvIndexer attached to a Dynamo component.""" + + def find_matches(self, sequence: List[int]) -> OverlapScores: + """Find prefix matches for block hash sequence.""" + ... + + def find_matches_for_request(self, token_ids: List[int], lora_id: int) -> OverlapScores: + """Return overlap scores for workers given token sequence.""" + ... + + def block_size(self) -> int: + """Return configured block size.""" + ... +``` + +### Router KvIndexer Usage + +```python +# From router.py - initialization +self.indexer = KvIndexer(engine, self.block_size) + +# From router.py - find_matches_for_request call +scores: OverlapScores = await self.indexer.find_matches_for_request(req.tokens, 0) + +# scores.scores is Dict[int, float] with worker_id → overlap ratio +overlap = float(scores.scores.get(wid, 0.0)) +``` + +### KV State Update Flow + +```mermaid +sequenceDiagram + participant Worker as vLLM Worker + participant NATS as NATS JetStream + participant Indexer as KvIndexer (Rust) + participant Router as Thompson Router + + Note over Worker,Router: KV Event Publishing (via ZMQ/NATS) + + Worker->>Worker: Allocate/evict KV blocks + Worker->>NATS: Publish KvCacheEvent
{event_id, stored/removed, block_hashes} + + Note over Indexer: Background event subscription + NATS->>Indexer: Stream KV events + Indexer->>Indexer: Apply events to RadixTree
Update per-worker block state + + Note over Router,Indexer: Router Query Path + Router->>Indexer: find_matches_for_request(tokens, lora_id) + Indexer->>Indexer: Hash tokens → block hashes
Search RadixTree for matches + Indexer-->>Router: OverlapScores
{scores: {wid: count}, frequencies: [...]} + + Router->>Router: Use overlap in feature vector
for Thompson Sampling +``` + +## MultiLRU Architecture Detail + +The MultiLRU backend is an advanced KV cache eviction strategy that uses frequency-based pool promotion. + +```mermaid +flowchart TB + subgraph MultiLRU["MultiLRU Backend (4-Pool System)"] + direction TB + + subgraph FreqTracker["TinyLFU Frequency Tracker"] + FT[FrequencyTracker
count(hash) → u8] + end + + subgraph Pools["Priority Pools"] + direction LR + Cold["Cold Pool
freq < 2
🥶"] + Warm["Warm Pool
freq 2-5
🌡️"] + Hot["Hot Pool
freq 6-14
🔥"] + VeryHot["VeryHot Pool
freq ≥ 15
⭐"] + end + + subgraph Operations["Operations"] + Insert["insert(block)
→ Pool by frequency"] + FindMatch["find_matches(hashes)
→ Search all pools"] + Allocate["allocate(count)
→ Evict Cold first"] + end + end + + subgraph DynamoScheduler["DynamoScheduler (vLLM Integration)"] + Sched["RustScheduler

vLLM Shadow Observer"] + end + + Worker["vLLM Worker
workers.backend.generate"] --> DynamoScheduler + DynamoScheduler --> MultiLRU + + FT --> |"touch(hash)"| Cold + Cold --> |"freq ≥ 2"| Warm + Warm --> |"freq ≥ 6"| Hot + Hot --> |"freq ≥ 15"| VeryHot + + style Cold fill:#4a90d9 + style Warm fill:#f5a623 + style Hot fill:#d0021b + style VeryHot fill:#f8e71c +``` + +### DynamoScheduler Integration (Expanded) + +The `DynamoScheduler` is the vLLM integration point that enables MultiLRU. It implements an **inverted shadow observer pattern** where: +- **Rust scheduler** is the primary decision maker (with MultiLRU backend) +- **vLLM scheduler** runs in shadow mode for comparison + +```mermaid +sequenceDiagram + participant vLLM as vLLM Engine + participant DS as DynamoScheduler + participant RS as RustScheduler + participant VS as vLLM Scheduler (Shadow) + participant ML as MultiLruBackend + + Note over vLLM,ML: Request Addition + vLLM->>DS: add_request(Request) + DS->>DS: Store request for output reconstruction
_requests[req_id] = request + DS->>RS: add_request(req_id, prompt_token_ids) + DS->>VS: add_request(request) [shadow mode] + + Note over vLLM,ML: Schedule Call + vLLM->>DS: schedule() + + DS->>VS: schedule() [get finished_req_ids first] + VS-->>DS: vllm_output (with finished_req_ids) + + DS->>RS: finish_requests(finished_ids) [sync completions] + + DS->>RS: schedule() [PRIMARY decision] + + rect rgb(60, 50, 50) + Note over RS,ML: Rust Scheduler Internal + RS->>ML: find_matches(block_hashes) + ML->>ML: Search all 4 pools
Touch frequency tracker + ML-->>RS: Matched blocks + frequencies + RS->>RS: Compute schedule output
(new_reqs, cached_reqs, blocks) + end + + RS-->>DS: rust_output_dict + + DS->>DS: _rust_output_to_scheduler_output()
Convert to vLLM format + DS->>DS: _compare_outputs(rust, vllm)
Print divergence warnings + + DS-->>vLLM: RustSchedulerOutput
(with vLLM's finished_req_ids) + + Note over vLLM,ML: Output Update + vLLM->>DS: update_from_output(scheduler_output, model_output) + DS->>VS: update_from_output() [shadow] + DS->>RS: update_from_output(finished_ids, output_tokens) + RS->>ML: Update block states based on output +``` + +### DynamoScheduler Key Implementation Details + +```python +# From kvbm_next_source/lib/bindings/kvbm/python/kvbm/v2/vllm/schedulers/dynamo.py + +class DynamoScheduler(SchedulerInterface): + """Scheduler with inverted shadow observer pattern.""" + + def __init__(self, vllm_config, kv_cache_config, ...): + # Create vLLM scheduler (shadow mode) + self._scheduler = Scheduler(vllm_config, kv_cache_config, ...) + + # Initialize Rust scheduler (primary) if available + if _RUST_SCHEDULER_AVAILABLE: + rust_config = RustSchedulerConfig( + max_num_batched_tokens=..., + max_num_seqs=..., + block_size=block_size, + enable_prefix_caching=True, # Required for MultiLRU + total_blocks=total_blocks, + ) + self._rust_scheduler = RustScheduler(rust_config) + + def schedule(self) -> SchedulerOutput: + # 1. Get vLLM schedule first (for finished_req_ids) + vllm_output = self._scheduler.schedule() + + # 2. Sync finished requests to Rust BEFORE it schedules + if vllm_output.finished_req_ids: + self._rust_scheduler.finish_requests( + list(vllm_output.finished_req_ids), + RustRequestStatus.finished_stopped(), + ) + + # 3. Get Rust scheduler decision (PRIMARY) + rust_output_dict = self._rust_scheduler.schedule() + rust_output = self._rust_output_to_scheduler_output(rust_output_dict) + + # 4. Use vLLM's finished_req_ids (vLLM tracks completion) + rust_output.finished_req_ids = vllm_output.finished_req_ids + + # 5. Compare and warn on divergence + self._compare_outputs(rust_output, vllm_output) + + return rust_output +``` + +### MultiLruBackend Rust Implementation + +```rust +// From kvbm_next_source/lib/kvbm/src/v2/logical/pools/inactive/backends/multi_lru_backend.rs + +pub struct MultiLruBackend { + priority_pools: [LruCache>; 4], + frequency_tracker: Arc>, + frequency_thresholds: [u8; 3], // [cold→warm, warm→hot, hot→very_hot] +} + +impl MultiLruBackend { + /// Calculate priority level based on access frequency + fn calculate_priority_level(&self, seq_hash: SequenceHash) -> usize { + let frequency = self.frequency_tracker.count(seq_hash.as_u128()); + let [t1, t2, t3] = self.frequency_thresholds; + + if frequency < t1 as u32 { 0 } // Cold: 0 to (t1 - 1) + else if frequency < t2 as u32 { 1 } // Warm: t1 to (t2 - 1) + else if frequency < t3 as u32 { 2 } // Hot: t2 to (t3 - 1) + else { 3 } // VeryHot: t3+ + } +} + +impl InactivePoolBackend for MultiLruBackend { + /// Evict blocks starting from coldest pool + fn allocate(&mut self, count: usize) -> Vec> { + let mut allocated = Vec::with_capacity(count); + for _ in 0..count { + for pool in &mut self.priority_pools { // Cold first + if let Some((_, block)) = pool.pop_lru() { + allocated.push(block); + break; + } + } + } + allocated + } + + /// Insert block into appropriate pool based on frequency + fn insert(&mut self, block: Block) { + let level = self.calculate_priority_level(block.sequence_hash()); + self.priority_pools[level].put(block.sequence_hash(), block); + } +} +``` + +## Component Registration (ETCD) + +```mermaid +flowchart LR + subgraph Workers["workers namespace"] + W1["workers.backend.generate
instance_0
model: llama-3.3-70b-internal"] + W2["workers.backend.generate
instance_1
model: llama-3.3-70b-internal"] + end + + subgraph Dynamo["dynamo namespace"] + R["dynamo.router.find_worker
dynamo.router.feedback"] + P["dynamo.backend.generate
model: llama-3.3-70b"] + end + + FE["Frontend
ModelWatcher
namespace=dynamo"] + + FE -.->|"Discovers"| P + FE -.x|"Cannot see"| Workers + + P -->|"Queries"| R + P -->|"Forwards to"| W1 + P -->|"Forwards to"| W2 + R -->|"Selects"| W1 + R -->|"Selects"| W2 + + style FE fill:#4a5568 + style P fill:#48bb78 + style R fill:#ed8936 + style W1 fill:#667eea + style W2 fill:#667eea +``` + +## Thompson Sampling Algorithm + +```mermaid +flowchart TB + subgraph Input["Request Context"] + Req["RouterRequest
tokens, prefix_id, reuse_budget, osl, iat"] + end + + subgraph Features["9-Dimensional Feature Vector"] + F1["1.0 (bias)"] + F2["inv_load = 1/(1 + gpu×w_gpu + queue×w_queue)"] + F3["overlap = KvIndexer.find_matches()"] + F4["affinity = 1 if sticky else 0"] + F5["outstanding_norm = tanh(0.1 × work)"] + F6["decode_norm = decode_cost / 3.0"] + F7["prefill_norm = tanh(prefill_cost)"] + F8["iat_norm = iat_factor / 1.5"] + F9["reuse_norm = tanh(0.25 × reuse_budget)"] + end + + subgraph LinTS["Contextual Bandit (LinTS)"] + A["A = λI + Σ xxᵀ
(precision matrix)"] + b["b = Σ x×reward"] + Theta["θ ~ N(A⁻¹b, v²A⁻¹)"] + LinScore["score_lin = θᵀx"] + end + + subgraph BetaTS["Beta Bandit"] + Alpha["α (successes)"] + Beta["β (failures)"] + BetaSample["p ~ Beta(α, β)"] + BetaScore["score_beta = base_weight × p"] + end + + subgraph Modifiers["Score Modifiers"] + Affinity["+ affinity_base × (0.5 + 0.5×overlap)
if sticky and reuse > 0"] + SwitchCost["- switch_cost_base
if switching and reuse > 0"] + LoadMod["× load_modifier
(GPU util, queue, outstanding)"] + end + + subgraph Selection["Worker Selection"] + Softmax["Softmax(scores, temperature)
temp = base / (1 + reuse × iat)"] + Sample["Random sample from distribution"] + Result["Selected worker_id"] + end + + Req --> Features + Features --> LinTS + Features --> BetaTS + LinTS --> LinScore + BetaTS --> BetaScore + LinScore --> Modifiers + BetaScore --> Modifiers + Modifiers --> Selection + Selection --> Result +``` + +## Data Flow Bridges (Potential Optimization Points) + +| Bridge | From | To | Data | Current State | Optimization Opportunity | +|--------|------|-----|------|---------------|-------------------------| +| **A** | `dynamo_llm.py` | Frontend | nvext.annotations | ✅ Working | Add backend selector annotation | +| **B** | Frontend | Processor | PreprocessedRequest.annotations | ✅ Working | Passthrough preserved | +| **C** | Processor | Router | RouterRequest | ✅ Working | Add `use_frequency_backend` hint | +| **D** | Router | KvIndexer | Token hashes | ✅ Working | Integrate with MultiLRU frequency data | +| **E** | Router | Workers | worker_id | ✅ Working | Send expected frequency hint | +| **F** | Worker | NATS | KV events | ✅ Working | Include frequency counts | +| **G** | NATS | Router | KV state updates | ⚠️ Partial | Real-time frequency sync | +| **H** | MultiLRU | Prometheus | Pool distribution | ❌ Missing | Export pool occupancy metrics | + +## Prometheus Metrics Summary + +> **Note**: All custom components (router, processor) use `prometheus_client.REGISTRY` directly for metrics registration. They do **not** use NATS for metrics—only for KV cache event streaming. + +### Processor Metrics (`thompson_*`) +- `thompson_requests_total` - Total requests processed +- `thompson_request_latency_seconds` - E2E latency histogram +- `thompson_tokens_in_total` / `thompson_tokens_out_total` - Throughput +- `thompson_routing_decisions_total{worker_id}` - Per-worker routing +- `thompson_kve_prompt_tokens_total` - KV efficiency denominator +- `thompson_kve_cached_tokens_total` - KV efficiency numerator +- `thompson_kve_device_blocks_total` - GPU cache hits + +### Router Metrics (`thompson_router_*`) + +```python +# From router.py - uses prometheus_client directly +from prometheus_client import REGISTRY, Counter, Gauge, Histogram + +metrics["decisions_total"] = Counter( + "thompson_router_decisions_total", ..., registry=REGISTRY) +metrics["kv_overlap"] = Gauge( + "thompson_router_kv_overlap", ..., registry=REGISTRY) +# ... etc +``` + +- `thompson_router_decisions_total{worker_id}` - Routing decisions +- `thompson_router_kv_overlap{worker_id}` - Overlap scores +- `thompson_router_feedback_latency_seconds{worker_id}` - Feedback latency +- `thompson_router_reward{worker_id}` - Computed rewards +- `thompson_router_pending_decisions` - Awaiting feedback +- `thompson_router_beta_alpha{worker_id}` / `beta_beta` - Bandit params +- `thompson_router_sticky_decisions_total` - Affinity hits +- `thompson_router_switch_decisions_total` - Worker switches +- `thompson_router_reuse_budget` - Distribution of reuse_budget values +- `thompson_router_tokens_per_request` - Distribution of input token counts + +### Worker Metrics (`vllm:*`) +- `vllm:gpu_cache_usage_perc` - GPU memory utilization +- `vllm:num_requests_waiting` - Queue depth +- `vllm:prompt_tokens_total` / `generation_tokens_total` - Throughput + +## Configuration Reference + +### DynamoModelConfig (dynamo_llm.py) +```python +prefix_template: str = "nat-dynamo-{uuid}" # Template with {uuid} placeholder +prefix_total_requests: int = 10 # Expected requests per conversation +prefix_osl: Literal["LOW", "MEDIUM", "HIGH"] = "MEDIUM" # Output length hint +prefix_iat: Literal["LOW", "MEDIUM", "HIGH"] = "MEDIUM" # Inter-arrival hint +# NOTE: reuse_budget is computed by processor from total_requests - processed_count +# Future enhancement: allow explicit reuse_budget override via annotation +``` + +### Router Config (config.yaml) +```yaml +affinity: + base: 0.30 # Primary stickiness + reuse_weight: 0.15 # Reuse budget bonus + iat_weight: 0.20 # IAT multiplier +exploration: + base_ts_weight: 0.10 # Beta-TS exploration + temperature: + base: 1.0 # Softmax temperature +lints: + lambda: 1.0 # LinTS regularization + v: 0.25 # Sampling variance + forget_rate: 0.995 # Forgetting factor +``` + +### MultiLRU Config (kvbm.v2) +```rust +frequency_thresholds: [2, 6, 15] // Cold→Warm, Warm→Hot, Hot→VeryHot +// Pool 0 (Cold): frequency 0-1 +// Pool 1 (Warm): frequency 2-5 +// Pool 2 (Hot): frequency 6-14 +// Pool 3 (VeryHot): frequency 15+ +``` + +--- + +*Generated from codebase analysis of:* +- `NeMo-Agent-Toolkit/src/nat/llm/dynamo_llm.py` +- `NeMo-Agent-Toolkit/external/dynamo/optimized/processor.py` +- `NeMo-Agent-Toolkit/external/dynamo/optimized/router.py` +- `NeMo-Agent-Toolkit/external/dynamo/start_dynamo_optimized_thompson_hints_vllm.sh` +- `kvbm_next_source/lib/kvbm/src/v2/logical/pools/inactive/backends/multi_lru_backend.rs` +- `kvbm_next_source/components/src/dynamo/frontend/main.py` + From 54e10a1a6b07ae54c28ace399b23977a430159b9 Mon Sep 17 00:00:00 2001 From: bbednarski9 Date: Fri, 30 Jan 2026 08:21:22 +0000 Subject: [PATCH 13/13] sequence diagrams 4 Signed-off-by: bbednarski9 --- external/dynamo/E2E_SEQUENCE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/dynamo/E2E_SEQUENCE.md b/external/dynamo/E2E_SEQUENCE.md index e5aa8916bf..25941026a8 100644 --- a/external/dynamo/E2E_SEQUENCE.md +++ b/external/dynamo/E2E_SEQUENCE.md @@ -606,7 +606,7 @@ flowchart LR FE["Frontend
ModelWatcher
namespace=dynamo"] FE -.->|"Discovers"| P - FE -.x|"Cannot see"| Workers + FE -.-x|"Cannot see"| Workers P -->|"Queries"| R P -->|"Forwards to"| W1