diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index fb2bc8368e..ac451bc0fa 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -38,7 +38,8 @@ repos:
name: Clear Jupyter Notebook Output Cells
entry: ci/scripts/clear_notebook_output_cells.sh
files: "\\.ipynb$"
- language: unsupported_script
+ language: python
+ additional_dependencies: ["nbconvert"]
- repo: https://github.com/tcort/markdown-link-check
rev: v3.14.1
diff --git a/docs/source/components/integrations/frameworks.md b/docs/source/components/integrations/frameworks.md
index 094065b0bc..ab27845314 100644
--- a/docs/source/components/integrations/frameworks.md
+++ b/docs/source/components/integrations/frameworks.md
@@ -148,7 +148,7 @@ uv pip install "nvidia-nat[crewai]"
LangChain is a framework for building applications that utilize large language models (LLMs) to interact with data. It provides a set of tools for creating chains of LLM calls, allowing for complex workflows powered by LLMs. LangChain focuses on modularity and extensibility, making it suitable for integrating custom data pipelines and enhancing intelligent applications.
-For more information, visit the [LangChain website](https://www.langchain.com/).
+For more information, visit the [LangChain documentation](https://docs.langchain.com/oss/python/langchain/overview).
| Capability | Providers / Details |
diff --git a/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/eval_config_no_rethinking_minimal_test.yml b/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/eval_config_no_rethinking_minimal_test.yml
index 6d3b8aff31..e5a695ba0a 100644
--- a/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/eval_config_no_rethinking_minimal_test.yml
+++ b/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/eval_config_no_rethinking_minimal_test.yml
@@ -78,7 +78,7 @@ llms:
dynamo_llm:
_type: dynamo
model_name: llama-3.3-70b
- base_url: http://localhost:8099/v1
+ base_url: http://localhost:8000/v1
api_key: dummy
# _type: nim
# model_name: meta/llama-3.3-70b-instruct
diff --git a/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/profile_rethinking_full_test.yml b/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/profile_rethinking_full_test.yml
index 47b7e243fb..f2d25d58f0 100644
--- a/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/profile_rethinking_full_test.yml
+++ b/examples/dynamo_integration/react_benchmark_agent/src/react_benchmark_agent/configs/profile_rethinking_full_test.yml
@@ -26,12 +26,24 @@
# - Bottleneck analysis with nested call stacks
# - Concurrency spike detection
# - Prompt caching prefix identification
+# - Dynamo inference stack metrics (KVE, TTFT, ITL from Prometheus)
+#
+# Core Dynamo Optimization Metrics:
+# 1. KV Efficiency (KVE) = cached_tokens / prompt_tokens
+# - Measures fraction of computational work saved via KV cache
+# - Higher is better (0.8 = 80% of tokens from cache)
+# 2. Time to First Token (TTFT) - User-perceived initial latency
+# 3. Inter-Token Latency (ITL) - Streaming smoothness
#
# Combines self-evaluating agent with detailed profiler for:
# - Understanding performance characteristics of rethinking
# - Identifying optimization opportunities
# - Generating data for throughput analysis scripts
#
+# Prerequisites:
+# - Prometheus running at localhost:9090 (for Dynamo metrics)
+# - Dynamo stack with monitoring enabled
+#
# Usage:
# nat profile --config_file configs/profile_rethinking_full_test.yml
#
@@ -130,7 +142,7 @@ llms:
dynamo_llm:
_type: dynamo
model_name: llama-3.3-70b
- base_url: http://localhost:8099/v1
+ base_url: http://localhost:8000/v1
api_key: dummy
temperature: 0.0
max_tokens: 8192
@@ -151,7 +163,7 @@ llms:
eval_llm:
_type: dynamo
model_name: llama-3.3-70b
- base_url: http://localhost:8099/v1
+ base_url: http://localhost:8000/v1
api_key: dummy
temperature: 0.0
max_tokens: 1024
@@ -187,7 +199,7 @@ workflow:
eval:
general:
- max_concurrency: 36
+ max_concurrency: 8
output:
dir: ./examples/dynamo_integration/react_benchmark_agent/outputs/dynamo_evals/rethinking_full_test_for_profiling/
@@ -223,6 +235,25 @@ eval:
concurrency_spike_analysis:
enable: true
spike_threshold: 24 # Alert when concurrent functions >= 24
+ # Dynamo inference stack metrics - collect from Prometheus
+ # Core optimization metrics: KV Efficiency, TTFT, ITL
+ dynamo_metrics:
+ enable: true
+ prometheus_url: http://localhost:9090
+ # Time range for rate calculations - should match experiment duration
+ # Minimum: 15s (Prometheus scrapes every 5s, need ≥3 points for reliable rates)
+ # Options: 15s (very short), 30s, 1m, 2m, 5m
+ # Shorter = more accurate for brief experiments, but noisier
+ # Longer = smoother averages, but may include pre-experiment data
+ query_range: 30s
+ # Core metrics (primary optimization targets)
+ collect_kv_cache: true # KVE = cached_tokens/prompt_tokens (work saved)
+ collect_ttft: true # Time to First Token (P50/P95/P99)
+ collect_itl: true # Inter-Token Latency (P50/P95/P99)
+ # Supplementary metrics (context and diagnostics)
+ collect_inflight_requests: true
+ collect_throughput: true
+ collect_token_throughput: true
evaluators:
tool_selection_quality:
diff --git a/external/dynamo/.env.example b/external/dynamo/.env.example
index 3510684153..916f826481 100644
--- a/external/dynamo/.env.example
+++ b/external/dynamo/.env.example
@@ -13,10 +13,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-# Required: Set your model directory path
-export DYNAMO_MODEL_DIR="/path/to/your/models/Llama-3.3-70B-Instruct"
-# Optional: Set repository directory (for Thompson Sampling router)
-export DYNAMO_REPO_DIR="/path/to/NeMo-Agent-Toolkit"
+export HF_HOME=/path/to/local/storage/.cache/huggingface
+
+export HF_TOKEN=my_huggingface_read_token
+
+# Required: Set your model directory path with model weights
+# EXAMPLE ls from properly configured directory
+# ~/models/Llama-3.3-70B-Instruct$ ls
+# LICENSE model-00003-of-00030.safetensors model-00010-of-00030.safetensors model-00017-of-00030.safetensors model-00024-of-00030.safetensors model.safetensors.index.json
+# README.md model-00004-of-00030.safetensors model-00011-of-00030.safetensors model-00018-of-00030.safetensors model-00025-of-00030.safetensors original
+# USE_POLICY.md model-00005-of-00030.safetensors model-00012-of-00030.safetensors model-00019-of-00030.safetensors model-00026-of-00030.safetensors special_tokens_map.json
+# config.json model-00006-of-00030.safetensors model-00013-of-00030.safetensors model-00020-of-00030.safetensors model-00027-of-00030.safetensors tokenizer.json
+# generation_config.json model-00007-of-00030.safetensors model-00014-of-00030.safetensors model-00021-of-00030.safetensors model-00028-of-00030.safetensors tokenizer_config.json
+# model-00001-of-00030.safetensors model-00008-of-00030.safetensors model-00015-of-00030.safetensors model-00022-of-00030.safetensors model-00029-of-00030.safetensors
+# model-00002-of-00030.safetensors model-00009-of-00030.safetensors model-00016-of-00030.safetensors model-00023-of-00030.safetensors model-00030-of-00030.safetensors
+export DYNAMO_MODEL_DIR=/path/to/your/models/Llama-3.3-70B-Instruct
+
+# Set repository directory (for Thompson Sampling router)
+export DYNAMO_REPO_DIR=/path/to/NeMo-Agent-Toolkit/external/dynamo
# =============================================================================
# OPTIONAL VARIABLES - GPU Configuration
@@ -24,39 +38,39 @@ export DYNAMO_REPO_DIR="/path/to/NeMo-Agent-Toolkit"
# GPU device IDs for unified mode (comma-separated)
# Default: 0,1,2,3
-DYNAMO_GPU_DEVICES="0,1,2,3"
+export DYNAMO_GPU_DEVICES=0,1,2,3
# GPU device IDs for disaggregated mode prefill workers
# Default: 0,1
-# DYNAMO_PREFILL_GPUS="0,1"
+# DYNAMO_PREFILL_GPUS=0,1
# GPU device IDs for disaggregated mode decode workers
# Default: 2,3
-# DYNAMO_DECODE_GPUS="2,3"
+# DYNAMO_DECODE_GPUS=2,3
# Tensor parallelism size (number of GPUs per worker)
# Default: 4 for unified mode, 2 for disaggregated mode
-# DYNAMO_TP_SIZE="4"
+# DYNAMO_TP_SIZE=4
# =============================================================================
# OPTIONAL VARIABLES - Network Configuration
# =============================================================================
# HTTP port for Dynamo frontend API
-# Default: 8099
-# DYNAMO_HTTP_PORT="8099"
+# Default: 8000
+DYNAMO_HTTP_PORT=8000
# ETCD client port for metadata and discovery
# Default: 2379
-# DYNAMO_ETCD_PORT="2379"
+# DYNAMO_ETCD_PORT=2379
# ETCD peer port
# Default: 2390
-# DYNAMO_ETCD_PEER_PORT="2390"
+# DYNAMO_ETCD_PEER_PORT=2390
# NATS messaging port
# Default: 4222
-# DYNAMO_NATS_PORT="4222"
+# DYNAMO_NATS_PORT=4222
# =============================================================================
# OPTIONAL VARIABLES - Model Configuration
@@ -64,11 +78,11 @@ DYNAMO_GPU_DEVICES="0,1,2,3"
# Model name as exposed by the API
# Default: llama-3.3-70b
-# DYNAMO_MODEL_NAME="llama-3.3-70b"
+# DYNAMO_MODEL_NAME=llama-3.3-70b
# Shared memory size for Docker container
# Default: 16g
-# DYNAMO_SHM_SIZE="16g"
+# DYNAMO_SHM_SIZE=16g
# =============================================================================
# OPTIONAL VARIABLES - Disaggregated Mode
@@ -76,21 +90,42 @@ DYNAMO_GPU_DEVICES="0,1,2,3"
# Bootstrap port for disaggregated mode communication
# Default: 12345
-# DYNAMO_DISAGG_BOOTSTRAP_PORT="12345"
+# DYNAMO_DISAGG_BOOTSTRAP_PORT=12345
# Transfer backend for KV cache (nixl, nccl, or gloo)
# Default: nixl
-# DYNAMO_DISAGG_TRANSFER_BACKEND="nixl"
+# DYNAMO_DISAGG_TRANSFER_BACKEND=nixl
+
+# =============================================================================
+# OPTIONAL VARIABLES - Performance Tuning
+# =============================================================================
+
+# Worker initialization timeout (seconds)
+# Increase for large models (70B+) or cold starts
+# Default: 1800 (30 minutes)
+# DYNAMO_WORKER_INIT_TIMEOUT_S=1800
+
+# Block size in tokens - must match between SGLang (--page-size) and Frontend (--kv-cache-block-size)
+# Default: 64 tokens per block
+DYNAMO_KV_BLOCK_SIZE=64
+# Fraction of GPU memory for KV cache (0.0-1.0)
+# Reduce to test cache pressure/degradation scenarios
+# Default: 0.9 (90% of GPU memory for KV cache)
+# DYNAMO_MEM_FRACTION_STATIC=0.9
# =============================================================================
-# OPTIONAL VARIABLES - Custom Thompson Sampler
+# OPTIONAL VARIABLES - LRU development
# =============================================================================
-# Path to CSV file for router decision logging
-# Default: router_metrics.csv
-# ROUTER_METRICS_CSV = "router_metrics.csv"
+# Path to Dynamo source for patching (auto-detected from DYNAMO_REPO_DIR)
+# DYNAMO_SOURCE_DIR=/path/to/dynamo
+
+# vLLM worker option 1: default
+DYNAMO_USE_MULTILRU=false
+DYNAMO_VLLM_IMAGE="nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.1"
+
+#vLLM worker option 2: MultiLRU from Ryan Olsen's dev branch
+# DYNAMO_USE_MULTILRU=true # will force script to use processor_multirlu.py and router_multirlu.py
+# DYNAMO_VLLM_IMAGE="dynamo-multi-lru:latest"
-# timeout period for dynamo worker initialization
-# Default: 300
-# DYNAMO_WORKER_INIT_TIMEOUT_S = 300
diff --git a/external/dynamo/E2E_SEQUENCE.md b/external/dynamo/E2E_SEQUENCE.md
new file mode 100644
index 0000000000..25941026a8
--- /dev/null
+++ b/external/dynamo/E2E_SEQUENCE.md
@@ -0,0 +1,782 @@
+# End-to-End Sequence Diagram: NeMo Agent Toolkit → Dynamo Integration
+
+This document captures the information flow from NeMo Agent Toolkit chat requests through `dynamo_llm.py` to the custom components launched by `start_dynamo_optimized_thompson_hints_vllm.sh`.
+
+## Architecture Overview
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│ NeMo Agent Toolkit │
+│ ┌─────────────────────────────────────────────────────────────────────┐ │
+│ │ DynamoModelConfig (dynamo_llm.py) │ │
+│ │ prefix_template: "react-benchmark-{uuid}" │ │
+│ │ prefix_total_requests: 10 │ │
+│ │ prefix_osl: MEDIUM │ │
+│ │ prefix_iat: MEDIUM │ │
+│ │ # reuse_budget: (computed by processor: total_requests - count) │ │
+│ │ │ │
+│ │ _DynamoTransport injects: │ │
+│ │ → HTTP Headers: x-prefix-id, x-prefix-total-requests, ... │ │
+│ │ → nvext.annotations in request body │ │
+│ └─────────────────────────────────────────────────────────────────────┘ │
+└─────────────────────────────────────────────────────────────────────────────┘
+ │
+ ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│ Dynamo Stack (Docker Container) │
+│ ┌─────────────────────────────────────────────────────────────────────┐ │
+│ │ Default Frontend (port 8000) │ │
+│ │ → Tokenization + nvext parsing │ │
+│ │ → ETCD ModelWatcher (namespace=dynamo) │ │
+│ │ → Discovers processor ONLY (workers hidden) │ │
+│ └─────────────────────────────────────────────────────────────────────┘ │
+│ │ │
+│ ▼ │
+│ ┌─────────────────────────────────────────────────────────────────────┐ │
+│ │ Custom Processor (processor.py / processor_multilru.py) │ │
+│ │ → Registered at: dynamo.backend.generate │ │
+│ │ → Extracts: prefix_id, total_requests, osl, iat │ │
+│ │ → Manages reuse_budget tracking │ │
+│ │ → Queries Router, forwards to Workers │ │
+│ └─────────────────────────────────────────────────────────────────────┘ │
+│ │ │ │
+│ ▼ ▼ │
+│ ┌────────────────────────────┐ ┌─────────────────────────────────────┐ │
+│ │ Custom Router (router.py) │ │ vLLM Workers (dynamo.vllm) │ │
+│ │ → Thompson Sampling │ │ → workers.backend.generate │ │
+│ │ → KV Overlap Scoring │ │ → MultiLRU (optional) │ │
+│ │ → LinTS + Beta-TS │ │ → KV Events via ZMQ │ │
+│ └────────────────────────────┘ └─────────────────────────────────────┘ │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+## Sequence Diagram: Full Request Flow
+
+```mermaid
+sequenceDiagram
+ autonumber
+
+ box rgb(45, 50, 80) NeMo Agent Toolkit
+ participant Client as Agent/Client
(LangChain/LlamaIndex)
+ participant DynamoLLM as DynamoModelConfig
(dynamo_llm.py)
+ participant Transport as _DynamoTransport
(httpx wrapper)
+ end
+
+ box rgb(50, 70, 50) Infrastructure
+ participant ETCD as ETCD
(Service Discovery)
+ participant NATS as NATS
(KV Events)
+ end
+
+ box rgb(70, 50, 50) Dynamo Stack
+ participant Frontend as Default Frontend
(dynamo.frontend)
+ participant Processor as Custom Processor
(processor.py)
+ participant Router as Thompson Router
(router.py)
+ participant Worker as vLLM Worker
(dynamo.vllm)
+ participant KVBM as MultiLRU Backend
(kvbm.v2)
+ end
+
+ box rgb(60, 60, 40) Observability
+ participant Prometheus as Prometheus
(Metrics)
+ end
+
+ %% ==================== INITIALIZATION PHASE ====================
+ Note over ETCD,NATS: Infrastructure Startup
+
+ Worker->>ETCD: Register at workers.backend.generate
(model: llama-3.3-70b-internal)
+ Note over Worker: Workers use internal model name
to hide from frontend discovery
+
+ Router->>ETCD: Register at dynamo.router.find_worker
& dynamo.router.feedback
+
+ Processor->>ETCD: Register at dynamo.backend.generate
(model: llama-3.3-70b)
+ Note over Processor: Processor uses PUBLIC model name
→ Frontend discovers ONLY processor
+
+ Frontend->>ETCD: ModelWatcher (namespace=dynamo)
Discovers processor only
+
+ Worker->>NATS: Subscribe to KV event streams
+
+ %% ==================== REQUEST PHASE ====================
+ Note over Client,Prometheus: Request Flow with Prefix Hints
+
+ rect rgb(35, 40, 60)
+ Note right of Client: User initiates chat request
+ Client->>DynamoLLM: chat.completions.create()
with DynamoPrefixContext
+
+ DynamoLLM->>DynamoLLM: Generate prefix_id from template
"react-benchmark-{uuid}"
+
+ DynamoLLM->>Transport: Build request with config:
prefix_total_requests=10
prefix_osl=MEDIUM
prefix_iat=MEDIUM
+ end
+
+ rect rgb(40, 50, 45)
+ Note right of Transport: Transport Layer Injection
+ Transport->>Transport: Inject HTTP Headers:
x-prefix-id: react-benchmark-abc123
x-prefix-total-requests: 10
x-prefix-osl: MEDIUM
x-prefix-iat: MEDIUM
+
+ Transport->>Transport: Inject nvext.annotations:
["prefix_id:react-benchmark-abc123",
"total_requests:10",
"osl:MEDIUM", "iat:MEDIUM"]
+
+ Transport->>Frontend: POST /v1/chat/completions
(HTTP + nvext.annotations)
+ end
+
+ rect rgb(50, 40, 40)
+ Note right of Frontend: Frontend Processing
+ Frontend->>Frontend: Parse nvext.annotations
from request body
+
+ Frontend->>Frontend: Tokenize messages
→ token_ids: [128000, 9906, ...]
+
+ Frontend->>Frontend: Build PreprocessedRequest:
{token_ids, annotations, sampling_options}
+
+ Frontend->>ETCD: Query ModelWatcher
(namespace=dynamo)
+ ETCD-->>Frontend: Discovered: dynamo.backend.generate
(processor, NOT workers)
+
+ Frontend->>Processor: Forward PreprocessedRequest
via dynamo.backend.generate
+ end
+
+ rect rgb(55, 45, 45)
+ Note right of Processor: Processor - Hint Extraction
+ Processor->>Processor: Extract from annotations:
prefix_id = "react-benchmark-abc123"
total_requests = 10
osl = "MEDIUM"
iat = "MEDIUM"
+
+ Processor->>Processor: Update _prefix_state:
reuse_budget = total - processed
+
+ Processor->>Processor: Build RouterRequest:
{tokens, prefix_id, reuse_budget, osl, iat}
+ end
+
+ rect rgb(45, 55, 50)
+ Note right of Router: Thompson Sampling Routing
+ Processor->>Router: Query find_worker(RouterRequest)
+
+ Router->>Router: Get available workers
from engine_client.instance_ids()
+
+ Router->>Router: KvIndexer.find_matches_for_request()
→ OverlapScores per worker
+
+ loop For each worker
+ Router->>Router: Build 9-dim feature vector:
[1.0, inv_load, overlap, affinity,
outstanding_norm, decode_norm,
prefill_norm, iat_norm, reuse_norm]
+
+ Router->>Router: LinTS sample: θ ~ N(μ, v²Σ⁻¹)
score = θᵀx
+
+ Router->>Router: Beta-TS sample: p ~ Beta(α, β)
Add exploration bonus
+
+ Router->>Router: Apply affinity bonus (if sticky)
Apply switching penalty (if switch)
+
+ Router->>Router: Compute load modifier
(GPU util, queue depth, outstanding work)
+ end
+
+ Router->>Router: Softmax selection with temperature
temp = base / (1 + reuse * iat_factor)
+
+ Router->>Router: Store pending decision:
{decision_id, wid, x, start_ts, ...}
+
+ Router-->>Processor: RouterResponse:
{worker_id, decision_id, overlap}
+
+ Router->>Prometheus: thompson_router_decisions_total++
thompson_router_kv_overlap.set()
+ end
+
+ rect rgb(50, 50, 55)
+ Note right of Worker: Worker Execution
+ Processor->>Processor: thompson_routing_decisions_total++
(worker_id label)
+
+ Processor->>Worker: Forward PreprocessedRequest
via workers.backend.generate
(direct routing to worker_id)
+
+ alt MultiLRU Enabled (DYNAMO_USE_MULTILRU=true)
+ Worker->>KVBM: DynamoScheduler.schedule()
+
+ Note over KVBM: MultiLRU 4-Pool Architecture:
Cold (freq < 2) → Warm (2-5)
→ Hot (6-14) → VeryHot (≥15)
+
+ KVBM->>KVBM: FrequencyTracker.touch(hash)
Calculate priority level
+
+ KVBM->>KVBM: find_matches() across pools
Evict from coldest first
+
+ KVBM-->>Worker: Scheduled sequences
with KV cache allocation
+ else Standard vLLM Scheduler
+ Worker->>Worker: Standard LRU scheduling
+ end
+
+ Worker->>Worker: Execute prefill + decode
with prefix caching
+
+ Worker->>NATS: Publish KV events
(cache state changes)
+
+ loop Stream tokens
+ Worker-->>Processor: Token chunks
{token_ids, finish_reason, usage}
+
+ Processor->>Processor: Extract KVEfficiencyData:
cached_tokens, device_blocks, etc.
+
+ Processor-->>Frontend: Forward token chunks
+ Frontend-->>Transport: SSE stream
+ Transport-->>Client: Streaming response
+ end
+ end
+
+ rect rgb(45, 50, 55)
+ Note right of Processor: Feedback Loop
+ Processor->>Processor: Calculate latency_ms
tokens_in, tokens_out
+
+ Processor->>Router: FeedbackRequest:
{decision_id, latency_ms, success,
tokens_in, tokens_out, finish_reason}
+
+ Router->>Router: Retrieve pending decision
by decision_id
+
+ Router->>Router: Compute reward:
metric = latency_ms / tokens_out
baseline = EMA(worker, osl, prefill)
reward = 1 / (1 + metric/baseline)
+
+ Router->>Router: Update Beta bandit:
α' = α + reward
β' = β + (1 - reward)
+
+ Router->>Router: Update LinTS:
A = forget·A + xxᵀ + ridge·I
b = forget·b + x·reward
+
+ Router->>Prometheus: thompson_router_feedback_latency
thompson_router_reward.set()
+
+ Router-->>Processor: FeedbackAck:
{ok, reward, baseline_used}
+ end
+
+ rect rgb(40, 45, 50)
+ Note right of Prometheus: Metrics Collection
+ Processor->>Prometheus: thompson_kve_prompt_tokens_total
thompson_kve_cached_tokens_total
thompson_kve_device_blocks_total
+
+ Processor->>Prometheus: thompson_request_latency_seconds
thompson_tokens_in/out_total
+
+ Worker->>Prometheus: vllm:gpu_cache_usage_perc
vllm:num_requests_waiting
+ end
+```
+
+## Detailed Data Structures
+
+### 1. NeMo Agent Toolkit → Frontend
+
+**HTTP Request with nvext.annotations:**
+```json
+{
+ "model": "llama-3.3-70b",
+ "messages": [{"role": "user", "content": "Hello!"}],
+ "max_tokens": 50,
+ "stream": true,
+ "nvext": {
+ "annotations": [
+ "prefix_id:react-benchmark-abc123",
+ "total_requests:10",
+ "osl:MEDIUM",
+ "iat:MEDIUM"
+ ]
+ }
+}
+```
+
+**HTTP Headers (legacy support):**
+```
+x-prefix-id: react-benchmark-abc123
+x-prefix-total-requests: 10
+x-prefix-osl: MEDIUM
+x-prefix-iat: MEDIUM
+```
+
+### 2. Frontend → Processor (PreprocessedRequest)
+
+```json
+{
+ "token_ids": [128000, 9906, 0, ...],
+ "annotations": [
+ "prefix_id:react-benchmark-abc123",
+ "total_requests:10",
+ "osl:MEDIUM",
+ "iat:MEDIUM"
+ ],
+ "sampling_options": {
+ "temperature": 0.7,
+ "top_p": 0.9
+ },
+ "stop_conditions": {
+ "max_tokens": 50
+ }
+}
+```
+
+### 3. Processor → Router (RouterRequest)
+
+```json
+{
+ "tokens": [128000, 9906, 0, ...],
+ "prefix_id": "react-benchmark-abc123",
+ "reuse_budget": 9,
+ "expected_osl": "MEDIUM",
+ "interarrival": "MEDIUM"
+}
+```
+
+### 4. Router → Processor (RouterResponse)
+
+```json
+{
+ "worker_id": 0,
+ "prefix_hit_rate": 0.85,
+ "decision_id": "a1b2c3d4e5f6..."
+}
+```
+
+### 5. Processor → Router (FeedbackRequest)
+
+```json
+{
+ "decision_id": "a1b2c3d4e5f6...",
+ "latency_ms": 1234.56,
+ "success": true,
+ "tokens_in": 128,
+ "tokens_out": 50,
+ "finish_reason": "stop"
+}
+```
+
+## KvIndexer: Router ↔ Worker KV State Binding
+
+The router accesses KV cache overlap data via Python bindings to the Rust `KvIndexer`. This is how the router determines which worker has the best prefix cache match.
+
+### KvIndexer Python Binding Interface
+
+```python
+# From kvbm_next_source/lib/bindings/python/src/dynamo/_core.pyi
+
+class OverlapScores:
+ """Collection of prefix matching scores for workers."""
+
+ @property
+ def scores(self) -> Dict[int, int]:
+ """Map of worker_id → number of matching blocks."""
+ ...
+
+ @property
+ def frequencies(self) -> List[int]:
+ """Access frequencies for matched blocks (0 entries omitted)."""
+ ...
+
+class KvIndexer:
+ """Tracks KV events emitted by workers (add_block, remove_block)."""
+
+ def __init__(self, component: Component, block_size: int) -> None:
+ """Create KvIndexer attached to a Dynamo component."""
+
+ def find_matches(self, sequence: List[int]) -> OverlapScores:
+ """Find prefix matches for block hash sequence."""
+ ...
+
+ def find_matches_for_request(self, token_ids: List[int], lora_id: int) -> OverlapScores:
+ """Return overlap scores for workers given token sequence."""
+ ...
+
+ def block_size(self) -> int:
+ """Return configured block size."""
+ ...
+```
+
+### Router KvIndexer Usage
+
+```python
+# From router.py - initialization
+self.indexer = KvIndexer(engine, self.block_size)
+
+# From router.py - find_matches_for_request call
+scores: OverlapScores = await self.indexer.find_matches_for_request(req.tokens, 0)
+
+# scores.scores is Dict[int, float] with worker_id → overlap ratio
+overlap = float(scores.scores.get(wid, 0.0))
+```
+
+### KV State Update Flow
+
+```mermaid
+sequenceDiagram
+ participant Worker as vLLM Worker
+ participant NATS as NATS JetStream
+ participant Indexer as KvIndexer (Rust)
+ participant Router as Thompson Router
+
+ Note over Worker,Router: KV Event Publishing (via ZMQ/NATS)
+
+ Worker->>Worker: Allocate/evict KV blocks
+ Worker->>NATS: Publish KvCacheEvent
{event_id, stored/removed, block_hashes}
+
+ Note over Indexer: Background event subscription
+ NATS->>Indexer: Stream KV events
+ Indexer->>Indexer: Apply events to RadixTree
Update per-worker block state
+
+ Note over Router,Indexer: Router Query Path
+ Router->>Indexer: find_matches_for_request(tokens, lora_id)
+ Indexer->>Indexer: Hash tokens → block hashes
Search RadixTree for matches
+ Indexer-->>Router: OverlapScores
{scores: {wid: count}, frequencies: [...]}
+
+ Router->>Router: Use overlap in feature vector
for Thompson Sampling
+```
+
+## MultiLRU Architecture Detail
+
+The MultiLRU backend is an advanced KV cache eviction strategy that uses frequency-based pool promotion.
+
+```mermaid
+flowchart TB
+ subgraph MultiLRU["MultiLRU Backend (4-Pool System)"]
+ direction TB
+
+ subgraph FreqTracker["TinyLFU Frequency Tracker"]
+ FT[FrequencyTracker
count(hash) → u8]
+ end
+
+ subgraph Pools["Priority Pools"]
+ direction LR
+ Cold["Cold Pool
freq < 2
🥶"]
+ Warm["Warm Pool
freq 2-5
🌡️"]
+ Hot["Hot Pool
freq 6-14
🔥"]
+ VeryHot["VeryHot Pool
freq ≥ 15
⭐"]
+ end
+
+ subgraph Operations["Operations"]
+ Insert["insert(block)
→ Pool by frequency"]
+ FindMatch["find_matches(hashes)
→ Search all pools"]
+ Allocate["allocate(count)
→ Evict Cold first"]
+ end
+ end
+
+ subgraph DynamoScheduler["DynamoScheduler (vLLM Integration)"]
+ Sched["RustScheduler
↕
vLLM Shadow Observer"]
+ end
+
+ Worker["vLLM Worker
workers.backend.generate"] --> DynamoScheduler
+ DynamoScheduler --> MultiLRU
+
+ FT --> |"touch(hash)"| Cold
+ Cold --> |"freq ≥ 2"| Warm
+ Warm --> |"freq ≥ 6"| Hot
+ Hot --> |"freq ≥ 15"| VeryHot
+
+ style Cold fill:#4a90d9
+ style Warm fill:#f5a623
+ style Hot fill:#d0021b
+ style VeryHot fill:#f8e71c
+```
+
+### DynamoScheduler Integration (Expanded)
+
+The `DynamoScheduler` is the vLLM integration point that enables MultiLRU. It implements an **inverted shadow observer pattern** where:
+- **Rust scheduler** is the primary decision maker (with MultiLRU backend)
+- **vLLM scheduler** runs in shadow mode for comparison
+
+```mermaid
+sequenceDiagram
+ participant vLLM as vLLM Engine
+ participant DS as DynamoScheduler
+ participant RS as RustScheduler
+ participant VS as vLLM Scheduler (Shadow)
+ participant ML as MultiLruBackend
+
+ Note over vLLM,ML: Request Addition
+ vLLM->>DS: add_request(Request)
+ DS->>DS: Store request for output reconstruction
_requests[req_id] = request
+ DS->>RS: add_request(req_id, prompt_token_ids)
+ DS->>VS: add_request(request) [shadow mode]
+
+ Note over vLLM,ML: Schedule Call
+ vLLM->>DS: schedule()
+
+ DS->>VS: schedule() [get finished_req_ids first]
+ VS-->>DS: vllm_output (with finished_req_ids)
+
+ DS->>RS: finish_requests(finished_ids) [sync completions]
+
+ DS->>RS: schedule() [PRIMARY decision]
+
+ rect rgb(60, 50, 50)
+ Note over RS,ML: Rust Scheduler Internal
+ RS->>ML: find_matches(block_hashes)
+ ML->>ML: Search all 4 pools
Touch frequency tracker
+ ML-->>RS: Matched blocks + frequencies
+ RS->>RS: Compute schedule output
(new_reqs, cached_reqs, blocks)
+ end
+
+ RS-->>DS: rust_output_dict
+
+ DS->>DS: _rust_output_to_scheduler_output()
Convert to vLLM format
+ DS->>DS: _compare_outputs(rust, vllm)
Print divergence warnings
+
+ DS-->>vLLM: RustSchedulerOutput
(with vLLM's finished_req_ids)
+
+ Note over vLLM,ML: Output Update
+ vLLM->>DS: update_from_output(scheduler_output, model_output)
+ DS->>VS: update_from_output() [shadow]
+ DS->>RS: update_from_output(finished_ids, output_tokens)
+ RS->>ML: Update block states based on output
+```
+
+### DynamoScheduler Key Implementation Details
+
+```python
+# From kvbm_next_source/lib/bindings/kvbm/python/kvbm/v2/vllm/schedulers/dynamo.py
+
+class DynamoScheduler(SchedulerInterface):
+ """Scheduler with inverted shadow observer pattern."""
+
+ def __init__(self, vllm_config, kv_cache_config, ...):
+ # Create vLLM scheduler (shadow mode)
+ self._scheduler = Scheduler(vllm_config, kv_cache_config, ...)
+
+ # Initialize Rust scheduler (primary) if available
+ if _RUST_SCHEDULER_AVAILABLE:
+ rust_config = RustSchedulerConfig(
+ max_num_batched_tokens=...,
+ max_num_seqs=...,
+ block_size=block_size,
+ enable_prefix_caching=True, # Required for MultiLRU
+ total_blocks=total_blocks,
+ )
+ self._rust_scheduler = RustScheduler(rust_config)
+
+ def schedule(self) -> SchedulerOutput:
+ # 1. Get vLLM schedule first (for finished_req_ids)
+ vllm_output = self._scheduler.schedule()
+
+ # 2. Sync finished requests to Rust BEFORE it schedules
+ if vllm_output.finished_req_ids:
+ self._rust_scheduler.finish_requests(
+ list(vllm_output.finished_req_ids),
+ RustRequestStatus.finished_stopped(),
+ )
+
+ # 3. Get Rust scheduler decision (PRIMARY)
+ rust_output_dict = self._rust_scheduler.schedule()
+ rust_output = self._rust_output_to_scheduler_output(rust_output_dict)
+
+ # 4. Use vLLM's finished_req_ids (vLLM tracks completion)
+ rust_output.finished_req_ids = vllm_output.finished_req_ids
+
+ # 5. Compare and warn on divergence
+ self._compare_outputs(rust_output, vllm_output)
+
+ return rust_output
+```
+
+### MultiLruBackend Rust Implementation
+
+```rust
+// From kvbm_next_source/lib/kvbm/src/v2/logical/pools/inactive/backends/multi_lru_backend.rs
+
+pub struct MultiLruBackend {
+ priority_pools: [LruCache>; 4],
+ frequency_tracker: Arc>,
+ frequency_thresholds: [u8; 3], // [cold→warm, warm→hot, hot→very_hot]
+}
+
+impl MultiLruBackend {
+ /// Calculate priority level based on access frequency
+ fn calculate_priority_level(&self, seq_hash: SequenceHash) -> usize {
+ let frequency = self.frequency_tracker.count(seq_hash.as_u128());
+ let [t1, t2, t3] = self.frequency_thresholds;
+
+ if frequency < t1 as u32 { 0 } // Cold: 0 to (t1 - 1)
+ else if frequency < t2 as u32 { 1 } // Warm: t1 to (t2 - 1)
+ else if frequency < t3 as u32 { 2 } // Hot: t2 to (t3 - 1)
+ else { 3 } // VeryHot: t3+
+ }
+}
+
+impl InactivePoolBackend for MultiLruBackend {
+ /// Evict blocks starting from coldest pool
+ fn allocate(&mut self, count: usize) -> Vec> {
+ let mut allocated = Vec::with_capacity(count);
+ for _ in 0..count {
+ for pool in &mut self.priority_pools { // Cold first
+ if let Some((_, block)) = pool.pop_lru() {
+ allocated.push(block);
+ break;
+ }
+ }
+ }
+ allocated
+ }
+
+ /// Insert block into appropriate pool based on frequency
+ fn insert(&mut self, block: Block) {
+ let level = self.calculate_priority_level(block.sequence_hash());
+ self.priority_pools[level].put(block.sequence_hash(), block);
+ }
+}
+```
+
+## Component Registration (ETCD)
+
+```mermaid
+flowchart LR
+ subgraph Workers["workers namespace"]
+ W1["workers.backend.generate
instance_0
model: llama-3.3-70b-internal"]
+ W2["workers.backend.generate
instance_1
model: llama-3.3-70b-internal"]
+ end
+
+ subgraph Dynamo["dynamo namespace"]
+ R["dynamo.router.find_worker
dynamo.router.feedback"]
+ P["dynamo.backend.generate
model: llama-3.3-70b"]
+ end
+
+ FE["Frontend
ModelWatcher
namespace=dynamo"]
+
+ FE -.->|"Discovers"| P
+ FE -.-x|"Cannot see"| Workers
+
+ P -->|"Queries"| R
+ P -->|"Forwards to"| W1
+ P -->|"Forwards to"| W2
+ R -->|"Selects"| W1
+ R -->|"Selects"| W2
+
+ style FE fill:#4a5568
+ style P fill:#48bb78
+ style R fill:#ed8936
+ style W1 fill:#667eea
+ style W2 fill:#667eea
+```
+
+## Thompson Sampling Algorithm
+
+```mermaid
+flowchart TB
+ subgraph Input["Request Context"]
+ Req["RouterRequest
tokens, prefix_id, reuse_budget, osl, iat"]
+ end
+
+ subgraph Features["9-Dimensional Feature Vector"]
+ F1["1.0 (bias)"]
+ F2["inv_load = 1/(1 + gpu×w_gpu + queue×w_queue)"]
+ F3["overlap = KvIndexer.find_matches()"]
+ F4["affinity = 1 if sticky else 0"]
+ F5["outstanding_norm = tanh(0.1 × work)"]
+ F6["decode_norm = decode_cost / 3.0"]
+ F7["prefill_norm = tanh(prefill_cost)"]
+ F8["iat_norm = iat_factor / 1.5"]
+ F9["reuse_norm = tanh(0.25 × reuse_budget)"]
+ end
+
+ subgraph LinTS["Contextual Bandit (LinTS)"]
+ A["A = λI + Σ xxᵀ
(precision matrix)"]
+ b["b = Σ x×reward"]
+ Theta["θ ~ N(A⁻¹b, v²A⁻¹)"]
+ LinScore["score_lin = θᵀx"]
+ end
+
+ subgraph BetaTS["Beta Bandit"]
+ Alpha["α (successes)"]
+ Beta["β (failures)"]
+ BetaSample["p ~ Beta(α, β)"]
+ BetaScore["score_beta = base_weight × p"]
+ end
+
+ subgraph Modifiers["Score Modifiers"]
+ Affinity["+ affinity_base × (0.5 + 0.5×overlap)
if sticky and reuse > 0"]
+ SwitchCost["- switch_cost_base
if switching and reuse > 0"]
+ LoadMod["× load_modifier
(GPU util, queue, outstanding)"]
+ end
+
+ subgraph Selection["Worker Selection"]
+ Softmax["Softmax(scores, temperature)
temp = base / (1 + reuse × iat)"]
+ Sample["Random sample from distribution"]
+ Result["Selected worker_id"]
+ end
+
+ Req --> Features
+ Features --> LinTS
+ Features --> BetaTS
+ LinTS --> LinScore
+ BetaTS --> BetaScore
+ LinScore --> Modifiers
+ BetaScore --> Modifiers
+ Modifiers --> Selection
+ Selection --> Result
+```
+
+## Data Flow Bridges (Potential Optimization Points)
+
+| Bridge | From | To | Data | Current State | Optimization Opportunity |
+|--------|------|-----|------|---------------|-------------------------|
+| **A** | `dynamo_llm.py` | Frontend | nvext.annotations | ✅ Working | Add backend selector annotation |
+| **B** | Frontend | Processor | PreprocessedRequest.annotations | ✅ Working | Passthrough preserved |
+| **C** | Processor | Router | RouterRequest | ✅ Working | Add `use_frequency_backend` hint |
+| **D** | Router | KvIndexer | Token hashes | ✅ Working | Integrate with MultiLRU frequency data |
+| **E** | Router | Workers | worker_id | ✅ Working | Send expected frequency hint |
+| **F** | Worker | NATS | KV events | ✅ Working | Include frequency counts |
+| **G** | NATS | Router | KV state updates | ⚠️ Partial | Real-time frequency sync |
+| **H** | MultiLRU | Prometheus | Pool distribution | ❌ Missing | Export pool occupancy metrics |
+
+## Prometheus Metrics Summary
+
+> **Note**: All custom components (router, processor) use `prometheus_client.REGISTRY` directly for metrics registration. They do **not** use NATS for metrics—only for KV cache event streaming.
+
+### Processor Metrics (`thompson_*`)
+- `thompson_requests_total` - Total requests processed
+- `thompson_request_latency_seconds` - E2E latency histogram
+- `thompson_tokens_in_total` / `thompson_tokens_out_total` - Throughput
+- `thompson_routing_decisions_total{worker_id}` - Per-worker routing
+- `thompson_kve_prompt_tokens_total` - KV efficiency denominator
+- `thompson_kve_cached_tokens_total` - KV efficiency numerator
+- `thompson_kve_device_blocks_total` - GPU cache hits
+
+### Router Metrics (`thompson_router_*`)
+
+```python
+# From router.py - uses prometheus_client directly
+from prometheus_client import REGISTRY, Counter, Gauge, Histogram
+
+metrics["decisions_total"] = Counter(
+ "thompson_router_decisions_total", ..., registry=REGISTRY)
+metrics["kv_overlap"] = Gauge(
+ "thompson_router_kv_overlap", ..., registry=REGISTRY)
+# ... etc
+```
+
+- `thompson_router_decisions_total{worker_id}` - Routing decisions
+- `thompson_router_kv_overlap{worker_id}` - Overlap scores
+- `thompson_router_feedback_latency_seconds{worker_id}` - Feedback latency
+- `thompson_router_reward{worker_id}` - Computed rewards
+- `thompson_router_pending_decisions` - Awaiting feedback
+- `thompson_router_beta_alpha{worker_id}` / `beta_beta` - Bandit params
+- `thompson_router_sticky_decisions_total` - Affinity hits
+- `thompson_router_switch_decisions_total` - Worker switches
+- `thompson_router_reuse_budget` - Distribution of reuse_budget values
+- `thompson_router_tokens_per_request` - Distribution of input token counts
+
+### Worker Metrics (`vllm:*`)
+- `vllm:gpu_cache_usage_perc` - GPU memory utilization
+- `vllm:num_requests_waiting` - Queue depth
+- `vllm:prompt_tokens_total` / `generation_tokens_total` - Throughput
+
+## Configuration Reference
+
+### DynamoModelConfig (dynamo_llm.py)
+```python
+prefix_template: str = "nat-dynamo-{uuid}" # Template with {uuid} placeholder
+prefix_total_requests: int = 10 # Expected requests per conversation
+prefix_osl: Literal["LOW", "MEDIUM", "HIGH"] = "MEDIUM" # Output length hint
+prefix_iat: Literal["LOW", "MEDIUM", "HIGH"] = "MEDIUM" # Inter-arrival hint
+# NOTE: reuse_budget is computed by processor from total_requests - processed_count
+# Future enhancement: allow explicit reuse_budget override via annotation
+```
+
+### Router Config (config.yaml)
+```yaml
+affinity:
+ base: 0.30 # Primary stickiness
+ reuse_weight: 0.15 # Reuse budget bonus
+ iat_weight: 0.20 # IAT multiplier
+exploration:
+ base_ts_weight: 0.10 # Beta-TS exploration
+ temperature:
+ base: 1.0 # Softmax temperature
+lints:
+ lambda: 1.0 # LinTS regularization
+ v: 0.25 # Sampling variance
+ forget_rate: 0.995 # Forgetting factor
+```
+
+### MultiLRU Config (kvbm.v2)
+```rust
+frequency_thresholds: [2, 6, 15] // Cold→Warm, Warm→Hot, Hot→VeryHot
+// Pool 0 (Cold): frequency 0-1
+// Pool 1 (Warm): frequency 2-5
+// Pool 2 (Hot): frequency 6-14
+// Pool 3 (VeryHot): frequency 15+
+```
+
+---
+
+*Generated from codebase analysis of:*
+- `NeMo-Agent-Toolkit/src/nat/llm/dynamo_llm.py`
+- `NeMo-Agent-Toolkit/external/dynamo/optimized/processor.py`
+- `NeMo-Agent-Toolkit/external/dynamo/optimized/router.py`
+- `NeMo-Agent-Toolkit/external/dynamo/start_dynamo_optimized_thompson_hints_vllm.sh`
+- `kvbm_next_source/lib/kvbm/src/v2/logical/pools/inactive/backends/multi_lru_backend.rs`
+- `kvbm_next_source/components/src/dynamo/frontend/main.py`
+
diff --git a/external/dynamo/build_multi_lru_image.sh b/external/dynamo/build_multi_lru_image.sh
new file mode 100755
index 0000000000..d1c86a2e0e
--- /dev/null
+++ b/external/dynamo/build_multi_lru_image.sh
@@ -0,0 +1,267 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Build Dynamo vLLM Image with MultiLruBackend from ryan/kvbm-next branch
+#
+# This script builds the Dynamo vLLM runtime image from source using the
+# ryan/kvbm-next branch, which includes the 4-pool MultiLruBackend for
+# frequency-based KV cache eviction.
+#
+# The build uses the branch's native container/build.sh with:
+# - Framework: VLLM
+# - KVBM enabled (includes MultiLruBackend)
+# - vLLM v0.14.0
+# - CUDA 12.9
+# - Python 3.12
+#
+# Usage:
+# ./build_multi_lru_image.sh [options]
+#
+# Options:
+# --no-cache Build without Docker cache
+# --skip-clone Skip cloning/updating the branch (use existing source)
+# --source-dir DIR Source directory (default: auto-detect kvbm_next_source or kvbm_next_build)
+# --target TARGET Docker build target (default: runtime)
+# --tag TAG Custom image tag (default: dynamo-multi-lru:latest)
+# --dry-run Print commands without executing
+# --help Show this help message
+#
+# Environment Variables:
+# DYNAMO_SOURCE_DIR Source directory (alternative to --source-dir)
+# DYNAMO_BUILD_JOBS Cargo build parallelism (default: 4, reduce if OOM)
+# DYNAMO_MAX_JOBS vLLM compilation parallelism (default: 8)
+
+set -euo pipefail
+
+# Script directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Configuration
+BRANCH="ryan/kvbm-next"
+REPO_URL="https://github.com/ai-dynamo/dynamo.git"
+
+# Build options (can be overridden by command line args)
+KVBM_NEXT_DIR="" # Will be set after arg parsing
+IMAGE_TAG="${DYNAMO_IMAGE_TAG:-dynamo-multi-lru:latest}"
+BUILD_TARGET="${DYNAMO_BUILD_TARGET:-runtime}"
+NO_CACHE=""
+SKIP_CLONE=false
+DRY_RUN=""
+CARGO_BUILD_JOBS="${DYNAMO_BUILD_JOBS:-4}"
+MAX_JOBS="${DYNAMO_MAX_JOBS:-8}"
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+ case $1 in
+ --no-cache)
+ NO_CACHE="--no-cache"
+ shift
+ ;;
+ --skip-clone)
+ SKIP_CLONE=true
+ shift
+ ;;
+ --source-dir)
+ KVBM_NEXT_DIR="$2"
+ shift 2
+ ;;
+ --target)
+ BUILD_TARGET="$2"
+ shift 2
+ ;;
+ --tag)
+ IMAGE_TAG="$2"
+ shift 2
+ ;;
+ --dry-run)
+ DRY_RUN="--dry-run"
+ shift
+ ;;
+ --help|-h)
+ head -42 "$0" | tail -37
+ exit 0
+ ;;
+ *)
+ echo "Unknown option: $1"
+ echo "Use --help for usage information"
+ exit 1
+ ;;
+ esac
+done
+
+# Auto-detect source directory if not specified
+if [ -z "$KVBM_NEXT_DIR" ]; then
+ if [ -n "${DYNAMO_SOURCE_DIR:-}" ]; then
+ KVBM_NEXT_DIR="$DYNAMO_SOURCE_DIR"
+ elif [ -d "${SCRIPT_DIR}/kvbm_next_source" ] && [ -f "${SCRIPT_DIR}/kvbm_next_source/container/build.sh" ]; then
+ KVBM_NEXT_DIR="${SCRIPT_DIR}/kvbm_next_source"
+ echo "Auto-detected existing source: $KVBM_NEXT_DIR"
+ else
+ KVBM_NEXT_DIR="${SCRIPT_DIR}/kvbm_next_build"
+ fi
+fi
+
+echo "========================================================="
+echo "Building Dynamo vLLM Image with MultiLruBackend"
+echo "========================================================="
+echo ""
+echo "Configuration:"
+echo " Branch: $BRANCH"
+echo " Source Dir: $KVBM_NEXT_DIR"
+echo " Image Tag: $IMAGE_TAG"
+echo " Build Target: $BUILD_TARGET"
+echo " Cargo Jobs: $CARGO_BUILD_JOBS"
+echo " vLLM Jobs: $MAX_JOBS"
+echo " Skip Clone: $SKIP_CLONE"
+echo " No Cache: ${NO_CACHE:-false}"
+echo ""
+
+# Step 1: Clone or update the ryan/kvbm-next branch
+if [ "$SKIP_CLONE" = false ]; then
+ if [ -d "$KVBM_NEXT_DIR" ]; then
+ echo "Updating existing $BRANCH branch..."
+ cd "$KVBM_NEXT_DIR"
+ git fetch origin
+ git checkout "$BRANCH"
+ git pull origin "$BRANCH"
+ git submodule update --init --recursive
+ else
+ echo "Cloning $BRANCH branch..."
+ git clone --branch "$BRANCH" --depth 1 "$REPO_URL" "$KVBM_NEXT_DIR"
+ cd "$KVBM_NEXT_DIR"
+ git submodule update --init --recursive
+ fi
+ echo "✓ Source code ready at $KVBM_NEXT_DIR"
+else
+ if [ ! -d "$KVBM_NEXT_DIR" ]; then
+ echo "ERROR: --skip-clone specified but source directory doesn't exist: $KVBM_NEXT_DIR"
+ exit 1
+ fi
+ echo "Using existing source at $KVBM_NEXT_DIR"
+ cd "$KVBM_NEXT_DIR"
+fi
+echo ""
+
+# Step 2: Apply MultiLruBackend patch (if needed)
+# The scheduler at lib/bindings/kvbm/src/v2/scheduler/mod.rs may use LineageBackend by default.
+# We patch it to use MultiLruBackend for frequency-based eviction.
+SCHEDULER_FILE="lib/bindings/kvbm/src/v2/scheduler/mod.rs"
+
+if [ -f "$SCHEDULER_FILE" ]; then
+ if grep -q "with_lineage_backend" "$SCHEDULER_FILE"; then
+ echo "Patching scheduler to enable MultiLruBackend..."
+ sed -i 's/\.with_lineage_backend()/.with_multi_lru_backend()/g' "$SCHEDULER_FILE"
+
+ if grep -q "with_multi_lru_backend" "$SCHEDULER_FILE"; then
+ echo "✓ Scheduler patched: LineageBackend → MultiLruBackend"
+ grep -n "with_multi_lru_backend" "$SCHEDULER_FILE" | head -3
+ else
+ echo "WARNING: Patch may have failed - check $SCHEDULER_FILE"
+ fi
+ elif grep -q "with_multi_lru_backend" "$SCHEDULER_FILE"; then
+ echo "✓ Scheduler already uses MultiLruBackend"
+ else
+ echo "WARNING: Could not find backend configuration in $SCHEDULER_FILE"
+ echo " The scheduler may use a different configuration method."
+ fi
+else
+ echo "WARNING: Scheduler file not found at $SCHEDULER_FILE"
+ echo " This is expected if the branch structure has changed."
+fi
+echo ""
+
+# Step 3: Build the image using the branch's build.sh
+echo "========================================================="
+echo "Building Docker image..."
+echo "========================================================="
+echo ""
+echo "Build command:"
+echo " ./container/build.sh \\"
+echo " --framework VLLM \\"
+echo " --target $BUILD_TARGET \\"
+echo " --tag $IMAGE_TAG \\"
+echo " --enable-kvbm \\"
+echo " --build-arg CARGO_BUILD_JOBS=$CARGO_BUILD_JOBS \\"
+echo " --vllm-max-jobs $MAX_JOBS \\"
+echo " $NO_CACHE $DRY_RUN"
+echo ""
+
+# Make build.sh executable
+chmod +x container/build.sh
+
+# Run the build
+# Note: --enable-kvbm is automatically set for VLLM framework, but we set it explicitly for clarity
+./container/build.sh \
+ --framework VLLM \
+ --target "$BUILD_TARGET" \
+ --tag "$IMAGE_TAG" \
+ --enable-kvbm \
+ --build-arg "CARGO_BUILD_JOBS=$CARGO_BUILD_JOBS" \
+ --vllm-max-jobs "$MAX_JOBS" \
+ $NO_CACHE \
+ $DRY_RUN
+
+BUILD_EXIT_CODE=$?
+
+if [ $BUILD_EXIT_CODE -eq 0 ]; then
+ echo ""
+ echo "========================================================="
+ echo "✓ Build successful!"
+ echo "========================================================="
+ echo ""
+ echo "Image: $IMAGE_TAG"
+ echo ""
+
+ # Verify the image has KVBM installed
+ echo "Verifying image contents..."
+ if docker run --rm "$IMAGE_TAG" python3 -c "import kvbm; print('✓ KVBM module installed')" 2>/dev/null; then
+ echo ""
+ else
+ echo "⚠ Warning: Could not verify KVBM installation in image"
+ fi
+
+ # Check for DynamoScheduler
+ if docker run --rm "$IMAGE_TAG" python3 -c "from kvbm.v2.vllm.schedulers.dynamo import DynamoScheduler; print('✓ DynamoScheduler available')" 2>/dev/null; then
+ echo ""
+ else
+ echo "⚠ Warning: Could not verify DynamoScheduler in image"
+ fi
+
+ echo "Features:"
+ echo " - vLLM v0.14.0 backend"
+ echo " - KVBM with MultiLruBackend (4-pool frequency-based eviction)"
+ echo " - CUDA 12.9"
+ echo " - Python 3.12"
+ echo " - NIXL 0.9.0 for KV transfer"
+ echo ""
+ echo "MultiLruBackend Configuration:"
+ echo " - 4 priority pools: Cold → Warm → Hot → VeryHot"
+ echo " - Default promotion thresholds: [2, 6, 15] accesses"
+ echo " - Frequently accessed blocks protected from eviction"
+ echo ""
+ echo "To use this image, update your startup script:"
+ echo " IMAGE=\"$IMAGE_TAG\""
+ echo ""
+ echo "Or set the environment variable:"
+ echo " export DYNAMO_VLLM_IMAGE=\"$IMAGE_TAG\""
+ echo ""
+ echo "Then run:"
+ echo " ./start_dynamo_optimized_thompson_hints_vllm_multilru.sh"
+ echo ""
+else
+ echo ""
+ echo "========================================================="
+ echo "✗ Build failed with exit code: $BUILD_EXIT_CODE"
+ echo "========================================================="
+ echo ""
+ echo "Troubleshooting:"
+ echo " 1. Check Docker daemon is running"
+ echo " 2. Ensure sufficient disk space (needs ~50GB)"
+ echo " 3. Try reducing parallelism:"
+ echo " DYNAMO_BUILD_JOBS=2 DYNAMO_MAX_JOBS=4 ./build_multi_lru_image.sh"
+ echo " 4. Check build logs above for specific errors"
+ echo ""
+ exit $BUILD_EXIT_CODE
+fi
+
diff --git a/external/dynamo/collect_metrics.sh b/external/dynamo/collect_metrics.sh
new file mode 100755
index 0000000000..aeeb3dc0ea
--- /dev/null
+++ b/external/dynamo/collect_metrics.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Dynamo Metrics Collector
+# Saves metrics from Frontend and Worker to timestamped files
+
+OUTPUT_DIR="${1:-./metrics_logs}"
+INTERVAL="${2:-30}" # Collection interval in seconds
+
+mkdir -p "$OUTPUT_DIR"
+
+echo "=== Dynamo Metrics Collector ==="
+echo "Output directory: $OUTPUT_DIR"
+echo "Collection interval: ${INTERVAL}s"
+echo "Press Ctrl+C to stop"
+echo ""
+
+collect_metrics() {
+ local timestamp=$(date +%Y%m%d_%H%M%S)
+ local frontend_file="$OUTPUT_DIR/frontend_${timestamp}.prom"
+ local worker_file="$OUTPUT_DIR/worker_${timestamp}.prom"
+ local combined_file="$OUTPUT_DIR/combined_${timestamp}.prom"
+
+ echo "[$(date)] Collecting metrics..."
+
+ # Collect frontend metrics
+ curl -s http://localhost:8000/metrics > "$frontend_file" 2>/dev/null
+
+ # Collect worker metrics
+ curl -s http://localhost:8081/metrics > "$worker_file" 2>/dev/null
+
+ # Create combined file with headers
+ {
+ echo "# Collected at: $(date -Iseconds)"
+ echo "# === FRONTEND METRICS ==="
+ cat "$frontend_file"
+ echo ""
+ echo "# === WORKER METRICS ==="
+ cat "$worker_file"
+ } > "$combined_file"
+
+ # Also append to a rolling log (last 24 hours of key metrics)
+ {
+ echo "# Timestamp: $(date -Iseconds)"
+ grep -E '^dynamo_frontend_(requests_total|time_to_first_token|inter_token_latency|inflight)' "$frontend_file" 2>/dev/null
+ grep -E '^dynamo_component_(request_duration|inflight|kvstats)' "$worker_file" 2>/dev/null
+ echo ""
+ } >> "$OUTPUT_DIR/rolling_metrics.log"
+
+ echo " Saved: $combined_file"
+}
+
+# Collect once immediately
+collect_metrics
+
+# Then collect at intervals
+while true; do
+ sleep "$INTERVAL"
+ collect_metrics
+done
+
+
+
+
+
+
+
+
diff --git a/external/dynamo/demo_priority_eviction.sh b/external/dynamo/demo_priority_eviction.sh
new file mode 100755
index 0000000000..d876f4471b
--- /dev/null
+++ b/external/dynamo/demo_priority_eviction.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+#
+# MultiLRU Priority Eviction Demo
+# ================================
+# Demonstrates frequency-based cache eviction protection
+#
+# Prerequisites:
+# - Start Dynamo with: DYNAMO_NUM_GPU_BLOCKS_OVERRIDE=12
+# - This gives us 12 blocks total (small cache for quick demo)
+#
+# ┌─────────────────────────────────────────────────────────────────────────┐
+# │ RECOMMENDED: Run the KV Event Observer in a separate terminal │
+# │ │
+# │ This lets you see cache events in real-time as the demo runs: │
+# │ 📦 STORED - Blocks committed to prefix cache │
+# │ 🗑️ REMOVED - Blocks evicted (should be COLD blocks, not HOT!) │
+# │ ✅ CACHE HIT - Tokens served from cache │
+# │ │
+# │ Run inside the container: │
+# │ docker exec -it dynamo-vllm python \ │
+# │ /workspace/monitoring/scripts/kv_event_observer.py \ │
+# │ --port 20080 --verbose --metrics-port 18081 │
+# │ │
+# │ This shows you EXACTLY what the MultiLRU eviction policy is doing: │
+# │ - Watch HOT blocks get stored and stay in cache │
+# │ - Watch COLD blocks get stored then evicted │
+# │ - Verify HOT blocks are protected when cache fills up │
+# └─────────────────────────────────────────────────────────────────────────┘
+#
+# What this demo shows:
+# 1. Access a "HOT" prompt multiple times (promotes to VeryHot pool)
+# 2. Fill cache with unique "COLD" prompts (forces eviction)
+# 3. Access HOT prompt again - it still gets cache hits!
+# 4. Cold blocks were evicted, hot blocks protected
+
+set -euo pipefail
+
+API="http://localhost:8000/v1/completions"
+MODEL="llama-3.3-70b"
+
+# Long prompt to fill ~2 blocks (128+ tokens with block_size=64)
+HOT_PROMPT="HOT_DEMO: This prompt will be accessed frequently and should be protected from eviction by the MultiLRU frequency-based cache management system. The quick brown fox jumps over the lazy dog multiple times throughout this demonstration. First jump over the lazy dog. Second jump over the lazy dog. Third jump over the lazy dog. Fourth jump over the lazy dog. Fifth jump over the lazy dog. Sixth jump over the lazy dog. Seventh jump over the lazy dog. Eighth jump over the lazy dog. This text ensures we have enough tokens to fill at least two complete KV cache blocks for proper prefix caching behavior."
+
+echo "╔══════════════════════════════════════════════════════════════╗"
+echo "║ MultiLRU Priority Eviction Demo ║"
+echo "║ Thresholds: [3, 8, 15] accesses for pool promotion ║"
+echo "╚══════════════════════════════════════════════════════════════╝"
+echo ""
+
+# Get baseline
+get_hits() {
+ docker exec dynamo-vllm curl -s http://localhost:18081/metrics 2>/dev/null | \
+ grep "prefix_cache_hits_total{" | grep -v external | awk '{print $NF}'
+}
+
+BASELINE=$(get_hits)
+echo "📊 Baseline cache hits: $BASELINE"
+echo ""
+
+# ============================================================
+# STEP 1: Make HOT prompt "hot" (20 accesses → VeryHot pool)
+# ============================================================
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+echo "🔥 STEP 1: Access HOT prompt 20 times (threshold for VeryHot: 15)"
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+
+for i in {1..20}; do
+ curl -s "$API" -H "Content-Type: application/json" -d "{
+ \"model\": \"$MODEL\",
+ \"prompt\": \"$HOT_PROMPT\",
+ \"max_tokens\": 2,
+ \"nvext\": {
+ \"annotations\": [
+ \"prefix_id:hot-demo-prompt\",
+ \"backend:frequency_multi_lru\"
+ ]
+ }
+ }" > /dev/null
+ echo -n "🔥"
+done
+echo ""
+
+AFTER_HOT=$(get_hits)
+HOT_HITS=$((${AFTER_HOT%.*} - ${BASELINE%.*}))
+echo " Cache hits from HOT prompt: $HOT_HITS tokens"
+echo " → HOT blocks now in VeryHot pool (protected)"
+echo ""
+
+# ============================================================
+# STEP 2: Fill cache with COLD prompts (forces eviction)
+# ============================================================
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+echo "❄️ STEP 2: Fill cache with 20 unique COLD prompts"
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+
+for i in {1..20}; do
+ # Each COLD prompt is unique and fills 2+ blocks
+ COLD="COLD_$i: This is unique cold prompt number $i designed to fill the KV cache and trigger eviction. The quick brown fox jumps over the lazy dog. First unique jump $i. Second unique jump $i. Third unique jump $i. Fourth unique jump $i. Fifth unique jump $i. Sixth unique jump $i. Adding more padding text to ensure this prompt fills at least two complete cache blocks. Extra content for block filling: $i $i $i $i $i $i $i $i."
+ curl -s "$API" -H "Content-Type: application/json" -d "{
+ \"model\": \"$MODEL\",
+ \"prompt\": \"$COLD\",
+ \"max_tokens\": 2,
+ \"nvext\": {
+ \"annotations\": [
+ \"prefix_id:cold-$i\",
+ \"backend:frequency_multi_lru\"
+ ]
+ }
+ }" > /dev/null
+ echo -n "❄️"
+done
+echo ""
+
+AFTER_COLD=$(get_hits)
+echo " Cold prompts added (each unique, no cache hits expected)"
+echo " → Eviction should have occurred (cache overflow)"
+echo ""
+
+# ============================================================
+# STEP 3: Test HOT prompt - should still get cache hits!
+# ============================================================
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+echo "🎯 STEP 3: Access HOT prompt again (was it protected?)"
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+
+for i in {1..5}; do
+ curl -s "$API" -H "Content-Type: application/json" -d "{
+ \"model\": \"$MODEL\",
+ \"prompt\": \"$HOT_PROMPT\",
+ \"max_tokens\": 2,
+ \"nvext\": {
+ \"annotations\": [
+ \"prefix_id:hot-demo-prompt\",
+ \"backend:frequency_multi_lru\"
+ ]
+ }
+ }" > /dev/null
+ echo -n "🎯"
+done
+echo ""
+
+FINAL=$(get_hits)
+FINAL_HITS=$((${FINAL%.*} - ${AFTER_COLD%.*}))
+echo ""
+
+# ============================================================
+# RESULTS
+# ============================================================
+echo "╔══════════════════════════════════════════════════════════════╗"
+echo "║ RESULTS ║"
+echo "╠══════════════════════════════════════════════════════════════╣"
+printf "║ HOT prompt initial cache hits: %6d tokens ║\n" "$HOT_HITS"
+printf "║ HOT prompt hits AFTER eviction: %6d tokens ║\n" "$FINAL_HITS"
+echo "╠══════════════════════════════════════════════════════════════╣"
+
+if [ "$FINAL_HITS" -gt 0 ]; then
+ echo "║ ✅ SUCCESS: Hot blocks PROTECTED from eviction! ║"
+ echo "║ ║"
+ echo "║ MultiLRU frequency-based eviction kept the frequently ║"
+ echo "║ accessed blocks while evicting cold (single-access) ones. ║"
+else
+ echo "║ ❌ Hot blocks were evicted (no protection) ║"
+fi
+echo "╚══════════════════════════════════════════════════════════════╝"
+
diff --git a/external/dynamo/generalized/processor.py b/external/dynamo/generalized/processor.py
index 7403000a55..c984842442 100644
--- a/external/dynamo/generalized/processor.py
+++ b/external/dynamo/generalized/processor.py
@@ -426,7 +426,7 @@ def parse_args():
async def worker(runtime: DistributedRuntime):
args = parse_args()
component = runtime.namespace("dynamo").component("processor")
- await component.create_service()
+ # NOTE: create_service() was removed in Dynamo 0.8.x - endpoint creation handles registration
handler = ProcessorRequestHandler(runtime, model_name=args.model, enable_router=args.enable_router)
await handler.initialize()
diff --git a/external/dynamo/generalized/router.py b/external/dynamo/generalized/router.py
index 96274617b7..eb8bd04eb0 100644
--- a/external/dynamo/generalized/router.py
+++ b/external/dynamo/generalized/router.py
@@ -1067,7 +1067,7 @@ async def worker(runtime: DistributedRuntime):
args = parse_args()
component = runtime.namespace("dynamo").component("router")
- await component.create_service()
+ # NOTE: create_service() was removed in Dynamo 0.8.x - endpoint creation handles registration
logger.info("Initializing WorkloadAwareRouter (LinTS + feedback + timeout + traces)")
router = WorkloadAwareRouter(
diff --git a/external/dynamo/monitor_dynamo.sh b/external/dynamo/monitor_dynamo.sh
index 156323f698..f3dffbfba2 100755
--- a/external/dynamo/monitor_dynamo.sh
+++ b/external/dynamo/monitor_dynamo.sh
@@ -124,9 +124,9 @@ case $option in
;;
6)
print_header "Health Check"
- echo "Testing: http://localhost:8099/health"
+ echo "Testing: http://localhost:8000/health"
echo ""
- http_code=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8099/health 2>&1)
+ http_code=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8000/health 2>&1)
if [ "$http_code" == "200" ]; then
print_status "ok" "Health check passed (HTTP $http_code)"
else
@@ -135,9 +135,9 @@ case $option in
;;
7)
print_header "Test Basic Inference"
- echo "Sending test request to http://localhost:8099/v1/chat/completions"
+ echo "Sending test request to http://localhost:8000/v1/chat/completions"
echo ""
- response=$(curl -s http://localhost:8099/v1/chat/completions \
+ response=$(curl -s http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama-3.1-8b",
diff --git a/external/dynamo/monitoring/README.md b/external/dynamo/monitoring/README.md
new file mode 100644
index 0000000000..060ce892aa
--- /dev/null
+++ b/external/dynamo/monitoring/README.md
@@ -0,0 +1,982 @@
+# Dynamo Monitoring Stack
+
+This directory contains a Prometheus + Grafana monitoring setup for the Dynamo LLM inference stack with Thompson Sampling router. Metrics are collected at **2-second resolution** directly from ai-dynamo's Prometheus API for per-request granularity.
+
+## Supported Backends
+
+The monitoring stack supports both **SGLang** and **vLLM** backends:
+
+| Backend | Metric Prefix | Startup Script | Features |
+|---------|---------------|----------------|----------|
+| SGLang | `sglang:` | `start_dynamo_optimized_thompson_hints_sglang.sh` | Fast inference |
+| vLLM | `vllm:` | `start_dynamo_optimized_thompson_hints_vllm.sh` | Native KVBM support |
+
+The Grafana dashboard includes a **Backend** dropdown selector to switch between SGLang and vLLM metrics dynamically.
+
+## Quick Start
+
+The monitoring stack starts **automatically** when you run the Dynamo startup script:
+
+```bash
+# Start Dynamo (monitoring starts automatically)
+bash start_dynamo_optimized_thompson_hints_vllm.sh
+
+# Or start monitoring manually if needed
+cd monitoring
+docker compose up -d
+```
+
+**Access the dashboards:**
+- **Grafana**: http://localhost:3000 (no login required)
+- **Prometheus**: http://localhost:9090
+
+**Direct dashboard link:**
+```
+http://localhost:3000/d/dynamo-overview/dynamo-llm-overview
+```
+
+In Grafana, use the **Backend** dropdown to select `sglang` or `vllm` based on your deployment.
+
+## Prerequisites
+
+- Docker and Docker Compose
+- Dynamo stack running (see `../start_dynamo_optimized_thompson_hints_sglang.sh` or `../start_dynamo_optimized_thompson_hints_vllm.sh`)
+
+## Accessing Grafana Dashboard
+
+### Local Access
+
+If running on your local machine:
+
+1. Open your browser
+2. Navigate to: **http://localhost:3000/d/dynamo-overview/dynamo-llm-overview**
+3. No login required (anonymous access enabled)
+4. Use the **Backend** dropdown (top left) to select `sglang` or `vllm`
+5. Use the **time filter** (top right) to adjust the time range
+
+### Remote Access via SSH Tunnel
+
+If Dynamo and monitoring are running on a remote server (for example, a GPU cluster), use SSH port forwarding:
+
+**Step 1: Create SSH tunnel**
+```bash
+# Replace and with your credentials
+ssh -L 3000:localhost:3000 @
+
+# Example with VPN-accessible server:
+ssh -L 3000:localhost:3000 myuser@10.57.201.5
+```
+
+**Step 2: Open browser**
+Navigate to: **http://localhost:3000/d/dynamo-overview/dynamo-llm-overview**
+
+**Step 3: Set time filter**
+- Click the time picker in the top-right corner of Grafana
+- Select a preset range (Last 1 hour, Last 6 hours, Last 24 hours)
+- Or set a custom range to view historical data from previous benchmark runs
+
+> **Tip**: Data persists across restarts. Zoom out to the last 12-24 hours to see multiple benchmark intervals.
+
+### Viewing Historical Data
+
+Prometheus stores metrics data persistently. To view data from previous runs:
+
+1. Open the Grafana dashboard
+2. Use the time picker (top right) to expand the time range
+3. Look for intervals of activity separated by gaps
+4. Compare KV Efficiency scores across different runs
+
+**Example observation**: With a tool-calling agent (20 tools) on 4xH100 with 2 workers, you might see:
+- Worker 18081: 25.4% average KV Efficiency
+- Worker 18082: 16.4% average KV Efficiency
+
+### Sharing Dashboard Access
+
+Anyone with SSH access to the remote server can view the same data:
+
+1. Share the SSH tunnel command with team members
+2. They can connect and view real-time or historical metrics
+3. Useful for collaborative debugging and performance analysis
+
+## Architecture
+
+The monitoring stack collects metrics from all Dynamo components. The architecture uses **model name isolation** to ensure all requests flow through the Thompson Sampling router.
+
+### Request Flow (Model Name Isolation)
+
+```
+Client Request (with nvext.annotations)
+ ↓
+┌─────────────────────────────────────────────────────────────────────────┐
+│ Default Dynamo Frontend (:8000) │
+│ - Tokenization + nvext parsing │
+│ - ETCD ModelWatcher (namespace=dynamo) │
+│ - Routes to processor ONLY (workers use internal model name) │
+└─────────────────────────────────────────────────────────────────────────┘
+ ↓ discovers processor (model: llama-3.3-70b)
+┌─────────────────────────────────────────────────────────────────────────┐
+│ Custom Processor (:18091/metrics) │
+│ - Extracts hints: prefix_id, total_requests, osl, iat │
+│ - Queries Thompson Sampling router │
+│ - Registered at: dynamo.backend.generate (namespace=dynamo) │
+└─────────────────────────────────────────────────────────────────────────┘
+ ↓ queries router
+┌─────────────────────────────────────────────────────────────────────────┐
+│ Custom Router (:18090/metrics) │
+│ - Thompson Sampling + KV overlap scoring │
+│ - Returns optimal worker_id │
+│ - Registered at: dynamo.router.{find_worker,feedback} │
+└─────────────────────────────────────────────────────────────────────────┘
+ ↓ returns worker_id
+┌─────────────────────────────────────────────────────────────────────────┐
+│ vLLM/SGLang Workers (:18081, :18082, ... /metrics) │
+│ - Registered at: workers.worker.generate (namespace=workers) │
+│ - Model: llama-3.3-70b-internal (hidden from frontend) │
+│ - Each worker uses TP_SIZE GPUs │
+└─────────────────────────────────────────────────────────────────────────┘
+ ↓
+Response + Feedback to Router
+```
+
+### Metrics Collection
+
+```
+┌──────────────────────────────────────────────────────────────────────────────┐
+│ Dynamo Stack │
+│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
+│ │ Frontend │ │ Workers │ │ Router │ │ Processor │ │
+│ │ :8000 │ │ :18081-180xx│ │ :18090 │ │ :18091 │ │
+│ │ /metrics │ │ /metrics │ │ /metrics │ │ /metrics │ │
+│ │ (latency) │ │ (KV cache) │ │ (routing) │ │ (KVE) │ │
+│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │
+└─────────┼────────────────┼────────────────┼────────────────┼─────────────────┘
+ │ │ │ │
+ ▼ ▼ ▼ ▼
+┌──────────────────────────────────────────────────────────────────────────────┐
+│ Monitoring Stack │
+│ ┌────────────────────────────────────────────────────────────────────────┐ │
+│ │ Prometheus :9090 │ │
+│ │ Scrapes all endpoints every 2 seconds for per-request granularity: │ │
+│ │ - Frontend (:8000) - latency, throughput, tokens │ │
+│ │ - Workers (:18081-180xx) - KV cache, backend stats (per-worker) │ │
+│ │ - Router (:18090) - Thompson Sampling routing metrics │ │
+│ │ - Processor (:18091) - Thompson Sampling KVE metrics │ │
+│ └────────────────────────────────┬───────────────────────────────────────┘ │
+│ │ │
+│ ▼ │
+│ ┌────────────────────────────────────────────────────────────────────────┐ │
+│ │ Grafana :3000 │ │
+│ │ Dashboard: "Dynamo LLM Overview" │ │
+│ │ URL: /d/dynamo-overview/dynamo-llm-overview │ │
+│ │ Access: Anonymous (no login required) │ │
+│ └────────────────────────────────────────────────────────────────────────┘ │
+└──────────────────────────────────────────────────────────────────────────────┘
+```
+
+### Model Name Isolation Explained
+
+| Component | Model Name | Namespace | Purpose |
+|-----------|------------|-----------|---------|
+| Workers | `llama-3.3-70b-internal` | `workers` | Hidden from frontend discovery |
+| Processor | `llama-3.3-70b` | `dynamo` | Discovered by frontend |
+| Router | N/A | `dynamo` | Internal routing service |
+
+This isolation ensures **ALL requests** go through the Thompson Sampling router, enabling:
+- KV overlap-aware worker selection
+- Workload hint extraction (prefix_id, osl, iat)
+- Per-request feedback for router learning
+
+## Metrics Endpoints
+
+| Component | Port(s) | URL | Description |
+|-----------|---------|-----|-------------|
+| Frontend | 8000 | `http://localhost:8000/metrics` | User-facing metrics (latency, throughput, tokens) |
+| Workers | 18081+ | `http://localhost:18081/metrics` | KV cache, backend stats - one port per worker |
+| Router | 18090 | `http://localhost:18090/metrics` | Thompson Sampling routing decisions |
+| Processor | 18091 | `http://localhost:18091/metrics` | Thompson Sampling KVE (KV Efficiency) metrics |
+
+### Worker Port Allocation
+
+Worker metrics ports are sequential starting at `DYNAMO_WORKER_METRICS_PORT` (default: 18081):
+
+| Configuration | Workers | GPU Allocation | Metrics Ports |
+|---------------|---------|----------------|---------------|
+| 8 GPUs, TP=4 | 2 | GPUs 0-3, 4-7 | 18081, 18082 |
+| 8 GPUs, TP=2 | 4 | GPUs 0-1, 2-3, 4-5, 6-7 | 18081-18084 |
+| 4 GPUs, TP=2 | 2 | GPUs 0-1, 2-3 | 18081, 18082 |
+
+Each worker is identified in Grafana by its metrics port (for example, `instance="localhost:18081"`).
+
+## Key Metrics
+
+### Frontend Metrics (`:8000/metrics`)
+
+User-facing HTTP API metrics for latency, throughput, and token statistics.
+
+| Prefix | Full Metric Name | Type | Description |
+|--------|------------------|------|-------------|
+| `dynamo_frontend_` | `dynamo_frontend_requests_total` | Counter | Total requests processed |
+| `dynamo_frontend_` | `dynamo_frontend_inflight_requests` | Gauge | Currently processing requests |
+| `dynamo_frontend_` | `dynamo_frontend_queued_requests` | Gauge | Requests waiting in queue |
+| `dynamo_frontend_` | `dynamo_frontend_disconnected_clients` | Counter | Client disconnections |
+| `dynamo_frontend_` | `dynamo_frontend_time_to_first_token_seconds` | Histogram | Time until first token generated |
+| `dynamo_frontend_` | `dynamo_frontend_inter_token_latency_seconds` | Histogram | Time between consecutive tokens |
+| `dynamo_frontend_` | `dynamo_frontend_request_duration_seconds` | Histogram | Total request duration |
+| `dynamo_frontend_` | `dynamo_frontend_input_sequence_tokens` | Histogram | Input prompt length distribution |
+| `dynamo_frontend_` | `dynamo_frontend_output_sequence_tokens` | Histogram | Output length distribution |
+| `dynamo_frontend_` | `dynamo_frontend_output_tokens_total` | Counter | Total output tokens generated |
+| `dynamo_frontend_` | `dynamo_frontend_model_context_length` | Gauge | Model context window size |
+| `dynamo_frontend_` | `dynamo_frontend_model_kv_cache_block_size` | Gauge | KV cache block size |
+
+### Worker Metrics (`:18081+/metrics`)
+
+Backend worker metrics including KV cache, scheduling, and internal statistics. Both SGLang and vLLM expose similar metrics with different prefixes:
+- **SGLang**: Metrics prefixed with `sglang:` (e.g., `sglang:cache_hit_rate`)
+- **vLLM**: Metrics prefixed with `vllm:` (e.g., `vllm:cache_hit_rate`)
+
+#### Dynamo Component Metrics
+
+| Prefix | Full Metric Name | Type | Description |
+|--------|------------------|------|-------------|
+| `dynamo_component_kvstats_` | `dynamo_component_kvstats_gpu_cache_usage_percent` | Gauge | KV cache memory utilization (0-100) |
+| `dynamo_component_kvstats_` | `dynamo_component_kvstats_gpu_prefix_cache_hit_rate` | Gauge | Prefix cache hit rate (0-1) |
+| `dynamo_component_kvstats_` | `dynamo_component_kvstats_active_blocks` | Gauge | Active KV cache blocks |
+| `dynamo_component_kvstats_` | `dynamo_component_kvstats_total_blocks` | Gauge | Total KV cache blocks |
+| `dynamo_component_` | `dynamo_component_request_duration_seconds` | Histogram | Backend request processing time |
+| `dynamo_component_` | `dynamo_component_requests_total` | Counter | Total requests to worker |
+| `dynamo_component_` | `dynamo_component_inflight_requests` | Gauge | Requests currently in worker |
+| `dynamo_component_` | `dynamo_component_uptime_seconds` | Gauge | Worker uptime |
+
+#### Backend Native Metrics
+
+Both SGLang and vLLM expose similar native metrics with their respective prefixes. Use the `${backend}` variable in the Grafana dashboard to switch between them.
+
+**Common metrics across both backends:**
+
+| Metric (use `${backend}:` prefix) | Type | Description |
+|-----------------------------------|------|-------------|
+| `cache_hit_rate` | Gauge | Prefix cache hit rate |
+| `token_usage` | Gauge | Current token usage |
+| `num_running_reqs` | Gauge | Currently running requests |
+| `num_queue_reqs` | Gauge | Queued requests |
+| `num_used_tokens` | Gauge | Tokens currently in use |
+| `gen_throughput` | Gauge | Generation throughput |
+
+**SGLang-specific metrics:**
+
+| Prefix | Full Metric Name | Type | Description |
+|--------|------------------|------|-------------|
+| `sglang:` | `sglang:utilization` | Gauge | GPU utilization |
+| `sglang:` | `sglang:queue_time_seconds` | Histogram | Time spent in queue |
+| `sglang:` | `sglang:per_stage_req_latency_seconds` | Histogram | Per-stage request latency |
+| `sglang:` | `sglang:kv_transfer_latency_ms` | Gauge | KV transfer latency |
+| `sglang:` | `sglang:kv_transfer_speed_gb_s` | Gauge | KV transfer speed |
+| `sglang:` | `sglang:engine_startup_time` | Gauge | Engine startup duration |
+| `sglang:` | `sglang:engine_load_weights_time` | Gauge | Model weight loading time |
+
+**vLLM-specific metrics:**
+
+| Prefix | Full Metric Name | Type | Description |
+|--------|------------------|------|-------------|
+| `vllm:` | `vllm:gpu_cache_usage_perc` | Gauge | GPU KV cache usage percentage |
+| `vllm:` | `vllm:cpu_cache_usage_perc` | Gauge | CPU KV cache usage percentage |
+| `vllm:` | `vllm:num_requests_running` | Gauge | Currently running requests |
+| `vllm:` | `vllm:num_requests_waiting` | Gauge | Waiting requests in queue |
+| `vllm:` | `vllm:generation_tokens_total` | Counter | Total generation tokens |
+| `vllm:` | `vllm:prompt_tokens_total` | Counter | Total prompt tokens |
+
+### Router Metrics (`:18090/metrics`)
+
+Dynamo component metrics for the Thompson Sampling router (uses standard `dynamo_component_*` prefix).
+
+| Prefix | Full Metric Name | Type | Description |
+|--------|------------------|------|-------------|
+| `dynamo_component_` | `dynamo_component_requests_total` | Counter | Total routing requests (labeled by endpoint) |
+| `dynamo_component_` | `dynamo_component_request_duration_seconds` | Histogram | Routing decision latency |
+| `dynamo_component_` | `dynamo_component_request_bytes_total` | Counter | Request payload bytes |
+| `dynamo_component_` | `dynamo_component_response_bytes_total` | Counter | Response payload bytes |
+| `dynamo_component_` | `dynamo_component_inflight_requests` | Gauge | In-flight routing requests |
+| `dynamo_component_` | `dynamo_component_uptime_seconds` | Gauge | Router uptime |
+| `dynamo_component_nats_` | `dynamo_component_nats_service_requests_total` | Gauge | NATS service requests |
+| `dynamo_component_nats_` | `dynamo_component_nats_service_processing_ms_avg` | Gauge | Average NATS processing time |
+| `dynamo_component_nats_` | `dynamo_component_nats_client_connection_state` | Gauge | NATS connection state (0=disconnected, 1=connected) |
+
+**Router Endpoints** (use `dynamo_endpoint` label to filter):
+- `find_worker` - Worker selection requests
+- `feedback` - Feedback from completed requests
+
+### Thompson Sampling Processor Metrics (`:18091/metrics`)
+
+Custom Thompson Sampling KV Efficiency (KVE) metrics from the processor component.
+
+| Prefix | Full Metric Name | Type | Description |
+|--------|------------------|------|-------------|
+| `dynamo_component_thompson_` | `dynamo_component_thompson_requests_total` | Counter | Total requests processed |
+| `dynamo_component_thompson_` | `dynamo_component_thompson_request_latency_seconds` | Histogram | End-to-end request latency |
+| `dynamo_component_thompson_` | `dynamo_component_thompson_tokens_in_total` | Counter | Total input tokens |
+| `dynamo_component_thompson_` | `dynamo_component_thompson_tokens_out_total` | Counter | Total output tokens |
+| `dynamo_component_thompson_` | `dynamo_component_thompson_routing_decisions_total` | Counter | Routing decisions made |
+| `dynamo_component_thompson_` | `dynamo_component_thompson_active_requests` | Gauge | Currently processing requests |
+| `dynamo_component_thompson_` | `dynamo_component_thompson_router_errors_total` | Counter | Router communication errors |
+| `dynamo_component_thompson_` | `dynamo_component_thompson_engine_errors_total` | Counter | Engine/worker errors |
+| `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_prompt_tokens_total` | Counter | Total prompt tokens (KVE denominator) |
+| `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_cached_tokens_total` | Counter | Cached tokens hit (KVE numerator) |
+| `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_device_blocks_total` | Counter | KV blocks from GPU memory |
+| `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_host_blocks_total` | Counter | KV blocks from CPU memory |
+| `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_disk_blocks_total` | Counter | KV blocks from disk |
+
+**KV Cache Efficiency Score (KVES) Calculation:**
+
+The full KVES formula is:
+```
+KVES = (TotalWork - ActualWork) / TotalWork ∈ [0,1]
+ where 0 = no cache benefit, 1 = full reuse
+
+ActualWork = + w_compute * recomputed_prefill_blocks * block_size
+TotalWork = cached_prompt_blocks * block_size
+w_hit = (w_gpu_hit, w_cpu_hit, w_disk_hit) # weights per hit source
+```
+
+Since full KVES requires GPU/CPU/disk hit breakdowns, we use a **simplified KVES proxy** based on cache hit rate:
+
+**Note**: vLLM with KVBM enabled provides richer KV cache metrics than SGLang.
+
+```promql
+# KVES Proxy (using SGLang native metric - RECOMMENDED)
+sglang:cache_hit_rate
+
+# As percentage
+sglang:cache_hit_rate * 100
+```
+
+> **Why use SGLang's native metric?** SGLang computes cache hit rate internally but doesn't include
+> `cached_tokens` in its API responses. The processor's `thompson_kve_*` counters will show 0
+> unless the underlying engine provides `usage.prompt_tokens_details.cached_tokens`.
+
+> **Note on Full KVES**: To implement the full KVES equation with CPU/disk hit weights, use
+> vLLM with KVBM enabled, which provides GPU→CPU→Disk tiered caching with proper metrics.
+
+## KV Cache Metrics Status
+
+This section documents the working status of all KV cache related metrics across the Dynamo stack.
+
+**Backend Selection**: The Grafana dashboard uses a `${backend}` template variable. Select `sglang` or `vllm` from the dropdown to switch all backend-specific queries.
+
+### Working Metrics ✓
+
+| Prefix | Full Metric Name | Status | Description |
+|--------|------------------|--------|-------------|
+| `sglang:` | `sglang:token_usage` | ✓ **WORKING** | KV cache memory usage as ratio (0-1). Multiply by 100 for percentage. |
+| `sglang:` | `sglang:num_used_tokens` | ✓ **WORKING** | Absolute number of tokens currently stored in KV cache. |
+| `dynamo_component_kvstats_` | `dynamo_component_kvstats_total_blocks` | ✓ **WORKING** | Total KV cache blocks available (capacity). |
+| `sglang:` | `sglang:gen_throughput` | ✓ **WORKING** | Token generation throughput (tokens/sec). |
+
+### Conditionally Working Metrics ⚠
+
+| Prefix | Full Metric Name | Status | Notes |
+|--------|------------------|--------|-------|
+| `sglang:` | `sglang:cache_hit_rate` | ⚠ **CONDITIONAL** | Shows prefix cache hit rate (0-1). Requires repeated queries with shared prefixes to see non-zero values. May stay at 0 if prefix caching is not effective for workload. |
+
+### Not Implemented / Always Zero Metrics
+
+| Prefix | Full Metric Name | Status | Notes |
+|--------|------------------|--------|-------|
+| `sglang:` | `sglang:utilization` | ✗ **ALWAYS 0** | Exported but not populated in unified engine mode. Use `sglang:num_running_reqs` and `sglang:gen_throughput` instead to gauge worker activity. |
+| `sglang:` | `sglang:is_cuda_graph` | ✗ **ALWAYS 0** | CUDA graph optimization not enabled in current configuration. |
+| `sglang:` | `sglang:spec_accept_*` | ✗ **ALWAYS 0** | Speculative decoding metrics - not applicable without draft model. |
+
+### Non-Working Metrics ✗
+
+| Prefix | Full Metric Name | Status | Reason |
+|--------|------------------|--------|--------|
+| `dynamo_component_kvstats_` | `dynamo_component_kvstats_gpu_cache_usage_percent` | ✗ **NOT WORKING** | Dynamo's internal metric not populated by SGLang backend. Use `sglang:token_usage * 100` instead. |
+| `dynamo_component_kvstats_` | `dynamo_component_kvstats_gpu_prefix_cache_hit_rate` | ✗ **NOT WORKING** | Dynamo's internal metric not populated. Use `sglang:cache_hit_rate` instead. |
+| `dynamo_component_kvstats_` | `dynamo_component_kvstats_active_blocks` | ✗ **NOT WORKING** | Dynamo's internal metric not populated by SGLang backend. |
+| `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_cached_tokens_total` | ✗ **NOT WORKING** | SGLang API doesn't return `cached_tokens` in response. |
+| `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_prompt_tokens_total` | ✗ **NOT WORKING** | Counter stays at 0 due to API limitation. |
+| `dynamo_component_thompson_kve_` | `dynamo_component_thompson_kve_*_blocks_total` | ✗ **NOT WORKING** | Block-level KVE metrics not populated. |
+
+### Architecture-Specific Metrics (Always Zero for Llama)
+
+| Prefix | Full Metric Name | Status | Reason |
+|--------|------------------|--------|--------|
+| `sglang:` | `sglang:swa_token_usage` | N/A | Sliding Window Attention - not used by Llama architecture. |
+| `sglang:` | `sglang:mamba_usage` | N/A | Mamba architecture metric - not applicable to Llama. |
+| `sglang:` | `sglang:kv_transfer_*` | N/A | KV transfer metrics only used in disaggregated prefill/decode mode. |
+| `sglang:` | `sglang:pending_prealloc_token_usage` | N/A | Preallocation metric - typically 0 in standard operation. |
+
+### Recommended KV Cache Queries
+
+The following queries use `${backend}` variable (set to `sglang` or `vllm` in Grafana):
+
+```promql
+# KV Cache Memory Usage % (RECOMMENDED - works with both backends!)
+${backend}:token_usage * 100
+
+# Absolute tokens in KV cache
+${backend}:num_used_tokens
+
+# Total KV cache capacity (blocks)
+dynamo_component_kvstats_total_blocks
+
+# Prefix Cache Hit Rate % (may be 0 without repeated prefix queries)
+${backend}:cache_hit_rate * 100
+
+# Token throughput
+${backend}:gen_throughput
+```
+
+**Direct queries** (without variable):
+```promql
+# SGLang specific
+sglang:token_usage * 100
+sglang:cache_hit_rate * 100
+
+# vLLM specific
+vllm:token_usage * 100
+vllm:cache_hit_rate * 100
+```
+
+## Grafana Dashboard
+
+### Dashboard Access
+
+| Property | Value |
+|----------|-------|
+| Dashboard Name | Dynamo LLM Overview |
+| Direct URL | `http://localhost:3000/d/dynamo-overview/dynamo-llm-overview` |
+| Authentication | None required (anonymous access enabled) |
+| Data Refresh | Every 2 seconds (configurable) |
+| Data Retention | Persistent (survives restarts) |
+
+### Backend Selector
+
+The dashboard includes a **Backend** dropdown variable at the top. Select:
+- **sglang** - For SGLang workers (metrics prefixed with `sglang:`)
+- **vllm** - For vLLM workers (metrics prefixed with `vllm:`)
+
+All backend-specific panels automatically update based on your selection.
+
+### Time Controls
+
+Use the time picker (top right) to:
+- Select preset ranges: Last 5 minutes, Last 1 hour, Last 6 hours, Last 24 hours
+- Set custom absolute time ranges for specific benchmark intervals
+- Use the refresh dropdown to control auto-refresh frequency
+
+### Dashboard Panels
+
+1. **Inflight Requests** - Current load across all components
+2. **Requests/min** - Throughput
+3. **Time to First Token (P95)** - Latency to start generating
+4. **KVES Proxy (Cache Hit Rate %)** - KV Efficiency Score proxy using prefix cache hit rate
+5. **TTFT Over Time** - P50/P95/P99 latency trends
+6. **ITL Over Time** - Inter-token latency trends
+7. **Token Throughput** - Tokens generated per second
+8. **KV Cache Usage** - Memory usage % and prefix cache hit rate % over time
+9. **KV Cache Tokens & Throughput** - Absolute token count and generation throughput
+10. **KV Cache Details (Per-Worker)** - Detailed per-worker metrics including:
+ - KVES: Prefix hit rate (%) - `avg_over_time(${backend}:cache_hit_rate[1m]) * 100`
+ - KV Usage (%) - `avg_over_time(${backend}:token_usage[1m]) * 100`
+ - KV Tokens Used - `last_over_time(${backend}:num_used_tokens[1m])`
+ - KV Capacity (blocks) - `last_over_time(dynamo_component_kvstats_total_blocks[1m])`
+ - Frontend Block Size - `last_over_time(dynamo_frontend_model_kv_cache_block_size[5m])`
+11. **KVES Proxy by Worker** - Color-coded efficiency score per worker (0-1 scale)
+12. **KV Cache Memory Usage % by Worker** - Per-worker memory utilization
+
+### Thompson Sampling Panels (Included)
+
+The dashboard includes these Thompson Sampling and worker monitoring panels:
+
+- **Routing Decisions/sec** - `rate(dynamo_component_thompson_routing_decisions_total[5m])`
+- **Worker Queue Depth** - `${backend}:num_queue_reqs`
+- **Worker Activity** - `${backend}:num_running_reqs`
+
+> **Note on KV Cache Metrics**: The dashboard uses backend-native metrics (`${backend}:token_usage`,
+> `${backend}:cache_hit_rate`, `${backend}:num_used_tokens`) which are reliably populated by both
+> SGLang and vLLM. The Dynamo-specific `dynamo_component_kvstats_*` metrics may not be populated
+> depending on your backend configuration. See the "KV Cache Metrics Status" section above for details.
+
+## Files
+
+```
+monitoring/
+├── docker-compose.yml # Prometheus + Grafana services
+├── prometheus.yml # Prometheus scrape configuration
+├── README.md # This file
+└── grafana/
+ └── provisioning/
+ ├── datasources/
+ │ └── datasources.yml # Prometheus datasource config
+ └── dashboards/
+ ├── dashboards.yml # Dashboard provider config
+ └── json/
+ └── dynamo-overview.json # Pre-built dashboard
+```
+
+## Usage
+
+### Automatic Startup (Recommended)
+
+The monitoring stack starts **automatically** when you run the Dynamo startup script:
+
+```bash
+# Start Dynamo with monitoring (vLLM backend)
+bash start_dynamo_optimized_thompson_hints_vllm.sh
+
+# Or SGLang backend
+bash start_dynamo_optimized_thompson_hints_sglang.sh
+```
+
+The script will:
+1. Start ETCD and NATS infrastructure
+2. Start Prometheus and Grafana containers
+3. Wait for monitoring services to be ready
+4. Start Dynamo components (workers, router, processor, frontend)
+
+### Manual Startup
+
+If you need to start monitoring separately:
+
+```bash
+cd monitoring
+docker compose up -d
+```
+
+### Stop Monitoring
+
+```bash
+docker compose down
+```
+
+### View Logs
+
+```bash
+docker compose logs -f prometheus
+docker compose logs -f grafana
+```
+
+### Reset Data (Start Fresh)
+
+```bash
+docker compose down -v # Removes ALL volumes (Prometheus + Grafana data)
+docker compose up -d
+```
+
+### Clear Prometheus Data Only
+
+If you're seeing duplicate labels in Grafana (for example, after restarting workers with new IDs), you can clear just the Prometheus data while keeping Grafana settings:
+
+```bash
+# Stop the monitoring containers
+docker stop dynamo-prometheus dynamo-grafana
+docker rm dynamo-prometheus dynamo-grafana
+
+# Remove just the Prometheus data volume (clears all historical metrics)
+docker volume rm monitoring_prometheus_data && echo "Prometheus data volume removed (old metrics cleared)"
+
+# Restart the monitoring stack with fresh data
+docker compose up -d
+```
+
+Alternatively, use the stop script with the `--kill-metrics` flag:
+
+```bash
+# From the dynamo directory
+bash stop_dynamo.sh --kill-metrics
+
+# Then remove the Prometheus volume
+docker volume rm monitoring_prometheus_data
+
+# Restart everything (monitoring will start automatically)
+bash start_dynamo_optimized_thompson_hints_vllm.sh
+```
+
+## Remote Access via SSH Port Forwarding
+
+If the monitoring stack is running on a remote GPU server (for example, a leased cluster node), use SSH port forwarding to access Grafana and Prometheus from your local machine.
+
+### Step-by-Step Remote Access
+
+**1. Create SSH tunnel to the remote server:**
+
+```bash
+# General syntax
+ssh -L 3000:localhost:3000 @
+
+# Example with VPN-accessible server
+ssh -L 3000:localhost:3000 myuser@10.57.201.5
+```
+
+**2. Open the Grafana dashboard in your browser:**
+
+```
+http://localhost:3000/d/dynamo-overview/dynamo-llm-overview
+```
+
+**3. Configure the time range:**
+- Click the time picker (top right corner of Grafana UI)
+- Select a preset: Last 1 hour, Last 6 hours, Last 12 hours, Last 24 hours
+- Or set a custom absolute time range to view specific benchmark intervals
+
+**4. Select your backend:**
+- Use the **Backend** dropdown (top left) to choose `sglang` or `vllm`
+- All panels will automatically update to show backend-specific metrics
+
+### Sharing Data with Team Members
+
+Anyone with SSH access to the same server can view the monitoring data:
+
+```bash
+# Team member creates their own tunnel
+ssh -L 3000:localhost:3000 @
+
+# Then opens the same dashboard URL
+# http://localhost:3000/d/dynamo-overview/dynamo-llm-overview
+```
+
+This enables collaborative analysis - multiple people can view the same data simultaneously to focus on specific signals.
+
+### Forward Multiple Ports
+
+To access both Grafana and Prometheus simultaneously:
+
+```bash
+ssh -L 3000:localhost:3000 -L 9090:localhost:9090 @
+```
+
+Access:
+- Grafana: http://localhost:3000/d/dynamo-overview/dynamo-llm-overview
+- Prometheus: http://localhost:9090
+
+### Background SSH Tunnel
+
+To run the tunnel in the background (stays open after terminal closes):
+
+```bash
+ssh -f -N -L 3000:localhost:3000 -L 9090:localhost:9090 @
+```
+
+- `-f`: Run in background after authentication
+- `-N`: Don't execute remote commands (tunnel only)
+
+To kill a background tunnel:
+```bash
+# Find the SSH process
+ps aux | grep "ssh -f -N -L 3000"
+
+# Kill it
+kill
+```
+
+### Viewing Historical Benchmark Data
+
+Prometheus persists all metrics data. To view historical benchmarks:
+
+1. Open the Grafana dashboard
+2. Expand the time range using the time picker (top right)
+3. Zoom out to 12-24 hours to see multiple benchmark intervals
+4. Gaps between data intervals indicate periods when Dynamo was stopped
+
+**Example**: After running multiple benchmark sessions, you might see:
+- Interval 1: Baseline configuration
+- Interval 2: Optimized parameters (small gap)
+- Interval 3: Best KV Efficiency (for example, Worker 18081: 25.4%, Worker 18082: 16.4%)
+
+## Manual Metrics Queries
+
+### Prometheus UI (http://localhost:9090)
+
+Example queries:
+
+```promql
+# Request rate (requests/second)
+rate(dynamo_frontend_requests_total[1m])
+
+# P95 Time to First Token
+histogram_quantile(0.95, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5m]))
+
+# P99 Inter-Token Latency
+histogram_quantile(0.99, rate(dynamo_frontend_inter_token_latency_seconds_bucket[5m]))
+
+# Token throughput
+rate(dynamo_frontend_output_tokens_total[1m])
+
+# KV cache hit rate (Dynamo)
+dynamo_component_kvstats_gpu_prefix_cache_hit_rate
+
+# KV cache hit rate (SGLang native)
+sglang:cache_hit_rate
+
+# KV cache usage percentage
+dynamo_component_kvstats_gpu_cache_usage_percent
+
+# Thompson routing decisions rate
+rate(dynamo_component_thompson_routing_decisions_total[5m])
+
+# KV Efficiency / Cache Hit Rate (using SGLang native - RECOMMENDED)
+sglang:cache_hit_rate * 100
+
+# Router endpoint request rate
+rate(dynamo_component_requests_total{dynamo_component="router"}[5m])
+
+# Worker queue depth
+sglang:num_queue_reqs
+```
+
+### curl
+
+```bash
+# All frontend metrics
+curl -s http://localhost:8000/metrics
+
+# All worker metrics (Worker 0)
+curl -s http://localhost:18081/metrics
+
+# All worker metrics (Worker 1, if running multiple workers)
+curl -s http://localhost:18082/metrics
+
+# All router metrics
+curl -s http://localhost:18090/metrics
+
+# All processor metrics (Thompson Sampling)
+curl -s http://localhost:18091/metrics
+
+# Filter specific metrics
+curl -s http://localhost:8000/metrics | grep time_to_first_token
+curl -s http://localhost:18081/metrics | grep kvstats
+curl -s http://localhost:18081/metrics | grep "sglang:" # SGLang backend
+curl -s http://localhost:18081/metrics | grep "vllm:" # vLLM backend
+curl -s http://localhost:18091/metrics | grep thompson
+```
+
+## Troubleshooting
+
+### Prometheus can't scrape targets
+
+Check if Dynamo is running:
+```bash
+# Check frontend health
+curl http://localhost:8000/health
+
+# Check worker metrics (Worker 0)
+curl http://localhost:18081/metrics
+
+# Check router metrics
+curl http://localhost:18090/metrics
+
+# Check processor metrics
+curl http://localhost:18091/metrics
+```
+
+### Grafana shows "No data"
+
+1. **Verify Prometheus is scraping**: http://localhost:9090/targets
+ - All targets should show "UP" state
+ - Check for scrape errors in the "Error" column
+
+2. **Check if metrics exist**: http://localhost:9090/graph
+ - Query a metric name (for example, `dynamo_frontend_requests_total`)
+ - If no data, Dynamo may not be running or generating traffic
+
+3. **Ensure time range is correct in Grafana**:
+ - Click the time picker (top right)
+ - Select "Last 1 hour" or expand to see historical data
+ - If you just started, wait 30-60 seconds for initial data
+
+4. **Check backend selector**:
+ - Make sure the Backend dropdown matches your deployment (sglang vs vllm)
+ - Backend mismatch will result in empty panels
+
+### SSH tunnel issues
+
+If you can't access Grafana via SSH tunnel:
+
+```bash
+# Verify the tunnel is active
+ps aux | grep "ssh -L 3000"
+
+# Test if port 3000 is accessible locally
+curl -s http://localhost:3000/api/health
+
+# If "connection refused", recreate the tunnel
+ssh -L 3000:localhost:3000 @
+```
+
+### Port conflicts
+
+If ports 9090 or 3000 are in use, modify `docker-compose.yml`:
+```yaml
+# Change Prometheus port
+command:
+ - '--web.listen-address=:9091' # Different port
+
+# Change Grafana port
+environment:
+ - GF_SERVER_HTTP_PORT=3001 # Different port
+```
+
+### Stale metrics after restart
+
+If you see old worker instances in Grafana after restarting Dynamo:
+
+```bash
+# Clear Prometheus data and restart
+docker stop dynamo-prometheus
+docker rm dynamo-prometheus
+docker volume rm monitoring_prometheus_data
+cd monitoring && docker compose up -d
+```
+
+## Alternative: File-Based Collection
+
+If you don't want to run Prometheus/Grafana, use the collection script:
+
+```bash
+cd external/dynamo
+./collect_metrics.sh ./metrics_output 30 # Collect every 30s
+```
+
+This creates timestamped `.prom` files that can be analyzed later or imported into Prometheus.
+
+## Complete Metrics Reference
+
+### Summary by Component
+
+| Component | Port(s) | Metric Count | Key Prefixes |
+|-----------|---------|--------------|--------------|
+| Frontend | 8000 | ~22 | `dynamo_frontend_*` |
+| Workers | 18081+ | ~50 | `dynamo_component_kvstats_*`, `sglang:*` or `vllm:*` |
+| Router | 18090 | ~20 | `dynamo_component_*` (labeled `router`) |
+| Processor | 18091 | ~35 | `dynamo_component_thompson_*` |
+
+### All Metric Names by Component
+
+
+Frontend (port 8000) - 22 metrics
+
+```
+dynamo_frontend_disconnected_clients
+dynamo_frontend_inflight_requests
+dynamo_frontend_input_sequence_tokens_{bucket,count,sum}
+dynamo_frontend_inter_token_latency_seconds_{bucket,count,sum}
+dynamo_frontend_model_context_length
+dynamo_frontend_model_kv_cache_block_size
+dynamo_frontend_model_migration_limit
+dynamo_frontend_output_sequence_tokens_{bucket,count,sum}
+dynamo_frontend_output_tokens_total
+dynamo_frontend_queued_requests
+dynamo_frontend_request_duration_seconds_{bucket,count,sum}
+dynamo_frontend_requests_total
+dynamo_frontend_time_to_first_token_seconds_{bucket,count,sum}
+```
+
+
+
+Worker (ports 18081+) - 50 metrics per worker
+
+**Dynamo Component Metrics:**
+```
+dynamo_component_inflight_requests
+dynamo_component_kvstats_active_blocks
+dynamo_component_kvstats_gpu_cache_usage_percent
+dynamo_component_kvstats_gpu_prefix_cache_hit_rate
+dynamo_component_kvstats_total_blocks
+dynamo_component_nats_client_*
+dynamo_component_nats_service_*
+dynamo_component_request_bytes_total
+dynamo_component_request_duration_seconds_{bucket,count,sum}
+dynamo_component_requests_total
+dynamo_component_response_bytes_total
+dynamo_component_uptime_seconds
+```
+
+**SGLang Native Metrics:**
+```
+sglang:cache_hit_rate
+sglang:engine_load_weights_time
+sglang:engine_startup_time
+sglang:gen_throughput
+sglang:is_cuda_graph
+sglang:kv_transfer_*
+sglang:mamba_usage
+sglang:num_decode_prealloc_queue_reqs
+sglang:num_decode_transfer_queue_reqs
+sglang:num_grammar_queue_reqs
+sglang:num_paused_reqs
+sglang:num_prefill_inflight_queue_reqs
+sglang:num_prefill_prealloc_queue_reqs
+sglang:num_queue_reqs
+sglang:num_retracted_reqs
+sglang:num_running_reqs
+sglang:num_running_reqs_offline_batch
+sglang:num_used_tokens
+sglang:pending_prealloc_token_usage
+sglang:per_stage_req_latency_seconds_{bucket,count,sum}
+sglang:queue_time_seconds_{bucket,count,sum}
+sglang:spec_accept_length
+sglang:spec_accept_rate
+sglang:swa_token_usage
+sglang:token_usage
+sglang:utilization
+```
+
+
+
+Router (port 18090) - 20 metrics
+
+```
+dynamo_component_inflight_requests{dynamo_component="router"}
+dynamo_component_nats_client_connection_state
+dynamo_component_nats_client_current_connections
+dynamo_component_nats_client_in_messages
+dynamo_component_nats_client_in_total_bytes
+dynamo_component_nats_client_out_messages
+dynamo_component_nats_client_out_overhead_bytes
+dynamo_component_nats_service_active_endpoints
+dynamo_component_nats_service_active_services
+dynamo_component_nats_service_errors_total
+dynamo_component_nats_service_processing_ms_avg
+dynamo_component_nats_service_processing_ms_total
+dynamo_component_nats_service_requests_total
+dynamo_component_request_bytes_total{dynamo_endpoint="find_worker|feedback"}
+dynamo_component_request_duration_seconds_{bucket,count,sum}
+dynamo_component_requests_total
+dynamo_component_response_bytes_total
+dynamo_component_uptime_seconds
+```
+
+
+
+Processor (port 18091) - 35 metrics
+
+**Standard Dynamo Component Metrics:**
+```
+dynamo_component_inflight_requests
+dynamo_component_nats_client_*
+dynamo_component_nats_service_*
+dynamo_component_request_bytes_total
+dynamo_component_request_duration_seconds_{bucket,count,sum}
+dynamo_component_requests_total
+dynamo_component_response_bytes_total
+dynamo_component_uptime_seconds
+```
+
+**Thompson Sampling Custom Metrics:**
+```
+dynamo_component_thompson_active_requests
+dynamo_component_thompson_engine_errors_total
+dynamo_component_thompson_kve_cached_tokens_total
+dynamo_component_thompson_kve_device_blocks_total
+dynamo_component_thompson_kve_disk_blocks_total
+dynamo_component_thompson_kve_host_blocks_total
+dynamo_component_thompson_kve_prompt_tokens_total
+dynamo_component_thompson_request_latency_seconds_{bucket,count,sum}
+dynamo_component_thompson_requests_total
+dynamo_component_thompson_router_errors_total
+dynamo_component_thompson_routing_decisions_total
+dynamo_component_thompson_tokens_in_total
+dynamo_component_thompson_tokens_out_total
+```
+
+
diff --git a/external/dynamo/monitoring/docker-compose.yml b/external/dynamo/monitoring/docker-compose.yml
new file mode 100644
index 0000000000..c66355919b
--- /dev/null
+++ b/external/dynamo/monitoring/docker-compose.yml
@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Prometheus + Grafana monitoring stack for Dynamo
+#
+# Usage:
+# cd monitoring
+# docker compose up -d
+#
+# Access:
+# Prometheus: http://localhost:9090
+# Grafana: http://localhost:3000 (admin/admin)
+
+services:
+ prometheus:
+ image: prom/prometheus:v2.48.0
+ container_name: dynamo-prometheus
+ network_mode: host
+ volumes:
+ - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
+ - ./rules:/etc/prometheus/rules:ro
+ - prometheus_data:/prometheus
+ command:
+ - '--config.file=/etc/prometheus/prometheus.yml'
+ - '--storage.tsdb.path=/prometheus'
+ - '--web.listen-address=:9090'
+ - '--storage.tsdb.retention.time=7d'
+ restart: unless-stopped
+
+ grafana:
+ image: grafana/grafana:10.2.2
+ container_name: dynamo-grafana
+ network_mode: host
+ environment:
+ - GF_SERVER_HTTP_PORT=3000
+ # Disable authentication for local development
+ - GF_AUTH_ANONYMOUS_ENABLED=true
+ - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
+ - GF_AUTH_DISABLE_LOGIN_FORM=true
+ # Keep these for if you re-enable login later
+ - GF_SECURITY_ADMIN_USER=admin
+ - GF_SECURITY_ADMIN_PASSWORD=admin
+ - GF_USERS_ALLOW_SIGN_UP=false
+ volumes:
+ - grafana_data:/var/lib/grafana
+ - ./grafana/provisioning:/etc/grafana/provisioning:ro
+ restart: unless-stopped
+ depends_on:
+ - prometheus
+
+volumes:
+ prometheus_data:
+ grafana_data:
+
+
diff --git a/external/dynamo/monitoring/grafana/provisioning/dashboards/dashboards.yml b/external/dynamo/monitoring/grafana/provisioning/dashboards/dashboards.yml
new file mode 100644
index 0000000000..08c8673e8d
--- /dev/null
+++ b/external/dynamo/monitoring/grafana/provisioning/dashboards/dashboards.yml
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: 1
+
+providers:
+ - name: 'Dynamo Dashboards'
+ orgId: 1
+ folder: 'Dynamo'
+ folderUid: 'dynamo'
+ type: file
+ disableDeletion: false
+ updateIntervalSeconds: 30
+ allowUiUpdates: true
+ options:
+ path: /etc/grafana/provisioning/dashboards/json
+
+
+
+
+
+
+
+
diff --git a/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json b/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json
new file mode 100644
index 0000000000..8852d819f1
--- /dev/null
+++ b/external/dynamo/monitoring/grafana/provisioning/dashboards/json/dynamo-overview.json
@@ -0,0 +1,986 @@
+{
+ "annotations": {
+ "list": []
+ },
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "liveNow": false,
+ "panels": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {"color": "green", "value": null},
+ {"color": "yellow", "value": 5},
+ {"color": "red", "value": 10}
+ ]
+ },
+ "unit": "none"
+ },
+ "overrides": []
+ },
+ "gridPos": {"h": 4, "w": 6, "x": 0, "y": 0},
+ "id": 1,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "10.2.2",
+ "targets": [
+ {
+ "expr": "dynamo_frontend_inflight_requests",
+ "legendFormat": "Inflight Requests",
+ "refId": "A"
+ }
+ ],
+ "title": "Inflight Requests",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {"color": "green", "value": null}
+ ]
+ },
+ "unit": "reqpm"
+ },
+ "overrides": []
+ },
+ "gridPos": {"h": 4, "w": 6, "x": 6, "y": 0},
+ "id": 2,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "10.2.2",
+ "targets": [
+ {
+ "expr": "sum(increase(dynamo_frontend_requests_total[5s]))",
+ "legendFormat": "Total Requests/min",
+ "refId": "A"
+ }
+ ],
+ "title": "Requests (1m)",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {"color": "green", "value": null},
+ {"color": "yellow", "value": 1},
+ {"color": "red", "value": 5}
+ ]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": {"h": 4, "w": 6, "x": 12, "y": 0},
+ "id": 3,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "10.2.2",
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.95, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5s]))",
+ "legendFormat": "P95 TTFT",
+ "refId": "A"
+ }
+ ],
+ "title": "Time to First Token (P95)",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {"color": "green", "value": null}
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": []
+ },
+ "gridPos": {"h": 4, "w": 6, "x": 18, "y": 0},
+ "id": 4,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["mean"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "10.2.2",
+ "targets": [
+ {
+ "expr": "${backend}:cache_hit_rate * 100",
+ "legendFormat": "Cache Hit Rate ({{instance}})",
+ "refId": "A"
+ }
+ ],
+ "title": "KVES Proxy (Cache Hit Rate %)",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": {"type": "linear"},
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {"group": "A", "mode": "none"},
+ "thresholdsStyle": {"mode": "off"}
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{"color": "green", "value": null}]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 4},
+ "id": 5,
+ "options": {
+ "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true},
+ "tooltip": {"mode": "multi", "sort": "none"}
+ },
+ "pluginVersion": "10.2.2",
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.5, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5s]))",
+ "legendFormat": "P50",
+ "refId": "A"
+ },
+ {
+ "expr": "histogram_quantile(0.95, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5s]))",
+ "legendFormat": "P95",
+ "refId": "B"
+ },
+ {
+ "expr": "histogram_quantile(0.99, rate(dynamo_frontend_time_to_first_token_seconds_bucket[5s]))",
+ "legendFormat": "P99",
+ "refId": "C"
+ }
+ ],
+ "title": "Time to First Token (TTFT)",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": {"type": "linear"},
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {"group": "A", "mode": "none"},
+ "thresholdsStyle": {"mode": "off"}
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{"color": "green", "value": null}]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 4},
+ "id": 6,
+ "options": {
+ "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true},
+ "tooltip": {"mode": "multi", "sort": "none"}
+ },
+ "pluginVersion": "10.2.2",
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.5, rate(dynamo_frontend_inter_token_latency_seconds_bucket[5s]))",
+ "legendFormat": "P50",
+ "refId": "A"
+ },
+ {
+ "expr": "histogram_quantile(0.95, rate(dynamo_frontend_inter_token_latency_seconds_bucket[5s]))",
+ "legendFormat": "P95",
+ "refId": "B"
+ },
+ {
+ "expr": "histogram_quantile(0.99, rate(dynamo_frontend_inter_token_latency_seconds_bucket[5s]))",
+ "legendFormat": "P99",
+ "refId": "C"
+ }
+ ],
+ "title": "Inter-Token Latency (ITL)",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": {"type": "linear"},
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {"group": "A", "mode": "none"},
+ "thresholdsStyle": {"mode": "off"}
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{"color": "green", "value": null}]
+ },
+ "unit": "tps"
+ },
+ "overrides": []
+ },
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 12},
+ "id": 7,
+ "options": {
+ "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true},
+ "tooltip": {"mode": "multi", "sort": "none"}
+ },
+ "pluginVersion": "10.2.2",
+ "targets": [
+ {
+ "expr": "${backend}:gen_throughput",
+ "legendFormat": "Worker ({{instance}}) [gen_throughput]",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(${backend}:gen_throughput)",
+ "legendFormat": "Total Workers (sum)",
+ "refId": "C"
+ },
+ {
+ "expr": "rate(dynamo_frontend_output_tokens_total{job=\"dynamo-frontend\"}[5s])",
+ "legendFormat": "Frontend Output (delivered)",
+ "refId": "B"
+ }
+ ],
+ "title": "Token Throughput",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": {"type": "linear"},
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {"group": "A", "mode": "none"},
+ "thresholdsStyle": {"mode": "off"}
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{"color": "green", "value": null}]
+ },
+ "unit": "percent"
+ },
+ "overrides": []
+ },
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 12},
+ "id": 8,
+ "options": {
+ "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true},
+ "tooltip": {"mode": "multi", "sort": "none"}
+ },
+ "pluginVersion": "10.2.2",
+ "targets": [
+ {
+ "expr": "${backend}:token_usage * 100",
+ "legendFormat": "KV Cache % ({{instance}}) [token_usage]",
+ "refId": "A"
+ }
+ ],
+ "title": "KV Cache Usage",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": {"type": "linear"},
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {"group": "A", "mode": "none"},
+ "thresholdsStyle": {"mode": "off"}
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{"color": "green", "value": null}]
+ },
+ "unit": "none"
+ },
+ "overrides": []
+ },
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 20},
+ "id": 9,
+ "options": {
+ "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true},
+ "tooltip": {"mode": "multi", "sort": "none"}
+ },
+ "pluginVersion": "10.2.2",
+ "targets": [
+ {
+ "expr": "${backend}:num_used_tokens",
+ "legendFormat": "Tokens in KV Cache ({{instance}}) [num_used_tokens]",
+ "refId": "A"
+ }
+ ],
+ "title": "KV Cache Tokens",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": {"type": "linear"},
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {"group": "A", "mode": "none"},
+ "thresholdsStyle": {"mode": "off"}
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{"color": "green", "value": null}]
+ },
+ "unit": "ops"
+ },
+ "overrides": []
+ },
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 20},
+ "id": 10,
+ "options": {
+ "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true},
+ "tooltip": {"mode": "multi", "sort": "none"}
+ },
+ "pluginVersion": "10.2.2",
+ "targets": [
+ {
+ "expr": "sum(rate(dynamo_frontend_requests_total[5s]))",
+ "legendFormat": "1. Frontend (total)",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(rate(dynamo_component_requests_total{dynamo_namespace=\"dynamo\",dynamo_component=\"backend\"}[5s]))",
+ "legendFormat": "2. Processor (backend)",
+ "refId": "B"
+ },
+ {
+ "expr": "sum(rate(dynamo_component_requests_total{dynamo_namespace=\"dynamo\",dynamo_component=\"router\",dynamo_endpoint=\"find_worker\"}[5s]))",
+ "legendFormat": "3. Router (find_worker)",
+ "refId": "C"
+ },
+ {
+ "expr": "rate(dynamo_component_requests_total{dynamo_namespace=\"workers\",dynamo_component=\"worker\",instance=\"localhost:18081\"}[5s])",
+ "legendFormat": "4. Worker 0 (18081)",
+ "refId": "D"
+ },
+ {
+ "expr": "rate(dynamo_component_requests_total{dynamo_namespace=\"workers\",dynamo_component=\"worker\",instance=\"localhost:18082\"}[5s])",
+ "legendFormat": "4. Worker 1 (18082)",
+ "refId": "E"
+ },
+ {
+ "expr": "sum(rate(dynamo_component_requests_total{dynamo_namespace=\"workers\",dynamo_component=\"worker\"}[5s]))",
+ "legendFormat": "4. Workers (total)",
+ "refId": "F"
+ }
+ ],
+ "title": "Request Flow (Frontend → Processor → Router → Workers)",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": {"type": "linear"},
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {"group": "A", "mode": "none"},
+ "thresholdsStyle": {"mode": "off"}
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {"color": "green", "value": null},
+ {"color": "yellow", "value": 10},
+ {"color": "red", "value": 50}
+ ]
+ },
+ "unit": "none"
+ },
+ "overrides": []
+ },
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 28},
+ "id": 11,
+ "options": {
+ "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true},
+ "tooltip": {"mode": "multi", "sort": "none"}
+ },
+ "pluginVersion": "10.2.2",
+ "targets": [
+ {
+ "expr": "${backend}:num_queue_reqs",
+ "legendFormat": "Queue Depth ({{instance}}) [num_queue_reqs]",
+ "refId": "A"
+ }
+ ],
+ "title": "Worker Queue Depth",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": {"type": "linear"},
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {"group": "A", "mode": "none"},
+ "thresholdsStyle": {"mode": "off"}
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{"color": "green", "value": null}]
+ },
+ "unit": "none"
+ },
+ "overrides": []
+ },
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 28},
+ "id": 12,
+ "options": {
+ "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true},
+ "tooltip": {"mode": "multi", "sort": "none"}
+ },
+ "pluginVersion": "10.2.2",
+ "targets": [
+ {
+ "expr": "${backend}:num_running_reqs",
+ "legendFormat": "Running Requests ({{instance}}) [num_running_reqs]",
+ "refId": "A"
+ }
+ ],
+ "title": "Worker Activity (Running Requests)",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "description": "Detailed KV cache metrics per worker including KVES proxy (prefix hit rate), memory usage, token counts, and capacity.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": {"type": "linear"},
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {"group": "A", "mode": "none"},
+ "thresholdsStyle": {"mode": "off"}
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [{"color": "green", "value": null}]
+ },
+ "unit": "percent"
+ },
+ "overrides": [
+ {
+ "matcher": {"id": "byRegexp", "options": ".*Tokens.*"},
+ "properties": [
+ {"id": "unit", "value": "none"},
+ {"id": "custom.axisPlacement", "value": "right"}
+ ]
+ },
+ {
+ "matcher": {"id": "byRegexp", "options": ".*Blocks.*"},
+ "properties": [
+ {"id": "unit", "value": "none"},
+ {"id": "custom.axisPlacement", "value": "right"},
+ {"id": "custom.drawStyle", "value": "bars"},
+ {"id": "custom.fillOpacity", "value": 30}
+ ]
+ },
+ {
+ "matcher": {"id": "byRegexp", "options": ".*Block Size.*"},
+ "properties": [
+ {"id": "unit", "value": "none"},
+ {"id": "custom.axisPlacement", "value": "hidden"},
+ {"id": "custom.drawStyle", "value": "points"},
+ {"id": "custom.pointSize", "value": 8}
+ ]
+ }
+ ]
+ },
+ "gridPos": {"h": 10, "w": 24, "x": 0, "y": 36},
+ "id": 13,
+ "options": {
+ "legend": {"calcs": ["mean", "last", "max"], "displayMode": "table", "placement": "right", "showLegend": true},
+ "tooltip": {"mode": "multi", "sort": "desc"}
+ },
+ "pluginVersion": "10.2.2",
+ "targets": [
+ {
+ "expr": "avg_over_time(${backend}:cache_hit_rate[1m]) * 100",
+ "legendFormat": "KVES: Prefix Hit Rate % ({{instance}})",
+ "refId": "A"
+ },
+ {
+ "expr": "avg_over_time(${backend}:token_usage[1m]) * 100",
+ "legendFormat": "KV Usage % ({{instance}})",
+ "refId": "B"
+ },
+ {
+ "expr": "last_over_time(${backend}:num_used_tokens[1m])",
+ "legendFormat": "KV Tokens Used ({{instance}})",
+ "refId": "C"
+ },
+ {
+ "expr": "last_over_time(dynamo_component_kvstats_total_blocks[1m])",
+ "legendFormat": "KV Capacity Blocks ({{instance}})",
+ "refId": "D"
+ },
+ {
+ "expr": "max(dynamo_frontend_model_kv_cache_block_size{job=\"dynamo-frontend\"})",
+ "legendFormat": "Frontend Block Size (tokens)",
+ "refId": "E"
+ }
+ ],
+ "title": "KV Cache Details (Per-Worker)",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "description": "KV Cache Efficiency Score (KVES) proxy using the backend's native prefix cache hit rate. KVES ∈ [0,1]: 0 = no cache benefit, 1 = full reuse. This is a simplified proxy for the full KVES equation (which requires CPU/disk hit metrics not currently available in all backends).",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": {"type": "linear"},
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {"group": "A", "mode": "none"},
+ "thresholdsStyle": {"mode": "off"}
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {"color": "red", "value": null},
+ {"color": "orange", "value": 0.2},
+ {"color": "yellow", "value": 0.4},
+ {"color": "light-green", "value": 0.6},
+ {"color": "green", "value": 0.8}
+ ]
+ },
+ "unit": "percentunit",
+ "min": 0,
+ "max": 1
+ },
+ "overrides": []
+ },
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 46},
+ "id": 14,
+ "options": {
+ "legend": {
+ "calcs": ["mean", "lastNotNull"],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {"mode": "multi", "sort": "none"}
+ },
+ "pluginVersion": "10.2.2",
+ "targets": [
+ {
+ "expr": "${backend}:cache_hit_rate",
+ "legendFormat": "Worker ({{instance}})",
+ "refId": "A"
+ }
+ ],
+ "title": "KVES Proxy by Worker",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "description": "KV cache memory utilization per worker. Shows how much of the allocated KV cache memory is currently in use.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
+ "lineInterpolation": "smooth",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": {"type": "linear"},
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {"group": "A", "mode": "none"},
+ "thresholdsStyle": {"mode": "off"}
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {"color": "green", "value": null},
+ {"color": "yellow", "value": 70},
+ {"color": "orange", "value": 85},
+ {"color": "red", "value": 95}
+ ]
+ },
+ "unit": "percent",
+ "min": 0,
+ "max": 100
+ },
+ "overrides": []
+ },
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 46},
+ "id": 15,
+ "options": {
+ "legend": {
+ "calcs": ["mean", "lastNotNull"],
+ "displayMode": "table",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {"mode": "multi", "sort": "none"}
+ },
+ "pluginVersion": "10.2.2",
+ "targets": [
+ {
+ "expr": "${backend}:token_usage * 100",
+ "legendFormat": "Worker ({{instance}})",
+ "refId": "A"
+ }
+ ],
+ "title": "KV Cache Memory Usage % by Worker",
+ "type": "timeseries"
+ }
+ ],
+ "refresh": "2s",
+ "schemaVersion": 38,
+ "style": "dark",
+ "tags": ["dynamo", "llm", "inference", "sglang", "vllm"],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "selected": true,
+ "text": "vllm",
+ "value": "vllm"
+ },
+ "description": "Backend inference engine (sglang or vllm). Metrics are prefixed with this value.",
+ "hide": 0,
+ "includeAll": false,
+ "label": "Backend",
+ "multi": false,
+ "name": "backend",
+ "options": [
+ {
+ "selected": false,
+ "text": "sglang",
+ "value": "sglang"
+ },
+ {
+ "selected": true,
+ "text": "vllm",
+ "value": "vllm"
+ }
+ ],
+ "query": "vllm,sglang",
+ "queryValue": "",
+ "skipUrlSync": false,
+ "type": "custom"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-15m",
+ "to": "now"
+ },
+ "timepicker": {},
+ "timezone": "",
+ "title": "Dynamo LLM Overview",
+ "uid": "dynamo-overview",
+ "version": 1,
+ "weekStart": ""
+}
diff --git a/external/dynamo/monitoring/grafana/provisioning/datasources/datasources.yml b/external/dynamo/monitoring/grafana/provisioning/datasources/datasources.yml
new file mode 100644
index 0000000000..0c065f282c
--- /dev/null
+++ b/external/dynamo/monitoring/grafana/provisioning/datasources/datasources.yml
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: 1
+
+datasources:
+ - name: Prometheus
+ type: prometheus
+ uid: prometheus
+ access: proxy
+ url: http://localhost:9090
+ isDefault: true
+ editable: true
+ jsonData:
+ timeInterval: 2s
+
+
diff --git a/external/dynamo/monitoring/prometheus.yml b/external/dynamo/monitoring/prometheus.yml
new file mode 100644
index 0000000000..1d2bf4be3a
--- /dev/null
+++ b/external/dynamo/monitoring/prometheus.yml
@@ -0,0 +1,74 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Prometheus configuration for Dynamo metrics collection
+#
+# Supports both SGLang and vLLM backends:
+# - SGLang metrics use 'sglang:' prefix (e.g., sglang:cache_hit_rate)
+# - vLLM metrics use 'vllm:' prefix (e.g., vllm:cache_hit_rate)
+# - Grafana dashboard uses ${backend} variable to switch between them
+#
+# Metrics Endpoints (using 18xxx range to avoid conflicts):
+# - Frontend (8000): User-facing latency, throughput, tokens
+# - Workers (18081-180xx): KV cache stats, NATS metrics, internal stats (one per worker)
+# - Router (18090): Thompson Sampling routing metrics
+# - Processor (18091): Thompson Sampling KVE metrics
+#
+# Note: Worker ports are sequential starting at 18081. With 2 workers: 18081, 18082.
+# With 4 workers: 18081, 18082, 18083, 18084. With 8 workers: 18081-18088.
+# Add more targets below if you run more than 8 workers.
+
+global:
+ scrape_interval: 2s
+ evaluation_interval: 2s
+
+# Recording rules to create vLLM metric aliases that match the dashboard expectations
+# This allows the same dashboard queries to work for both SGLang and vLLM backends
+rule_files:
+ - /etc/prometheus/rules/*.yml
+
+scrape_configs:
+ # Dynamo Frontend metrics (user-facing latency, throughput)
+ - job_name: 'dynamo-frontend'
+ static_configs:
+ - targets: ['localhost:8000']
+ metrics_path: /metrics
+ scrape_interval: 2s
+
+ # Dynamo Worker metrics (KV cache, internal stats)
+ # Works for both SGLang and vLLM backends - same ports, different metric prefixes
+ # Multiple workers use sequential ports starting at 18081
+ # Add/remove targets based on your NUM_WORKERS setting
+ - job_name: 'dynamo-worker'
+ static_configs:
+ - targets:
+ - 'localhost:18081'
+ - 'localhost:18082'
+ - 'localhost:18083'
+ - 'localhost:18084'
+ - 'localhost:18085'
+ - 'localhost:18086'
+ - 'localhost:18087'
+ - 'localhost:18088'
+ metrics_path: /metrics
+ scrape_interval: 2s
+
+ # Thompson Sampling Router metrics
+ - job_name: 'dynamo-router'
+ static_configs:
+ - targets: ['localhost:18090']
+ metrics_path: /metrics
+ scrape_interval: 2s
+
+ # Thompson Sampling Processor metrics (KVE)
+ - job_name: 'dynamo-processor'
+ static_configs:
+ - targets: ['localhost:18091']
+ metrics_path: /metrics
+ scrape_interval: 2s
+
+ # Prometheus self-monitoring
+ - job_name: 'prometheus'
+ static_configs:
+ - targets: ['localhost:9090']
+
diff --git a/external/dynamo/monitoring/rules/vllm-aliases.yml b/external/dynamo/monitoring/rules/vllm-aliases.yml
new file mode 100644
index 0000000000..075f5c1d55
--- /dev/null
+++ b/external/dynamo/monitoring/rules/vllm-aliases.yml
@@ -0,0 +1,70 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Prometheus Recording Rules for vLLM Metric Aliases
+# ===================================================
+#
+# vLLM metrics have different names than SGLang metrics.
+# These recording rules create vLLM metrics with names that match
+# what the Grafana dashboard expects, enabling a single parameterized
+# dashboard to work with both backends.
+#
+# Dashboard variable: ${backend} = "sglang" or "vllm"
+# Dashboard queries: ${backend}:metric_name
+#
+# Metric Mapping:
+# Dashboard Query | vLLM Source Metric(s)
+# -----------------------|---------------------------------------
+# vllm:cache_hit_rate | prefix_cache_hits / prefix_cache_queries
+# vllm:gen_throughput | rate(generation_tokens_total)
+# vllm:token_usage | kv_cache_usage_perc
+# vllm:num_running_reqs | num_requests_running (same name!)
+# vllm:num_queue_reqs | num_requests_waiting
+# vllm:num_used_tokens | (computed from kv_cache_usage_perc * total_blocks)
+
+groups:
+ - name: vllm_metric_aliases
+ interval: 2s
+ rules:
+ # Cache hit rate: prefix_cache_hits / prefix_cache_queries
+ # This matches SGLang's cache_hit_rate metric
+ - record: "vllm:cache_hit_rate"
+ expr: |
+ (
+ sum by (instance, model_name) (vllm:prefix_cache_hits_total)
+ /
+ clamp_min(sum by (instance, model_name) (vllm:prefix_cache_queries_total), 1)
+ )
+
+ # Generation throughput: rate of generation tokens
+ # This matches SGLang's gen_throughput metric
+ - record: "vllm:gen_throughput"
+ expr: |
+ rate(vllm:generation_tokens_total[5s])
+
+ # Token usage percentage: direct alias for kv_cache_usage_perc
+ # This matches SGLang's token_usage metric
+ - record: "vllm:token_usage"
+ expr: |
+ vllm:kv_cache_usage_perc
+
+ # Number of requests in queue: alias for num_requests_waiting
+ # This matches SGLang's num_queue_reqs metric
+ - record: "vllm:num_queue_reqs"
+ expr: |
+ vllm:num_requests_waiting
+
+ # Note: vllm:num_requests_running is already the correct name,
+ # but SGLang uses num_running_reqs. Create an alias.
+ - record: "vllm:num_running_reqs"
+ expr: |
+ vllm:num_requests_running
+
+ # Number of used tokens: estimate from cache usage percentage
+ # Note: This is an approximation since vLLM doesn't expose exact token count
+ # The dashboard may need adjustment for accurate display
+ - record: "vllm:num_used_tokens"
+ expr: |
+ vllm:kv_cache_usage_perc * 100000
+
+
diff --git a/external/dynamo/monitoring/scripts/kv_event_observer.py b/external/dynamo/monitoring/scripts/kv_event_observer.py
new file mode 100755
index 0000000000..93627e980e
--- /dev/null
+++ b/external/dynamo/monitoring/scripts/kv_event_observer.py
@@ -0,0 +1,515 @@
+#!/usr/bin/env python3
+"""
+KV Cache Event Observer for Dynamo vLLM Workers
+
+Subscribes to vLLM's ZMQ KV event publisher and logs/monitors block-level
+events (stored, evicted) in real-time. Also polls Prometheus metrics to
+detect cache hits (which don't generate ZMQ events).
+
+vLLM publishes events in msgpack format via ZMQ multipart messages:
+ - Part 0: Topic (bytes, usually empty)
+ - Part 1: Sequence number (8 bytes, big-endian int64)
+ - Part 2: Payload (msgpack-encoded KVEventBatch)
+
+KVEventBatch structure (msgpack):
+ [timestamp, events_list, dp_rank]
+
+Event types (from ZMQ):
+ - BlockStored: A new block was committed to prefix cache
+ - BlockRemoved: A block was evicted from prefix cache
+ - AllBlocksCleared: Entire cache was cleared
+
+Metrics polling (for cache hits):
+ - vllm:prefix_cache_hits_total: Cumulative cache hit tokens
+ - vllm:prefix_cache_queries_total: Cumulative cache query tokens
+
+Usage:
+ # Inside container:
+ python /workspace/monitoring/scripts/kv_event_observer.py --port 20080 --verbose
+
+ # With cache hit tracking (polls metrics endpoint):
+ python /workspace/monitoring/scripts/kv_event_observer.py -p 20080 -v --metrics-port 18081
+
+ # Output to file:
+ python kv_event_observer.py --port 20080 --verbose --output kv_events.jsonl
+"""
+
+import argparse
+import json
+import re
+import signal
+import sys
+import threading
+import time
+import urllib.request
+from collections import defaultdict
+from dataclasses import dataclass
+from dataclasses import field
+from datetime import UTC
+from datetime import datetime
+from typing import Any
+
+try:
+ import zmq
+except ImportError:
+ print("ERROR: pyzmq not installed. Run: pip install pyzmq")
+ sys.exit(1)
+
+try:
+ import msgpack
+except ImportError:
+ print("ERROR: msgpack not installed. Run: pip install msgpack")
+ sys.exit(1)
+
+
+def format_hash(block_hash: Any) -> str:
+ """Format a block hash for display."""
+ if isinstance(block_hash, bytes):
+ return block_hash.hex()[:16]
+ elif isinstance(block_hash, int):
+ return f"{block_hash:016x}"[:16]
+ return str(block_hash)[:16]
+
+
+@dataclass
+class KVCacheStats:
+ """Aggregated statistics for KV cache events."""
+ stored_blocks: int = 0
+ evicted_blocks: int = 0
+ cleared_count: int = 0
+ cache_hit_tokens: int = 0 # Tokens served from cache (from metrics)
+ cache_query_tokens: int = 0 # Total tokens queried (from metrics)
+ unique_hashes: set = field(default_factory=set)
+ hash_to_blocks: dict = field(default_factory=lambda: defaultdict(list))
+ last_event_time: float = 0.0
+ last_seq: int = -1
+
+ def record_stored(self, block_hashes: list[Any], parent_hash: Any = None):
+ """Record BlockStored event."""
+ self.last_event_time = time.time()
+ for bh in block_hashes:
+ h = format_hash(bh)
+ self.stored_blocks += 1
+ self.unique_hashes.add(h)
+
+ def record_removed(self, block_hashes: list[Any]):
+ """Record BlockRemoved event."""
+ self.last_event_time = time.time()
+ for bh in block_hashes:
+ h = format_hash(bh)
+ self.evicted_blocks += 1
+ self.unique_hashes.discard(h)
+
+ def record_cleared(self):
+ """Record AllBlocksCleared event."""
+ self.last_event_time = time.time()
+ self.cleared_count += 1
+ self.unique_hashes.clear()
+
+ def record_cache_hit(self, hit_tokens: int, query_tokens: int):
+ """Record cache hit from metrics delta."""
+ self.cache_hit_tokens += hit_tokens
+ self.cache_query_tokens += query_tokens
+
+ def summary(self) -> dict:
+ """Return summary statistics."""
+ hit_rate = (self.cache_hit_tokens / self.cache_query_tokens * 100) if self.cache_query_tokens > 0 else 0
+ return {
+ "stored_blocks": self.stored_blocks,
+ "evicted_blocks": self.evicted_blocks,
+ "net_blocks": self.stored_blocks - self.evicted_blocks,
+ "cleared_count": self.cleared_count,
+ "unique_hashes_current": len(self.unique_hashes),
+ "cache_hit_tokens": self.cache_hit_tokens,
+ "cache_query_tokens": self.cache_query_tokens,
+ "cache_hit_rate": f"{hit_rate:.1f}%",
+ "last_seq": self.last_seq,
+ }
+
+
+class KVEventObserver:
+ """Observes KV cache events from a vLLM worker via ZMQ.
+
+ Also optionally polls Prometheus metrics to detect cache hits,
+ which don't generate ZMQ events.
+ """
+
+ def __init__(
+ self,
+ host: str = "localhost",
+ port: int = 20080,
+ verbose: bool = False,
+ output_file: str | None = None,
+ metrics_port: int | None = None,
+ ):
+ self.host = host
+ self.port = port
+ self.verbose = verbose
+ self.output_file = output_file
+ self.metrics_port = metrics_port
+ self.stats = KVCacheStats()
+ self.running = False
+ self._output_handle = None
+
+ # Metrics polling state
+ self._last_hits = 0.0
+ self._last_queries = 0.0
+ self._metrics_thread = None
+
+ self.context = zmq.Context()
+ self.socket = self.context.socket(zmq.SUB)
+
+ def _parse_metric(self, metrics_text: str, metric_name: str) -> float:
+ """Extract a metric value from Prometheus text format."""
+ pattern = rf'^{re.escape(metric_name)}\{{[^}}]*\}}\s+([0-9.e+-]+)'
+ for line in metrics_text.split('\n'):
+ match = re.match(pattern, line)
+ if match:
+ return float(match.group(1))
+ return 0.0
+
+ def _poll_metrics(self):
+ """Background thread to poll Prometheus metrics for cache hits."""
+ metrics_url = f"http://{self.host}:{self.metrics_port}/metrics"
+
+ while self.running:
+ try:
+ with urllib.request.urlopen(metrics_url, timeout=2) as resp:
+ metrics_text = resp.read().decode('utf-8')
+
+ hits = self._parse_metric(metrics_text, 'vllm:prefix_cache_hits_total')
+ queries = self._parse_metric(metrics_text, 'vllm:prefix_cache_queries_total')
+
+ # Calculate deltas
+ hit_delta = hits - self._last_hits
+ query_delta = queries - self._last_queries
+
+ if hit_delta > 0:
+ # Cache hit detected!
+ self.stats.record_cache_hit(int(hit_delta), int(query_delta))
+ if self.verbose:
+ hit_rate = (hit_delta / query_delta * 100) if query_delta > 0 else 0
+ print(
+ f"✅ [CACHE HIT] tokens={int(hit_delta):4d} queried={int(query_delta):4d} hit_rate={hit_rate:.0f}%"
+ )
+ elif query_delta > 0:
+ # Queries happened but no hits (cache miss)
+ self.stats.record_cache_hit(0, int(query_delta))
+
+ self._last_hits = hits
+ self._last_queries = queries
+
+ except Exception as e:
+ if self.verbose:
+ print(f"[Metrics] Poll error: {e}")
+
+ time.sleep(0.5) # Poll every 500ms
+
+ def connect(self):
+ """Connect to the vLLM KV event publisher."""
+ endpoint = f"tcp://{self.host}:{self.port}"
+ print(f"[KV Observer] Connecting to {endpoint}...")
+ self.socket.connect(endpoint)
+ # Subscribe to all topics (empty string = all)
+ self.socket.setsockopt_string(zmq.SUBSCRIBE, "")
+ self.socket.setsockopt(zmq.RCVTIMEO, 1000)
+ print("[KV Observer] ✓ Connected and subscribed")
+
+ if self.output_file:
+ self._output_handle = open(self.output_file, "a")
+ print(f"[KV Observer] Writing events to: {self.output_file}")
+
+ if self.metrics_port:
+ print(f"[KV Observer] Polling metrics at http://{self.host}:{self.metrics_port}/metrics")
+ # Initialize baseline metrics
+ try:
+ metrics_url = f"http://{self.host}:{self.metrics_port}/metrics"
+ with urllib.request.urlopen(metrics_url, timeout=2) as resp:
+ metrics_text = resp.read().decode('utf-8')
+ self._last_hits = self._parse_metric(metrics_text, 'vllm:prefix_cache_hits_total')
+ self._last_queries = self._parse_metric(metrics_text, 'vllm:prefix_cache_queries_total')
+ print(f"[KV Observer] ✓ Baseline: hits={self._last_hits:.0f} queries={self._last_queries:.0f}")
+ except Exception as e:
+ print(f"[KV Observer] ⚠ Could not get baseline metrics: {e}")
+
+ def parse_multipart(self, parts: list[bytes]) -> dict | None:
+ """Parse a ZMQ multipart message from vLLM.
+
+ Format: [topic, sequence, payload]
+ Payload is msgpack-encoded KVEventBatch: [timestamp, events_list, dp_rank]
+
+ Note: The order is [ts, events, dp_rank], NOT [ts, dp_rank, events]!
+ """
+ if len(parts) < 3:
+ if self.verbose:
+ print(f"[KV Observer] Warning: Expected 3 parts, got {len(parts)}")
+ return None
+
+ topic, seq_bytes, payload = parts[0], parts[1], parts[2]
+
+ try:
+ seq = int.from_bytes(seq_bytes, "big", signed=True)
+ self.stats.last_seq = seq
+ except Exception:
+ seq = -1
+
+ try:
+ # Decode msgpack payload
+ batch = msgpack.unpackb(payload, raw=False, strict_map_key=False)
+
+ # vLLM KVEventBatch format: [timestamp, events_list, dp_rank]
+ # Note: events is at index 1, dp_rank at index 2!
+ if isinstance(batch, (list, tuple)) and len(batch) >= 3:
+ ts = batch[0]
+ events = batch[1] # Events are at index 1
+ dp_rank = batch[2] # dp_rank is at index 2
+ elif isinstance(batch, dict):
+ ts = batch.get("ts", time.time())
+ dp_rank = batch.get("data_parallel_rank", 0)
+ events = batch.get("events", [])
+ else:
+ events = [batch] if batch else []
+ ts = time.time()
+ dp_rank = 0
+
+ # Ensure events is a list
+ if not isinstance(events, list):
+ events = [events] if events else []
+
+ return {
+ "seq": seq,
+ "timestamp": ts,
+ "dp_rank": dp_rank,
+ "events": events,
+ "topic": topic.decode("utf-8", errors="replace") if topic else "",
+ }
+ except Exception as e:
+ if self.verbose:
+ print(f"[KV Observer] Parse error: {e}")
+ print(f"[KV Observer] Raw payload: {payload[:100]}...")
+ return None
+
+ def handle_event(self, event_data: dict):
+ """Handle a parsed event batch."""
+ seq = event_data.get("seq", -1)
+ ts = event_data.get("timestamp", 0)
+ dp_rank = event_data.get("dp_rank", 0)
+ events = event_data.get("events", [])
+
+ for event in events:
+ # Events can be dicts or tuples/lists
+ # vLLM format (list):
+ # BlockRemoved: ['BlockRemoved', [hash_list], medium]
+ # BlockStored: ['BlockStored', [hash_list], parent_hash, token_ids, block_size, lora_id, medium]
+ # AllBlocksCleared: ['AllBlocksCleared']
+ if isinstance(event, dict):
+ event_type = event.get("type", event.get("event_type", "unknown"))
+ block_hashes = event.get("block_hashes", [])
+ parent_hash = event.get("parent_block_hash")
+ medium = event.get("medium", "GPU")
+ token_ids = event.get("token_ids", [])
+ block_size = event.get("block_size", 0)
+ elif isinstance(event, (list, tuple)) and len(event) >= 1:
+ event_type = str(event[0]) if event else "unknown"
+
+ if event_type == "BlockRemoved" and len(event) >= 2:
+ # ['BlockRemoved', [hashes], medium]
+ block_hashes = event[1] if isinstance(event[1], list) else [event[1]]
+ medium = event[2] if len(event) > 2 else "GPU"
+ parent_hash = None
+ token_ids = []
+ block_size = 0
+ elif event_type == "BlockStored" and len(event) >= 2:
+ # ['BlockStored', [hashes], parent_hash, token_ids, block_size, lora_id, medium]
+ block_hashes = event[1] if isinstance(event[1], list) else [event[1]]
+ parent_hash = event[2] if len(event) > 2 else None
+ token_ids = event[3] if len(event) > 3 else []
+ block_size = event[4] if len(event) > 4 else 0
+ medium = event[6] if len(event) > 6 else "GPU"
+ elif event_type == "AllBlocksCleared":
+ block_hashes = []
+ parent_hash = None
+ medium = "GPU"
+ token_ids = []
+ block_size = 0
+ else:
+ block_hashes = event[1] if len(event) > 1 and isinstance(event[1], list) else []
+ parent_hash = None
+ medium = event[-1] if len(event) > 2 and isinstance(event[-1], str) else "GPU"
+ token_ids = []
+ block_size = 0
+ else:
+ event_type = str(type(event).__name__)
+ block_hashes = []
+ parent_hash = None
+ medium = "GPU"
+ token_ids = []
+ block_size = 0
+
+ # Normalize event type (vLLM uses class names like "BlockStored")
+ event_type_lower = event_type.lower()
+
+ if "stored" in event_type_lower or "blockstored" in event_type_lower:
+ self.stats.record_stored(block_hashes, parent_hash)
+ if self.verbose:
+ num_tokens = len(token_ids) if token_ids else block_size
+ for bh in block_hashes:
+ print(
+ f"📦 [STORED ] seq={seq:6d} hash={format_hash(bh)} tokens={num_tokens:3d} medium={medium}")
+ elif "removed" in event_type_lower or "blockremoved" in event_type_lower:
+ self.stats.record_removed(block_hashes)
+ if self.verbose:
+ for bh in block_hashes:
+ print(f"🗑️ [REMOVED ] seq={seq:6d} hash={format_hash(bh)} medium={medium}")
+ elif "cleared" in event_type_lower or "allblockscleared" in event_type_lower:
+ self.stats.record_cleared()
+ if self.verbose:
+ print(f"🧹 [CLEARED ] seq={seq:6d} All blocks cleared")
+ elif self.verbose:
+ print(
+ f"❓ [UNKNOWN ] seq={seq:6d} type={event_type} data={event[:3] if isinstance(event, (list, tuple)) else event}"
+ )
+
+ # Write to output file
+ if self._output_handle:
+
+ def get_event_type(e):
+ if isinstance(e, dict):
+ return str(e.get("type", "unknown"))
+ elif isinstance(e, (list, tuple)) and len(e) > 0:
+ return str(e[0])
+ else:
+ return str(e)
+
+ output = {
+ "_timestamp": datetime.now(UTC).isoformat(),
+ "seq": seq,
+ "ts": ts,
+ "dp_rank": dp_rank,
+ "events": [{
+ "type": get_event_type(e)
+ } for e in events],
+ }
+ self._output_handle.write(json.dumps(output) + "\n")
+ self._output_handle.flush()
+
+ def run(self, duration: float | None = None):
+ """Run the observer loop."""
+ self.running = True
+ start_time = time.time()
+ batches_received = 0
+
+ # Start metrics polling thread if configured
+ if self.metrics_port:
+ self._metrics_thread = threading.Thread(target=self._poll_metrics, daemon=True, name="metrics-poller")
+ self._metrics_thread.start()
+
+ print("[KV Observer] Listening for KV events (msgpack multipart)...")
+ if self.metrics_port:
+ print("[KV Observer] Cache hits will show as ✅ [CACHE HIT]")
+ print("[KV Observer] Press Ctrl+C to stop")
+ print("-" * 60)
+
+ try:
+ while self.running:
+ if duration and (time.time() - start_time) >= duration:
+ print(f"\n[KV Observer] Duration limit reached ({duration}s)")
+ break
+
+ try:
+ # Receive multipart message
+ parts = self.socket.recv_multipart()
+ event_data = self.parse_multipart(parts)
+
+ if event_data:
+ self.handle_event(event_data)
+ batches_received += 1
+
+ if batches_received % 20 == 0 and not self.verbose:
+ summary = self.stats.summary()
+ print(f"[{batches_received:5d} batches] "
+ f"Stored: {summary['stored_blocks']:4d} | "
+ f"Removed: {summary['evicted_blocks']:4d} | "
+ f"Net: {summary['net_blocks']:4d} | "
+ f"Hashes: {summary['unique_hashes_current']} | "
+ f"Seq: {summary['last_seq']}")
+ except zmq.Again:
+ # Timeout, continue loop
+ continue
+
+ except KeyboardInterrupt:
+ print("\n[KV Observer] Interrupted")
+ finally:
+ self.stop()
+
+ def stop(self):
+ """Stop and print final statistics."""
+ self.running = False
+
+ print("-" * 60)
+ print("[KV Observer] Final Statistics:")
+ for key, value in self.stats.summary().items():
+ print(f" {key}: {value}")
+
+ if self._output_handle:
+ self._output_handle.close()
+
+ self.socket.close()
+ self.context.term()
+ print("[KV Observer] Stopped")
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Observe KV cache events from vLLM workers",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+Examples:
+ # Monitor worker 0 (ZMQ events only):
+ python kv_event_observer.py -p 20080 -v
+
+ # Monitor with cache hit detection (polls Prometheus metrics):
+ python kv_event_observer.py -p 20080 -v -m 18081
+
+ # Monitor worker 1:
+ python kv_event_observer.py -p 20081 -v -m 18082
+
+ # Save events to file:
+ python kv_event_observer.py -p 20080 -o events.jsonl
+
+ # Run for 60 seconds:
+ python kv_event_observer.py -p 20080 -d 60
+
+Event types:
+ 📦 STORED - Block committed to prefix cache (ZMQ)
+ 🗑️ REMOVED - Block evicted from cache (ZMQ)
+ ✅ CACHE HIT - Tokens served from cache (metrics polling)
+""")
+ parser.add_argument("--host", "-H", default="localhost", help="Worker host (default: localhost)")
+ parser.add_argument("--port", "-p", type=int, default=20080, help="KV event ZMQ port (default: 20080)")
+ parser.add_argument("--metrics-port",
+ "-m",
+ type=int,
+ help="Prometheus metrics port for cache hit detection (e.g., 18081)")
+ parser.add_argument("--verbose", "-v", action="store_true", help="Print each event")
+ parser.add_argument("--output", "-o", help="Output file (JSONL format)")
+ parser.add_argument("--duration", "-d", type=float, help="Run duration in seconds")
+
+ args = parser.parse_args()
+
+ observer = KVEventObserver(
+ host=args.host,
+ port=args.port,
+ verbose=args.verbose,
+ output_file=args.output,
+ metrics_port=args.metrics_port,
+ )
+
+ signal.signal(signal.SIGINT, lambda s, f: setattr(observer, 'running', False))
+ signal.signal(signal.SIGTERM, lambda s, f: setattr(observer, 'running', False))
+
+ observer.connect()
+ observer.run(duration=args.duration)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/external/dynamo/optimized/ARCHITECTURE.md b/external/dynamo/optimized/ARCHITECTURE.md
new file mode 100644
index 0000000000..86f8ba2206
--- /dev/null
+++ b/external/dynamo/optimized/ARCHITECTURE.md
@@ -0,0 +1,414 @@
+# Optimized Thompson Sampling Router Architecture
+
+## Overview
+
+This architecture uses the **default Dynamo frontend** with custom **Processor** and **Router** components to implement Thompson Sampling-based intelligent worker selection with KV cache locality awareness.
+
+### Processor-as-Backend Pattern
+
+**Key insight**: The default Dynamo frontend has its own built-in router (`DYN_ROUTER_MODE`) and routes directly to `dynamo.backend.generate`. To intercept requests and apply custom Thompson Sampling routing:
+
+1. **Processor registers as `dynamo.backend.generate`** - The frontend discovers our processor as the "backend"
+2. **SGLang Worker registers as `dynamo.worker.generate`** - Our processor forwards to actual workers after routing
+3. **Frontend's built-in router becomes irrelevant** - The frontend routes to `dynamo.backend.generate` which is our processor
+
+```
+Frontend (built-in router: round-robin)
+ → routes to dynamo.backend.generate
+ → OUR PROCESSOR (intercepts!)
+ → queries Thompson Sampling router
+ → forwards to dynamo.worker.generate (actual SGLang workers)
+```
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│ CLIENT │
+│ │
+│ POST /v1/chat/completions │
+│ { │
+│ "model": "llama-3.3-70b", │
+│ "messages": [...], │
+│ "nvext": { │
+│ "annotations": [ │
+│ "prefix_id:my-session-001", │
+│ "total_requests:10", │
+│ "osl:MEDIUM", │
+│ "iat:LOW" │
+│ ] │
+│ } │
+│ } │
+└─────────────────────────────────────────────────────────────────────────────────┘
+ │
+ ▼
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│ DEFAULT DYNAMO FRONTEND │
+│ (python -m dynamo.frontend) │
+│ │
+│ ┌─────────────────────────────────────────────────────────────────────────┐ │
+│ │ OpenAI HTTP Server (port 8000) │ │
+│ │ • /v1/chat/completions │ │
+│ │ • /v1/models │ │
+│ │ • /health │ │
+│ │ • /metrics (Prometheus) │ │
+│ └─────────────────────────────────────────────────────────────────────────┘ │
+│ │ │
+│ ┌─────────────────────────────────────────────────────────────────────────┐ │
+│ │ Preprocessor │ │
+│ │ • Tokenization (chat template applied) │ │
+│ │ • NVExt parsing → PreprocessedRequest │ │
+│ │ • Annotations preserved: prefix_id, total_requests, osl, iat │ │
+│ └─────────────────────────────────────────────────────────────────────────┘ │
+│ │ │
+│ │ PreprocessedRequest │
+│ │ (tokens + annotations + extra_args) │
+└────────────────────────────────────────┼────────────────────────────────────────┘
+ │
+ ▼
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│ CUSTOM PROCESSOR │
+│ (registers as: dynamo.backend.generate) │
+│ (intercepts frontend requests!) │
+│ │
+│ ┌─────────────────────────────────────────────────────────────────────────┐ │
+│ │ 1. Receive PreprocessedRequest from frontend │ │
+│ │ • Extract annotations: prefix_id, total_requests, osl, iat │ │
+│ │ • Compute reuse_budget = total_requests - processed_for_prefix │ │
+│ └─────────────────────────────────────────────────────────────────────────┘ │
+│ │ │
+│ ┌─────────────────────────────────────────────────────────────────────────┐ │
+│ │ 2. Query Router (find_worker endpoint) │ │
+│ │ RouterRequest { │ │
+│ │ tokens: [...], │ │
+│ │ prefix_id: "my-session-001", │ │
+│ │ reuse_budget: 9, │ │
+│ │ expected_osl: "MEDIUM", │ │
+│ │ interarrival: "LOW" │ │
+│ │ } │ │
+│ └─────────────────────────────────────────────────────────────────────────┘ │
+│ │ │
+│ ┌─────────────────────────────────────────────────────────────────────────┐ │
+│ │ 3. Route to Selected Backend Worker │ │
+│ │ • Use worker_id from router to direct request │ │
+│ │ • Stream response tokens back to frontend │ │
+│ └─────────────────────────────────────────────────────────────────────────┘ │
+│ │ │
+│ ┌─────────────────────────────────────────────────────────────────────────┐ │
+│ │ 4. Send Feedback to Router │ │
+│ │ RouterFeedbackRequest { │ │
+│ │ decision_id: "abc123", │ │
+│ │ latency_ms: 245.5, │ │
+│ │ success: true, │ │
+│ │ tokens_in: 128, │ │
+│ │ tokens_out: 64 │ │
+│ │ } │ │
+│ └─────────────────────────────────────────────────────────────────────────┘ │
+│ │
+│ Prometheus Metrics (port 8081): │
+│ • thompson_processor_requests_total │
+│ • thompson_processor_request_latency_seconds │
+│ • thompson_processor_tokens_processed_total │
+└─────────────────────────────────────────────────────────────────────────────────┘
+ │
+ ▼
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│ CUSTOM ROUTER │
+│ (dynamo/router component) │
+│ │
+│ Endpoints: │
+│ • find_worker: Select optimal worker for request │
+│ • feedback: Receive latency feedback to update bandits │
+│ │
+│ ┌─────────────────────────────────────────────────────────────────────────┐ │
+│ │ Thompson Sampling Algorithm │ │
+│ │ │ │
+│ │ Score(worker) = LinTS(features) + Beta_TS(worker) │
+│ │ + Affinity(prefix_sticky) │ │
+│ │ - SwitchCost(if switching) │ │
+│ │ × LoadModifier(queue, GPU, outstanding) │ │
+│ │ │ │
+│ │ Features (9-dim): │ │
+│ │ [1, inv_load, kv_overlap, affinity, outstanding, │ │
+│ │ decode_cost, prefill_cost, iat_factor, reuse_budget] │ │
+│ └─────────────────────────────────────────────────────────────────────────┘ │
+│ │ │
+│ ┌─────────────────────────────────────────────────────────────────────────┐ │
+│ │ KV Cache Indexer │ │
+│ │ • Tracks KV cache blocks per worker │ │
+│ │ • Computes overlap scores for routing decisions │ │
+│ └─────────────────────────────────────────────────────────────────────────┘ │
+│ │ │
+│ ┌─────────────────────────────────────────────────────────────────────────┐ │
+│ │ Bandit State │ │
+│ │ • Beta bandits: (α, β) per worker │ │
+│ │ • LinTS: A matrix, b vector per worker │ │
+│ │ • Pending decisions awaiting feedback │ │
+│ │ • Latency EMA baselines (global, per-worker, per-bucket) │ │
+│ └─────────────────────────────────────────────────────────────────────────┘ │
+│ │
+│ Prometheus Metrics (port 8081): │
+│ • thompson_router_decisions_total{worker_id} │
+│ • thompson_router_kv_overlap{worker_id} │
+│ • thompson_router_feedback_latency_seconds │
+│ • thompson_router_reward{worker_id} │
+│ • thompson_router_pending_decisions │
+└─────────────────────────────────────────────────────────────────────────────────┘
+ │
+ ▼
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│ BACKEND WORKER (Unified Mode) │
+│ (python -m dynamo.sglang) │
+│ (registers as: dynamo.worker.generate) │
+│ (NOT backend.generate - that's our processor!) │
+│ │
+│ Default Configuration (start_dynamo_optimized_thompson_hints.sh): │
+│ │
+│ ┌───────────────────────────────────────────────────────────────────────────┐ │
+│ │ Unified Worker │ │
+│ │ GPUs: 0,1,2,3 (DYNAMO_GPU_DEVICES) │ │
+│ │ TP: 4 (DYNAMO_TP_SIZE) │ │
+│ │ Endpoint: dynamo.worker.generate (--endpoint flag) │ │
+│ │ │ │
+│ │ • KV Cache (shared across TP ranks) │ │
+│ │ • SGLang Engine │ │
+│ │ • Prometheus Metrics (port 8081) │ │
+│ └───────────────────────────────────────────────────────────────────────────┘ │
+│ │
+│ Environment Variables for GPU Configuration: │
+│ DYNAMO_GPU_DEVICES="0,1,2,3" # Which GPUs to use (default: 0,1,2,3) │
+│ DYNAMO_TP_SIZE=4 # Tensor parallelism degree (default: 4) │
+│ │
+│ Metrics exposed: │
+│ • sglang:* metrics on port 8081 │
+│ • dynamo_component_* metrics │
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+
+## Scaling to Multiple Workers (8-GPU Example)
+
+For systems with more GPUs, you can run multiple workers. The current startup script
+runs a **single unified worker** by default. To scale to multiple workers:
+
+### Option A: Two Workers with TP=4 (8 GPUs total)
+```bash
+# Worker 1: GPUs 0-3
+export DYNAMO_GPU_DEVICES="0,1,2,3"
+export DYNAMO_TP_SIZE=4
+# (start first worker)
+
+# Worker 2: GPUs 4-7
+export DYNAMO_GPU_DEVICES="4,5,6,7"
+export DYNAMO_TP_SIZE=4
+# (start second worker)
+```
+
+### Option B: One Worker with TP=8 (8 GPUs, single worker)
+```bash
+export DYNAMO_GPU_DEVICES="0,1,2,3,4,5,6,7"
+export DYNAMO_TP_SIZE=8
+```
+
+> **Note**: The Thompson Sampling router benefits most from multiple workers,
+> as it can learn optimal routing between them. With a single worker, the router
+> still tracks KV cache overlap but cannot make routing decisions between workers.
+
+## Key Differences from Generalized Architecture
+
+| Aspect | Generalized | Optimized |
+|--------|-------------|-----------|
+| Frontend | Custom `frontend.py` with HTTP headers | Default `dynamo.frontend` with nvext |
+| Hint Passing | HTTP headers (`x-prefix-*`) | `nvext.annotations` in request body |
+| Tokenization | Custom (in frontend) | Handled by Dynamo preprocessor |
+| Metrics | CSV files | Prometheus (`/metrics` endpoint) |
+| Model Mapping | Custom `FRONTEND_MODEL_MAPPING` | Dynamo's `--model-name`/`--model-path` |
+| **Processor Registration** | `dynamo.processor.process` | **`dynamo.backend.generate`** (intercepts frontend) |
+| **Worker Registration** | `dynamo.backend.generate` | **`dynamo.worker.generate`** (processor forwards to) |
+
+### Why "Processor-as-Backend"?
+
+The default Dynamo frontend has a built-in router (`DYN_ROUTER_MODE=round-robin|random|kv`) that routes directly to `dynamo.backend.generate`. To inject our custom Thompson Sampling routing:
+
+1. **Processor claims `backend.generate`** - Frontend thinks it's talking to the backend
+2. **Processor queries custom router** - Thompson Sampling selects best worker
+3. **Processor forwards to `worker.generate`** - Actual SGLang workers
+4. **Frontend's built-in router is irrelevant** - We've intercepted the request pipeline
+
+## NVExt Annotations
+
+The client passes routing hints via the `nvext.annotations` field in the request:
+
+```json
+{
+ "model": "llama-3.3-70b",
+ "messages": [{"role": "user", "content": "Hello!"}],
+ "nvext": {
+ "annotations": [
+ "prefix_id:session-12345",
+ "total_requests:10",
+ "osl:MEDIUM",
+ "iat:LOW"
+ ]
+ }
+}
+```
+
+### Annotation Keys
+
+| Key | Type | Description | Values |
+|-----|------|-------------|--------|
+| `prefix_id` | string | Unique identifier for request prefix/session | Any string |
+| `total_requests` | int | Total expected requests for this prefix | Positive integer |
+| `osl` | enum | Output Sequence Length expectation | `LOW`, `MEDIUM`, `HIGH` |
+| `iat` | enum | Inter-Arrival Time (request frequency) | `LOW`, `MEDIUM`, `HIGH` |
+
+## Quick Start
+
+```bash
+# Required: Set path to your model
+export DYNAMO_MODEL_DIR="/path/to/Llama-3.3-70B-Instruct"
+
+# Optional: Configure GPU devices (default: 0,1,2,3)
+export DYNAMO_GPU_DEVICES="0,1,2,3"
+export DYNAMO_TP_SIZE=4
+
+# Optional: Set model name (default: llama-3.3-70b)
+export DYNAMO_MODEL_NAME="llama-3.3-70b"
+
+# Start the system
+bash start_dynamo_optimized_thompson_hints.sh
+```
+
+## Component Startup Order
+
+1. **ETCD** - Service discovery and metadata
+2. **NATS** - Message queue for KV events (if using kv router mode)
+3. **Backend Worker** - SGLang GPU worker → registers at `dynamo.worker.generate`
+4. **Router** - Thompson Sampling router → registers at `dynamo.router.{find_worker,feedback}`
+5. **Processor** - Request orchestrator → **registers at `dynamo.backend.generate`** (intercepts frontend!)
+6. **Frontend** - HTTP API server → routes to `dynamo.backend.generate` (our processor)
+
+> **Important**: The Processor must register as `backend.generate` before the Frontend starts,
+> otherwise the Frontend might discover the SGLang worker directly (if it registered as `backend.generate`).
+
+## Prometheus Metrics
+
+All components expose metrics on port 8081 by default (`DYN_SYSTEM_PORT`):
+
+### Router Metrics
+```
+thompson_router_decisions_total{worker_id="0"} 1234
+thompson_router_kv_overlap{worker_id="0"} 0.75
+thompson_router_feedback_latency_seconds_bucket{le="0.1"} 100
+thompson_router_reward{worker_id="0"} 0.65
+thompson_router_pending_decisions 5
+thompson_router_timeout_penalties_total 2
+```
+
+### Processor Metrics
+```
+thompson_processor_requests_total 5000
+thompson_processor_request_latency_seconds_bucket{le="1.0"} 4500
+thompson_processor_tokens_in_total 128000
+thompson_processor_tokens_out_total 64000
+thompson_processor_routing_decisions_total{worker_id="0"} 1234
+```
+
+## Environment Variables
+
+### GPU and Worker Configuration
+
+These variables control how the backend worker uses GPUs. **Modify these to scale your deployment.**
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `DYNAMO_GPU_DEVICES` | `0,1,2,3` | Comma-separated list of GPU device IDs to use |
+| `DYNAMO_TP_SIZE` | `4` | Tensor parallelism degree (must match number of GPUs) |
+| `DYNAMO_MODEL_DIR` | (required) | Path to the model directory on the host |
+| `DYNAMO_MODEL_NAME` | `llama-3.3-70b` | Model name exposed to clients |
+| `DYNAMO_SHM_SIZE` | `16g` | Shared memory size for the container |
+| `DYNAMO_WORKER_INIT_TIMEOUT_S` | `600` | Timeout (seconds) for worker initialization |
+
+### Example GPU Configurations
+
+```bash
+# Default: Single worker using GPUs 0-3 with TP=4
+export DYNAMO_GPU_DEVICES="0,1,2,3"
+export DYNAMO_TP_SIZE=4
+
+# 8-GPU system: Single worker using all 8 GPUs with TP=8
+export DYNAMO_GPU_DEVICES="0,1,2,3,4,5,6,7"
+export DYNAMO_TP_SIZE=8
+
+# 8-GPU system: Use only GPUs 4-7 with TP=4
+export DYNAMO_GPU_DEVICES="4,5,6,7"
+export DYNAMO_TP_SIZE=4
+
+# 2-GPU system: Use GPUs 0-1 with TP=2
+export DYNAMO_GPU_DEVICES="0,1"
+export DYNAMO_TP_SIZE=2
+```
+
+### Network and Metrics Configuration
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `DYNAMO_HTTP_PORT` | `8000` | Frontend HTTP API port |
+| `DYNAMO_METRICS_PORT` | `8081` | Prometheus metrics port |
+| `DYN_HTTP_PORT` | `8000` | Dynamo frontend HTTP port (same as above) |
+| `DYN_SYSTEM_PORT` | `8081` | Dynamo system/metrics port |
+| `DYNAMO_ROUTER_WAIT_FOR_WORKERS_TIMEOUT_S` | `600` | Worker discovery timeout |
+
+### Backend-Specific Configuration (REQUIRED)
+
+| Variable | Values | Description |
+|----------|--------|-------------|
+| `DYNAMO_WORKER_COMPONENT` | `worker` or `backend` | **REQUIRED.** Component name where workers register. SGLang uses `worker` (via `--endpoint workers.worker.generate`). vLLM uses `backend` (hardcoded in `dynamo.vllm`). |
+
+> **Important**: `DYNAMO_WORKER_COMPONENT` must be set for the router and processor to find
+> the backend workers. Without this variable, startup will fail with an error.
+
+> **Note on `DYN_ROUTER_MODE`**: The startup script passes `--router-mode round-robin` to the
+> default frontend, but this is **irrelevant** in our architecture. The frontend's built-in
+> router routes to `dynamo.backend.generate`, which is our Processor (not a real backend).
+> Our Processor intercepts the request and uses our custom Thompson Sampling router instead.
+
+## Sample Client Request
+
+```bash
+curl -X POST http://localhost:8000/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "llama-3.3-70b",
+ "messages": [{"role": "user", "content": "What is 2+2?"}],
+ "max_tokens": 100,
+ "stream": true,
+ "nvext": {
+ "annotations": [
+ "prefix_id:math-session-001",
+ "total_requests:5",
+ "osl:LOW",
+ "iat:HIGH"
+ ]
+ }
+ }'
+```
+
+## Request Flow (Detailed)
+
+1. **Client → Frontend**: HTTP POST with nvext annotations
+2. **Frontend (Preprocessor)**: Tokenizes messages, creates `PreprocessedRequest` with annotations
+3. **Frontend (Built-in Router)**: Routes to `dynamo.backend.generate` (round-robin, but only one "backend" - our processor!)
+4. **Processor (as backend.generate)**: Receives request, extracts hints from annotations
+5. **Processor → Router**: Queries Thompson Sampling router for worker selection
+6. **Router**: Computes Thompson Sampling scores, returns worker_id
+7. **Processor → Worker**: Sends request to `dynamo.worker.generate` via `engine_client.direct(worker_id)`
+7. **Backend → Processor**: Streams response tokens
+8. **Processor → Router**: Sends latency feedback for bandit update
+9. **Processor → Frontend**: Streams response
+10. **Frontend → Client**: SSE stream or JSON response
+
+## Files
+
+- `processor.py` - Custom processor with nvext annotation extraction
+- `router.py` - Thompson Sampling router with Prometheus metrics
+- `ARCHITECTURE.md` - This document
+
diff --git a/external/dynamo/optimized/PARAMETERS.md b/external/dynamo/optimized/PARAMETERS.md
new file mode 100644
index 0000000000..0d747f9fe6
--- /dev/null
+++ b/external/dynamo/optimized/PARAMETERS.md
@@ -0,0 +1,181 @@
+# Thompson Sampling Router Parameters
+
+This document describes all configurable parameters for the `WorkloadAwareRouter` in `router.py`.
+
+## Configuration Methods
+
+Parameters can be set via:
+
+1. **YAML Config File** (`config.yaml`) - All 31 parameters
+2. **CLI Flags** - 5 flags for common operations:
+ - `--config` - Path to YAML config file
+ - `--affinity-base` - Primary stickiness control
+ - `--temp-base` - Primary exploration control
+ - `--lints-v` - Exploration variance
+ - `--override` - Override any config value (repeatable)
+
+**Precedence:** CLI flags override config file values.
+
+## Usage Examples
+
+```bash
+# Use config file only
+python router.py --config config.yaml
+
+# Override specific values
+python router.py --config config.yaml --affinity-base 0.5 --temp-base 1.5
+
+# Override nested values
+python router.py --config config.yaml --override load_balancing.gpu_penalty_weight=2.0
+
+# Multiple overrides
+python router.py --config config.yaml \
+ --override switching_cost.base=0.3 \
+ --override feedback.timeout_seconds=60
+```
+
+---
+
+## Parameter Reference
+
+### Infrastructure
+
+| Parameter | Config Path | Default | Type | Description |
+|-----------|-------------|---------|------|-------------|
+| Block Size | `infrastructure.block_size` | 64 | int | KV cache block size for overlap computation |
+| Router Type | `infrastructure.router_type` | "kv" | str | Router mode: "kv" (KV-aware) or "kv_load" (load-based only) |
+| Min Workers | `infrastructure.min_workers` | 1 | int | Minimum workers required before routing starts |
+
+### Affinity (Stickiness)
+
+Controls how strongly the router prefers keeping requests on the same worker for KV cache reuse.
+
+| Parameter | Config Path | CLI Flag | Default | Type | Description |
+|-----------|-------------|----------|---------|------|-------------|
+| Base | `affinity.base` | `--affinity-base` | 0.30 | float | Base bonus when staying on same worker. Higher = more sticky. |
+| Reuse Weight | `affinity.reuse_weight` | `--override` | 0.15 | float | Additional bonus per remaining request in session |
+| IAT Weight | `affinity.iat_weight` | `--override` | 0.20 | float | Bonus scaling based on inter-arrival time hint |
+| Sticky Load Floor | `affinity.sticky_load_floor` | `--override` | 0.70 | float | Minimum load modifier for sticky decisions (prevents load from overriding stickiness) |
+
+**Tuning Guide:**
+- **High affinity (0.4-0.6):** Prioritize KV cache hits, good for multi-turn conversations
+- **Low affinity (0.1-0.2):** Prioritize load balancing, good for independent requests
+
+### Exploration (Temperature)
+
+Controls the explore vs exploit tradeoff in worker selection.
+
+| Parameter | Config Path | CLI Flag | Default | Type | Description |
+|-----------|-------------|----------|---------|------|-------------|
+| Base TS Weight | `exploration.base_ts_weight` | `--override` | 0.10 | float | Weight for Thompson Sampling exploration term |
+| Temp Base | `exploration.temperature.base` | `--temp-base` | 1.0 | float | Base softmax temperature |
+| Temp Min | `exploration.temperature.min` | `--override` | 0.15 | float | Minimum temperature (more greedy selection) |
+| Temp Max | `exploration.temperature.max` | `--override` | 2.0 | float | Maximum temperature (more random selection) |
+
+**Tuning Guide:**
+- **Low temperature (0.2-0.5):** Greedy, always pick best-scored worker
+- **High temperature (1.5-2.0):** More exploration, useful early or with changing workloads
+
+### Switching Cost
+
+Penalty applied when moving a prefix session to a different worker.
+
+| Parameter | Config Path | CLI Flag | Default | Type | Description |
+|-----------|-------------|----------|---------|------|-------------|
+| Base | `switching_cost.base` | `--override` | 0.20 | float | Base penalty for switching workers |
+| Reuse Penalty | `switching_cost.reuse_penalty` | `--override` | 0.08 | float | Additional penalty per remaining request in session |
+| IAT Penalty | `switching_cost.iat_penalty` | `--override` | 0.05 | float | Additional penalty based on inter-arrival time |
+
+**Tuning Guide:**
+- **High switching cost (0.3-0.5):** Strongly discourage worker changes mid-session
+- **Low switching cost (0.05-0.1):** Allow flexible rebalancing
+
+### Load Balancing
+
+Controls how much to penalize workers with high load.
+
+| Parameter | Config Path | CLI Flag | Default | Type | Description |
+|-----------|-------------|----------|---------|------|-------------|
+| Queue Penalty | `load_balancing.queue_penalty_weight` | `--override` | 0.50 | float | Weight for pending request queue depth |
+| GPU Penalty | `load_balancing.gpu_penalty_weight` | `--override` | 1.00 | float | Weight for GPU KV cache memory usage |
+| Outstanding Work | `load_balancing.outstanding_work_weight` | `--override` | 0.45 | float | Weight for outstanding work (expected future load) |
+| Job-GPU Coupling | `load_balancing.job_gpu_coupling_weight` | `--override` | 0.40 | float | How much job cost amplifies GPU load penalty |
+| Job-Queue Coupling | `load_balancing.job_queue_coupling_weight` | `--override` | 0.20 | float | How much job cost amplifies queue penalty |
+
+**Tuning Guide:**
+- **High GPU penalty (1.5-2.0):** Aggressively avoid memory-constrained workers
+- **High queue penalty (0.8-1.0):** Prioritize low-latency routing
+
+### Prefill Cost Model
+
+How input sequence length (ISL) contributes to job cost estimation.
+
+| Parameter | Config Path | CLI Flag | Default | Type | Description |
+|-----------|-------------|----------|---------|------|-------------|
+| Token Scale | `prefill.token_scale` | `--override` | 1024.0 | float | Normalization denominator for token count |
+| Weight | `prefill.weight` | `--override` | 1.0 | float | Weight of prefill cost in total job cost |
+
+### LinTS Learner
+
+Parameters controlling the Linear Thompson Sampling algorithm.
+
+| Parameter | Config Path | CLI Flag | Default | Type | Description |
+|-----------|-------------|----------|---------|------|-------------|
+| Lambda | `lints.lambda` | `--override` | 1.0 | float | Ridge regression regularization strength |
+| V | `lints.v` | `--lints-v` | 0.25 | float | Exploration variance in posterior sampling |
+| Forget Rate | `lints.forget_rate` | `--override` | 0.995 | float | Exponential decay for old observations |
+
+**Tuning Guide:**
+- **High V (0.4-0.6):** More exploration, keeps trying alternatives
+- **Low V (0.05-0.15):** More exploitation, trusts learned model
+- **High forget rate (0.999):** Long memory, slow adaptation
+- **Low forget rate (0.95):** Short memory, fast adaptation to changes
+
+### Feedback Handling
+
+Controls the delayed reward mechanism.
+
+| Parameter | Config Path | CLI Flag | Default | Type | Description |
+|-----------|-------------|----------|---------|------|-------------|
+| Timeout Seconds | `feedback.timeout_seconds` | `--override` | 120.0 | float | Max wait time for feedback before applying timeout penalty |
+| Sweep Interval | `feedback.sweep_interval_seconds` | `--override` | 5.0 | float | How often to check for timed-out decisions |
+| Timeout Reward | `feedback.timeout_reward` | `--override` | 0.0 | float | Reward for timed-out requests (0.0 = treat as failure) |
+| Latency EMA Alpha | `feedback.latency_ema_alpha` | `--override` | 0.2 | float | Smoothing factor for latency baseline EMA |
+
+### Debug
+
+| Parameter | Config Path | CLI Flag | Default | Type | Description |
+|-----------|-------------|----------|---------|------|-------------|
+| Traces Enabled | `debug.traces_enabled` | `--override` | false | bool | Enable detailed decision trace logging |
+| Trace Dir | `debug.trace_dir` | `--override` | /tmp/dynamo_router_traces | str | Directory for trace output files |
+| Buffer Size | `debug.buffer_size` | `--override` | 2000 | int | In-memory trace buffer size |
+
+---
+
+## Feature Vector (LinTS Input)
+
+The router uses a 9-dimensional feature vector as input to the LinTS learner:
+
+| Index | Feature | Source | Description |
+|-------|---------|--------|-------------|
+| 0 | Bias | Constant | Always 1.0 (intercept term) |
+| 1 | Inverse Load | Computed | 1/(1 + gpu_penalty + queue_penalty) |
+| 2 | Overlap | KV Indexer | KV cache overlap score [0, 1] |
+| 3 | Affinity | State | 1 if same worker as last request, else 0 |
+| 4 | Outstanding | State | tanh(0.1 × outstanding_work) |
+| 5 | Decode Cost | OSL Hint | decode_cost / 3.0 |
+| 6 | Prefill Cost | Computed | tanh(prefill_cost) |
+| 7 | IAT Factor | IAT Hint | iat_factor / 1.5 |
+| 8 | Reuse Budget | Hint | tanh(0.25 × reuse_budget) |
+
+---
+
+## Learned vs Fixed Parameters
+
+| Category | Updated At | Examples |
+|----------|-----------|----------|
+| **Learned (runtime)** | Every request | `linA`, `linb` matrices, Beta(α,β) bandits, latency EMAs |
+| **Fixed (startup)** | Never | All 31 config parameters above |
+
+The config parameters are **hyperparameters** that control *how* the router learns, not *what* it learns.
+
diff --git a/external/dynamo/optimized/README.md b/external/dynamo/optimized/README.md
new file mode 100644
index 0000000000..fdb1443939
--- /dev/null
+++ b/external/dynamo/optimized/README.md
@@ -0,0 +1,273 @@
+# Optimized Thompson Sampling Router Architecture
+
+This directory contains the optimized implementation of the Thompson Sampling router for Dynamo, using the "Processor-as-Backend" pattern with **Dynamic Discovery** to intercept requests from the default Dynamo frontend.
+
+## Architecture Overview (Dynamic Discovery Mode)
+
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│ Client Request (with nvext.annotations) │
+│ ↓ │
+│ Default Dynamo Frontend (port 8000) │
+│ ↓ tokenization + nvext parsing │
+│ ↓ discovers backends via ETCD ModelWatcher │
+│ ↓ finds Processor's model card! │
+│ │
+│ Custom Processor (dynamo.backend.generate-{instance_id}) │
+│ ↓ extracts: prefix_id, total_requests, osl, iat │
+│ ↓ queries Thompson Sampling router │
+│ │
+│ Custom Router (dynamo.router.find_worker) │
+│ ↓ KV overlap + workload-aware selection │
+│ ↓ returns worker_id │
+│ │
+│ Processor forwards to dynamo.worker.generate (with worker_id) │
+│ ↓ │
+│ SGLang Worker (actual inference) │
+│ ↓ │
+│ Response + Feedback to Router │
+└─────────────────────────────────────────────────────────────────────────┘
+```
+
+## Components
+
+| Component | File | Endpoint | Purpose |
+|-----------|------|----------|---------|
+| Processor | `processor.py` | `dynamo.backend.generate` + ETCD model card | Intercepts frontend requests, extracts hints, coordinates routing |
+| Router | `router.py` | `dynamo.router.find_worker` | Thompson Sampling + KV overlap worker selection |
+| Config | `config.yaml` | - | Router configuration parameters |
+
+## Dynamic Discovery Pattern (Forward-Compatible)
+
+Instead of using the deprecated `--static-endpoint` flag on the frontend, this processor uses **dynamic discovery** via ETCD:
+
+1. **Processor** registers as `dynamo.backend.generate` (dynamic mode with instance ID)
+2. **Processor** calls `register_llm()` to advertise a model card in ETCD
+3. **Frontend's ModelWatcher** discovers the processor's model card
+4. **Frontend** routes requests to the discovered processor endpoint
+5. **SGLang Worker** registers as `dynamo.worker.generate` (also dynamic)
+
+### Why Dynamic Discovery?
+
+The `--static-endpoint` flag is **deprecated** and will be removed in future Dynamo versions. Dynamic discovery provides:
+
+- Forward compatibility with future Dynamo releases
+- Support for multiple processor instances (load balancing)
+- Standard Dynamo discovery patterns
+- Dynamic scaling capabilities
+
+## Processor Registration
+
+The processor uses `register_llm()` to advertise itself in ETCD:
+
+```python
+@dynamo_worker(static=False) # Dynamic mode for ETCD discovery
+async def worker(runtime: DistributedRuntime):
+ component = runtime.namespace("dynamo").component("backend")
+ await component.create_service()
+
+ endpoint = component.endpoint("generate")
+
+ # Register model card so frontend can discover us
+ await register_llm(
+ model_input=ModelInput.Tokens,
+ model_type=ModelType.Chat | ModelType.Completions,
+ endpoint=endpoint,
+ model_path=args.model_path,
+ model_name=args.model_name,
+ )
+
+ handler = ProcessorRequestHandler(runtime, ...)
+ await endpoint.serve_endpoint(handler.generate)
+```
+
+### Required Arguments
+
+The processor now requires:
+- `--model-path`: Path to the model directory (for tokenizer and model card)
+- `--model-name`: Served model name (must match frontend's expected model)
+
+## Usage
+
+### Starting the System
+
+```bash
+# Set required environment variable
+export DYNAMO_MODEL_DIR="/path/to/Llama-3.3-70B-Instruct"
+
+# Start all components
+bash ../start_dynamo_optimized_thompson_hints.sh
+```
+
+### Making Requests
+
+```bash
+# Basic request (no routing hints)
+curl http://localhost:8000/v1/chat/completions \
+ -H 'Content-Type: application/json' \
+ -d '{
+ "model": "llama-3.3-70b",
+ "messages": [{"role": "user", "content": "Hello!"}],
+ "max_tokens": 50
+ }'
+
+# Request with nvext annotations (routing hints)
+curl http://localhost:8000/v1/chat/completions \
+ -H 'Content-Type: application/json' \
+ -d '{
+ "model": "llama-3.3-70b",
+ "messages": [{"role": "user", "content": "Hello!"}],
+ "max_tokens": 50,
+ "nvext": {
+ "annotations": [
+ "prefix_id:my-session-001",
+ "total_requests:10",
+ "osl:MEDIUM",
+ "iat:LOW"
+ ]
+ }
+ }'
+```
+
+### Routing Hint Annotations
+
+| Annotation | Format | Description |
+|------------|--------|-------------|
+| `prefix_id` | `prefix_id:` | Unique identifier for prefix reuse across requests |
+| `total_requests` | `total_requests:` | Expected total requests in this prefix group |
+| `osl` | `osl:LOW\|MEDIUM\|HIGH` | Expected output sequence length |
+| `iat` | `iat:LOW\|MEDIUM\|HIGH` | Inter-arrival time hint |
+
+---
+
+## Troubleshooting
+
+### Verifying Processor Interception
+
+To confirm that requests are flowing through the processor (not directly to workers), run:
+
+```bash
+docker logs dynamo-sglang-optimized 2>&1 | grep -E "(Processor|processor|Processing request|Routing decision|dynamo.backend|backend.generate|find_worker)" | tail -50
+```
+
+### Expected Output (Nominal Operation)
+
+When the system is working correctly, you should see output similar to:
+
+```
+Step 3: Starting Custom Processor (Registers as backend.generate)...
+Processor PID: 3735
+Registered at: dynamo.backend.generate (intercepts frontend requests)
+
+INFO processor._init_prometheus_metrics: Prometheus metrics initialized for processor
+INFO processor.initialize: Router clients created, waiting for instances...
+INFO dynamo_runtime::component::client: wait_for_instances: Found 1 instance(s) for endpoint: dynamo/router/_endpoint_/find_worker
+INFO processor.initialize: Router clients initialized successfully
+INFO processor.initialize: Engine client created, waiting for worker instances...
+INFO processor.initialize: Processor initialized successfully (routing to dynamo.worker.generate)
+
+INFO processor.generate: Processing request: prefix=auto-3f0519ac1cc442d2... total=1 osl=MEDIUM iat=MEDIUM tokens=37
+INFO processor.generate: Routing decision: worker=7587892168930944779 decision=bcc5180740ed44c6... reuse_budget=0
+
+INFO processor.generate: Processing request: prefix=auto-2593032a6cf843ce... total=1 osl=MEDIUM iat=MEDIUM tokens=37
+INFO processor.generate: Routing decision: worker=7587892168930944779 decision=ba4440fd3a144822... reuse_budget=0
+```
+
+### Key Indicators of Success
+
+| Log Message | Meaning |
+|-------------|---------|
+| `Registering model card: model_name=...` | Processor registering with ETCD |
+| `Model card registered successfully` | Frontend can now discover the processor |
+| `Router clients initialized successfully` | Connected to Thompson Sampling router |
+| `Processor initialized successfully` | Ready to process requests |
+| `Processing request: prefix=... tokens=N` | Request received and being processed |
+| `Routing decision: worker=... decision=...` | Router selected a worker |
+
+### Common Issues
+
+#### 1. Frontend Not Finding Processor
+
+**Symptom:** Requests fail or go directly to workers, bypassing processor.
+
+**Cause:** Model card not registered or model name mismatch.
+
+**Verification:**
+```bash
+# Check if processor registered its model card
+docker logs dynamo-sglang-optimized 2>&1 | grep -i "model card"
+
+# Check ETCD for registered models
+curl -s http://localhost:2379/v3/kv/range -X POST \
+ -H "Content-Type: application/json" \
+ -d '{"key":"ZHluYW1v","range_end":"ZHluYW1w"}' | jq .
+```
+
+**Solution:**
+1. Ensure `--model-name` matches between processor and frontend
+2. Ensure `--model-path` points to a valid model directory
+3. Processor must start BEFORE frontend
+
+#### 2. "missing field `token_ids`" Error
+
+**Cause:** Processor couldn't reach workers.
+
+**Solution:** Ensure workers are registered and running:
+```bash
+docker logs dynamo-sglang-optimized 2>&1 | grep "worker.generate"
+```
+
+#### 3. Requests Bypassing Processor
+
+**Symptom:** No "Processing request" logs, but responses still work.
+
+**Cause:** Frontend is routing directly to workers instead of through the processor.
+
+**Verification:**
+```bash
+# Check if processor is receiving requests
+docker logs dynamo-sglang-optimized 2>&1 | grep "Processing request"
+```
+
+**Solution:**
+1. Ensure processor's `--model-name` matches frontend's `--model-name` exactly
+2. Processor must register BEFORE frontend starts
+3. Check that processor's model card is in ETCD
+
+#### 4. Router Not Found
+
+**Symptom:** `Router stream ended without worker_id; falling back to engine load balancing`
+
+**Cause:** Router not started or not registered.
+
+**Solution:** Check router logs:
+```bash
+docker logs dynamo-sglang-optimized 2>&1 | grep -i router
+```
+
+---
+
+## Prometheus Metrics
+
+| Metric | Description |
+|--------|-------------|
+| `thompson_processor_requests_total` | Total requests processed |
+| `thompson_processor_request_latency_seconds` | Request latency histogram |
+| `thompson_processor_tokens_in_total` | Total input tokens |
+| `thompson_processor_tokens_out_total` | Total output tokens |
+| `thompson_processor_routing_decisions_total` | Routing decisions by worker |
+| `thompson_processor_router_errors_total` | Router communication errors |
+| `thompson_processor_engine_errors_total` | Backend engine errors |
+| `thompson_processor_active_requests` | Currently active requests |
+
+Access metrics:
+```bash
+curl http://localhost:8081/metrics | grep thompson_processor
+```
+
+---
+
+## Configuration
+
+See `config.yaml` for router configuration options and `PARAMETERS.md` for detailed parameter documentation.
+
diff --git a/external/dynamo/optimized/__init__.py b/external/dynamo/optimized/__init__.py
new file mode 100644
index 0000000000..98c3816c75
--- /dev/null
+++ b/external/dynamo/optimized/__init__.py
@@ -0,0 +1,27 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Optimized Thompson Sampling Router Architecture.
+
+This package contains custom Dynamo components that work with the default
+Dynamo frontend, using nvext.annotations for routing hints and Prometheus
+for metrics.
+
+Components:
+ - processor.py: Custom processor with nvext annotation extraction
+ - router.py: Thompson Sampling router with Prometheus metrics
+
+See ARCHITECTURE.md for detailed documentation.
+"""
diff --git a/external/dynamo/optimized/config.yaml b/external/dynamo/optimized/config.yaml
new file mode 100644
index 0000000000..ae496265e6
--- /dev/null
+++ b/external/dynamo/optimized/config.yaml
@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Thompson Sampling Router Configuration
+# ======================================
+#
+# This file contains all configurable parameters for the WorkloadAwareRouter.
+# Parameters can be overridden via CLI flags (see PARAMETERS.md for details).
+#
+# REQUIRED ENVIRONMENT VARIABLE:
+# DYNAMO_WORKER_COMPONENT - Component name where backend workers register.
+# - SGLang: "worker" (workers register at workers.worker.generate)
+# - vLLM: "backend" (workers register at workers.backend.generate)
+#
+# CLI Override Examples:
+# python router.py --config config.yaml --affinity-base 0.5
+# python router.py --config config.yaml --override affinity.reuse_weight=0.2
+#
+
+# Infrastructure settings
+infrastructure:
+ block_size: 64 # KV cache block size for overlap computation
+ router_type: kv # Router type: "kv" (KV-aware) or "kv_load" (load-based)
+ min_workers: 1 # Minimum workers required before routing starts
+
+# Affinity settings - controls stickiness to same worker for prefix reuse
+affinity:
+ base: 0.30 # Base affinity bonus when staying on same worker (CLI: --affinity-base)
+ reuse_weight: 0.15 # Additional bonus per remaining request in session
+ iat_weight: 0.20 # Bonus scaling based on inter-arrival time
+ sticky_load_floor: 0.70 # Minimum load modifier for sticky decisions
+
+# Exploration settings - controls explore vs exploit tradeoff
+exploration:
+ base_ts_weight: 0.10 # Weight for Thompson Sampling exploration term
+ temperature:
+ base: 1.0 # Base softmax temperature (CLI: --temp-base)
+ min: 0.15 # Minimum temperature (more greedy)
+ max: 2.0 # Maximum temperature (more random)
+
+# Switching cost - penalty for moving prefix to different worker
+switching_cost:
+ base: 0.20 # Base penalty for switching workers
+ reuse_penalty: 0.08 # Additional penalty per remaining request
+ iat_penalty: 0.05 # Additional penalty based on inter-arrival time
+
+# Load balancing - how much to penalize busy workers
+load_balancing:
+ queue_penalty_weight: 0.50 # Weight for queue depth penalty
+ gpu_penalty_weight: 1.00 # Weight for GPU memory usage penalty
+ outstanding_work_weight: 0.45 # Weight for outstanding work penalty
+ job_gpu_coupling_weight: 0.40 # Coupling between job cost and GPU load
+ job_queue_coupling_weight: 0.20 # Coupling between job cost and queue depth
+
+# Prefill cost model - how to weight input sequence length
+prefill:
+ token_scale: 1024.0 # Normalization scale for token count
+ weight: 1.0 # Weight of prefill cost in total job cost
+
+# LinTS (Linear Thompson Sampling) learner parameters
+lints:
+ lambda: 1.0 # Ridge regression regularization strength
+ v: 0.25 # Exploration variance in posterior sampling (CLI: --lints-v)
+ forget_rate: 0.995 # Exponential decay for old observations (0.995 = slow forget)
+
+# Feedback handling - delayed reward processing
+feedback:
+ timeout_seconds: 120.0 # Seconds to wait for feedback before timeout penalty
+ sweep_interval_seconds: 5.0 # How often to check for timed-out decisions
+ timeout_reward: 0.0 # Reward assigned to timed-out decisions (0.0 = bad)
+ latency_ema_alpha: 0.2 # EMA smoothing factor for latency baselines
+
+# Debug settings
+debug:
+ traces_enabled: false # Enable debug trace logging
+ trace_dir: /tmp/dynamo_router_traces # Directory for trace files
+ buffer_size: 2000 # In-memory trace buffer size
+
diff --git a/external/dynamo/optimized/processor.py b/external/dynamo/optimized/processor.py
new file mode 100644
index 0000000000..c814b04e26
--- /dev/null
+++ b/external/dynamo/optimized/processor.py
@@ -0,0 +1,818 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Optimized Processor for Thompson Sampling Router Architecture.
+
+This processor uses the "Processor-as-Backend" pattern with DYNAMIC DISCOVERY
+to intercept requests from the default Dynamo frontend and apply custom Thompson
+Sampling routing.
+
+## Dynamic Discovery Mode (Forward-Compatible)
+
+Instead of using the deprecated `--static-endpoint` flag on the frontend, this
+processor registers a model card in ETCD so the frontend can discover it via
+its ModelWatcher. This is the forward-compatible approach.
+
+### Requirements:
+- Processor must be started with `--model-path` and `--model-name` arguments
+- Model path must point to a valid model directory with tokenizer files
+- Model name must match what the frontend expects (e.g., "llama-3.3-70b")
+
+### Endpoint Registration Pattern
+
+1. **This Processor registers as `dynamo.backend.generate`** - Dynamically with instance ID
+2. **Processor calls `register_llm()`** - Advertises model card in ETCD
+3. **Frontend's ModelWatcher discovers us** - Routes requests to our endpoint
+4. **SGLang Worker registers as `workers.worker.generate`** - We forward to actual workers
+
+## Request Flow
+
+```
+Frontend (discovers backends via ETCD ModelWatcher)
+ → routes to dynamo.backend.generate-{instance_id}
+ → THIS PROCESSOR (discovered via model card!)
+ → extracts hints from nvext annotations
+ → queries Thompson Sampling router → worker_id
+ → forwards to workers.worker.generate (actual SGLang workers)
+```
+
+Key differences from generalized/processor.py:
+- Uses dynamic discovery (no --static-endpoint on frontend)
+- Registers model card via register_llm() for ETCD discovery
+- Registers as `dynamo.backend.generate` (not `dynamo.processor.process`)
+- Forwards to `workers.worker.generate` (workers in separate namespace)
+- Receives PreprocessedRequest instead of ChatCompletionRequest
+- Extracts hints from nvext annotations (prefix_id:value format)
+- Uses Dynamo metrics API for Prometheus integration (auto-exposed at /metrics)
+- No tokenization (handled by frontend preprocessor)
+
+## Metrics
+
+All metrics are exposed via Dynamo's `/metrics` endpoint (requires DYN_SYSTEM_PORT).
+Metrics use the `dynamo_component_` prefix and include standard Dynamo labels:
+- `dynamo_namespace`, `dynamo_component`, `dynamo_endpoint`
+
+Custom metrics for Thompson Sampling routing:
+- `requests_total` - Total requests processed
+- `request_latency_seconds` - End-to-end request latency histogram
+- `tokens_in_total` / `tokens_out_total` - Token throughput counters
+- `routing_decisions_total` - Per-worker routing decision counter
+- `router_errors_total` / `engine_errors_total` - Error counters
+- `active_requests` - Current in-flight request gauge
+
+KV Cache Efficiency (KVE) metrics:
+- `kve_prompt_tokens_total` - Total prompt tokens (efficiency denominator)
+- `kve_cached_tokens_total` - Total cached tokens hit (efficiency numerator)
+- `kve_device_blocks_total` - Cache hits from device (GPU) memory
+- `kve_host_blocks_total` - Cache hits from host (CPU) memory
+- `kve_disk_blocks_total` - Cache hits from disk
+
+## Grafana Integration
+
+Metrics are exposed at `/metrics` in Prometheus format. Enable with:
+ DYN_SYSTEM_PORT=8081 python processor.py --model-path ... --model-name ...
+
+Full metric names include the `dynamo_component_` prefix:
+ dynamo_component_requests_total{dynamo_namespace="dynamo",dynamo_component="backend",dynamo_endpoint="generate"}
+
+Example PromQL queries for Grafana dashboards:
+ # KV Cache Efficiency (%)
+ rate(dynamo_component_kve_cached_tokens_total[5m]) / rate(dynamo_component_kve_prompt_tokens_total[5m]) * 100
+
+ # Request latency p99
+ histogram_quantile(0.99, rate(dynamo_component_request_latency_seconds_bucket[5m]))
+
+## Data Source Requirements
+
+KVE metrics require the underlying engine to return cache efficiency data:
+- `usage.prompt_tokens_details.cached_tokens` - Standard OpenAI field (should work with prefix caching enabled)
+- `nvext.cache_hit_breakdown` - Engine-specific extension (NOT standard Dynamo NvExt)
+"""
+
+import argparse
+import asyncio
+import logging
+import os
+import time
+import uuid
+from collections.abc import AsyncIterator
+from typing import Any
+
+import uvloop
+from dynamo.llm import ModelInput
+from dynamo.llm import ModelType
+from dynamo.llm import register_llm
+from dynamo.runtime import DistributedRuntime
+from dynamo.runtime import dynamo_worker
+from dynamo.runtime.logging import configure_dynamo_logging
+from pydantic import BaseModel
+
+configure_dynamo_logging()
+logger = logging.getLogger(__name__)
+
+
+# ----------------------- request / response models ----------------------- #
+class RouterRequest(BaseModel):
+ """Request to the Thompson Sampling router."""
+
+ tokens: list[int]
+ prefix_id: str = ""
+ reuse_budget: int = 0 # remaining *after this request*
+ expected_osl: str | None = "MEDIUM"
+ interarrival: str | None = "MEDIUM"
+
+
+class RouterFeedbackRequest(BaseModel):
+ """Feedback to the router after request completion."""
+
+ decision_id: str
+ latency_ms: float
+ success: bool | None = True
+ tokens_in: int | None = None
+ tokens_out: int | None = None
+ finish_reason: str | None = None
+
+
+# ----------------------- KV efficiency data ----------------------- #
+class KVEfficiencyData:
+ """
+ Container for KV cache efficiency data extracted from worker responses.
+
+ This data is used to compute and publish KVE metrics asynchronously,
+ ensuring zero impact on routing throughput.
+ """
+
+ __slots__ = ("prompt_tokens", "cached_tokens", "device_blocks", "host_blocks", "disk_blocks")
+
+ def __init__(self):
+ self.prompt_tokens: int = 0
+ self.cached_tokens: int = 0
+ self.device_blocks: int = 0
+ self.host_blocks: int = 0
+ self.disk_blocks: int = 0
+
+ def has_data(self) -> bool:
+ """Check if any KVE data was collected."""
+ return self.prompt_tokens > 0
+
+ @classmethod
+ def from_response(cls, data: dict[str, Any]) -> "KVEfficiencyData":
+ """
+ Extract KVE data from a worker response chunk.
+
+ Expected fields in response (OpenAI-compatible):
+ - usage.prompt_tokens: Total prompt tokens
+ - usage.prompt_tokens_details.cached_tokens: Cached token count
+
+ Optional engine-specific fields (may not be present):
+ - nvext.cache_hit_breakdown.{device,host,disk}_blocks: Per-tier hits
+
+ Note: cache_hit_breakdown is NOT a standard Dynamo NvExt field.
+ It must be enabled/configured in the underlying engine (vLLM/SGLang).
+ """
+ kve = cls()
+
+ # Extract from usage field (OpenAI-compatible, should always work)
+ usage = data.get("usage")
+ if isinstance(usage, dict):
+ kve.prompt_tokens = usage.get("prompt_tokens", 0) or 0
+ prompt_details = usage.get("prompt_tokens_details")
+ if isinstance(prompt_details, dict):
+ kve.cached_tokens = prompt_details.get("cached_tokens", 0) or 0
+
+ # Extract cache breakdown from nvext (engine-specific, may not be present)
+ # This is NOT a standard Dynamo NvExt field - requires engine configuration
+ nvext = data.get("nvext")
+ if isinstance(nvext, dict):
+ breakdown = nvext.get("cache_hit_breakdown")
+ if isinstance(breakdown, dict):
+ kve.device_blocks = breakdown.get("device_blocks", 0) or 0
+ kve.host_blocks = breakdown.get("host_blocks", 0) or 0
+ kve.disk_blocks = breakdown.get("disk_blocks", 0) or 0
+
+ return kve
+
+
+# ----------------------- metrics dataclass ----------------------- #
+class ProcessorMetrics:
+ """
+ Container for Thompson Sampling processor metrics.
+
+ All metrics are created via Dynamo's metrics API, which:
+ - Automatically exposes them at /metrics in Prometheus format
+ - Adds standard labels (dynamo_namespace, dynamo_component, dynamo_endpoint)
+ - Integrates with Dynamo's Grafana dashboards
+ """
+
+ def __init__(self, endpoint):
+ """
+ Initialize metrics using Dynamo's metrics API.
+
+ Args:
+ endpoint: Dynamo endpoint object providing the metrics interface.
+ """
+ # Request throughput (prefixed with thompson_ to avoid conflicts with
+ # serve_endpoint's built-in work handler metrics)
+ self.requests_total = endpoint.metrics.create_intcounter(
+ "thompson_requests_total",
+ "Total requests processed by the Thompson Sampling processor",
+ )
+
+ # Latency histogram (uses default Prometheus buckets since Python binding
+ # doesn't expose custom bucket configuration in Dynamo 0.7.1)
+ self.request_latency_seconds = endpoint.metrics.create_histogram(
+ "thompson_request_latency_seconds",
+ "End-to-end request latency in seconds",
+ )
+
+ # Token throughput
+ self.tokens_in_total = endpoint.metrics.create_intcounter(
+ "thompson_tokens_in_total",
+ "Total input tokens processed",
+ )
+ self.tokens_out_total = endpoint.metrics.create_intcounter(
+ "thompson_tokens_out_total",
+ "Total output tokens generated",
+ )
+
+ # Routing decisions by worker (for analyzing load distribution)
+ self.routing_decisions_total = endpoint.metrics.create_intcountervec(
+ "thompson_routing_decisions_total",
+ "Routing decisions by worker",
+ ["worker_id"],
+ )
+
+ # Error tracking
+ self.router_errors_total = endpoint.metrics.create_intcounter(
+ "thompson_router_errors_total",
+ "Router communication errors (failed to pick worker)",
+ )
+ self.engine_errors_total = endpoint.metrics.create_intcounter(
+ "thompson_engine_errors_total",
+ "Backend engine errors (failed during streaming)",
+ )
+
+ # Active request gauge
+ self.active_requests = endpoint.metrics.create_intgauge(
+ "thompson_active_requests",
+ "Currently active requests being processed",
+ )
+
+ # -----------------------------------------------------------------
+ # KV Cache Efficiency (KVE) metrics
+ # These track cache hit rates for analyzing routing effectiveness.
+ # Efficiency = kve_cached_tokens_total / kve_prompt_tokens_total
+ # -----------------------------------------------------------------
+ self.kve_prompt_tokens_total = endpoint.metrics.create_intcounter(
+ "thompson_kve_prompt_tokens_total",
+ "Total prompt tokens processed (KV efficiency denominator)",
+ )
+ self.kve_cached_tokens_total = endpoint.metrics.create_intcounter(
+ "thompson_kve_cached_tokens_total",
+ "Total cached tokens hit (KV efficiency numerator)",
+ )
+
+ # Cache hit breakdown by memory tier (for analyzing cache hierarchy)
+ self.kve_device_blocks_total = endpoint.metrics.create_intcounter(
+ "thompson_kve_device_blocks_total",
+ "KV cache blocks hit from device (GPU) memory",
+ )
+ self.kve_host_blocks_total = endpoint.metrics.create_intcounter(
+ "thompson_kve_host_blocks_total",
+ "KV cache blocks hit from host (CPU) memory",
+ )
+ self.kve_disk_blocks_total = endpoint.metrics.create_intcounter(
+ "thompson_kve_disk_blocks_total",
+ "KV cache blocks hit from disk storage",
+ )
+
+ logger.info("Processor metrics initialized via Dynamo metrics API")
+
+
+# -------------------------- processor handler -------------------------- #
+class ProcessorRequestHandler:
+ """
+ Processor that receives PreprocessedRequest from the default Dynamo frontend,
+ extracts routing hints from nvext annotations, and coordinates with the
+ Thompson Sampling router for intelligent worker selection.
+ """
+
+ def __init__(
+ self,
+ runtime: DistributedRuntime,
+ endpoint,
+ enable_router: bool = True,
+ ):
+ """
+ Initialize the processor request handler.
+
+ Args:
+ runtime: Dynamo distributed runtime for client connections.
+ endpoint: Dynamo endpoint for metrics registration.
+ enable_router: Whether to use Thompson Sampling router (default: True).
+ """
+ self.runtime = runtime
+ self.endpoint = endpoint
+ self.enable_router = enable_router
+
+ # Client connections (initialized in initialize())
+ self.router_pick_client = None
+ self.router_feedback_client = None
+ self.engine_client = None
+
+ # Prefix-level state: {prefix_id: {"total": int, "processed": int}}
+ self._prefix_state: dict[str, dict[str, int]] = {}
+ self._prefix_lock = asyncio.Lock()
+
+ # Metrics (initialized in initialize())
+ self._metrics: ProcessorMetrics | None = None
+
+ async def initialize(self):
+ """Initialize processor by setting up metrics and connecting to services."""
+ # Initialize metrics using Dynamo's metrics API
+ self._metrics = ProcessorMetrics(self.endpoint)
+
+ # Connect to Thompson Sampling router
+ if self.enable_router:
+ router_component = self.runtime.namespace("dynamo").component("router")
+ self.router_pick_client = await router_component.endpoint("find_worker").client()
+ self.router_feedback_client = await router_component.endpoint("feedback").client()
+ logger.info("Router clients created, waiting for instances...")
+ await self.router_pick_client.wait_for_instances()
+ logger.info("Router clients initialized successfully")
+
+ # Connect to actual workers at workers.{component}.generate
+ # Workers are in the "workers" namespace (hidden from frontend discovery)
+ # while this processor is in "dynamo" namespace (frontend discovers us)
+ # Component name varies by backend (REQUIRED - no default):
+ # - SGLang: uses "worker" (set via --endpoint workers.worker.generate)
+ # - vLLM: uses "backend" (hardcoded in dynamo.vllm)
+ worker_component_name = os.environ.get("DYNAMO_WORKER_COMPONENT")
+ if not worker_component_name:
+ raise ValueError("DYNAMO_WORKER_COMPONENT environment variable is required. "
+ "Set to 'worker' for SGLang or 'backend' for vLLM.")
+ worker_component = self.runtime.namespace("workers").component(worker_component_name)
+ self.engine_client = await worker_component.endpoint("generate").client()
+ logger.info("Engine client created for workers/%s/generate, waiting for worker instances...",
+ worker_component_name)
+ await self.engine_client.wait_for_instances()
+ logger.info("Processor initialized successfully (routing to workers/%s/generate)", worker_component_name)
+
+ # ---- annotation extraction ----
+ @staticmethod
+ def _extract_annotation(annotations: list[str], key: str, default: str | None = None) -> str | None:
+ """Extract value from annotations list (format: 'key:value')."""
+ prefix = f"{key}:"
+ for ann in annotations:
+ if ann.startswith(prefix):
+ return ann[len(prefix):]
+ return default
+
+ def _extract_hints(self, request: dict[str, Any]) -> tuple[str, int, str, str]:
+ """
+ Extract routing hints from PreprocessedRequest annotations.
+
+ Returns: (prefix_id, total_requests, osl, iat)
+ """
+ annotations = request.get("annotations", [])
+ if not isinstance(annotations, list):
+ annotations = []
+
+ # Extract prefix_id (generate one if not provided)
+ prefix_id = self._extract_annotation(annotations, "prefix_id")
+ if not prefix_id:
+ prefix_id = f"auto-{uuid.uuid4().hex}"
+
+ # Extract total_requests count
+ total_str = self._extract_annotation(annotations, "total_requests", "1")
+ try:
+ total_requests = max(1, int(total_str))
+ except (ValueError, TypeError):
+ total_requests = 1
+
+ # Extract expected output sequence length category
+ osl = self._extract_annotation(annotations, "osl", "MEDIUM")
+ osl = osl.upper() if osl else "MEDIUM"
+ if osl not in ("LOW", "MEDIUM", "HIGH"):
+ osl = "MEDIUM"
+
+ # Extract interarrival time category
+ iat = self._extract_annotation(annotations, "iat", "MEDIUM")
+ iat = iat.upper() if iat else "MEDIUM"
+ if iat not in ("LOW", "MEDIUM", "HIGH"):
+ iat = "MEDIUM"
+
+ return prefix_id, total_requests, osl, iat
+
+ async def _update_prefix_state(self, prefix_id: str, total_requests: int) -> int:
+ """
+ Update prefix counters and return remaining_after (reuse_budget).
+
+ This tracks how many requests remain for a given prefix, allowing the
+ router to make informed decisions about KV cache placement.
+ """
+ async with self._prefix_lock:
+ state = self._prefix_state.get(prefix_id)
+ if state is None:
+ state = {"total": total_requests, "processed": 0}
+ self._prefix_state[prefix_id] = state
+ else:
+ # Update total if a higher count is reported
+ state["total"] = max(state["total"], total_requests)
+
+ state["processed"] += 1
+ remaining_after = max(state["total"] - state["processed"], 0)
+
+ # Clean up completed prefixes immediately
+ if remaining_after == 0:
+ self._prefix_state.pop(prefix_id, None)
+
+ return remaining_after
+
+ async def _pick_worker(
+ self,
+ token_ids: list[int],
+ prefix_id: str,
+ reuse_budget: int,
+ osl: str,
+ iat: str,
+ ) -> tuple[int | None, str | None]:
+ """
+ Pick a worker via the Thompson Sampling router.
+
+ Returns: (worker_id, decision_id) or (None, None) if routing fails.
+ """
+ if not self.router_pick_client:
+ return None, None
+
+ req = RouterRequest(
+ tokens=token_ids,
+ prefix_id=prefix_id,
+ reuse_budget=max(int(reuse_budget), 0),
+ expected_osl=osl,
+ interarrival=iat,
+ )
+
+ try:
+ stream = await self.router_pick_client.generate(req.model_dump())
+
+ worker_id: int | None = None
+ decision_id: str | None = None
+
+ async for chunk in stream:
+ data = chunk.data()
+ if "error" in data:
+ logger.error("Router error: %s", data["error"])
+ self._metrics.router_errors_total.inc()
+ break
+
+ wid = data.get("worker_id", -1)
+ if wid == -1:
+ break
+
+ worker_id = int(wid)
+ decision_id = data.get("decision_id")
+ break
+
+ # Record routing decision
+ if worker_id is not None:
+ self._metrics.routing_decisions_total.inc({"worker_id": str(worker_id)})
+ else:
+ logger.warning("Router stream ended without worker_id; falling back to engine load balancing.")
+
+ return worker_id, decision_id
+
+ except Exception as e:
+ logger.error("Failed to pick worker: %s", e)
+ self._metrics.router_errors_total.inc()
+ return None, None
+
+ async def _send_feedback_safely(
+ self,
+ decision_id: str | None,
+ latency_ms: float,
+ success: bool,
+ tokens_in: int,
+ tokens_out: int,
+ finish_reason: str | None,
+ ):
+ """
+ Send feedback to router (fire-and-forget style).
+
+ This feedback is used by the Thompson Sampling algorithm to update
+ its model of worker performance.
+ """
+ if not decision_id or not self.router_feedback_client:
+ return
+
+ try:
+ feedback = RouterFeedbackRequest(
+ decision_id=decision_id,
+ latency_ms=float(latency_ms),
+ success=bool(success),
+ tokens_in=int(tokens_in),
+ tokens_out=int(tokens_out),
+ finish_reason=finish_reason or "",
+ )
+ stream = await self.router_feedback_client.generate(feedback.model_dump())
+ async for _ in stream:
+ pass
+ except Exception:
+ logger.exception("Failed to send router feedback")
+
+ def _update_kve_metrics_sync(self, kve: KVEfficiencyData) -> None:
+ """
+ Update KV cache efficiency metrics (synchronous, called from background task).
+
+ This is intentionally synchronous - counter increments are atomic and
+ extremely fast (microseconds). The async wrapper exists only to allow
+ fire-and-forget scheduling via create_task().
+ """
+ if not kve.has_data():
+ return
+
+ # Update counters - these are atomic operations
+ self._metrics.kve_prompt_tokens_total.inc_by(kve.prompt_tokens)
+ self._metrics.kve_cached_tokens_total.inc_by(kve.cached_tokens)
+ self._metrics.kve_device_blocks_total.inc_by(kve.device_blocks)
+ self._metrics.kve_host_blocks_total.inc_by(kve.host_blocks)
+ self._metrics.kve_disk_blocks_total.inc_by(kve.disk_blocks)
+
+ # Log efficiency for debugging (only if we have meaningful data)
+ if kve.prompt_tokens > 0:
+ efficiency = kve.cached_tokens / kve.prompt_tokens * 100
+ logger.debug(
+ "KVE update: prompt=%d cached=%d eff=%.1f%% (dev=%d host=%d disk=%d)",
+ kve.prompt_tokens,
+ kve.cached_tokens,
+ efficiency,
+ kve.device_blocks,
+ kve.host_blocks,
+ kve.disk_blocks,
+ )
+
+ async def _update_kve_metrics_async(self, kve: KVEfficiencyData) -> None:
+ """
+ Async wrapper for KVE metric updates (fire-and-forget via create_task).
+
+ This allows the main streaming path to continue without waiting for
+ metric updates, ensuring zero impact on routing throughput.
+ """
+ try:
+ self._update_kve_metrics_sync(kve)
+ except Exception:
+ # Never let metric updates crash the system
+ logger.exception("Failed to update KVE metrics")
+
+ async def _stream_from_engine(
+ self,
+ request: dict[str, Any],
+ worker_id: int | None,
+ decision_id: str | None,
+ tokens_in: int,
+ ) -> AsyncIterator[dict[str, Any]]:
+ """
+ Stream response from the backend engine.
+
+ Yields response chunks and sends feedback to the router on completion.
+ Also updates Prometheus metrics for latency and token throughput.
+
+ KV cache efficiency (KVE) metrics are updated asynchronously via
+ create_task() to ensure zero impact on routing throughput.
+ """
+ t0 = time.perf_counter()
+ tokens_out = 0
+ finish_reason: str | None = None
+ kve_data: KVEfficiencyData | None = None # Collected from response
+
+ try:
+ # Route to specific worker or use engine's load balancing
+ if worker_id is not None:
+ stream = await self.engine_client.direct(request, worker_id)
+ else:
+ stream = await self.engine_client.generate(request)
+
+ async for chunk in stream:
+ data = chunk.data()
+
+ # Handle engine errors
+ if "error" in data:
+ latency_ms = (time.perf_counter() - t0) * 1000.0
+ await self._send_feedback_safely(decision_id, latency_ms, False, tokens_in, tokens_out, "error")
+ self._metrics.engine_errors_total.inc()
+ yield {"error": data["error"]}
+ return
+
+ # Count output tokens
+ if "token_ids" in data and isinstance(data["token_ids"], list):
+ tokens_out += len(data["token_ids"])
+
+ # Extract KVE data if present (typically in final chunk or usage chunk)
+ # We check for 'usage' field which contains cache efficiency info
+ if "usage" in data or "nvext" in data:
+ extracted = KVEfficiencyData.from_response(data)
+ if extracted.has_data():
+ kve_data = extracted
+
+ # Pass through the chunk
+ yield data
+
+ # Handle completion
+ if "finish_reason" in data and data["finish_reason"] is not None:
+ finish_reason = data["finish_reason"]
+ latency_seconds = time.perf_counter() - t0
+ latency_ms = latency_seconds * 1000.0
+
+ # Send feedback to router (this is already fire-and-forget)
+ await self._send_feedback_safely(decision_id,
+ latency_ms,
+ True,
+ tokens_in,
+ tokens_out,
+ finish_reason)
+
+ # Update core Prometheus metrics (fast atomic operations)
+ self._metrics.request_latency_seconds.observe(latency_seconds)
+ self._metrics.tokens_in_total.inc_by(tokens_in)
+ self._metrics.tokens_out_total.inc_by(tokens_out)
+
+ # Fire-and-forget KVE metric update (async, non-blocking)
+ # This ensures KVE computation has ZERO impact on routing throughput
+ if kve_data is not None:
+ asyncio.create_task(self._update_kve_metrics_async(kve_data))
+
+ return
+
+ except Exception as e:
+ latency_ms = (time.perf_counter() - t0) * 1000.0
+ await self._send_feedback_safely(decision_id, latency_ms, False, tokens_in, tokens_out, "exception")
+ self._metrics.engine_errors_total.inc()
+ logger.exception("Engine stream exception")
+ yield {"error": str(e)}
+ return
+
+ # ---- main generation endpoint ----
+ async def generate(self, raw: dict[str, Any]):
+ """
+ Processor endpoint: receives PreprocessedRequest from frontend.
+
+ Expected format (from Dynamo preprocessor):
+ {
+ "token_ids": [...],
+ "annotations": ["prefix_id:xyz", "total_requests:10", ...],
+ "sampling_options": {...},
+ "stop_conditions": {...},
+ ...
+ }
+ """
+ # Track active requests
+ self._metrics.active_requests.inc()
+
+ try:
+ # Increment request counter
+ self._metrics.requests_total.inc()
+
+ # Extract routing hints from annotations
+ prefix_id, total_requests, osl, iat = self._extract_hints(raw)
+
+ # Get token IDs from preprocessed request
+ token_ids = raw.get("token_ids", [])
+ if not isinstance(token_ids, list):
+ token_ids = []
+
+ tokens_in = len(token_ids)
+ logger.info(
+ "Processing request: prefix=%s total=%d osl=%s iat=%s tokens=%d",
+ prefix_id,
+ total_requests,
+ osl,
+ iat,
+ tokens_in,
+ )
+
+ # Compute reuse_budget := remaining AFTER this request
+ reuse_budget = await self._update_prefix_state(prefix_id, total_requests)
+
+ # Pick worker via Thompson Sampling router
+ worker_id, decision_id = await self._pick_worker(token_ids, prefix_id, reuse_budget, osl, iat)
+
+ logger.info(
+ "Routing decision: worker=%s decision=%s reuse_budget=%d",
+ worker_id,
+ decision_id,
+ reuse_budget,
+ )
+
+ # Stream response from engine
+ async for resp in self._stream_from_engine(raw, worker_id, decision_id, tokens_in):
+ yield resp
+
+ finally:
+ self._metrics.active_requests.dec()
+
+
+# -------------------------- worker entry point -------------------------- #
+def parse_args():
+ """Parse command-line arguments for the processor."""
+ parser = argparse.ArgumentParser(description="Optimized Thompson Sampling Processor")
+ parser.add_argument(
+ "--enable-router",
+ action="store_true",
+ default=True,
+ help="Enable Thompson Sampling router integration",
+ )
+ parser.add_argument(
+ "--no-router",
+ action="store_false",
+ dest="enable_router",
+ help="Disable router (use engine load balancing only)",
+ )
+ parser.add_argument(
+ "--model-path",
+ type=str,
+ required=True,
+ help="Path to the model directory (for loading tokenizer and model card)",
+ )
+ parser.add_argument(
+ "--model-name",
+ type=str,
+ required=True,
+ help="Served model name (must match frontend's --model-name)",
+ )
+ return parser.parse_args()
+
+
+@dynamo_worker(static=False) # Dynamic mode - required to call router/workers which are also dynamic
+async def worker(runtime: DistributedRuntime):
+ """
+ Main worker entry point for the Thompson Sampling processor.
+
+ This processor registers as a backend that the frontend can discover via ETCD,
+ then forwards requests to actual workers after applying Thompson Sampling routing.
+ """
+ args = parse_args()
+
+ # DYNAMIC DISCOVERY MODE:
+ # Instead of using --static-endpoint on the frontend, we register a model card
+ # in ETCD so the frontend can discover us via its ModelWatcher.
+ #
+ # This is the forward-compatible approach since --static-endpoint is deprecated.
+ #
+ # Flow:
+ # 1. We register as dynamo.backend.generate (dynamically with instance ID)
+ # 2. We call register_llm() to advertise ourselves in ETCD
+ # 3. Frontend's ModelWatcher discovers us and routes requests to us
+ # 4. We forward to actual workers at workers.worker.generate
+
+ component = runtime.namespace("dynamo").component("backend")
+ await component.create_service()
+
+ # Create the endpoint FIRST (needed for register_llm and metrics)
+ endpoint = component.endpoint("generate")
+
+ # Register the model card with ETCD so the frontend can discover us
+ # We accept preprocessed tokens (ModelInput.Tokens) and serve chat/completions
+ logger.info(
+ "Registering model card: model_name=%s, model_path=%s",
+ args.model_name,
+ args.model_path,
+ )
+ # IMPORTANT: kv_cache_block_size must match what workers use (default page_size=1)
+ # Otherwise checksums will differ and frontend will reject the processor's model card
+ await register_llm(
+ model_input=ModelInput.Tokens, # We accept tokenized input from frontend
+ model_type=ModelType.Chat | ModelType.Completions, # Chat and completions endpoints
+ endpoint=endpoint,
+ model_path=args.model_path,
+ model_name=args.model_name,
+ kv_cache_block_size=1, # Must match worker page_size to ensure same checksum
+ )
+ logger.info("Model card registered successfully - frontend can now discover us via ETCD")
+
+ # Initialize the request handler with the endpoint for metrics
+ handler = ProcessorRequestHandler(
+ runtime=runtime,
+ endpoint=endpoint,
+ enable_router=args.enable_router,
+ )
+ await handler.initialize()
+
+ # Serve as "backend.generate" - frontend will route to us after ETCD discovery
+ await endpoint.serve_endpoint(handler.generate)
+
+
+if __name__ == "__main__":
+ uvloop.install()
+ asyncio.run(worker()) # pylint: disable=no-value-for-parameter
diff --git a/external/dynamo/optimized/processor_multilru.py b/external/dynamo/optimized/processor_multilru.py
new file mode 100644
index 0000000000..e3a0c7f412
--- /dev/null
+++ b/external/dynamo/optimized/processor_multilru.py
@@ -0,0 +1,833 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Optimized Processor for Thompson Sampling Router Architecture.
+
+This processor uses the "Processor-as-Backend" pattern with DYNAMIC DISCOVERY
+to intercept requests from the default Dynamo frontend and apply custom Thompson
+Sampling routing.
+
+## Dynamic Discovery Mode (Forward-Compatible)
+
+Instead of using the deprecated `--static-endpoint` flag on the frontend, this
+processor registers a model card in ETCD so the frontend can discover it via
+its ModelWatcher. This is the forward-compatible approach.
+
+### Requirements:
+- Processor must be started with `--model-path` and `--model-name` arguments
+- Model path must point to a valid model directory with tokenizer files
+- Model name must match what the frontend expects (e.g., "llama-3.3-70b")
+
+### Endpoint Registration Pattern
+
+1. **This Processor registers as `dynamo.backend.generate`** - Dynamically with instance ID
+2. **Processor calls `register_llm()`** - Advertises model card in ETCD
+3. **Frontend's ModelWatcher discovers us** - Routes requests to our endpoint
+4. **SGLang Worker registers as `workers.worker.generate`** - We forward to actual workers
+
+## Request Flow
+
+```
+Frontend (discovers backends via ETCD ModelWatcher)
+ → routes to dynamo.backend.generate-{instance_id}
+ → THIS PROCESSOR (discovered via model card!)
+ → extracts hints from nvext annotations
+ → queries Thompson Sampling router → worker_id
+ → forwards to workers.worker.generate (actual SGLang workers)
+```
+
+Key differences from generalized/processor.py:
+- Uses dynamic discovery (no --static-endpoint on frontend)
+- Registers model card via register_llm() for ETCD discovery
+- Registers as `dynamo.backend.generate` (not `dynamo.processor.process`)
+- Forwards to `workers.worker.generate` (workers in separate namespace)
+- Receives PreprocessedRequest instead of ChatCompletionRequest
+- Extracts hints from nvext annotations (prefix_id:value format)
+- Uses Dynamo metrics API for Prometheus integration (auto-exposed at /metrics)
+- No tokenization (handled by frontend preprocessor)
+
+## Metrics
+
+All metrics are exposed via Dynamo's `/metrics` endpoint (requires DYN_SYSTEM_PORT).
+Metrics use the `dynamo_component_` prefix and include standard Dynamo labels:
+- `dynamo_namespace`, `dynamo_component`, `dynamo_endpoint`
+
+Custom metrics for Thompson Sampling routing:
+- `requests_total` - Total requests processed
+- `request_latency_seconds` - End-to-end request latency histogram
+- `tokens_in_total` / `tokens_out_total` - Token throughput counters
+- `routing_decisions_total` - Per-worker routing decision counter
+- `router_errors_total` / `engine_errors_total` - Error counters
+- `active_requests` - Current in-flight request gauge
+
+KV Cache Efficiency (KVE) metrics:
+- `kve_prompt_tokens_total` - Total prompt tokens (efficiency denominator)
+- `kve_cached_tokens_total` - Total cached tokens hit (efficiency numerator)
+- `kve_device_blocks_total` - Cache hits from device (GPU) memory
+- `kve_host_blocks_total` - Cache hits from host (CPU) memory
+- `kve_disk_blocks_total` - Cache hits from disk
+
+## Grafana Integration
+
+Metrics are exposed at `/metrics` in Prometheus format. Enable with:
+ DYN_SYSTEM_PORT=8081 python processor.py --model-path ... --model-name ...
+
+Full metric names include the `dynamo_component_` prefix:
+ dynamo_component_requests_total{dynamo_namespace="dynamo",dynamo_component="backend",dynamo_endpoint="generate"}
+
+Example PromQL queries for Grafana dashboards:
+ # KV Cache Efficiency (%)
+ rate(dynamo_component_kve_cached_tokens_total[5m]) / rate(dynamo_component_kve_prompt_tokens_total[5m]) * 100
+
+ # Request latency p99
+ histogram_quantile(0.99, rate(dynamo_component_request_latency_seconds_bucket[5m]))
+
+## Data Source Requirements
+
+KVE metrics require the underlying engine to return cache efficiency data:
+- `usage.prompt_tokens_details.cached_tokens` - Standard OpenAI field (should work with prefix caching enabled)
+- `nvext.cache_hit_breakdown` - Engine-specific extension (NOT standard Dynamo NvExt)
+"""
+
+import argparse
+import asyncio
+import logging
+import os
+import time
+import uuid
+from collections.abc import AsyncIterator
+from typing import Any
+
+import uvloop
+from dynamo.llm import ModelInput
+from dynamo.llm import ModelType
+from dynamo.llm import register_llm
+from dynamo.runtime import DistributedRuntime
+from dynamo.runtime import dynamo_worker
+from dynamo.runtime.logging import configure_dynamo_logging
+from pydantic import BaseModel
+
+configure_dynamo_logging()
+logger = logging.getLogger(__name__)
+
+
+# ----------------------- request / response models ----------------------- #
+class RouterRequest(BaseModel):
+ """Request to the Thompson Sampling router."""
+
+ tokens: list[int]
+ prefix_id: str = ""
+ reuse_budget: int = 0 # remaining *after this request*
+ expected_osl: str | None = "MEDIUM"
+ interarrival: str | None = "MEDIUM"
+
+
+class RouterFeedbackRequest(BaseModel):
+ """Feedback to the router after request completion."""
+
+ decision_id: str
+ latency_ms: float
+ success: bool | None = True
+ tokens_in: int | None = None
+ tokens_out: int | None = None
+ finish_reason: str | None = None
+
+
+# ----------------------- KV efficiency data ----------------------- #
+class KVEfficiencyData:
+ """
+ Container for KV cache efficiency data extracted from worker responses.
+
+ This data is used to compute and publish KVE metrics asynchronously,
+ ensuring zero impact on routing throughput.
+ """
+
+ __slots__ = ("prompt_tokens", "cached_tokens", "device_blocks", "host_blocks", "disk_blocks")
+
+ def __init__(self):
+ self.prompt_tokens: int = 0
+ self.cached_tokens: int = 0
+ self.device_blocks: int = 0
+ self.host_blocks: int = 0
+ self.disk_blocks: int = 0
+
+ def has_data(self) -> bool:
+ """Check if any KVE data was collected."""
+ return self.prompt_tokens > 0
+
+ @classmethod
+ def from_response(cls, data: dict[str, Any]) -> "KVEfficiencyData":
+ """
+ Extract KVE data from a worker response chunk.
+
+ Expected fields in response (OpenAI-compatible):
+ - usage.prompt_tokens: Total prompt tokens
+ - usage.prompt_tokens_details.cached_tokens: Cached token count
+
+ Optional engine-specific fields (may not be present):
+ - nvext.cache_hit_breakdown.{device,host,disk}_blocks: Per-tier hits
+
+ Note: cache_hit_breakdown is NOT a standard Dynamo NvExt field.
+ It must be enabled/configured in the underlying engine (vLLM/SGLang).
+ """
+ kve = cls()
+
+ # Extract from usage field (OpenAI-compatible, should always work)
+ usage = data.get("usage")
+ if isinstance(usage, dict):
+ kve.prompt_tokens = usage.get("prompt_tokens", 0) or 0
+ prompt_details = usage.get("prompt_tokens_details")
+ if isinstance(prompt_details, dict):
+ kve.cached_tokens = prompt_details.get("cached_tokens", 0) or 0
+
+ # Extract cache breakdown from nvext (engine-specific, may not be present)
+ # This is NOT a standard Dynamo NvExt field - requires engine configuration
+ nvext = data.get("nvext")
+ if isinstance(nvext, dict):
+ breakdown = nvext.get("cache_hit_breakdown")
+ if isinstance(breakdown, dict):
+ kve.device_blocks = breakdown.get("device_blocks", 0) or 0
+ kve.host_blocks = breakdown.get("host_blocks", 0) or 0
+ kve.disk_blocks = breakdown.get("disk_blocks", 0) or 0
+
+ return kve
+
+
+# ----------------------- metrics dataclass ----------------------- #
+class ProcessorMetrics:
+ """
+ Container for Thompson Sampling processor metrics.
+
+ Uses prometheus_client directly (compatible with ryan/kvbm-next branch).
+ Metrics are registered with Dynamo's endpoint via register_prometheus_expfmt_callback.
+ """
+
+ def __init__(self, endpoint):
+ """
+ Initialize metrics using prometheus_client directly.
+
+ Args:
+ endpoint: Dynamo endpoint object for registering metrics callback.
+ """
+ from prometheus_client import REGISTRY
+ from prometheus_client import Counter
+ from prometheus_client import Gauge
+ from prometheus_client import Histogram
+
+ # Request throughput (prefixed with thompson_ to avoid conflicts with
+ # serve_endpoint's built-in work handler metrics)
+ self.requests_total = Counter(
+ "thompson_requests_total",
+ "Total requests processed by the Thompson Sampling processor",
+ )
+
+ # Latency histogram
+ self.request_latency_seconds = Histogram(
+ "thompson_request_latency_seconds",
+ "End-to-end request latency in seconds",
+ )
+
+ # Token throughput
+ self.tokens_in_total = Counter(
+ "thompson_tokens_in_total",
+ "Total input tokens processed",
+ )
+ self.tokens_out_total = Counter(
+ "thompson_tokens_out_total",
+ "Total output tokens generated",
+ )
+
+ # Routing decisions by worker (for analyzing load distribution)
+ self.routing_decisions_total = Counter(
+ "thompson_routing_decisions_total",
+ "Routing decisions by worker",
+ ["worker_id"],
+ )
+
+ # Error tracking
+ self.router_errors_total = Counter(
+ "thompson_router_errors_total",
+ "Router communication errors (failed to pick worker)",
+ )
+ self.engine_errors_total = Counter(
+ "thompson_engine_errors_total",
+ "Backend engine errors (failed during streaming)",
+ )
+
+ # Active request gauge
+ self.active_requests = Gauge(
+ "thompson_active_requests",
+ "Currently active requests being processed",
+ )
+
+ # -----------------------------------------------------------------
+ # KV Cache Efficiency (KVE) metrics
+ # These track cache hit rates for analyzing routing effectiveness.
+ # Efficiency = kve_cached_tokens_total / kve_prompt_tokens_total
+ # -----------------------------------------------------------------
+ self.kve_prompt_tokens_total = Counter(
+ "thompson_kve_prompt_tokens_total",
+ "Total prompt tokens processed (KV efficiency denominator)",
+ )
+ self.kve_cached_tokens_total = Counter(
+ "thompson_kve_cached_tokens_total",
+ "Total cached tokens hit (KV efficiency numerator)",
+ )
+
+ # Cache hit breakdown by memory tier (for analyzing cache hierarchy)
+ self.kve_device_blocks_total = Counter(
+ "thompson_kve_device_blocks_total",
+ "KV cache blocks hit from device (GPU) memory",
+ )
+ self.kve_host_blocks_total = Counter(
+ "thompson_kve_host_blocks_total",
+ "KV cache blocks hit from host (CPU) memory",
+ )
+ self.kve_disk_blocks_total = Counter(
+ "thompson_kve_disk_blocks_total",
+ "KV cache blocks hit from disk storage",
+ )
+
+ # Register metrics with Dynamo's endpoint for /metrics exposure
+ from dynamo.common.utils.prometheus import register_engine_metrics_callback
+ register_engine_metrics_callback(endpoint, REGISTRY, metric_prefix_filters=["thompson_"])
+
+ logger.info("Processor metrics initialized via prometheus_client")
+
+
+# -------------------------- processor handler -------------------------- #
+class ProcessorRequestHandler:
+ """
+ Processor that receives PreprocessedRequest from the default Dynamo frontend,
+ extracts routing hints from nvext annotations, and coordinates with the
+ Thompson Sampling router for intelligent worker selection.
+ """
+
+ def __init__(
+ self,
+ runtime: DistributedRuntime,
+ endpoint,
+ enable_router: bool = True,
+ ):
+ """
+ Initialize the processor request handler.
+
+ Args:
+ runtime: Dynamo distributed runtime for client connections.
+ endpoint: Dynamo endpoint for metrics registration.
+ enable_router: Whether to use Thompson Sampling router (default: True).
+ """
+ self.runtime = runtime
+ self.endpoint = endpoint
+ self.enable_router = enable_router
+
+ # Client connections (initialized in initialize())
+ self.router_pick_client = None
+ self.router_feedback_client = None
+ self.engine_client = None
+
+ # Prefix-level state: {prefix_id: {"total": int, "processed": int}}
+ self._prefix_state: dict[str, dict[str, int]] = {}
+ self._prefix_lock = asyncio.Lock()
+
+ # Metrics (initialized in initialize())
+ self._metrics: ProcessorMetrics | None = None
+
+ async def initialize(self):
+ """Initialize processor by setting up metrics and connecting to services."""
+ # Initialize metrics using Dynamo's metrics API
+ self._metrics = ProcessorMetrics(self.endpoint)
+
+ # Connect to Thompson Sampling router
+ if self.enable_router:
+ router_component = self.runtime.namespace("dynamo").component("router")
+ self.router_pick_client = await router_component.endpoint("find_worker").client()
+ self.router_feedback_client = await router_component.endpoint("feedback").client()
+ logger.info("Router clients created, waiting for instances...")
+ await self.router_pick_client.wait_for_instances()
+ logger.info("Router clients initialized successfully")
+
+ # Connect to actual workers at workers.{component}.generate
+ # Workers are in the "workers" namespace (hidden from frontend discovery)
+ # while this processor is in "dynamo" namespace (frontend discovers us)
+ # Component name varies by backend (REQUIRED - no default):
+ # - SGLang: uses "worker" (set via --endpoint workers.worker.generate)
+ # - vLLM: uses "backend" (hardcoded in dynamo.vllm)
+ worker_component_name = os.environ.get("DYNAMO_WORKER_COMPONENT")
+ if not worker_component_name:
+ raise ValueError("DYNAMO_WORKER_COMPONENT environment variable is required. "
+ "Set to 'worker' for SGLang or 'backend' for vLLM.")
+ worker_component = self.runtime.namespace("workers").component(worker_component_name)
+ self.engine_client = await worker_component.endpoint("generate").client()
+ logger.info("Engine client created for workers/%s/generate, waiting for worker instances...",
+ worker_component_name)
+ await self.engine_client.wait_for_instances()
+ logger.info("Processor initialized successfully (routing to workers/%s/generate)", worker_component_name)
+
+ # ---- annotation extraction ----
+ @staticmethod
+ def _extract_annotation(annotations: list[str], key: str, default: str | None = None) -> str | None:
+ """Extract value from annotations list (format: 'key:value')."""
+ prefix = f"{key}:"
+ for ann in annotations:
+ if ann.startswith(prefix):
+ return ann[len(prefix):]
+ return default
+
+ def _extract_hints(self, request: dict[str, Any]) -> tuple[str, int, str, str]:
+ """
+ Extract routing hints from PreprocessedRequest annotations.
+
+ Returns: (prefix_id, total_requests, osl, iat, use_frequency_backend)
+ """
+ annotations = request.get("annotations", [])
+ if not isinstance(annotations, list):
+ annotations = []
+
+ # Extract prefix_id (generate one if not provided)
+ prefix_id = self._extract_annotation(annotations, "prefix_id")
+ if not prefix_id:
+ prefix_id = f"auto-{uuid.uuid4().hex}"
+
+ # Extract total_requests count
+ total_str = self._extract_annotation(annotations, "total_requests", "1")
+ try:
+ total_requests = max(1, int(total_str))
+ except (ValueError, TypeError):
+ total_requests = 1
+
+ # Extract expected output sequence length category
+ osl = self._extract_annotation(annotations, "osl", "MEDIUM")
+ osl = osl.upper() if osl else "MEDIUM"
+ if osl not in ("LOW", "MEDIUM", "HIGH"):
+ osl = "MEDIUM"
+
+ # Extract interarrival time category
+ iat = self._extract_annotation(annotations, "iat", "MEDIUM")
+ iat = iat.upper() if iat else "MEDIUM"
+ if iat not in ("LOW", "MEDIUM", "HIGH"):
+ iat = "MEDIUM"
+
+ # Extract backend selection (determines v1 vs v2 routing)
+ backend_selector = self._extract_annotation(annotations, "backend")
+ use_frequency_backend = backend_selector == "frequency_multi_lru" if backend_selector else False
+
+ return prefix_id, total_requests, osl, iat, use_frequency_backend
+
+ async def _update_prefix_state(self, prefix_id: str, total_requests: int) -> int:
+ """
+ Update prefix counters and return remaining_after (reuse_budget).
+
+ This tracks how many requests remain for a given prefix, allowing the
+ router to make informed decisions about KV cache placement.
+ """
+ async with self._prefix_lock:
+ state = self._prefix_state.get(prefix_id)
+ if state is None:
+ state = {"total": total_requests, "processed": 0}
+ self._prefix_state[prefix_id] = state
+ else:
+ # Update total if a higher count is reported
+ state["total"] = max(state["total"], total_requests)
+
+ state["processed"] += 1
+ remaining_after = max(state["total"] - state["processed"], 0)
+
+ # Clean up completed prefixes immediately
+ if remaining_after == 0:
+ self._prefix_state.pop(prefix_id, None)
+
+ return remaining_after
+
+ async def _pick_worker(
+ self,
+ token_ids: list[int],
+ prefix_id: str,
+ reuse_budget: int,
+ osl: str,
+ iat: str,
+ ) -> tuple[int | None, str | None]:
+ """
+ Pick a worker via the Thompson Sampling router.
+
+ Returns: (worker_id, decision_id) or (None, None) if routing fails.
+ """
+ if not self.router_pick_client:
+ return None, None
+
+ req = RouterRequest(
+ tokens=token_ids,
+ prefix_id=prefix_id,
+ reuse_budget=max(int(reuse_budget), 0),
+ expected_osl=osl,
+ interarrival=iat,
+ )
+
+ try:
+ stream = await self.router_pick_client.generate(req.model_dump())
+
+ worker_id: int | None = None
+ decision_id: str | None = None
+
+ async for chunk in stream:
+ data = chunk.data()
+ if "error" in data:
+ logger.error("Router error: %s", data["error"])
+ self._metrics.router_errors_total.inc()
+ break
+
+ wid = data.get("worker_id", -1)
+ if wid == -1:
+ break
+
+ worker_id = int(wid)
+ decision_id = data.get("decision_id")
+ break
+
+ # Record routing decision
+ if worker_id is not None:
+ self._metrics.routing_decisions_total.labels(worker_id=str(worker_id)).inc()
+ else:
+ logger.warning("Router stream ended without worker_id; falling back to engine load balancing.")
+
+ return worker_id, decision_id
+
+ except Exception as e:
+ logger.error("Failed to pick worker: %s", e)
+ self._metrics.router_errors_total.inc()
+ return None, None
+
+ async def _send_feedback_safely(
+ self,
+ decision_id: str | None,
+ latency_ms: float,
+ success: bool,
+ tokens_in: int,
+ tokens_out: int,
+ finish_reason: str | None,
+ ):
+ """
+ Send feedback to router (fire-and-forget style).
+
+ This feedback is used by the Thompson Sampling algorithm to update
+ its model of worker performance.
+ """
+ if not decision_id or not self.router_feedback_client:
+ return
+
+ try:
+ feedback = RouterFeedbackRequest(
+ decision_id=decision_id,
+ latency_ms=float(latency_ms),
+ success=bool(success),
+ tokens_in=int(tokens_in),
+ tokens_out=int(tokens_out),
+ finish_reason=finish_reason or "",
+ )
+ stream = await self.router_feedback_client.generate(feedback.model_dump())
+ async for _ in stream:
+ pass
+ except Exception:
+ logger.exception("Failed to send router feedback")
+
+ def _update_kve_metrics_sync(self, kve: KVEfficiencyData) -> None:
+ """
+ Update KV cache efficiency metrics (synchronous, called from background task).
+
+ This is intentionally synchronous - counter increments are atomic and
+ extremely fast (microseconds). The async wrapper exists only to allow
+ fire-and-forget scheduling via create_task().
+ """
+ if not kve.has_data():
+ return
+
+ # Update counters - these are atomic operations
+ self._metrics.kve_prompt_tokens_total.inc(kve.prompt_tokens)
+ self._metrics.kve_cached_tokens_total.inc(kve.cached_tokens)
+ self._metrics.kve_device_blocks_total.inc(kve.device_blocks)
+ self._metrics.kve_host_blocks_total.inc(kve.host_blocks)
+ self._metrics.kve_disk_blocks_total.inc(kve.disk_blocks)
+
+ # Log efficiency for debugging (only if we have meaningful data)
+ if kve.prompt_tokens > 0:
+ efficiency = kve.cached_tokens / kve.prompt_tokens * 100
+ logger.debug(
+ "KVE update: prompt=%d cached=%d eff=%.1f%% (dev=%d host=%d disk=%d)",
+ kve.prompt_tokens,
+ kve.cached_tokens,
+ efficiency,
+ kve.device_blocks,
+ kve.host_blocks,
+ kve.disk_blocks,
+ )
+
+ async def _update_kve_metrics_async(self, kve: KVEfficiencyData) -> None:
+ """
+ Async wrapper for KVE metric updates (fire-and-forget via create_task).
+
+ This allows the main streaming path to continue without waiting for
+ metric updates, ensuring zero impact on routing throughput.
+ """
+ try:
+ self._update_kve_metrics_sync(kve)
+ except Exception:
+ # Never let metric updates crash the system
+ logger.exception("Failed to update KVE metrics")
+
+ async def _stream_from_engine(
+ self,
+ request: dict[str, Any],
+ worker_id: int | None,
+ decision_id: str | None,
+ tokens_in: int,
+ ) -> AsyncIterator[dict[str, Any]]:
+ """
+ Stream response from the backend engine.
+
+ Yields response chunks and sends feedback to the router on completion.
+ Also updates Prometheus metrics for latency and token throughput.
+
+ KV cache efficiency (KVE) metrics are updated asynchronously via
+ create_task() to ensure zero impact on routing throughput.
+ """
+ t0 = time.perf_counter()
+ tokens_out = 0
+ finish_reason: str | None = None
+ kve_data: KVEfficiencyData | None = None # Collected from response
+
+ try:
+ # Route to specific worker or use engine's load balancing
+ if worker_id is not None:
+ stream = await self.engine_client.direct(request, worker_id)
+ else:
+ stream = await self.engine_client.generate(request)
+
+ async for chunk in stream:
+ data = chunk.data()
+
+ # Handle engine errors
+ if "error" in data:
+ latency_ms = (time.perf_counter() - t0) * 1000.0
+ await self._send_feedback_safely(decision_id, latency_ms, False, tokens_in, tokens_out, "error")
+ self._metrics.engine_errors_total.inc()
+ yield {"error": data["error"]}
+ return
+
+ # Count output tokens
+ if "token_ids" in data and isinstance(data["token_ids"], list):
+ tokens_out += len(data["token_ids"])
+
+ # Extract KVE data if present (typically in final chunk or usage chunk)
+ # We check for 'usage' field which contains cache efficiency info
+ if "usage" in data or "nvext" in data:
+ extracted = KVEfficiencyData.from_response(data)
+ if extracted.has_data():
+ kve_data = extracted
+
+ # Pass through the chunk
+ yield data
+
+ # Handle completion
+ if "finish_reason" in data and data["finish_reason"] is not None:
+ finish_reason = data["finish_reason"]
+ latency_seconds = time.perf_counter() - t0
+ latency_ms = latency_seconds * 1000.0
+
+ # Send feedback to router (this is already fire-and-forget)
+ await self._send_feedback_safely(decision_id,
+ latency_ms,
+ True,
+ tokens_in,
+ tokens_out,
+ finish_reason)
+
+ # Update core Prometheus metrics (fast atomic operations)
+ self._metrics.request_latency_seconds.observe(latency_seconds)
+ self._metrics.tokens_in_total.inc(tokens_in)
+ self._metrics.tokens_out_total.inc(tokens_out)
+
+ # Fire-and-forget KVE metric update (async, non-blocking)
+ # This ensures KVE computation has ZERO impact on routing throughput
+ if kve_data is not None:
+ asyncio.create_task(self._update_kve_metrics_async(kve_data))
+
+ return
+
+ except Exception as e:
+ latency_ms = (time.perf_counter() - t0) * 1000.0
+ await self._send_feedback_safely(decision_id, latency_ms, False, tokens_in, tokens_out, "exception")
+ self._metrics.engine_errors_total.inc()
+ logger.exception("Engine stream exception")
+ yield {"error": str(e)}
+ return
+
+ # ---- main generation endpoint ----
+ async def generate(self, raw: dict[str, Any]):
+ """
+ Processor endpoint: receives PreprocessedRequest from frontend.
+
+ Expected format (from Dynamo preprocessor):
+ {
+ "token_ids": [...],
+ "annotations": ["prefix_id:xyz", "total_requests:10", ...],
+ "sampling_options": {...},
+ "stop_conditions": {...},
+ ...
+ }
+ """
+ # Track active requests
+ self._metrics.active_requests.inc()
+
+ try:
+ # Increment request counter
+ self._metrics.requests_total.inc()
+
+ # Extract routing hints from annotations
+ prefix_id, total_requests, osl, iat, use_frequency_backend = self._extract_hints(raw)
+
+ # Determine KVBM routing path based on backend selection
+ kvbm_version = "v2" if use_frequency_backend else "v1"
+
+ # Get token IDs from preprocessed request
+ token_ids = raw.get("token_ids", [])
+ if not isinstance(token_ids, list):
+ token_ids = []
+
+ tokens_in = len(token_ids)
+ logger.info(
+ "Processing request: prefix=%s total=%d osl=%s iat=%s tokens=%d kvbm=%s backend=%s",
+ prefix_id,
+ total_requests,
+ osl,
+ iat,
+ tokens_in,
+ kvbm_version,
+ "frequency_multi_lru" if use_frequency_backend else "default_3pool",
+ )
+
+ # Compute reuse_budget := remaining AFTER this request
+ reuse_budget = await self._update_prefix_state(prefix_id, total_requests)
+
+ # Pick worker via Thompson Sampling router
+ worker_id, decision_id = await self._pick_worker(token_ids, prefix_id, reuse_budget, osl, iat)
+
+ logger.info(
+ "Routing decision: worker=%s decision=%s reuse_budget=%d",
+ worker_id,
+ decision_id,
+ reuse_budget,
+ )
+
+ # Stream response from engine
+ async for resp in self._stream_from_engine(raw, worker_id, decision_id, tokens_in):
+ yield resp
+
+ finally:
+ self._metrics.active_requests.dec()
+
+
+# -------------------------- worker entry point -------------------------- #
+def parse_args():
+ """Parse command-line arguments for the processor."""
+ parser = argparse.ArgumentParser(description="Optimized Thompson Sampling Processor")
+ parser.add_argument(
+ "--enable-router",
+ action="store_true",
+ default=True,
+ help="Enable Thompson Sampling router integration",
+ )
+ parser.add_argument(
+ "--no-router",
+ action="store_false",
+ dest="enable_router",
+ help="Disable router (use engine load balancing only)",
+ )
+ parser.add_argument(
+ "--model-path",
+ type=str,
+ required=True,
+ help="Path to the model directory (for loading tokenizer and model card)",
+ )
+ parser.add_argument(
+ "--model-name",
+ type=str,
+ required=True,
+ help="Served model name (must match frontend's --model-name)",
+ )
+ return parser.parse_args()
+
+
+@dynamo_worker() # Dynamic mode - required to call router/workers which are also dynamic
+async def worker(runtime: DistributedRuntime):
+ """
+ Main worker entry point for the Thompson Sampling processor.
+
+ This processor registers as a backend that the frontend can discover via ETCD,
+ then forwards requests to actual workers after applying Thompson Sampling routing.
+ """
+ args = parse_args()
+
+ # DYNAMIC DISCOVERY MODE:
+ # Instead of using --static-endpoint on the frontend, we register a model card
+ # in ETCD so the frontend can discover us via its ModelWatcher.
+ #
+ # This is the forward-compatible approach since --static-endpoint is deprecated.
+ #
+ # Flow:
+ # 1. We register as dynamo.backend.generate (dynamically with instance ID)
+ # 2. We call register_llm() to advertise ourselves in ETCD
+ # 3. Frontend's ModelWatcher discovers us and routes requests to us
+ # 4. We forward to actual workers at workers.worker.generate
+
+ component = runtime.namespace("dynamo").component("backend")
+ # NOTE: create_service() was removed in Dynamo 0.8.x - endpoint creation handles registration
+
+ # Create the endpoint FIRST (needed for register_llm and metrics)
+ endpoint = component.endpoint("generate")
+
+ # Register the model card with ETCD so the frontend can discover us
+ # We accept preprocessed tokens (ModelInput.Tokens) and serve chat/completions
+ logger.info(
+ "Registering model card: model_name=%s, model_path=%s",
+ args.model_name,
+ args.model_path,
+ )
+ # IMPORTANT: kv_cache_block_size must match what workers use (default page_size=1)
+ # Otherwise checksums will differ and frontend will reject the processor's model card
+ await register_llm(
+ model_input=ModelInput.Tokens, # We accept tokenized input from frontend
+ model_type=ModelType.Chat | ModelType.Completions, # Chat and completions endpoints
+ endpoint=endpoint,
+ model_path=args.model_path,
+ model_name=args.model_name,
+ kv_cache_block_size=1, # Must match worker page_size to ensure same checksum
+ )
+ logger.info("Model card registered successfully - frontend can now discover us via ETCD")
+
+ # Initialize the request handler with the endpoint for metrics
+ handler = ProcessorRequestHandler(
+ runtime=runtime,
+ endpoint=endpoint,
+ enable_router=args.enable_router,
+ )
+ await handler.initialize()
+
+ # Serve as "backend.generate" - frontend will route to us after ETCD discovery
+ await endpoint.serve_endpoint(handler.generate)
+
+
+if __name__ == "__main__":
+ uvloop.install()
+ asyncio.run(worker()) # pylint: disable=no-value-for-parameter
diff --git a/external/dynamo/optimized/router.py b/external/dynamo/optimized/router.py
new file mode 100644
index 0000000000..24684a5714
--- /dev/null
+++ b/external/dynamo/optimized/router.py
@@ -0,0 +1,1402 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Optimized Thompson Sampling Router with Prometheus Metrics.
+
+This router implements Contextual Thompson Sampling with:
+ - KV overlap locality
+ - Remaining per-prefix requests (reuse_budget)
+ - OSL-based decode cost, ISL/prefill cost per worker
+ - IAT-based stickiness/opportunity weighting
+ - Instant & outstanding load (no TTL decay)
+ - Delayed bandit update using observed latency via `feedback` endpoint
+ - Timeout penalty for missing feedback
+ - Prometheus metrics (instead of CSV)
+ - Debug traces for offline analysis
+
+Key differences from generalized/router.py:
+ - Uses Prometheus metrics instead of CSV logging
+ - Removed CSV file I/O
+ - Added comprehensive Prometheus gauges, counters, and histograms
+"""
+
+import argparse
+import asyncio
+import json
+import logging
+import math
+import os
+import random
+import threading
+import time
+import uuid
+from collections import deque
+from functools import wraps
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import uvloop
+import yaml
+from dynamo.runtime import DistributedRuntime
+from dynamo.runtime import dynamo_worker
+from dynamo.runtime.logging import configure_dynamo_logging
+from pydantic import BaseModel
+
+# Try to import KV routing classes from dynamo.llm, fallback to stubs if unavailable
+try:
+ from dynamo.llm import KvIndexer
+ from dynamo.llm import OverlapScores
+except ImportError:
+ logger_init = logging.getLogger(__name__)
+ logger_init.warning("dynamo.llm KV classes not available, using fallback implementations")
+
+ class OverlapScores:
+ """Fallback: KV cache overlap scores between a request and workers."""
+
+ def __init__(self, scores: dict[int, float] | None = None):
+ self.scores = scores if scores is not None else {}
+
+ class KvIndexer:
+ """Fallback: KV cache indexer for finding overlap between requests and workers."""
+
+ def __init__(self, engine: Any, block_size: int):
+ self.engine = engine
+ self.block_size = block_size
+
+ async def find_matches_for_request(self, tokens: list[int], min_overlap: int) -> OverlapScores:
+ """Find overlap scores for each worker. Returns empty scores (round-robin fallback)."""
+ return OverlapScores({})
+
+
+configure_dynamo_logging()
+logger = logging.getLogger(__name__)
+
+WorkerId = int
+
+
+# ---------------------- config loading ---------------------- #
+def get_default_config_path() -> Path:
+ """Get path to default config.yaml in the same directory as this script."""
+ return Path(__file__).parent / "config.yaml"
+
+
+def load_config(config_path: str | Path | None = None) -> dict[str, Any]:
+ """Load configuration from YAML file.
+
+ Args:
+ config_path: Path to YAML config file. If None, uses default config.yaml.
+
+ Returns:
+ Configuration dictionary with nested structure.
+ """
+ if config_path is None:
+ config_path = get_default_config_path()
+
+ config_path = Path(config_path)
+ if not config_path.exists():
+ logger.warning("Config file not found: %s, using built-in defaults", config_path)
+ return get_builtin_defaults()
+
+ with open(config_path, encoding="utf-8") as f:
+ config = yaml.safe_load(f)
+
+ logger.info("Loaded config from: %s", config_path)
+ return config
+
+
+def get_builtin_defaults() -> dict[str, Any]:
+ """Return built-in default configuration (matches config.yaml)."""
+ return {
+ "infrastructure": {
+ "block_size": 64,
+ "router_type": "kv",
+ "min_workers": 1,
+ },
+ "affinity": {
+ "base": 0.30,
+ "reuse_weight": 0.15,
+ "iat_weight": 0.20,
+ "sticky_load_floor": 0.70,
+ },
+ "exploration": {
+ "base_ts_weight": 0.10,
+ "temperature": {
+ "base": 1.0,
+ "min": 0.15,
+ "max": 2.0,
+ },
+ },
+ "switching_cost": {
+ "base": 0.20,
+ "reuse_penalty": 0.08,
+ "iat_penalty": 0.05,
+ },
+ "load_balancing": {
+ "queue_penalty_weight": 0.50,
+ "gpu_penalty_weight": 1.00,
+ "outstanding_work_weight": 0.45,
+ "job_gpu_coupling_weight": 0.40,
+ "job_queue_coupling_weight": 0.20,
+ },
+ "prefill": {
+ "token_scale": 1024.0,
+ "weight": 1.0,
+ },
+ "lints": {
+ "lambda": 1.0,
+ "v": 0.25,
+ "forget_rate": 0.995,
+ },
+ "feedback": {
+ "timeout_seconds": 120.0,
+ "sweep_interval_seconds": 5.0,
+ "timeout_reward": 0.0,
+ "latency_ema_alpha": 0.2,
+ },
+ "debug": {
+ "traces_enabled": False,
+ "trace_dir": "/tmp/dynamo_router_traces",
+ "buffer_size": 2000,
+ },
+ }
+
+
+def get_nested(config: dict, dotted_key: str, default: Any = None) -> Any:
+ """Get a nested value from config using dot notation.
+
+ Args:
+ config: Configuration dictionary
+ dotted_key: Key in dot notation, e.g., "affinity.base"
+ default: Default value if key not found
+
+ Returns:
+ Value at the nested key, or default if not found.
+ """
+ keys = dotted_key.split(".")
+ obj = config
+ for k in keys:
+ if not isinstance(obj, dict) or k not in obj:
+ return default
+ obj = obj[k]
+ return obj
+
+
+def set_nested(config: dict, dotted_key: str, value: Any) -> None:
+ """Set a nested value in config using dot notation.
+
+ Args:
+ config: Configuration dictionary (modified in place)
+ dotted_key: Key in dot notation, e.g., "affinity.base"
+ value: Value to set
+ """
+ keys = dotted_key.split(".")
+ obj = config
+ for k in keys[:-1]:
+ if k not in obj:
+ obj[k] = {}
+ obj = obj[k]
+ obj[keys[-1]] = value
+
+
+def auto_cast(value_str: str) -> Any:
+ """Auto-cast a string value to appropriate type.
+
+ Args:
+ value_str: String value from CLI
+
+ Returns:
+ Value cast to int, float, bool, or str as appropriate.
+ """
+ # Boolean
+ if value_str.lower() in ("true", "yes", "1"):
+ return True
+ if value_str.lower() in ("false", "no", "0"):
+ return False
+
+ # Integer
+ try:
+ return int(value_str)
+ except ValueError:
+ pass
+
+ # Float
+ try:
+ return float(value_str)
+ except ValueError:
+ pass
+
+ # String
+ return value_str
+
+
+def apply_cli_overrides(config: dict, args: argparse.Namespace) -> dict:
+ """Apply CLI argument overrides to configuration.
+
+ Args:
+ config: Base configuration dictionary
+ args: Parsed CLI arguments
+
+ Returns:
+ Configuration with CLI overrides applied.
+ """
+ # Apply explicit CLI flags
+ if args.affinity_base is not None:
+ set_nested(config, "affinity.base", args.affinity_base)
+ logger.info("CLI override: affinity.base = %s", args.affinity_base)
+
+ if args.temp_base is not None:
+ set_nested(config, "exploration.temperature.base", args.temp_base)
+ logger.info("CLI override: exploration.temperature.base = %s", args.temp_base)
+
+ if args.lints_v is not None:
+ set_nested(config, "lints.v", args.lints_v)
+ logger.info("CLI override: lints.v = %s", args.lints_v)
+
+ # Apply generic --override flags
+ if args.override:
+ for override in args.override:
+ if "=" not in override:
+ logger.warning("Invalid override format (expected key=value): %s", override)
+ continue
+ key, value_str = override.split("=", 1)
+ value = auto_cast(value_str)
+ set_nested(config, key, value)
+ logger.info("CLI override: %s = %s", key, value)
+
+ return config
+
+
+def _init_prometheus_metrics():
+ """Initialize Prometheus metrics lazily."""
+ import functools
+
+ @functools.lru_cache(maxsize=1)
+ def _init() -> dict:
+ metrics: dict = {}
+ try:
+ from prometheus_client import REGISTRY
+ from prometheus_client import Counter
+ from prometheus_client import Gauge
+ from prometheus_client import Histogram
+
+ metrics["decisions_total"] = Counter(
+ "thompson_router_decisions_total",
+ "Total routing decisions by worker",
+ ["worker_id"],
+ registry=REGISTRY,
+ )
+ metrics["kv_overlap"] = Gauge(
+ "thompson_router_kv_overlap",
+ "KV cache overlap score for last decision by worker",
+ ["worker_id"],
+ registry=REGISTRY,
+ )
+ metrics["feedback_latency"] = Histogram(
+ "thompson_router_feedback_latency_seconds",
+ "Latency from feedback by worker",
+ ["worker_id"],
+ buckets=[0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0],
+ registry=REGISTRY,
+ )
+ metrics["reward"] = Gauge(
+ "thompson_router_reward",
+ "Last computed reward by worker",
+ ["worker_id"],
+ registry=REGISTRY,
+ )
+ metrics["pending_decisions"] = Gauge(
+ "thompson_router_pending_decisions",
+ "Number of pending decisions awaiting feedback",
+ registry=REGISTRY,
+ )
+ metrics["timeout_penalties"] = Counter(
+ "thompson_router_timeout_penalties_total",
+ "Total timeout penalties applied",
+ registry=REGISTRY,
+ )
+ metrics["sticky_decisions"] = Counter(
+ "thompson_router_sticky_decisions_total",
+ "Decisions that stayed on the same worker (sticky)",
+ registry=REGISTRY,
+ )
+ metrics["switch_decisions"] = Counter(
+ "thompson_router_switch_decisions_total",
+ "Decisions that switched to a different worker",
+ registry=REGISTRY,
+ )
+ metrics["beta_alpha"] = Gauge(
+ "thompson_router_beta_alpha",
+ "Beta distribution alpha parameter by worker",
+ ["worker_id"],
+ registry=REGISTRY,
+ )
+ metrics["beta_beta"] = Gauge(
+ "thompson_router_beta_beta",
+ "Beta distribution beta parameter by worker",
+ ["worker_id"],
+ registry=REGISTRY,
+ )
+ metrics["prefix_state_size"] = Gauge(
+ "thompson_router_prefix_state_size",
+ "Number of active prefix states",
+ registry=REGISTRY,
+ )
+ metrics["reuse_budget"] = Histogram(
+ "thompson_router_reuse_budget",
+ "Distribution of reuse_budget values",
+ buckets=[0, 1, 2, 5, 10, 20, 50, 100],
+ registry=REGISTRY,
+ )
+ metrics["tokens_per_request"] = Histogram(
+ "thompson_router_tokens_per_request",
+ "Distribution of input token counts",
+ buckets=[32, 64, 128, 256, 512, 1024, 2048, 4096, 8192],
+ registry=REGISTRY,
+ )
+ logger.info("Prometheus metrics initialized for router")
+ except ImportError:
+ logger.warning("prometheus_client not available, metrics disabled")
+
+ return metrics
+
+ return _init()
+
+
+# ---------------------- request / response models ---------------------- #
+class RouterRequest(BaseModel):
+ tokens: list[int]
+ prefix_id: str = ""
+ reuse_budget: int = 0 # remaining *after this request*
+ expected_osl: str | None = "MEDIUM"
+ interarrival: str | None = "MEDIUM"
+
+
+class RouterResponse(BaseModel):
+ worker_id: int
+ prefix_hit_rate: float
+ decision_id: str | None = None
+
+
+class FeedbackRequest(BaseModel):
+ decision_id: str
+ latency_ms: float
+ success: bool | None = True
+ tokens_in: int | None = None
+ tokens_out: int | None = None
+ finish_reason: str | None = None
+
+
+class FeedbackAck(BaseModel):
+ ok: bool
+ used_baseline: float
+ reward: float
+ worker_id: int | None = None
+ error: str | None = None
+
+
+# ---------------------- helper decorator ---------------------- #
+def safe_update(lock_name: str):
+
+ def decorator(fn):
+
+ @wraps(fn)
+ def wrapper(self, *args, **kwargs):
+ lock = getattr(self, lock_name)
+ with lock:
+ return fn(self, *args, **kwargs)
+
+ return wrapper
+
+ return decorator
+
+
+# ---------------------- router implementation ---------------------- #
+class WorkloadAwareRouter:
+ """
+ Contextual Thompson Sampling router with Prometheus metrics.
+ """
+
+ def __init__(
+ self,
+ runtime: DistributedRuntime,
+ block_size: int = 64,
+ router_type: str = "kv",
+ min_workers: int = 1,
+ # Affinity / exploration
+ affinity_base: float = 0.30,
+ affinity_reuse_weight: float = 0.15,
+ affinity_iat_weight: float = 0.20,
+ base_ts_weight: float = 0.10,
+ sticky_load_floor: float = 0.70,
+ # Softmax temperature
+ temp_base: float = 1.0,
+ temp_min: float = 0.15,
+ temp_max: float = 2.0,
+ # Switching cost
+ switch_cost_base: float = 0.20,
+ switch_cost_reuse: float = 0.08,
+ switch_cost_iat: float = 0.05,
+ # Load / opportunity cost
+ queue_penalty_weight: float = 0.50,
+ gpu_penalty_weight: float = 1.00,
+ outstanding_work_weight: float = 0.45,
+ job_gpu_coupling_weight: float = 0.40,
+ job_queue_coupling_weight: float = 0.20,
+ # Prefill / ISL
+ prefill_token_scale: float = 1024.0,
+ prefill_weight: float = 1.0,
+ # LinTS
+ lints_lambda: float = 1.0,
+ lints_v: float = 0.25,
+ lints_forget: float = 0.995,
+ # ---------- Feedback timeout / sweep ----------
+ feedback_timeout_seconds: float = 120.0,
+ pending_sweep_interval_seconds: float = 5.0,
+ timeout_reward: float = 0.0,
+ # ---------- Latency EMA (reward normalization) ----------
+ latency_ema_alpha: float = 0.2,
+ # ---------- Debug traces ----------
+ debug_traces: bool = False,
+ debug_trace_dir: str = "/tmp/dynamo_router_traces",
+ debug_buffer_size: int = 2000,
+ ):
+ self.runtime = runtime
+ self.block_size = block_size
+ self.router_type = router_type
+ self.min_workers = min_workers
+
+ # clients / helpers (initialized later)
+ self.engine_client = None
+ self.indexer: KvIndexer | None = None
+
+ # concurrency primitives
+ self._init_lock = threading.Lock()
+ self._bandit_lock = threading.Lock()
+ self._prefix_lock = threading.Lock()
+ self._lin_lock = threading.Lock()
+ self._pending_lock = threading.Lock()
+
+ # prefix state: pid -> {"worker": int|None, "reuse_remaining": int}
+ self.prefix_cache_state: dict[str, dict[str, int | None]] = {}
+ # pid -> {"decode_cost","prefill_cost","iat_factor"}
+ self.prefix_meta: dict[str, dict[str, float]] = {}
+
+ # Beta bandits and LinTS params
+ self.worker_bandits: dict[int, tuple[float, float]] = {}
+ self.feature_dim = 9
+ self.lin_lambda = float(lints_lambda)
+ self.lin_v = float(lints_v)
+ self.lin_forget = float(lints_forget)
+ self.lin_forget = max(1e-6, min(self.lin_forget, 0.999999))
+ self.linA: dict[int, np.ndarray] = {}
+ self.linb: dict[int, np.ndarray] = {}
+
+ # knobs
+ self.affinity_base = float(affinity_base)
+ self.affinity_reuse_weight = float(affinity_reuse_weight)
+ self.affinity_iat_weight = float(affinity_iat_weight)
+ self.base_ts_weight = float(base_ts_weight)
+ self.sticky_load_floor = float(sticky_load_floor)
+ self.temp_base = float(temp_base)
+ self.temp_min = float(temp_min)
+ self.temp_max = float(temp_max)
+ self.switch_cost_base = float(switch_cost_base)
+ self.switch_cost_reuse = float(switch_cost_reuse)
+ self.switch_cost_iat = float(switch_cost_iat)
+ self.queue_penalty_weight = float(queue_penalty_weight)
+ self.gpu_penalty_weight = float(gpu_penalty_weight)
+ self.outstanding_work_weight = float(outstanding_work_weight)
+ self.job_gpu_coupling_weight = float(job_gpu_coupling_weight)
+ self.job_queue_coupling_weight = float(job_queue_coupling_weight)
+ self.prefill_token_scale = float(prefill_token_scale)
+ self.prefill_weight = float(prefill_weight)
+
+ # LinTS numerics
+ self._jt_base = 1e-9
+ self._jt_mult = 10.0
+ self._jt_max = 1e-3
+ self._eig_floor = 1e-10
+
+ # Feedback timeout / sweep
+ self.feedback_timeout_seconds = float(feedback_timeout_seconds)
+ self.pending_sweep_interval_seconds = float(pending_sweep_interval_seconds)
+ self.timeout_reward = float(max(0.0, min(1.0, timeout_reward)))
+ self._last_pending_sweep = 0.0
+
+ # Latency EMA baselines (two modes: raw ms, or ms/token)
+ self.latency_ema_alpha = float(latency_ema_alpha)
+ # Global (per-mode)
+ self.lat_ema_global: dict[bool, float | None] = {False: None, True: None}
+ # Per worker (per-mode)
+ self.lat_ema_worker: dict[tuple[int, bool], float] = {}
+ # Per bucket (per-mode): (wid, osl, prefill_bin, per_tok) -> value
+ self.lat_ema_bucket: dict[tuple[int, str, str, bool], float] = {}
+
+ # Pending decisions waiting for feedback
+ self.pending: dict[str, dict[str, Any]] = {}
+
+ # Debug traces
+ self.debug_traces = bool(debug_traces)
+ self.debug_trace_dir = str(debug_trace_dir)
+ self.recent_traces: deque = deque(maxlen=int(debug_buffer_size))
+ if self.debug_traces:
+ os.makedirs(self.debug_trace_dir, exist_ok=True)
+ logger.info("Router debug traces enabled -> %s", self.debug_trace_dir)
+
+ # Prometheus metrics
+ self._metrics = {}
+
+ # --------------------- tracing --------------------- #
+ def _emit_trace(self, kind: str, payload: dict[str, Any]):
+ if not self.debug_traces:
+ return
+ item = {"ts": time.time(), "kind": kind, **payload}
+ self.recent_traces.append(item)
+ try:
+ path = os.path.join(self.debug_trace_dir, "router_traces.jsonl")
+ with open(path, "a", encoding="utf-8") as f:
+ f.write(json.dumps(item, separators=(",", ":")) + "\n")
+ except Exception as e:
+ logger.debug("Trace write failed: %s", e)
+
+ # --------------------- level mappings --------------------- #
+ @staticmethod
+ def _norm_level(s: str | None, default: str = "MEDIUM") -> str:
+ if not s:
+ return default
+ s = str(s).strip().upper()
+ return s if s in ("LOW", "MEDIUM", "HIGH") else default
+
+ @staticmethod
+ def _decode_cost(osl: str) -> float:
+ return {"LOW": 1.0, "MEDIUM": 2.0, "HIGH": 3.0}[osl]
+
+ @staticmethod
+ def _iat_factor(iat: str) -> float:
+ return {"LOW": 1.5, "MEDIUM": 1.0, "HIGH": 0.6}[iat]
+
+ # --------------------- init --------------------- #
+ async def initialize(self):
+ """Initialize router by polling for backend workers."""
+ # Initialize Prometheus metrics
+ self._metrics = _init_prometheus_metrics()
+
+ # Connect to actual workers at workers.{component}.generate
+ # Workers are in the "workers" namespace (hidden from frontend discovery)
+ # Component name varies by backend (REQUIRED - no default):
+ # - SGLang: uses "worker" (set via --endpoint workers.worker.generate)
+ # - vLLM: uses "backend" (hardcoded in dynamo.vllm)
+ worker_component = os.environ.get("DYNAMO_WORKER_COMPONENT")
+ if not worker_component:
+ raise ValueError("DYNAMO_WORKER_COMPONENT environment variable is required. "
+ "Set to 'worker' for SGLang or 'backend' for vLLM.")
+ engine = self.runtime.namespace("workers").component(worker_component)
+ logger.info("Getting engine client for workers/%s/generate", worker_component)
+ self.engine_client = await engine.endpoint("generate").client()
+
+ min_workers = int(self.min_workers)
+ if min_workers < 0:
+ raise ValueError(f"min_workers must be >= 0, got {min_workers}")
+
+ timeout_s = float(os.environ.get("DYNAMO_ROUTER_WAIT_FOR_WORKERS_TIMEOUT_S", "600"))
+ if not math.isfinite(timeout_s) or timeout_s <= 0:
+ raise ValueError("DYNAMO_ROUTER_WAIT_FOR_WORKERS_TIMEOUT_S must be a finite number > 0")
+
+ deadline = time.monotonic() + timeout_s
+ backoff_s = 0.5
+
+ logger.info("Waiting for backend workers (min_workers=%d, timeout_s=%.1f)...", min_workers, timeout_s)
+
+ if min_workers == 0:
+ instance_ids_raw = list(self.engine_client.instance_ids())
+ logger.info("Backend workers discovered (min_workers=0): %s", instance_ids_raw)
+ else:
+ while True:
+ remaining = deadline - time.monotonic()
+ if remaining <= 0:
+ raise TimeoutError(f"Timed out after {timeout_s}s waiting for >= {min_workers} backend worker(s)")
+
+ try:
+ await asyncio.wait_for(
+ self.engine_client.wait_for_instances(),
+ timeout=min(remaining, 10.0),
+ )
+ except TimeoutError:
+ pass
+
+ instance_ids_raw = list(self.engine_client.instance_ids())
+ if len(instance_ids_raw) >= min_workers:
+ try:
+ instance_ids = [int(w) for w in instance_ids_raw]
+ except Exception:
+ instance_ids = instance_ids_raw
+ logger.info("Backend workers discovered: %s", instance_ids)
+ break
+
+ await asyncio.sleep(backoff_s)
+ backoff_s = min(backoff_s * 1.5, 5.0)
+
+ self.indexer = KvIndexer(engine, self.block_size)
+
+ self._initialize_bandits()
+ self._initialize_contextual()
+ logger.info("WorkloadAwareRouter initialized with %d backend worker(s)",
+ len(list(self.engine_client.instance_ids())))
+
+ @safe_update("_init_lock")
+ def _initialize_bandits(self):
+ for wid in self.engine_client.instance_ids():
+ wid = int(wid)
+ self.worker_bandits.setdefault(wid, (1.0, 1.0))
+ # Update Prometheus metrics
+ if self._metrics.get("beta_alpha"):
+ self._metrics["beta_alpha"].labels(worker_id=str(wid)).set(1.0)
+ if self._metrics.get("beta_beta"):
+ self._metrics["beta_beta"].labels(worker_id=str(wid)).set(1.0)
+
+ @safe_update("_init_lock")
+ def _initialize_contextual(self):
+ for wid in self.engine_client.instance_ids():
+ wid = int(wid)
+ if wid not in self.linA:
+ self.linA[wid] = self.lin_lambda * np.eye(self.feature_dim, dtype=np.float64)
+ self.linb[wid] = np.zeros(self.feature_dim, dtype=np.float64)
+
+ def _ensure_worker_context(self, worker_id: int):
+ if worker_id not in self.linA:
+ with self._lin_lock:
+ if worker_id not in self.linA:
+ self.linA[worker_id] = self.lin_lambda * np.eye(self.feature_dim, dtype=np.float64)
+ self.linb[worker_id] = np.zeros(self.feature_dim, dtype=np.float64)
+
+ # --------------------- prefix state --------------------- #
+ @safe_update("_prefix_lock")
+ def _get_prefix(self, pid: str) -> tuple[int | None, int]:
+ info = self.prefix_cache_state.get(pid)
+ if info:
+ return info.get("worker"), int(info.get("reuse_remaining") or 0)
+ return None, 0
+
+ @safe_update("_prefix_lock")
+ def _set_prefix(
+ self,
+ pid: str,
+ wid: int,
+ reuse_remaining: int,
+ decode_cost: float,
+ prefill_cost: float,
+ iat_factor: float,
+ ):
+ """Record/refresh prefix assignment."""
+ if reuse_remaining <= 0:
+ self.prefix_cache_state.pop(pid, None)
+ self.prefix_meta.pop(pid, None)
+ else:
+ self.prefix_cache_state[pid] = {"worker": wid, "reuse_remaining": max(0, int(reuse_remaining))}
+ self.prefix_meta[pid] = {
+ "decode_cost": float(decode_cost),
+ "prefill_cost": float(max(prefill_cost, 0.0)),
+ "iat_factor": float(iat_factor),
+ }
+
+ # Update prefix state size metric
+ if self._metrics.get("prefix_state_size"):
+ self._metrics["prefix_state_size"].set(len(self.prefix_cache_state))
+
+ def _worker_outstanding(self, wid: int) -> tuple[int, float]:
+ """Returns (reuse_total, work_total) for a worker."""
+ reuse_total = 0
+ work_total = 0.0
+ for pid, info in self.prefix_cache_state.items():
+ if info.get("worker") != wid:
+ continue
+ r = int(info.get("reuse_remaining") or 0)
+ reuse_total += r
+ meta = self.prefix_meta.get(pid)
+ if meta:
+ work_total += float(r) * (float(meta.get("decode_cost", 2.0)) +
+ float(meta.get("prefill_cost", 0.0))) * float(meta.get("iat_factor", 1.0))
+ return reuse_total, work_total
+
+ # --------------------- bandits --------------------- #
+ def _linTS_sample(self, wid: int, x: np.ndarray) -> float:
+ self._ensure_worker_context(wid)
+ with self._lin_lock:
+ A = np.array(self.linA[wid], dtype=np.float64, copy=True)
+ b = np.array(self.linb[wid], dtype=np.float64, copy=True)
+
+ A = 0.5 * (A + A.T)
+ eye = np.eye(self.feature_dim, dtype=np.float64)
+ jitter = self._jt_base
+ L = None
+ while True:
+ try:
+ L = np.linalg.cholesky(A + jitter * eye)
+ break
+ except np.linalg.LinAlgError:
+ jitter = jitter * self._jt_mult if jitter > 0 else self._jt_base
+ if jitter > self._jt_max:
+ vals, vecs = np.linalg.eigh(A)
+ vals = np.maximum(vals, self._eig_floor)
+ A_inv = vecs @ (np.diag(1.0 / vals)) @ vecs.T
+ mu = A_inv @ b
+ z = np.random.normal(size=self.feature_dim)
+ noise = vecs @ (z / np.sqrt(vals))
+ theta = mu + (self.lin_v * noise)
+ return float(theta @ x)
+
+ y = np.linalg.solve(L, b)
+ mu = np.linalg.solve(L.T, y)
+ z = np.random.normal(size=self.feature_dim)
+ noise = np.linalg.solve(L.T, z)
+ theta = mu + (self.lin_v * noise)
+ return float(theta @ x)
+
+ def _update_contextual(self, wid: int, x: np.ndarray, reward: float):
+ r = float(max(0.0, min(1.0, reward)))
+ with self._lin_lock:
+ A = self.linA[wid]
+ b = self.linb[wid]
+ A *= self.lin_forget
+ b *= self.lin_forget
+ A += np.outer(x, x)
+ ridge = (1.0 - self.lin_forget) * self.lin_lambda
+ if ridge > 0.0:
+ A += ridge * np.eye(self.feature_dim, dtype=np.float64)
+ self.linA[wid] = 0.5 * (A + A.T)
+ self.linb[wid] = b + x * r
+
+ def _ts_sample(self, worker_id: int) -> float:
+ with self._bandit_lock:
+ alpha, beta = self.worker_bandits.get(worker_id, (1.0, 1.0))
+ return np.random.beta(alpha, beta)
+
+ def _update_bandit(self, worker_id: int, reward: float):
+ with self._bandit_lock:
+ alpha, beta = self.worker_bandits.get(worker_id, (1.0, 1.0))
+ r = float(max(0.0, min(1.0, reward)))
+ new_alpha = alpha + r
+ new_beta = beta + 1.0 - r
+ self.worker_bandits[worker_id] = (new_alpha, new_beta)
+
+ # Update Prometheus metrics
+ if self._metrics.get("beta_alpha"):
+ self._metrics["beta_alpha"].labels(worker_id=str(worker_id)).set(new_alpha)
+ if self._metrics.get("beta_beta"):
+ self._metrics["beta_beta"].labels(worker_id=str(worker_id)).set(new_beta)
+
+ # --------------------- features / scores --------------------- #
+ def _prefill_cost_for_worker(self, tokens: list[int], overlap: float) -> float:
+ isl = max(0, len(tokens))
+ frac = min(max(float(overlap), 0.0), 1.0)
+ uncached = max(0.0, float(isl) * (1.0 - frac))
+ return (uncached / self.prefill_token_scale) * self.prefill_weight
+
+ @staticmethod
+ def _prefill_bin(prefill_cost: float) -> str:
+ if prefill_cost < 0.25:
+ return "LOW"
+ if prefill_cost < 0.75:
+ return "MEDIUM"
+ return "HIGH"
+
+ def _feature_vector(
+ self,
+ wid: int,
+ metrics: dict[str, Any] | None,
+ scores: "OverlapScores",
+ last_w: int | None,
+ reuse_after: int,
+ decode_cost: float,
+ prefill_cost: float,
+ iat_factor: float,
+ ) -> np.ndarray:
+ gpu = 0.0
+ queue = 0.0
+ if metrics and isinstance(metrics, dict) and "endpoints" in metrics:
+ for ep in metrics["endpoints"]:
+ if ep.get("worker_id") == wid:
+ gpu = float(ep.get("gpu_cache_usage_perc", 0.0))
+ queue = float(ep.get("num_requests_waiting", 0.0))
+ break
+ inv_load = 1.0 / (1.0 + self.gpu_penalty_weight * max(0.0, gpu) + self.queue_penalty_weight * max(0.0, queue))
+
+ overlap = float(scores.scores.get(wid, 0.0))
+ affinity = 1.0 if (last_w is not None and wid == last_w) else 0.0
+ _, work_out = self._worker_outstanding(wid)
+
+ decode_norm = decode_cost / 3.0
+ prefill_norm = math.tanh(prefill_cost)
+ iat_norm = iat_factor / 1.5
+ outstanding_norm = math.tanh(0.1 * work_out)
+ reuse_norm = math.tanh(0.25 * float(max(reuse_after, 0)))
+
+ return np.array([
+ 1.0,
+ inv_load,
+ overlap,
+ affinity,
+ outstanding_norm,
+ decode_norm,
+ prefill_norm,
+ iat_norm,
+ reuse_norm,
+ ],
+ dtype=np.float64)
+
+ def _load_score(self, wid: int, metrics: dict[str, Any] | None, job_cost_total: float) -> float:
+ gpu = 0.0
+ queue = 0.0
+ if metrics and isinstance(metrics, dict) and "endpoints" in metrics:
+ for ep in metrics["endpoints"]:
+ if ep.get("worker_id") == wid:
+ gpu = float(ep.get("gpu_cache_usage_perc", 0.0))
+ queue = float(ep.get("num_requests_waiting", 0.0))
+ break
+ _, work_out = self._worker_outstanding(wid)
+ penalty = (self.gpu_penalty_weight * gpu + self.queue_penalty_weight * queue +
+ self.outstanding_work_weight * max(0.0, work_out) +
+ self.job_gpu_coupling_weight * job_cost_total * gpu +
+ self.job_queue_coupling_weight * job_cost_total * queue)
+ return 1.0 / (1.0 + max(0.0, penalty))
+
+ def _softmax(self, scores: list[float], temp: float) -> list[float]:
+ t = float(min(max(temp, self.temp_min), self.temp_max))
+ m = float(np.max(scores))
+ exps = np.exp((np.array(scores) - m) / max(1e-6, t))
+ s = float(np.sum(exps))
+ if s <= 0.0 or not np.isfinite(s):
+ return [1.0 / len(scores)] * len(scores)
+ return list((exps / s).astype(float))
+
+ # --------------------- selection --------------------- #
+ def _select_worker(
+ self,
+ worker_ids,
+ req: RouterRequest,
+ metrics: dict[str, Any] | None,
+ scores: OverlapScores,
+ ) -> tuple[int, dict[str, float], dict[int, dict[str, float]], list[float], list[float]]:
+ osl = self._norm_level(req.expected_osl, "MEDIUM")
+ iat = self._norm_level(req.interarrival, "MEDIUM")
+ last_w, _ = self._get_prefix(req.prefix_id)
+
+ reuse_after = max(int(req.reuse_budget), 0)
+ decode_cost = self._decode_cost(osl)
+ iat_factor = self._iat_factor(iat)
+
+ temp = self.temp_base / (1.0 + float(reuse_after) * iat_factor)
+ temp = min(max(temp, self.temp_min), self.temp_max)
+
+ raw_scores: list[float] = []
+ worker_list: list[int] = [int(w) for w in worker_ids]
+ per_worker_ctx: dict[int, dict[str, float]] = {}
+ load_mods: list[float] = []
+ overlaps: list[float] = []
+
+ for wid in worker_list:
+ overlap = float(scores.scores.get(wid, 0.0))
+ prefill_cost = self._prefill_cost_for_worker(req.tokens, overlap)
+ job_cost_total = decode_cost + prefill_cost
+
+ x = self._feature_vector(
+ wid=wid,
+ metrics=metrics,
+ scores=scores,
+ last_w=last_w,
+ reuse_after=reuse_after,
+ decode_cost=decode_cost,
+ prefill_cost=prefill_cost,
+ iat_factor=iat_factor,
+ )
+
+ val = self._linTS_sample(wid, x)
+ explore_w = self.base_ts_weight / (1.0 + float(reuse_after) * iat_factor)
+ val += explore_w * self._ts_sample(wid)
+
+ if last_w == wid and (reuse_after > 0):
+ val += (self.affinity_base + self.affinity_reuse_weight * float(reuse_after) +
+ self.affinity_iat_weight * iat_factor) * (0.5 + 0.5 * overlap)
+
+ if last_w is not None and wid != last_w and (reuse_after > 0):
+ val -= (self.switch_cost_base + self.switch_cost_reuse * float(reuse_after) +
+ self.switch_cost_iat * iat_factor)
+
+ load_mod = self._load_score(wid, metrics, job_cost_total=job_cost_total)
+ if last_w == wid and reuse_after > 0:
+ load_mod = max(load_mod, self.sticky_load_floor)
+ val *= load_mod
+
+ if np.isnan(val) or np.isinf(val):
+ val = -1e9
+
+ raw_scores.append(float(val))
+ load_mods.append(float(load_mod))
+ overlaps.append(float(overlap))
+ per_worker_ctx[wid] = {
+ "decode_cost": decode_cost,
+ "prefill_cost": prefill_cost,
+ "iat_factor": iat_factor,
+ "overlap": overlap,
+ "reuse_after": float(reuse_after),
+ "load_mod": load_mod,
+ }
+
+ probs = self._softmax(raw_scores, temp)
+ r = random.random()
+ cum = 0.0
+ idx = 0
+ for i, p in enumerate(probs):
+ cum += p
+ if r <= cum:
+ idx = i
+ break
+ chosen = int(worker_list[idx])
+
+ return chosen, per_worker_ctx[chosen], per_worker_ctx, raw_scores, probs
+
+ # --------------------- latency baselines & reward --------------------- #
+ def _ema_update(self, old: float | None, new: float) -> float:
+ a = self.latency_ema_alpha
+ return new if old is None else (a * new + (1.0 - a) * old)
+
+ def _get_latency_baseline(self, wid: int, osl: str, prefill_bin: str, per_tok: bool, fallback: float) -> float:
+ key_b = (wid, osl, prefill_bin, per_tok)
+ key_w = (wid, per_tok)
+ if key_b in self.lat_ema_bucket:
+ return self.lat_ema_bucket[key_b]
+ if key_w in self.lat_ema_worker:
+ return self.lat_ema_worker[key_w]
+ if self.lat_ema_global[per_tok] is not None:
+ return self.lat_ema_global[per_tok] # type: ignore
+ return max(1.0, float(fallback))
+
+ def _update_latency_baselines(self, wid: int, osl: str, prefill_bin: str, metric: float, per_tok: bool) -> float:
+ self.lat_ema_global[per_tok] = self._ema_update(self.lat_ema_global[per_tok], metric)
+ key_w = (wid, per_tok)
+ self.lat_ema_worker[key_w] = self._ema_update(self.lat_ema_worker.get(key_w), metric)
+ key_b = (wid, osl, prefill_bin, per_tok)
+ self.lat_ema_bucket[key_b] = self._ema_update(self.lat_ema_bucket.get(key_b), metric)
+ return self.lat_ema_bucket[key_b]
+
+ @staticmethod
+ def _latency_metric(latency_ms: float, tokens_out: int | None) -> tuple[float, bool]:
+ if tokens_out is not None and int(tokens_out) > 0:
+ return float(latency_ms) / float(max(1, int(tokens_out))), True
+ return float(latency_ms), False
+
+ @staticmethod
+ def _metric_to_reward(metric: float, baseline: float, success: bool) -> float:
+ if not success:
+ return 0.0
+ denom = max(1e-3, baseline)
+ ratio = metric / denom
+ return float(1.0 / (1.0 + ratio))
+
+ # --------------------- timeout sweep --------------------- #
+ def _sweep_pending(self, now: float):
+ if now - self._last_pending_sweep < self.pending_sweep_interval_seconds:
+ return
+ self._last_pending_sweep = now
+ expired: list[tuple[str, dict[str, Any]]] = []
+ with self._pending_lock:
+ for did, rec in list(self.pending.items()):
+ if now - float(rec.get("start_ts", now)) >= self.feedback_timeout_seconds:
+ expired.append((did, rec))
+ self.pending.pop(did, None)
+
+ # Update pending count metric
+ if self._metrics.get("pending_decisions"):
+ self._metrics["pending_decisions"].set(len(self.pending))
+
+ for did, rec in expired:
+ wid = int(rec["wid"])
+ x = rec["x"]
+ reward = float(self.timeout_reward)
+ self._update_bandit(wid, reward)
+ self._update_contextual(wid, x, reward)
+
+ if self._metrics.get("timeout_penalties"):
+ self._metrics["timeout_penalties"].inc()
+
+ self._emit_trace(
+ "timeout",
+ {
+ "decision_id": did,
+ "wid": wid,
+ "reward": reward,
+ "age": self.feedback_timeout_seconds,
+ "prefix_id": rec.get("prefix_id"),
+ "osl": rec.get("osl"),
+ "prefill_bin": rec.get("prefill_bin"),
+ })
+ logger.warning("Timeout feedback: wid=%s decision=%s reward=%.3f", wid, did, reward)
+
+ # --------------------- main endpoint: find_worker --------------------- #
+ async def generate(self, request: dict):
+ req = RouterRequest(**request)
+
+ worker_ids = [int(w) for w in self.engine_client.instance_ids()]
+ if not worker_ids:
+ yield RouterResponse(worker_id=-1, prefix_hit_rate=0.0).model_dump()
+ return
+
+ now = time.time()
+ self._sweep_pending(now)
+
+ # Track tokens per request
+ if self._metrics.get("tokens_per_request"):
+ self._metrics["tokens_per_request"].observe(len(req.tokens))
+ if self._metrics.get("reuse_budget"):
+ self._metrics["reuse_budget"].observe(req.reuse_budget)
+
+ metrics = None # TODO: Replace with proper metrics query when API is available
+ if self.router_type == "kv_load":
+ wid, _ = self._get_underloaded(metrics)
+ yield RouterResponse(worker_id=wid, prefix_hit_rate=0.0).model_dump()
+ return
+
+ scores: OverlapScores = await self.indexer.find_matches_for_request(req.tokens, 0)
+ chosen, chosen_ctx, all_ctx, raw_scores, probs = self._select_worker(worker_ids, req, metrics, scores)
+
+ last_w, _ = self._get_prefix(req.prefix_id)
+
+ osl = self._norm_level(req.expected_osl, "MEDIUM")
+ iat = self._norm_level(req.interarrival, "MEDIUM")
+ decode_cost = self._decode_cost(osl)
+ overlap_chosen = float(scores.scores.get(chosen, 0.0))
+ prefill_cost_chosen = self._prefill_cost_for_worker(req.tokens, overlap_chosen)
+ iat_factor = self._iat_factor(iat)
+
+ # Update prefix state
+ self._set_prefix(
+ req.prefix_id,
+ chosen,
+ reuse_remaining=max(int(req.reuse_budget), 0),
+ decode_cost=decode_cost,
+ prefill_cost=prefill_cost_chosen,
+ iat_factor=iat_factor,
+ )
+
+ # Build feature x for chosen & store pending decision
+ x = self._feature_vector(
+ wid=chosen,
+ metrics=metrics,
+ scores=scores,
+ last_w=last_w,
+ reuse_after=max(int(req.reuse_budget), 0),
+ decode_cost=decode_cost,
+ prefill_cost=prefill_cost_chosen,
+ iat_factor=iat_factor,
+ )
+ decision_id = uuid.uuid4().hex
+ with self._pending_lock:
+ self.pending[decision_id] = {
+ "wid": int(chosen),
+ "x": x,
+ "osl": osl,
+ "prefill_bin": self._prefill_bin(prefill_cost_chosen),
+ "start_ts": now,
+ "prefix_id": req.prefix_id,
+ "tokens_in": len(req.tokens),
+ "reuse_after": int(req.reuse_budget),
+ "overlap": overlap_chosen,
+ "prefill_cost": float(prefill_cost_chosen),
+ "decode_cost": float(decode_cost),
+ }
+ # Update pending count metric
+ if self._metrics.get("pending_decisions"):
+ self._metrics["pending_decisions"].set(len(self.pending))
+
+ # Update Prometheus metrics
+ if self._metrics.get("decisions_total"):
+ self._metrics["decisions_total"].labels(worker_id=str(chosen)).inc()
+ if self._metrics.get("kv_overlap"):
+ self._metrics["kv_overlap"].labels(worker_id=str(chosen)).set(overlap_chosen)
+
+ # Track sticky vs switch decisions
+ if last_w is not None:
+ if chosen == last_w:
+ if self._metrics.get("sticky_decisions"):
+ self._metrics["sticky_decisions"].inc()
+ elif self._metrics.get("switch_decisions"):
+ self._metrics["switch_decisions"].inc()
+
+ # Decision trace
+ if self.debug_traces:
+ worker_list = [int(w) for w in worker_ids]
+ details = {
+ wid: {
+ "score": float(raw_scores[i]),
+ "prob": float(probs[i]),
+ **all_ctx[wid],
+ }
+ for i, wid in enumerate(worker_list)
+ }
+ self._emit_trace("decision",
+ {
+ "decision_id": decision_id,
+ "prefix_id": req.prefix_id,
+ "chosen": int(chosen),
+ "workers": details,
+ })
+
+ logger.info(
+ "Router picked worker=%s decision=%s prefix=%s (last=%s reuse_after=%s osl=%s "
+ "prefill_cost=%.3f iat=%s overlap=%.3f)",
+ chosen,
+ decision_id,
+ req.prefix_id,
+ last_w,
+ req.reuse_budget,
+ osl,
+ prefill_cost_chosen,
+ iat,
+ overlap_chosen,
+ )
+
+ resp = RouterResponse(worker_id=chosen, prefix_hit_rate=overlap_chosen, decision_id=decision_id)
+ yield resp.model_dump()
+ return
+
+ # --------------------- feedback endpoint --------------------- #
+ async def feedback(self, request: dict):
+ """Ex-post reward update from processor with observed latency."""
+ try:
+ fb = FeedbackRequest(**request)
+ except Exception as e:
+ ack = FeedbackAck(ok=False, used_baseline=0.0, reward=0.0, error=str(e))
+ yield ack.model_dump()
+ return
+
+ with self._pending_lock:
+ decision = self.pending.pop(fb.decision_id, None)
+ # Update pending count metric
+ if self._metrics.get("pending_decisions"):
+ self._metrics["pending_decisions"].set(len(self.pending))
+
+ if not decision:
+ ack = FeedbackAck(ok=False, used_baseline=0.0, reward=0.0, error="unknown_decision")
+ yield ack.model_dump()
+ return
+
+ wid: int = int(decision["wid"])
+ x: np.ndarray = decision["x"]
+ osl: str = str(decision["osl"])
+ prefill_bin: str = str(decision["prefill_bin"])
+ tokens_out = None if fb.tokens_out is None else int(fb.tokens_out)
+ metric, per_tok = self._latency_metric(float(fb.latency_ms), tokens_out)
+
+ # Baseline lookup (hierarchical)
+ baseline_before = self._get_latency_baseline(wid, osl, prefill_bin, per_tok, fallback=metric)
+ reward = self._metric_to_reward(metric, baseline_before, bool(fb.success))
+
+ # Update EMAs only on successes
+ if fb.success:
+ baseline_after = self._update_latency_baselines(wid, osl, prefill_bin, metric, per_tok)
+ else:
+ baseline_after = baseline_before
+
+ # Update bandits with ex-post reward
+ self._update_bandit(wid, reward)
+ self._update_contextual(wid, x, reward)
+
+ # Update Prometheus metrics
+ if self._metrics.get("feedback_latency"):
+ self._metrics["feedback_latency"].labels(worker_id=str(wid)).observe(fb.latency_ms / 1000.0)
+ if self._metrics.get("reward"):
+ self._metrics["reward"].labels(worker_id=str(wid)).set(reward)
+
+ self._emit_trace(
+ "feedback",
+ {
+ "decision_id": fb.decision_id,
+ "wid": wid,
+ "latency_ms": float(fb.latency_ms),
+ "tokens_out": tokens_out,
+ "metric": metric,
+ "per_tok": per_tok,
+ "baseline_used": baseline_before,
+ "baseline_after": baseline_after,
+ "reward": reward,
+ "success": bool(fb.success),
+ "finish_reason": fb.finish_reason or "",
+ })
+
+ logger.info(
+ "Feedback: wid=%s decision=%s metric=%.3f%s baseline=%.3f reward=%.3f success=%s",
+ wid,
+ fb.decision_id,
+ metric,
+ " ms/tok" if per_tok else " ms",
+ baseline_before,
+ reward,
+ fb.success,
+ )
+
+ ack = FeedbackAck(ok=True, used_baseline=float(baseline_before), reward=float(reward), worker_id=wid)
+ yield ack.model_dump()
+ return
+
+ # --------------------- helpers --------------------- #
+ def _get_underloaded(self, metrics: dict[str, Any] | None):
+ if not metrics or not metrics.get("endpoints"):
+ wid = int(random.choice(list(self.engine_client.instance_ids())))
+ return wid, 0.0
+ loads = {ep.get("worker_id"): ep.get("gpu_cache_usage_perc", 0.0) for ep in metrics["endpoints"]}
+ min_val = min(loads.values())
+ candidates = [wid for wid, v in loads.items() if v == min_val]
+ return random.choice(candidates), min_val
+
+
+# ---------------------- worker entry point ---------------------- #
+def parse_args():
+ """Parse minimal CLI arguments.
+
+ The router uses a YAML config file for most parameters.
+ Only frequently-tuned parameters have dedicated CLI flags.
+ Use --override for any other parameter.
+
+ See PARAMETERS.md for full documentation.
+ """
+ parser = argparse.ArgumentParser(
+ description="Optimized Thompson Sampling Router with Prometheus Metrics",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+Examples:
+ # Use default config
+ python router.py
+
+ # Use custom config file
+ python router.py --config /path/to/config.yaml
+
+ # Override specific values
+ python router.py --config config.yaml --affinity-base 0.5 --temp-base 1.5
+
+ # Override any config value
+ python router.py --config config.yaml --override load_balancing.gpu_penalty_weight=2.0
+
+See PARAMETERS.md for full parameter documentation.
+ """,
+ )
+
+ # Config file
+ parser.add_argument(
+ "--config",
+ type=str,
+ default=None,
+ help="Path to YAML config file (default: config.yaml in script directory)",
+ )
+
+ # Primary tuning knobs (explicit CLI flags)
+ parser.add_argument(
+ "--affinity-base",
+ type=float,
+ default=None,
+ help="Primary stickiness control [0.0-1.0] (overrides config)",
+ )
+ parser.add_argument(
+ "--temp-base",
+ type=float,
+ default=None,
+ help="Primary exploration control [0.15-2.0] (overrides config)",
+ )
+ parser.add_argument(
+ "--lints-v",
+ type=float,
+ default=None,
+ help="LinTS exploration variance [0.0-1.0] (overrides config)",
+ )
+
+ # Generic override for any config value
+ parser.add_argument(
+ "--override",
+ action="append",
+ default=[],
+ metavar="KEY=VALUE",
+ help="Override any config value using dot notation (repeatable)",
+ )
+
+ return parser.parse_args()
+
+
+@dynamo_worker(static=False)
+async def worker(runtime: DistributedRuntime):
+ # Parse CLI and load config
+ args = parse_args()
+ config = load_config(args.config)
+ config = apply_cli_overrides(config, args)
+
+ component = runtime.namespace("dynamo").component("router")
+ await component.create_service()
+ logger.info("Initializing Optimized Thompson Sampling Router (Prometheus metrics)")
+
+ # Extract config values with nested access
+ router = WorkloadAwareRouter(
+ runtime,
+ # Infrastructure
+ block_size=get_nested(config, "infrastructure.block_size", 64),
+ router_type=str(get_nested(config, "infrastructure.router_type", "kv")).lower(),
+ min_workers=get_nested(config, "infrastructure.min_workers", 1),
+ # Affinity
+ affinity_base=get_nested(config, "affinity.base", 0.30),
+ affinity_reuse_weight=get_nested(config, "affinity.reuse_weight", 0.15),
+ affinity_iat_weight=get_nested(config, "affinity.iat_weight", 0.20),
+ sticky_load_floor=get_nested(config, "affinity.sticky_load_floor", 0.70),
+ # Exploration
+ base_ts_weight=get_nested(config, "exploration.base_ts_weight", 0.10),
+ temp_base=get_nested(config, "exploration.temperature.base", 1.0),
+ temp_min=get_nested(config, "exploration.temperature.min", 0.15),
+ temp_max=get_nested(config, "exploration.temperature.max", 2.0),
+ # Switching cost
+ switch_cost_base=get_nested(config, "switching_cost.base", 0.20),
+ switch_cost_reuse=get_nested(config, "switching_cost.reuse_penalty", 0.08),
+ switch_cost_iat=get_nested(config, "switching_cost.iat_penalty", 0.05),
+ # Load balancing
+ queue_penalty_weight=get_nested(config, "load_balancing.queue_penalty_weight", 0.50),
+ gpu_penalty_weight=get_nested(config, "load_balancing.gpu_penalty_weight", 1.00),
+ outstanding_work_weight=get_nested(config, "load_balancing.outstanding_work_weight", 0.45),
+ job_gpu_coupling_weight=get_nested(config, "load_balancing.job_gpu_coupling_weight", 0.40),
+ job_queue_coupling_weight=get_nested(config, "load_balancing.job_queue_coupling_weight", 0.20),
+ # Prefill
+ prefill_token_scale=get_nested(config, "prefill.token_scale", 1024.0),
+ prefill_weight=get_nested(config, "prefill.weight", 1.0),
+ # LinTS
+ lints_lambda=get_nested(config, "lints.lambda", 1.0),
+ lints_v=get_nested(config, "lints.v", 0.25),
+ lints_forget=get_nested(config, "lints.forget_rate", 0.995),
+ # Feedback
+ feedback_timeout_seconds=get_nested(config, "feedback.timeout_seconds", 120.0),
+ pending_sweep_interval_seconds=get_nested(config, "feedback.sweep_interval_seconds", 5.0),
+ timeout_reward=get_nested(config, "feedback.timeout_reward", 0.0),
+ latency_ema_alpha=get_nested(config, "feedback.latency_ema_alpha", 0.2),
+ # Debug
+ debug_traces=get_nested(config, "debug.traces_enabled", False),
+ debug_trace_dir=get_nested(config, "debug.trace_dir", "/tmp/dynamo_router_traces"),
+ debug_buffer_size=get_nested(config, "debug.buffer_size", 2000),
+ )
+ await router.initialize()
+
+ # Serve both endpoints
+ await asyncio.gather(
+ component.endpoint("find_worker").serve_endpoint(router.generate),
+ component.endpoint("feedback").serve_endpoint(router.feedback),
+ )
+
+
+if __name__ == "__main__":
+ uvloop.install()
+ asyncio.run(worker())
diff --git a/external/dynamo/optimized/router_multilru.py b/external/dynamo/optimized/router_multilru.py
new file mode 100644
index 0000000000..bbeb9ef6e8
--- /dev/null
+++ b/external/dynamo/optimized/router_multilru.py
@@ -0,0 +1,1402 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Optimized Thompson Sampling Router with Prometheus Metrics.
+
+This router implements Contextual Thompson Sampling with:
+ - KV overlap locality
+ - Remaining per-prefix requests (reuse_budget)
+ - OSL-based decode cost, ISL/prefill cost per worker
+ - IAT-based stickiness/opportunity weighting
+ - Instant & outstanding load (no TTL decay)
+ - Delayed bandit update using observed latency via `feedback` endpoint
+ - Timeout penalty for missing feedback
+ - Prometheus metrics (instead of CSV)
+ - Debug traces for offline analysis
+
+Key differences from generalized/router.py:
+ - Uses Prometheus metrics instead of CSV logging
+ - Removed CSV file I/O
+ - Added comprehensive Prometheus gauges, counters, and histograms
+"""
+
+import argparse
+import asyncio
+import json
+import logging
+import math
+import os
+import random
+import threading
+import time
+import uuid
+from collections import deque
+from functools import wraps
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import uvloop
+import yaml
+from dynamo.runtime import DistributedRuntime
+from dynamo.runtime import dynamo_worker
+from dynamo.runtime.logging import configure_dynamo_logging
+from pydantic import BaseModel
+
+# Try to import KV routing classes from dynamo.llm, fallback to stubs if unavailable
+try:
+ from dynamo.llm import KvIndexer
+ from dynamo.llm import OverlapScores
+except ImportError:
+ logger_init = logging.getLogger(__name__)
+ logger_init.warning("dynamo.llm KV classes not available, using fallback implementations")
+
+ class OverlapScores:
+ """Fallback: KV cache overlap scores between a request and workers."""
+
+ def __init__(self, scores: dict[int, float] | None = None):
+ self.scores = scores if scores is not None else {}
+
+ class KvIndexer:
+ """Fallback: KV cache indexer for finding overlap between requests and workers."""
+
+ def __init__(self, engine: Any, block_size: int):
+ self.engine = engine
+ self.block_size = block_size
+
+ async def find_matches_for_request(self, tokens: list[int], min_overlap: int) -> OverlapScores:
+ """Find overlap scores for each worker. Returns empty scores (round-robin fallback)."""
+ return OverlapScores({})
+
+
+configure_dynamo_logging()
+logger = logging.getLogger(__name__)
+
+WorkerId = int
+
+
+# ---------------------- config loading ---------------------- #
+def get_default_config_path() -> Path:
+ """Get path to default config.yaml in the same directory as this script."""
+ return Path(__file__).parent / "config.yaml"
+
+
+def load_config(config_path: str | Path | None = None) -> dict[str, Any]:
+ """Load configuration from YAML file.
+
+ Args:
+ config_path: Path to YAML config file. If None, uses default config.yaml.
+
+ Returns:
+ Configuration dictionary with nested structure.
+ """
+ if config_path is None:
+ config_path = get_default_config_path()
+
+ config_path = Path(config_path)
+ if not config_path.exists():
+ logger.warning("Config file not found: %s, using built-in defaults", config_path)
+ return get_builtin_defaults()
+
+ with open(config_path, encoding="utf-8") as f:
+ config = yaml.safe_load(f)
+
+ logger.info("Loaded config from: %s", config_path)
+ return config
+
+
+def get_builtin_defaults() -> dict[str, Any]:
+ """Return built-in default configuration (matches config.yaml)."""
+ return {
+ "infrastructure": {
+ "block_size": 64,
+ "router_type": "kv",
+ "min_workers": 1,
+ },
+ "affinity": {
+ "base": 0.30,
+ "reuse_weight": 0.15,
+ "iat_weight": 0.20,
+ "sticky_load_floor": 0.70,
+ },
+ "exploration": {
+ "base_ts_weight": 0.10,
+ "temperature": {
+ "base": 1.0,
+ "min": 0.15,
+ "max": 2.0,
+ },
+ },
+ "switching_cost": {
+ "base": 0.20,
+ "reuse_penalty": 0.08,
+ "iat_penalty": 0.05,
+ },
+ "load_balancing": {
+ "queue_penalty_weight": 0.50,
+ "gpu_penalty_weight": 1.00,
+ "outstanding_work_weight": 0.45,
+ "job_gpu_coupling_weight": 0.40,
+ "job_queue_coupling_weight": 0.20,
+ },
+ "prefill": {
+ "token_scale": 1024.0,
+ "weight": 1.0,
+ },
+ "lints": {
+ "lambda": 1.0,
+ "v": 0.25,
+ "forget_rate": 0.995,
+ },
+ "feedback": {
+ "timeout_seconds": 120.0,
+ "sweep_interval_seconds": 5.0,
+ "timeout_reward": 0.0,
+ "latency_ema_alpha": 0.2,
+ },
+ "debug": {
+ "traces_enabled": False,
+ "trace_dir": "/tmp/dynamo_router_traces",
+ "buffer_size": 2000,
+ },
+ }
+
+
+def get_nested(config: dict, dotted_key: str, default: Any = None) -> Any:
+ """Get a nested value from config using dot notation.
+
+ Args:
+ config: Configuration dictionary
+ dotted_key: Key in dot notation, e.g., "affinity.base"
+ default: Default value if key not found
+
+ Returns:
+ Value at the nested key, or default if not found.
+ """
+ keys = dotted_key.split(".")
+ obj = config
+ for k in keys:
+ if not isinstance(obj, dict) or k not in obj:
+ return default
+ obj = obj[k]
+ return obj
+
+
+def set_nested(config: dict, dotted_key: str, value: Any) -> None:
+ """Set a nested value in config using dot notation.
+
+ Args:
+ config: Configuration dictionary (modified in place)
+ dotted_key: Key in dot notation, e.g., "affinity.base"
+ value: Value to set
+ """
+ keys = dotted_key.split(".")
+ obj = config
+ for k in keys[:-1]:
+ if k not in obj:
+ obj[k] = {}
+ obj = obj[k]
+ obj[keys[-1]] = value
+
+
+def auto_cast(value_str: str) -> Any:
+ """Auto-cast a string value to appropriate type.
+
+ Args:
+ value_str: String value from CLI
+
+ Returns:
+ Value cast to int, float, bool, or str as appropriate.
+ """
+ # Boolean
+ if value_str.lower() in ("true", "yes", "1"):
+ return True
+ if value_str.lower() in ("false", "no", "0"):
+ return False
+
+ # Integer
+ try:
+ return int(value_str)
+ except ValueError:
+ pass
+
+ # Float
+ try:
+ return float(value_str)
+ except ValueError:
+ pass
+
+ # String
+ return value_str
+
+
+def apply_cli_overrides(config: dict, args: argparse.Namespace) -> dict:
+ """Apply CLI argument overrides to configuration.
+
+ Args:
+ config: Base configuration dictionary
+ args: Parsed CLI arguments
+
+ Returns:
+ Configuration with CLI overrides applied.
+ """
+ # Apply explicit CLI flags
+ if args.affinity_base is not None:
+ set_nested(config, "affinity.base", args.affinity_base)
+ logger.info("CLI override: affinity.base = %s", args.affinity_base)
+
+ if args.temp_base is not None:
+ set_nested(config, "exploration.temperature.base", args.temp_base)
+ logger.info("CLI override: exploration.temperature.base = %s", args.temp_base)
+
+ if args.lints_v is not None:
+ set_nested(config, "lints.v", args.lints_v)
+ logger.info("CLI override: lints.v = %s", args.lints_v)
+
+ # Apply generic --override flags
+ if args.override:
+ for override in args.override:
+ if "=" not in override:
+ logger.warning("Invalid override format (expected key=value): %s", override)
+ continue
+ key, value_str = override.split("=", 1)
+ value = auto_cast(value_str)
+ set_nested(config, key, value)
+ logger.info("CLI override: %s = %s", key, value)
+
+ return config
+
+
+def _init_prometheus_metrics():
+ """Initialize Prometheus metrics lazily."""
+ import functools
+
+ @functools.lru_cache(maxsize=1)
+ def _init() -> dict:
+ metrics: dict = {}
+ try:
+ from prometheus_client import REGISTRY
+ from prometheus_client import Counter
+ from prometheus_client import Gauge
+ from prometheus_client import Histogram
+
+ metrics["decisions_total"] = Counter(
+ "thompson_router_decisions_total",
+ "Total routing decisions by worker",
+ ["worker_id"],
+ registry=REGISTRY,
+ )
+ metrics["kv_overlap"] = Gauge(
+ "thompson_router_kv_overlap",
+ "KV cache overlap score for last decision by worker",
+ ["worker_id"],
+ registry=REGISTRY,
+ )
+ metrics["feedback_latency"] = Histogram(
+ "thompson_router_feedback_latency_seconds",
+ "Latency from feedback by worker",
+ ["worker_id"],
+ buckets=[0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0],
+ registry=REGISTRY,
+ )
+ metrics["reward"] = Gauge(
+ "thompson_router_reward",
+ "Last computed reward by worker",
+ ["worker_id"],
+ registry=REGISTRY,
+ )
+ metrics["pending_decisions"] = Gauge(
+ "thompson_router_pending_decisions",
+ "Number of pending decisions awaiting feedback",
+ registry=REGISTRY,
+ )
+ metrics["timeout_penalties"] = Counter(
+ "thompson_router_timeout_penalties_total",
+ "Total timeout penalties applied",
+ registry=REGISTRY,
+ )
+ metrics["sticky_decisions"] = Counter(
+ "thompson_router_sticky_decisions_total",
+ "Decisions that stayed on the same worker (sticky)",
+ registry=REGISTRY,
+ )
+ metrics["switch_decisions"] = Counter(
+ "thompson_router_switch_decisions_total",
+ "Decisions that switched to a different worker",
+ registry=REGISTRY,
+ )
+ metrics["beta_alpha"] = Gauge(
+ "thompson_router_beta_alpha",
+ "Beta distribution alpha parameter by worker",
+ ["worker_id"],
+ registry=REGISTRY,
+ )
+ metrics["beta_beta"] = Gauge(
+ "thompson_router_beta_beta",
+ "Beta distribution beta parameter by worker",
+ ["worker_id"],
+ registry=REGISTRY,
+ )
+ metrics["prefix_state_size"] = Gauge(
+ "thompson_router_prefix_state_size",
+ "Number of active prefix states",
+ registry=REGISTRY,
+ )
+ metrics["reuse_budget"] = Histogram(
+ "thompson_router_reuse_budget",
+ "Distribution of reuse_budget values",
+ buckets=[0, 1, 2, 5, 10, 20, 50, 100],
+ registry=REGISTRY,
+ )
+ metrics["tokens_per_request"] = Histogram(
+ "thompson_router_tokens_per_request",
+ "Distribution of input token counts",
+ buckets=[32, 64, 128, 256, 512, 1024, 2048, 4096, 8192],
+ registry=REGISTRY,
+ )
+ logger.info("Prometheus metrics initialized for router")
+ except ImportError:
+ logger.warning("prometheus_client not available, metrics disabled")
+
+ return metrics
+
+ return _init()
+
+
+# ---------------------- request / response models ---------------------- #
+class RouterRequest(BaseModel):
+ tokens: list[int]
+ prefix_id: str = ""
+ reuse_budget: int = 0 # remaining *after this request*
+ expected_osl: str | None = "MEDIUM"
+ interarrival: str | None = "MEDIUM"
+
+
+class RouterResponse(BaseModel):
+ worker_id: int
+ prefix_hit_rate: float
+ decision_id: str | None = None
+
+
+class FeedbackRequest(BaseModel):
+ decision_id: str
+ latency_ms: float
+ success: bool | None = True
+ tokens_in: int | None = None
+ tokens_out: int | None = None
+ finish_reason: str | None = None
+
+
+class FeedbackAck(BaseModel):
+ ok: bool
+ used_baseline: float
+ reward: float
+ worker_id: int | None = None
+ error: str | None = None
+
+
+# ---------------------- helper decorator ---------------------- #
+def safe_update(lock_name: str):
+
+ def decorator(fn):
+
+ @wraps(fn)
+ def wrapper(self, *args, **kwargs):
+ lock = getattr(self, lock_name)
+ with lock:
+ return fn(self, *args, **kwargs)
+
+ return wrapper
+
+ return decorator
+
+
+# ---------------------- router implementation ---------------------- #
+class WorkloadAwareRouter:
+ """
+ Contextual Thompson Sampling router with Prometheus metrics.
+ """
+
+ def __init__(
+ self,
+ runtime: DistributedRuntime,
+ block_size: int = 64,
+ router_type: str = "kv",
+ min_workers: int = 1,
+ # Affinity / exploration
+ affinity_base: float = 0.30,
+ affinity_reuse_weight: float = 0.15,
+ affinity_iat_weight: float = 0.20,
+ base_ts_weight: float = 0.10,
+ sticky_load_floor: float = 0.70,
+ # Softmax temperature
+ temp_base: float = 1.0,
+ temp_min: float = 0.15,
+ temp_max: float = 2.0,
+ # Switching cost
+ switch_cost_base: float = 0.20,
+ switch_cost_reuse: float = 0.08,
+ switch_cost_iat: float = 0.05,
+ # Load / opportunity cost
+ queue_penalty_weight: float = 0.50,
+ gpu_penalty_weight: float = 1.00,
+ outstanding_work_weight: float = 0.45,
+ job_gpu_coupling_weight: float = 0.40,
+ job_queue_coupling_weight: float = 0.20,
+ # Prefill / ISL
+ prefill_token_scale: float = 1024.0,
+ prefill_weight: float = 1.0,
+ # LinTS
+ lints_lambda: float = 1.0,
+ lints_v: float = 0.25,
+ lints_forget: float = 0.995,
+ # ---------- Feedback timeout / sweep ----------
+ feedback_timeout_seconds: float = 120.0,
+ pending_sweep_interval_seconds: float = 5.0,
+ timeout_reward: float = 0.0,
+ # ---------- Latency EMA (reward normalization) ----------
+ latency_ema_alpha: float = 0.2,
+ # ---------- Debug traces ----------
+ debug_traces: bool = False,
+ debug_trace_dir: str = "/tmp/dynamo_router_traces",
+ debug_buffer_size: int = 2000,
+ ):
+ self.runtime = runtime
+ self.block_size = block_size
+ self.router_type = router_type
+ self.min_workers = min_workers
+
+ # clients / helpers (initialized later)
+ self.engine_client = None
+ self.indexer: KvIndexer | None = None
+
+ # concurrency primitives
+ self._init_lock = threading.Lock()
+ self._bandit_lock = threading.Lock()
+ self._prefix_lock = threading.Lock()
+ self._lin_lock = threading.Lock()
+ self._pending_lock = threading.Lock()
+
+ # prefix state: pid -> {"worker": int|None, "reuse_remaining": int}
+ self.prefix_cache_state: dict[str, dict[str, int | None]] = {}
+ # pid -> {"decode_cost","prefill_cost","iat_factor"}
+ self.prefix_meta: dict[str, dict[str, float]] = {}
+
+ # Beta bandits and LinTS params
+ self.worker_bandits: dict[int, tuple[float, float]] = {}
+ self.feature_dim = 9
+ self.lin_lambda = float(lints_lambda)
+ self.lin_v = float(lints_v)
+ self.lin_forget = float(lints_forget)
+ self.lin_forget = max(1e-6, min(self.lin_forget, 0.999999))
+ self.linA: dict[int, np.ndarray] = {}
+ self.linb: dict[int, np.ndarray] = {}
+
+ # knobs
+ self.affinity_base = float(affinity_base)
+ self.affinity_reuse_weight = float(affinity_reuse_weight)
+ self.affinity_iat_weight = float(affinity_iat_weight)
+ self.base_ts_weight = float(base_ts_weight)
+ self.sticky_load_floor = float(sticky_load_floor)
+ self.temp_base = float(temp_base)
+ self.temp_min = float(temp_min)
+ self.temp_max = float(temp_max)
+ self.switch_cost_base = float(switch_cost_base)
+ self.switch_cost_reuse = float(switch_cost_reuse)
+ self.switch_cost_iat = float(switch_cost_iat)
+ self.queue_penalty_weight = float(queue_penalty_weight)
+ self.gpu_penalty_weight = float(gpu_penalty_weight)
+ self.outstanding_work_weight = float(outstanding_work_weight)
+ self.job_gpu_coupling_weight = float(job_gpu_coupling_weight)
+ self.job_queue_coupling_weight = float(job_queue_coupling_weight)
+ self.prefill_token_scale = float(prefill_token_scale)
+ self.prefill_weight = float(prefill_weight)
+
+ # LinTS numerics
+ self._jt_base = 1e-9
+ self._jt_mult = 10.0
+ self._jt_max = 1e-3
+ self._eig_floor = 1e-10
+
+ # Feedback timeout / sweep
+ self.feedback_timeout_seconds = float(feedback_timeout_seconds)
+ self.pending_sweep_interval_seconds = float(pending_sweep_interval_seconds)
+ self.timeout_reward = float(max(0.0, min(1.0, timeout_reward)))
+ self._last_pending_sweep = 0.0
+
+ # Latency EMA baselines (two modes: raw ms, or ms/token)
+ self.latency_ema_alpha = float(latency_ema_alpha)
+ # Global (per-mode)
+ self.lat_ema_global: dict[bool, float | None] = {False: None, True: None}
+ # Per worker (per-mode)
+ self.lat_ema_worker: dict[tuple[int, bool], float] = {}
+ # Per bucket (per-mode): (wid, osl, prefill_bin, per_tok) -> value
+ self.lat_ema_bucket: dict[tuple[int, str, str, bool], float] = {}
+
+ # Pending decisions waiting for feedback
+ self.pending: dict[str, dict[str, Any]] = {}
+
+ # Debug traces
+ self.debug_traces = bool(debug_traces)
+ self.debug_trace_dir = str(debug_trace_dir)
+ self.recent_traces: deque = deque(maxlen=int(debug_buffer_size))
+ if self.debug_traces:
+ os.makedirs(self.debug_trace_dir, exist_ok=True)
+ logger.info("Router debug traces enabled -> %s", self.debug_trace_dir)
+
+ # Prometheus metrics
+ self._metrics = {}
+
+ # --------------------- tracing --------------------- #
+ def _emit_trace(self, kind: str, payload: dict[str, Any]):
+ if not self.debug_traces:
+ return
+ item = {"ts": time.time(), "kind": kind, **payload}
+ self.recent_traces.append(item)
+ try:
+ path = os.path.join(self.debug_trace_dir, "router_traces.jsonl")
+ with open(path, "a", encoding="utf-8") as f:
+ f.write(json.dumps(item, separators=(",", ":")) + "\n")
+ except Exception as e:
+ logger.debug("Trace write failed: %s", e)
+
+ # --------------------- level mappings --------------------- #
+ @staticmethod
+ def _norm_level(s: str | None, default: str = "MEDIUM") -> str:
+ if not s:
+ return default
+ s = str(s).strip().upper()
+ return s if s in ("LOW", "MEDIUM", "HIGH") else default
+
+ @staticmethod
+ def _decode_cost(osl: str) -> float:
+ return {"LOW": 1.0, "MEDIUM": 2.0, "HIGH": 3.0}[osl]
+
+ @staticmethod
+ def _iat_factor(iat: str) -> float:
+ return {"LOW": 1.5, "MEDIUM": 1.0, "HIGH": 0.6}[iat]
+
+ # --------------------- init --------------------- #
+ async def initialize(self):
+ """Initialize router by polling for backend workers."""
+ # Initialize Prometheus metrics
+ self._metrics = _init_prometheus_metrics()
+
+ # Connect to actual workers at workers.{component}.generate
+ # Workers are in the "workers" namespace (hidden from frontend discovery)
+ # Component name varies by backend (REQUIRED - no default):
+ # - SGLang: uses "worker" (set via --endpoint workers.worker.generate)
+ # - vLLM: uses "backend" (hardcoded in dynamo.vllm)
+ worker_component = os.environ.get("DYNAMO_WORKER_COMPONENT")
+ if not worker_component:
+ raise ValueError("DYNAMO_WORKER_COMPONENT environment variable is required. "
+ "Set to 'worker' for SGLang or 'backend' for vLLM.")
+ engine = self.runtime.namespace("workers").component(worker_component)
+ logger.info("Getting engine client for workers/%s/generate", worker_component)
+ self.engine_client = await engine.endpoint("generate").client()
+
+ min_workers = int(self.min_workers)
+ if min_workers < 0:
+ raise ValueError(f"min_workers must be >= 0, got {min_workers}")
+
+ timeout_s = float(os.environ.get("DYNAMO_ROUTER_WAIT_FOR_WORKERS_TIMEOUT_S", "600"))
+ if not math.isfinite(timeout_s) or timeout_s <= 0:
+ raise ValueError("DYNAMO_ROUTER_WAIT_FOR_WORKERS_TIMEOUT_S must be a finite number > 0")
+
+ deadline = time.monotonic() + timeout_s
+ backoff_s = 0.5
+
+ logger.info("Waiting for backend workers (min_workers=%d, timeout_s=%.1f)...", min_workers, timeout_s)
+
+ if min_workers == 0:
+ instance_ids_raw = list(self.engine_client.instance_ids())
+ logger.info("Backend workers discovered (min_workers=0): %s", instance_ids_raw)
+ else:
+ while True:
+ remaining = deadline - time.monotonic()
+ if remaining <= 0:
+ raise TimeoutError(f"Timed out after {timeout_s}s waiting for >= {min_workers} backend worker(s)")
+
+ try:
+ await asyncio.wait_for(
+ self.engine_client.wait_for_instances(),
+ timeout=min(remaining, 10.0),
+ )
+ except TimeoutError:
+ pass
+
+ instance_ids_raw = list(self.engine_client.instance_ids())
+ if len(instance_ids_raw) >= min_workers:
+ try:
+ instance_ids = [int(w) for w in instance_ids_raw]
+ except Exception:
+ instance_ids = instance_ids_raw
+ logger.info("Backend workers discovered: %s", instance_ids)
+ break
+
+ await asyncio.sleep(backoff_s)
+ backoff_s = min(backoff_s * 1.5, 5.0)
+
+ self.indexer = KvIndexer(engine, self.block_size)
+
+ self._initialize_bandits()
+ self._initialize_contextual()
+ logger.info("WorkloadAwareRouter initialized with %d backend worker(s)",
+ len(list(self.engine_client.instance_ids())))
+
+ @safe_update("_init_lock")
+ def _initialize_bandits(self):
+ for wid in self.engine_client.instance_ids():
+ wid = int(wid)
+ self.worker_bandits.setdefault(wid, (1.0, 1.0))
+ # Update Prometheus metrics
+ if self._metrics.get("beta_alpha"):
+ self._metrics["beta_alpha"].labels(worker_id=str(wid)).set(1.0)
+ if self._metrics.get("beta_beta"):
+ self._metrics["beta_beta"].labels(worker_id=str(wid)).set(1.0)
+
+ @safe_update("_init_lock")
+ def _initialize_contextual(self):
+ for wid in self.engine_client.instance_ids():
+ wid = int(wid)
+ if wid not in self.linA:
+ self.linA[wid] = self.lin_lambda * np.eye(self.feature_dim, dtype=np.float64)
+ self.linb[wid] = np.zeros(self.feature_dim, dtype=np.float64)
+
+ def _ensure_worker_context(self, worker_id: int):
+ if worker_id not in self.linA:
+ with self._lin_lock:
+ if worker_id not in self.linA:
+ self.linA[worker_id] = self.lin_lambda * np.eye(self.feature_dim, dtype=np.float64)
+ self.linb[worker_id] = np.zeros(self.feature_dim, dtype=np.float64)
+
+ # --------------------- prefix state --------------------- #
+ @safe_update("_prefix_lock")
+ def _get_prefix(self, pid: str) -> tuple[int | None, int]:
+ info = self.prefix_cache_state.get(pid)
+ if info:
+ return info.get("worker"), int(info.get("reuse_remaining") or 0)
+ return None, 0
+
+ @safe_update("_prefix_lock")
+ def _set_prefix(
+ self,
+ pid: str,
+ wid: int,
+ reuse_remaining: int,
+ decode_cost: float,
+ prefill_cost: float,
+ iat_factor: float,
+ ):
+ """Record/refresh prefix assignment."""
+ if reuse_remaining <= 0:
+ self.prefix_cache_state.pop(pid, None)
+ self.prefix_meta.pop(pid, None)
+ else:
+ self.prefix_cache_state[pid] = {"worker": wid, "reuse_remaining": max(0, int(reuse_remaining))}
+ self.prefix_meta[pid] = {
+ "decode_cost": float(decode_cost),
+ "prefill_cost": float(max(prefill_cost, 0.0)),
+ "iat_factor": float(iat_factor),
+ }
+
+ # Update prefix state size metric
+ if self._metrics.get("prefix_state_size"):
+ self._metrics["prefix_state_size"].set(len(self.prefix_cache_state))
+
+ def _worker_outstanding(self, wid: int) -> tuple[int, float]:
+ """Returns (reuse_total, work_total) for a worker."""
+ reuse_total = 0
+ work_total = 0.0
+ for pid, info in self.prefix_cache_state.items():
+ if info.get("worker") != wid:
+ continue
+ r = int(info.get("reuse_remaining") or 0)
+ reuse_total += r
+ meta = self.prefix_meta.get(pid)
+ if meta:
+ work_total += float(r) * (float(meta.get("decode_cost", 2.0)) +
+ float(meta.get("prefill_cost", 0.0))) * float(meta.get("iat_factor", 1.0))
+ return reuse_total, work_total
+
+ # --------------------- bandits --------------------- #
+ def _linTS_sample(self, wid: int, x: np.ndarray) -> float:
+ self._ensure_worker_context(wid)
+ with self._lin_lock:
+ A = np.array(self.linA[wid], dtype=np.float64, copy=True)
+ b = np.array(self.linb[wid], dtype=np.float64, copy=True)
+
+ A = 0.5 * (A + A.T)
+ eye = np.eye(self.feature_dim, dtype=np.float64)
+ jitter = self._jt_base
+ L = None
+ while True:
+ try:
+ L = np.linalg.cholesky(A + jitter * eye)
+ break
+ except np.linalg.LinAlgError:
+ jitter = jitter * self._jt_mult if jitter > 0 else self._jt_base
+ if jitter > self._jt_max:
+ vals, vecs = np.linalg.eigh(A)
+ vals = np.maximum(vals, self._eig_floor)
+ A_inv = vecs @ (np.diag(1.0 / vals)) @ vecs.T
+ mu = A_inv @ b
+ z = np.random.normal(size=self.feature_dim)
+ noise = vecs @ (z / np.sqrt(vals))
+ theta = mu + (self.lin_v * noise)
+ return float(theta @ x)
+
+ y = np.linalg.solve(L, b)
+ mu = np.linalg.solve(L.T, y)
+ z = np.random.normal(size=self.feature_dim)
+ noise = np.linalg.solve(L.T, z)
+ theta = mu + (self.lin_v * noise)
+ return float(theta @ x)
+
+ def _update_contextual(self, wid: int, x: np.ndarray, reward: float):
+ r = float(max(0.0, min(1.0, reward)))
+ with self._lin_lock:
+ A = self.linA[wid]
+ b = self.linb[wid]
+ A *= self.lin_forget
+ b *= self.lin_forget
+ A += np.outer(x, x)
+ ridge = (1.0 - self.lin_forget) * self.lin_lambda
+ if ridge > 0.0:
+ A += ridge * np.eye(self.feature_dim, dtype=np.float64)
+ self.linA[wid] = 0.5 * (A + A.T)
+ self.linb[wid] = b + x * r
+
+ def _ts_sample(self, worker_id: int) -> float:
+ with self._bandit_lock:
+ alpha, beta = self.worker_bandits.get(worker_id, (1.0, 1.0))
+ return np.random.beta(alpha, beta)
+
+ def _update_bandit(self, worker_id: int, reward: float):
+ with self._bandit_lock:
+ alpha, beta = self.worker_bandits.get(worker_id, (1.0, 1.0))
+ r = float(max(0.0, min(1.0, reward)))
+ new_alpha = alpha + r
+ new_beta = beta + 1.0 - r
+ self.worker_bandits[worker_id] = (new_alpha, new_beta)
+
+ # Update Prometheus metrics
+ if self._metrics.get("beta_alpha"):
+ self._metrics["beta_alpha"].labels(worker_id=str(worker_id)).set(new_alpha)
+ if self._metrics.get("beta_beta"):
+ self._metrics["beta_beta"].labels(worker_id=str(worker_id)).set(new_beta)
+
+ # --------------------- features / scores --------------------- #
+ def _prefill_cost_for_worker(self, tokens: list[int], overlap: float) -> float:
+ isl = max(0, len(tokens))
+ frac = min(max(float(overlap), 0.0), 1.0)
+ uncached = max(0.0, float(isl) * (1.0 - frac))
+ return (uncached / self.prefill_token_scale) * self.prefill_weight
+
+ @staticmethod
+ def _prefill_bin(prefill_cost: float) -> str:
+ if prefill_cost < 0.25:
+ return "LOW"
+ if prefill_cost < 0.75:
+ return "MEDIUM"
+ return "HIGH"
+
+ def _feature_vector(
+ self,
+ wid: int,
+ metrics: dict[str, Any] | None,
+ scores: "OverlapScores",
+ last_w: int | None,
+ reuse_after: int,
+ decode_cost: float,
+ prefill_cost: float,
+ iat_factor: float,
+ ) -> np.ndarray:
+ gpu = 0.0
+ queue = 0.0
+ if metrics and isinstance(metrics, dict) and "endpoints" in metrics:
+ for ep in metrics["endpoints"]:
+ if ep.get("worker_id") == wid:
+ gpu = float(ep.get("gpu_cache_usage_perc", 0.0))
+ queue = float(ep.get("num_requests_waiting", 0.0))
+ break
+ inv_load = 1.0 / (1.0 + self.gpu_penalty_weight * max(0.0, gpu) + self.queue_penalty_weight * max(0.0, queue))
+
+ overlap = float(scores.scores.get(wid, 0.0))
+ affinity = 1.0 if (last_w is not None and wid == last_w) else 0.0
+ _, work_out = self._worker_outstanding(wid)
+
+ decode_norm = decode_cost / 3.0
+ prefill_norm = math.tanh(prefill_cost)
+ iat_norm = iat_factor / 1.5
+ outstanding_norm = math.tanh(0.1 * work_out)
+ reuse_norm = math.tanh(0.25 * float(max(reuse_after, 0)))
+
+ return np.array([
+ 1.0,
+ inv_load,
+ overlap,
+ affinity,
+ outstanding_norm,
+ decode_norm,
+ prefill_norm,
+ iat_norm,
+ reuse_norm,
+ ],
+ dtype=np.float64)
+
+ def _load_score(self, wid: int, metrics: dict[str, Any] | None, job_cost_total: float) -> float:
+ gpu = 0.0
+ queue = 0.0
+ if metrics and isinstance(metrics, dict) and "endpoints" in metrics:
+ for ep in metrics["endpoints"]:
+ if ep.get("worker_id") == wid:
+ gpu = float(ep.get("gpu_cache_usage_perc", 0.0))
+ queue = float(ep.get("num_requests_waiting", 0.0))
+ break
+ _, work_out = self._worker_outstanding(wid)
+ penalty = (self.gpu_penalty_weight * gpu + self.queue_penalty_weight * queue +
+ self.outstanding_work_weight * max(0.0, work_out) +
+ self.job_gpu_coupling_weight * job_cost_total * gpu +
+ self.job_queue_coupling_weight * job_cost_total * queue)
+ return 1.0 / (1.0 + max(0.0, penalty))
+
+ def _softmax(self, scores: list[float], temp: float) -> list[float]:
+ t = float(min(max(temp, self.temp_min), self.temp_max))
+ m = float(np.max(scores))
+ exps = np.exp((np.array(scores) - m) / max(1e-6, t))
+ s = float(np.sum(exps))
+ if s <= 0.0 or not np.isfinite(s):
+ return [1.0 / len(scores)] * len(scores)
+ return list((exps / s).astype(float))
+
+ # --------------------- selection --------------------- #
+ def _select_worker(
+ self,
+ worker_ids,
+ req: RouterRequest,
+ metrics: dict[str, Any] | None,
+ scores: OverlapScores,
+ ) -> tuple[int, dict[str, float], dict[int, dict[str, float]], list[float], list[float]]:
+ osl = self._norm_level(req.expected_osl, "MEDIUM")
+ iat = self._norm_level(req.interarrival, "MEDIUM")
+ last_w, _ = self._get_prefix(req.prefix_id)
+
+ reuse_after = max(int(req.reuse_budget), 0)
+ decode_cost = self._decode_cost(osl)
+ iat_factor = self._iat_factor(iat)
+
+ temp = self.temp_base / (1.0 + float(reuse_after) * iat_factor)
+ temp = min(max(temp, self.temp_min), self.temp_max)
+
+ raw_scores: list[float] = []
+ worker_list: list[int] = [int(w) for w in worker_ids]
+ per_worker_ctx: dict[int, dict[str, float]] = {}
+ load_mods: list[float] = []
+ overlaps: list[float] = []
+
+ for wid in worker_list:
+ overlap = float(scores.scores.get(wid, 0.0))
+ prefill_cost = self._prefill_cost_for_worker(req.tokens, overlap)
+ job_cost_total = decode_cost + prefill_cost
+
+ x = self._feature_vector(
+ wid=wid,
+ metrics=metrics,
+ scores=scores,
+ last_w=last_w,
+ reuse_after=reuse_after,
+ decode_cost=decode_cost,
+ prefill_cost=prefill_cost,
+ iat_factor=iat_factor,
+ )
+
+ val = self._linTS_sample(wid, x)
+ explore_w = self.base_ts_weight / (1.0 + float(reuse_after) * iat_factor)
+ val += explore_w * self._ts_sample(wid)
+
+ if last_w == wid and (reuse_after > 0):
+ val += (self.affinity_base + self.affinity_reuse_weight * float(reuse_after) +
+ self.affinity_iat_weight * iat_factor) * (0.5 + 0.5 * overlap)
+
+ if last_w is not None and wid != last_w and (reuse_after > 0):
+ val -= (self.switch_cost_base + self.switch_cost_reuse * float(reuse_after) +
+ self.switch_cost_iat * iat_factor)
+
+ load_mod = self._load_score(wid, metrics, job_cost_total=job_cost_total)
+ if last_w == wid and reuse_after > 0:
+ load_mod = max(load_mod, self.sticky_load_floor)
+ val *= load_mod
+
+ if np.isnan(val) or np.isinf(val):
+ val = -1e9
+
+ raw_scores.append(float(val))
+ load_mods.append(float(load_mod))
+ overlaps.append(float(overlap))
+ per_worker_ctx[wid] = {
+ "decode_cost": decode_cost,
+ "prefill_cost": prefill_cost,
+ "iat_factor": iat_factor,
+ "overlap": overlap,
+ "reuse_after": float(reuse_after),
+ "load_mod": load_mod,
+ }
+
+ probs = self._softmax(raw_scores, temp)
+ r = random.random()
+ cum = 0.0
+ idx = 0
+ for i, p in enumerate(probs):
+ cum += p
+ if r <= cum:
+ idx = i
+ break
+ chosen = int(worker_list[idx])
+
+ return chosen, per_worker_ctx[chosen], per_worker_ctx, raw_scores, probs
+
+ # --------------------- latency baselines & reward --------------------- #
+ def _ema_update(self, old: float | None, new: float) -> float:
+ a = self.latency_ema_alpha
+ return new if old is None else (a * new + (1.0 - a) * old)
+
+ def _get_latency_baseline(self, wid: int, osl: str, prefill_bin: str, per_tok: bool, fallback: float) -> float:
+ key_b = (wid, osl, prefill_bin, per_tok)
+ key_w = (wid, per_tok)
+ if key_b in self.lat_ema_bucket:
+ return self.lat_ema_bucket[key_b]
+ if key_w in self.lat_ema_worker:
+ return self.lat_ema_worker[key_w]
+ if self.lat_ema_global[per_tok] is not None:
+ return self.lat_ema_global[per_tok] # type: ignore
+ return max(1.0, float(fallback))
+
+ def _update_latency_baselines(self, wid: int, osl: str, prefill_bin: str, metric: float, per_tok: bool) -> float:
+ self.lat_ema_global[per_tok] = self._ema_update(self.lat_ema_global[per_tok], metric)
+ key_w = (wid, per_tok)
+ self.lat_ema_worker[key_w] = self._ema_update(self.lat_ema_worker.get(key_w), metric)
+ key_b = (wid, osl, prefill_bin, per_tok)
+ self.lat_ema_bucket[key_b] = self._ema_update(self.lat_ema_bucket.get(key_b), metric)
+ return self.lat_ema_bucket[key_b]
+
+ @staticmethod
+ def _latency_metric(latency_ms: float, tokens_out: int | None) -> tuple[float, bool]:
+ if tokens_out is not None and int(tokens_out) > 0:
+ return float(latency_ms) / float(max(1, int(tokens_out))), True
+ return float(latency_ms), False
+
+ @staticmethod
+ def _metric_to_reward(metric: float, baseline: float, success: bool) -> float:
+ if not success:
+ return 0.0
+ denom = max(1e-3, baseline)
+ ratio = metric / denom
+ return float(1.0 / (1.0 + ratio))
+
+ # --------------------- timeout sweep --------------------- #
+ def _sweep_pending(self, now: float):
+ if now - self._last_pending_sweep < self.pending_sweep_interval_seconds:
+ return
+ self._last_pending_sweep = now
+ expired: list[tuple[str, dict[str, Any]]] = []
+ with self._pending_lock:
+ for did, rec in list(self.pending.items()):
+ if now - float(rec.get("start_ts", now)) >= self.feedback_timeout_seconds:
+ expired.append((did, rec))
+ self.pending.pop(did, None)
+
+ # Update pending count metric
+ if self._metrics.get("pending_decisions"):
+ self._metrics["pending_decisions"].set(len(self.pending))
+
+ for did, rec in expired:
+ wid = int(rec["wid"])
+ x = rec["x"]
+ reward = float(self.timeout_reward)
+ self._update_bandit(wid, reward)
+ self._update_contextual(wid, x, reward)
+
+ if self._metrics.get("timeout_penalties"):
+ self._metrics["timeout_penalties"].inc()
+
+ self._emit_trace(
+ "timeout",
+ {
+ "decision_id": did,
+ "wid": wid,
+ "reward": reward,
+ "age": self.feedback_timeout_seconds,
+ "prefix_id": rec.get("prefix_id"),
+ "osl": rec.get("osl"),
+ "prefill_bin": rec.get("prefill_bin"),
+ })
+ logger.warning("Timeout feedback: wid=%s decision=%s reward=%.3f", wid, did, reward)
+
+ # --------------------- main endpoint: find_worker --------------------- #
+ async def generate(self, request: dict):
+ req = RouterRequest(**request)
+
+ worker_ids = [int(w) for w in self.engine_client.instance_ids()]
+ if not worker_ids:
+ yield RouterResponse(worker_id=-1, prefix_hit_rate=0.0).model_dump()
+ return
+
+ now = time.time()
+ self._sweep_pending(now)
+
+ # Track tokens per request
+ if self._metrics.get("tokens_per_request"):
+ self._metrics["tokens_per_request"].observe(len(req.tokens))
+ if self._metrics.get("reuse_budget"):
+ self._metrics["reuse_budget"].observe(req.reuse_budget)
+
+ metrics = None # TODO: Replace with proper metrics query when API is available
+ if self.router_type == "kv_load":
+ wid, _ = self._get_underloaded(metrics)
+ yield RouterResponse(worker_id=wid, prefix_hit_rate=0.0).model_dump()
+ return
+
+ scores: OverlapScores = await self.indexer.find_matches_for_request(req.tokens, 0)
+ chosen, chosen_ctx, all_ctx, raw_scores, probs = self._select_worker(worker_ids, req, metrics, scores)
+
+ last_w, _ = self._get_prefix(req.prefix_id)
+
+ osl = self._norm_level(req.expected_osl, "MEDIUM")
+ iat = self._norm_level(req.interarrival, "MEDIUM")
+ decode_cost = self._decode_cost(osl)
+ overlap_chosen = float(scores.scores.get(chosen, 0.0))
+ prefill_cost_chosen = self._prefill_cost_for_worker(req.tokens, overlap_chosen)
+ iat_factor = self._iat_factor(iat)
+
+ # Update prefix state
+ self._set_prefix(
+ req.prefix_id,
+ chosen,
+ reuse_remaining=max(int(req.reuse_budget), 0),
+ decode_cost=decode_cost,
+ prefill_cost=prefill_cost_chosen,
+ iat_factor=iat_factor,
+ )
+
+ # Build feature x for chosen & store pending decision
+ x = self._feature_vector(
+ wid=chosen,
+ metrics=metrics,
+ scores=scores,
+ last_w=last_w,
+ reuse_after=max(int(req.reuse_budget), 0),
+ decode_cost=decode_cost,
+ prefill_cost=prefill_cost_chosen,
+ iat_factor=iat_factor,
+ )
+ decision_id = uuid.uuid4().hex
+ with self._pending_lock:
+ self.pending[decision_id] = {
+ "wid": int(chosen),
+ "x": x,
+ "osl": osl,
+ "prefill_bin": self._prefill_bin(prefill_cost_chosen),
+ "start_ts": now,
+ "prefix_id": req.prefix_id,
+ "tokens_in": len(req.tokens),
+ "reuse_after": int(req.reuse_budget),
+ "overlap": overlap_chosen,
+ "prefill_cost": float(prefill_cost_chosen),
+ "decode_cost": float(decode_cost),
+ }
+ # Update pending count metric
+ if self._metrics.get("pending_decisions"):
+ self._metrics["pending_decisions"].set(len(self.pending))
+
+ # Update Prometheus metrics
+ if self._metrics.get("decisions_total"):
+ self._metrics["decisions_total"].labels(worker_id=str(chosen)).inc()
+ if self._metrics.get("kv_overlap"):
+ self._metrics["kv_overlap"].labels(worker_id=str(chosen)).set(overlap_chosen)
+
+ # Track sticky vs switch decisions
+ if last_w is not None:
+ if chosen == last_w:
+ if self._metrics.get("sticky_decisions"):
+ self._metrics["sticky_decisions"].inc()
+ elif self._metrics.get("switch_decisions"):
+ self._metrics["switch_decisions"].inc()
+
+ # Decision trace
+ if self.debug_traces:
+ worker_list = [int(w) for w in worker_ids]
+ details = {
+ wid: {
+ "score": float(raw_scores[i]),
+ "prob": float(probs[i]),
+ **all_ctx[wid],
+ }
+ for i, wid in enumerate(worker_list)
+ }
+ self._emit_trace("decision",
+ {
+ "decision_id": decision_id,
+ "prefix_id": req.prefix_id,
+ "chosen": int(chosen),
+ "workers": details,
+ })
+
+ logger.info(
+ "Router picked worker=%s decision=%s prefix=%s (last=%s reuse_after=%s osl=%s "
+ "prefill_cost=%.3f iat=%s overlap=%.3f)",
+ chosen,
+ decision_id,
+ req.prefix_id,
+ last_w,
+ req.reuse_budget,
+ osl,
+ prefill_cost_chosen,
+ iat,
+ overlap_chosen,
+ )
+
+ resp = RouterResponse(worker_id=chosen, prefix_hit_rate=overlap_chosen, decision_id=decision_id)
+ yield resp.model_dump()
+ return
+
+ # --------------------- feedback endpoint --------------------- #
+ async def feedback(self, request: dict):
+ """Ex-post reward update from processor with observed latency."""
+ try:
+ fb = FeedbackRequest(**request)
+ except Exception as e:
+ ack = FeedbackAck(ok=False, used_baseline=0.0, reward=0.0, error=str(e))
+ yield ack.model_dump()
+ return
+
+ with self._pending_lock:
+ decision = self.pending.pop(fb.decision_id, None)
+ # Update pending count metric
+ if self._metrics.get("pending_decisions"):
+ self._metrics["pending_decisions"].set(len(self.pending))
+
+ if not decision:
+ ack = FeedbackAck(ok=False, used_baseline=0.0, reward=0.0, error="unknown_decision")
+ yield ack.model_dump()
+ return
+
+ wid: int = int(decision["wid"])
+ x: np.ndarray = decision["x"]
+ osl: str = str(decision["osl"])
+ prefill_bin: str = str(decision["prefill_bin"])
+ tokens_out = None if fb.tokens_out is None else int(fb.tokens_out)
+ metric, per_tok = self._latency_metric(float(fb.latency_ms), tokens_out)
+
+ # Baseline lookup (hierarchical)
+ baseline_before = self._get_latency_baseline(wid, osl, prefill_bin, per_tok, fallback=metric)
+ reward = self._metric_to_reward(metric, baseline_before, bool(fb.success))
+
+ # Update EMAs only on successes
+ if fb.success:
+ baseline_after = self._update_latency_baselines(wid, osl, prefill_bin, metric, per_tok)
+ else:
+ baseline_after = baseline_before
+
+ # Update bandits with ex-post reward
+ self._update_bandit(wid, reward)
+ self._update_contextual(wid, x, reward)
+
+ # Update Prometheus metrics
+ if self._metrics.get("feedback_latency"):
+ self._metrics["feedback_latency"].labels(worker_id=str(wid)).observe(fb.latency_ms / 1000.0)
+ if self._metrics.get("reward"):
+ self._metrics["reward"].labels(worker_id=str(wid)).set(reward)
+
+ self._emit_trace(
+ "feedback",
+ {
+ "decision_id": fb.decision_id,
+ "wid": wid,
+ "latency_ms": float(fb.latency_ms),
+ "tokens_out": tokens_out,
+ "metric": metric,
+ "per_tok": per_tok,
+ "baseline_used": baseline_before,
+ "baseline_after": baseline_after,
+ "reward": reward,
+ "success": bool(fb.success),
+ "finish_reason": fb.finish_reason or "",
+ })
+
+ logger.info(
+ "Feedback: wid=%s decision=%s metric=%.3f%s baseline=%.3f reward=%.3f success=%s",
+ wid,
+ fb.decision_id,
+ metric,
+ " ms/tok" if per_tok else " ms",
+ baseline_before,
+ reward,
+ fb.success,
+ )
+
+ ack = FeedbackAck(ok=True, used_baseline=float(baseline_before), reward=float(reward), worker_id=wid)
+ yield ack.model_dump()
+ return
+
+ # --------------------- helpers --------------------- #
+ def _get_underloaded(self, metrics: dict[str, Any] | None):
+ if not metrics or not metrics.get("endpoints"):
+ wid = int(random.choice(list(self.engine_client.instance_ids())))
+ return wid, 0.0
+ loads = {ep.get("worker_id"): ep.get("gpu_cache_usage_perc", 0.0) for ep in metrics["endpoints"]}
+ min_val = min(loads.values())
+ candidates = [wid for wid, v in loads.items() if v == min_val]
+ return random.choice(candidates), min_val
+
+
+# ---------------------- worker entry point ---------------------- #
+def parse_args():
+ """Parse minimal CLI arguments.
+
+ The router uses a YAML config file for most parameters.
+ Only frequently-tuned parameters have dedicated CLI flags.
+ Use --override for any other parameter.
+
+ See PARAMETERS.md for full documentation.
+ """
+ parser = argparse.ArgumentParser(
+ description="Optimized Thompson Sampling Router with Prometheus Metrics",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+Examples:
+ # Use default config
+ python router.py
+
+ # Use custom config file
+ python router.py --config /path/to/config.yaml
+
+ # Override specific values
+ python router.py --config config.yaml --affinity-base 0.5 --temp-base 1.5
+
+ # Override any config value
+ python router.py --config config.yaml --override load_balancing.gpu_penalty_weight=2.0
+
+See PARAMETERS.md for full parameter documentation.
+ """,
+ )
+
+ # Config file
+ parser.add_argument(
+ "--config",
+ type=str,
+ default=None,
+ help="Path to YAML config file (default: config.yaml in script directory)",
+ )
+
+ # Primary tuning knobs (explicit CLI flags)
+ parser.add_argument(
+ "--affinity-base",
+ type=float,
+ default=None,
+ help="Primary stickiness control [0.0-1.0] (overrides config)",
+ )
+ parser.add_argument(
+ "--temp-base",
+ type=float,
+ default=None,
+ help="Primary exploration control [0.15-2.0] (overrides config)",
+ )
+ parser.add_argument(
+ "--lints-v",
+ type=float,
+ default=None,
+ help="LinTS exploration variance [0.0-1.0] (overrides config)",
+ )
+
+ # Generic override for any config value
+ parser.add_argument(
+ "--override",
+ action="append",
+ default=[],
+ metavar="KEY=VALUE",
+ help="Override any config value using dot notation (repeatable)",
+ )
+
+ return parser.parse_args()
+
+
+@dynamo_worker()
+async def worker(runtime: DistributedRuntime):
+ # Parse CLI and load config
+ args = parse_args()
+ config = load_config(args.config)
+ config = apply_cli_overrides(config, args)
+
+ component = runtime.namespace("dynamo").component("router")
+ # NOTE: create_service() was removed in Dynamo 0.8.x - endpoint creation handles registration
+ logger.info("Initializing Optimized Thompson Sampling Router (Prometheus metrics)")
+
+ # Extract config values with nested access
+ router = WorkloadAwareRouter(
+ runtime,
+ # Infrastructure
+ block_size=get_nested(config, "infrastructure.block_size", 64),
+ router_type=str(get_nested(config, "infrastructure.router_type", "kv")).lower(),
+ min_workers=get_nested(config, "infrastructure.min_workers", 1),
+ # Affinity
+ affinity_base=get_nested(config, "affinity.base", 0.30),
+ affinity_reuse_weight=get_nested(config, "affinity.reuse_weight", 0.15),
+ affinity_iat_weight=get_nested(config, "affinity.iat_weight", 0.20),
+ sticky_load_floor=get_nested(config, "affinity.sticky_load_floor", 0.70),
+ # Exploration
+ base_ts_weight=get_nested(config, "exploration.base_ts_weight", 0.10),
+ temp_base=get_nested(config, "exploration.temperature.base", 1.0),
+ temp_min=get_nested(config, "exploration.temperature.min", 0.15),
+ temp_max=get_nested(config, "exploration.temperature.max", 2.0),
+ # Switching cost
+ switch_cost_base=get_nested(config, "switching_cost.base", 0.20),
+ switch_cost_reuse=get_nested(config, "switching_cost.reuse_penalty", 0.08),
+ switch_cost_iat=get_nested(config, "switching_cost.iat_penalty", 0.05),
+ # Load balancing
+ queue_penalty_weight=get_nested(config, "load_balancing.queue_penalty_weight", 0.50),
+ gpu_penalty_weight=get_nested(config, "load_balancing.gpu_penalty_weight", 1.00),
+ outstanding_work_weight=get_nested(config, "load_balancing.outstanding_work_weight", 0.45),
+ job_gpu_coupling_weight=get_nested(config, "load_balancing.job_gpu_coupling_weight", 0.40),
+ job_queue_coupling_weight=get_nested(config, "load_balancing.job_queue_coupling_weight", 0.20),
+ # Prefill
+ prefill_token_scale=get_nested(config, "prefill.token_scale", 1024.0),
+ prefill_weight=get_nested(config, "prefill.weight", 1.0),
+ # LinTS
+ lints_lambda=get_nested(config, "lints.lambda", 1.0),
+ lints_v=get_nested(config, "lints.v", 0.25),
+ lints_forget=get_nested(config, "lints.forget_rate", 0.995),
+ # Feedback
+ feedback_timeout_seconds=get_nested(config, "feedback.timeout_seconds", 120.0),
+ pending_sweep_interval_seconds=get_nested(config, "feedback.sweep_interval_seconds", 5.0),
+ timeout_reward=get_nested(config, "feedback.timeout_reward", 0.0),
+ latency_ema_alpha=get_nested(config, "feedback.latency_ema_alpha", 0.2),
+ # Debug
+ debug_traces=get_nested(config, "debug.traces_enabled", False),
+ debug_trace_dir=get_nested(config, "debug.trace_dir", "/tmp/dynamo_router_traces"),
+ debug_buffer_size=get_nested(config, "debug.buffer_size", 2000),
+ )
+ await router.initialize()
+
+ # Serve both endpoints
+ await asyncio.gather(
+ component.endpoint("find_worker").serve_endpoint(router.generate),
+ component.endpoint("feedback").serve_endpoint(router.feedback),
+ )
+
+
+if __name__ == "__main__":
+ uvloop.install()
+ asyncio.run(worker())
diff --git a/external/dynamo/start_dynamo_disagg.sh b/external/dynamo/start_dynamo_disagg.sh
index 8477a66a90..e9935afdfb 100755
--- a/external/dynamo/start_dynamo_disagg.sh
+++ b/external/dynamo/start_dynamo_disagg.sh
@@ -36,7 +36,7 @@ CONTAINER_NAME="dynamo-sglang"
PREFILL_GPUS="${DYNAMO_PREFILL_GPUS:-0,1}"
DECODE_GPUS="${DYNAMO_DECODE_GPUS:-2,3}"
TP_SIZE="${DYNAMO_TP_SIZE:-2}"
-HTTP_PORT="${DYNAMO_HTTP_PORT:-8099}"
+HTTP_PORT="${DYNAMO_HTTP_PORT:-8000}"
MODEL="/workspace/models/Llama-3.3-70B-Instruct"
SERVED_MODEL_NAME="${DYNAMO_MODEL_NAME:-llama-3.3-70b}"
IMAGE="nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.1"
diff --git a/external/dynamo/start_dynamo_optimized_thompson_hints_sglang.sh b/external/dynamo/start_dynamo_optimized_thompson_hints_sglang.sh
new file mode 100755
index 0000000000..820e04564b
--- /dev/null
+++ b/external/dynamo/start_dynamo_optimized_thompson_hints_sglang.sh
@@ -0,0 +1,868 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Dynamo SGLang with OPTIMIZED Thompson Sampling Router Architecture
+#
+# Key difference from generalized architecture:
+# - Uses DEFAULT Dynamo frontend (python -m dynamo.frontend)
+# - Custom Processor + Router components
+# - Routing hints passed via nvext.annotations instead of HTTP headers
+# - Prometheus metrics instead of CSV files
+#
+# Architecture:
+# Client → Default Dynamo Frontend (tokenization + nvext parsing)
+# ↓ PreprocessedRequest with annotations
+# Custom Processor (extracts hints, queries router)
+# ↓ RouterRequest
+# Custom Router (Thompson Sampling + KV overlap)
+# ↓ worker_id
+# SGLang Backend Worker
+# ↓ response tokens
+# Processor sends feedback to Router
+#
+# Components:
+# - ETCD (metadata and worker discovery)
+# - NATS (message queue for KV events)
+# - Default Dynamo Frontend (HTTP API on port 8000)
+# - Custom Router (Thompson Sampling + KV overlap)
+# - Custom Processor (hint extraction + routing)
+# - SGLang Workers (unified mode, multiple workers with TP=2 each)
+#
+# Prometheus Metrics:
+# - Frontend: http://localhost:8000/metrics
+# - Backend/Router/Processor: http://localhost:8081/metrics
+#
+# To stop all components: bash stop_dynamo.sh
+
+set -euo pipefail
+
+# Configuration Variables (can be overridden via environment variables)
+# See env.example for documentation on each variable
+CONTAINER_NAME="dynamo-sglang"
+WORKER_GPUS="${DYNAMO_GPU_DEVICES:-0,1,2,3,4,5,6,7}"
+TP_SIZE="${DYNAMO_TP_SIZE:-2}"
+HTTP_PORT="${DYNAMO_HTTP_PORT:-8000}"
+# Metrics ports - each component gets its own port to avoid conflicts
+# Using 18xxx range to avoid conflicts with common services
+# Workers use sequential ports starting at WORKER_METRICS_PORT (18081, 18082, ...)
+# Router and Processor are offset to allow for many workers
+WORKER_METRICS_PORT="${DYNAMO_WORKER_METRICS_PORT:-18081}"
+ROUTER_METRICS_PORT="${DYNAMO_ROUTER_METRICS_PORT:-18090}"
+PROCESSOR_METRICS_PORT="${DYNAMO_PROCESSOR_METRICS_PORT:-18091}"
+MODEL="/workspace/models/Llama-3.3-70B-Instruct"
+SERVED_MODEL_NAME="${DYNAMO_MODEL_NAME:-llama-3.3-70b}"
+IMAGE="nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.1"
+SHM_SIZE="${DYNAMO_SHM_SIZE:-16g}"
+WORKER_INIT_TIMEOUT_S="${DYNAMO_WORKER_INIT_TIMEOUT_S:-1800}"
+
+# KV Cache Configuration
+# Block size in tokens - must match between SGLang (--page-size) and Frontend (--kv-cache-block-size)
+KV_BLOCK_SIZE="${DYNAMO_KV_BLOCK_SIZE:-64}"
+# Fraction of GPU memory for KV cache (0.0-1.0). Reduce to test cache pressure/degradation.
+MEM_FRACTION_STATIC="${DYNAMO_MEM_FRACTION_STATIC:-0.9}"
+
+# Compute container-internal GPU indices (GPUs are renumbered 0,1,2,... inside the container)
+NUM_GPUS=$(echo "$WORKER_GPUS" | tr ',' '\n' | wc -l)
+CONTAINER_GPU_INDICES=$(seq -s, 0 $((NUM_GPUS - 1)))
+
+# Calculate number of workers based on available GPUs and TP size
+NUM_WORKERS=$((NUM_GPUS / TP_SIZE))
+
+# Local paths - DYNAMO_MODEL_DIR must be set or script will error
+if [ -z "${DYNAMO_MODEL_DIR:-}" ]; then
+ echo "ERROR: DYNAMO_MODEL_DIR environment variable must be set"
+ echo ""
+ echo "Example:"
+ echo " export DYNAMO_MODEL_DIR=\"/path/to/your/models/Llama-3.3-70B-Instruct\""
+ echo ""
+ echo "Then run this script again."
+ exit 1
+fi
+
+# Validate model directory
+if [ -d "${DYNAMO_MODEL_DIR}" ]; then
+ if [ ! -f "${DYNAMO_MODEL_DIR}/config.json" ]; then
+ echo "ERROR: ${DYNAMO_MODEL_DIR} exists but is not a valid model directory"
+ echo ""
+ echo "Missing: config.json"
+ echo ""
+ echo "Find it: find ~/.cache/huggingface/hub -name config.json -path '*Llama-3.3-70B*'"
+ exit 1
+ fi
+
+ if ! grep -q '"model_type"' "${DYNAMO_MODEL_DIR}/config.json" 2>/dev/null; then
+ echo "ERROR: ${DYNAMO_MODEL_DIR}/config.json is missing 'model_type' field"
+ echo ""
+ echo "This usually means incomplete/corrupted download. Try:"
+ echo " rm -rf ${DYNAMO_MODEL_DIR}"
+ echo " hf download meta-llama/Llama-3.3-70B-Instruct --local-dir ${DYNAMO_MODEL_DIR}"
+ exit 1
+ fi
+fi
+LOCAL_MODEL_DIR="${DYNAMO_MODEL_DIR}"
+
+# Repository directory - auto-detect from script location
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CUSTOM_DYNAMO_DIR="${SCRIPT_DIR}/optimized"
+
+echo "========================================================="
+echo "Dynamo SGLang with OPTIMIZED Thompson Sampling Router"
+echo "========================================================="
+echo "Model: Llama-3.3-70B-Instruct"
+echo "Container: $CONTAINER_NAME"
+echo "HTTP Port: $HTTP_PORT (default Dynamo frontend)"
+echo "Metrics Ports:"
+echo " - Worker: $WORKER_METRICS_PORT (KV cache, internal)"
+echo " - Router: $ROUTER_METRICS_PORT (Thompson routing)"
+echo " - Processor: $PROCESSOR_METRICS_PORT (KVE metrics)"
+echo ""
+echo "Architecture Differences (vs generalized):"
+echo " - Default Dynamo frontend (not custom frontend.py)"
+echo " - Hints via nvext.annotations (not HTTP headers)"
+echo " - Prometheus metrics on separate ports per component"
+echo ""
+echo "Components:"
+echo " - ETCD (metadata and discovery)"
+echo " - NATS (message queue for KV events)"
+echo " - Default Frontend (HTTP API on port $HTTP_PORT)"
+echo " - Custom Router (Thompson Sampling + KV overlap)"
+echo " - Custom Processor (hint extraction + routing)"
+echo " - SGLang Worker (unified mode)"
+echo ""
+echo "Backend Workers:"
+echo " Workers: $NUM_WORKERS (GPUs: $NUM_GPUS, TP=$TP_SIZE per worker)"
+echo " GPUs: $WORKER_GPUS"
+echo " Mode: UNIFIED (no prefill/decode disaggregation)"
+echo ""
+echo "KV Cache Configuration:"
+echo " Block Size: $KV_BLOCK_SIZE tokens (--page-size / --kv-cache-block-size)"
+echo " GPU Mem Fraction: $MEM_FRACTION_STATIC (--mem-fraction-static)"
+echo ""
+echo "========================================================="
+
+# Verify custom components exist
+if [ ! -f "$CUSTOM_DYNAMO_DIR/router.py" ]; then
+ echo "✗ ERROR: Custom router.py not found at: $CUSTOM_DYNAMO_DIR/router.py"
+ exit 1
+fi
+if [ ! -f "$CUSTOM_DYNAMO_DIR/processor.py" ]; then
+ echo "✗ ERROR: Custom processor.py not found at: $CUSTOM_DYNAMO_DIR/processor.py"
+ exit 1
+fi
+echo "✓ Custom components found in: $CUSTOM_DYNAMO_DIR"
+echo ""
+
+# Start ETCD if not running
+if docker ps -a --format '{{.Names}}' | grep -q "^etcd-dynamo$"; then
+ echo "Removing existing ETCD container..."
+ docker rm -f etcd-dynamo
+fi
+
+echo "Starting ETCD container..."
+docker run -d \
+ --name etcd-dynamo \
+ --network host \
+ -e ALLOW_NONE_AUTHENTICATION=yes \
+ -e ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379 \
+ -e ETCD_ADVERTISE_CLIENT_URLS=http://localhost:2379 \
+ bitnamilegacy/etcd:3.6.1
+
+# Wait for ETCD to be ready
+echo "Waiting for ETCD to be ready..."
+for i in {1..30}; do
+ if curl -s http://localhost:2379/health > /dev/null 2>&1; then
+ echo "✓ ETCD is ready"
+ sleep 2
+ break
+ fi
+ if [ $i -eq 30 ]; then
+ echo "✗ ERROR: ETCD failed to start within 30 seconds"
+ docker logs etcd-dynamo
+ exit 1
+ fi
+ sleep 1
+done
+
+# Start NATS if not running
+if docker ps -a --format '{{.Names}}' | grep -q "^nats-dynamo$"; then
+ echo "Removing existing NATS container..."
+ docker rm -f nats-dynamo
+fi
+
+echo "Starting NATS container..."
+docker run -d \
+ --name nats-dynamo \
+ --network host \
+ nats:2.11.4 \
+ -js
+
+# Wait for NATS to be ready
+echo "Waiting for NATS to be ready..."
+for i in {1..30}; do
+ if timeout 2 bash -c 'cat < /dev/null > /dev/tcp/localhost/4222' 2>/dev/null; then
+ echo "✓ NATS is ready"
+ break
+ fi
+ if [ $i -eq 30 ]; then
+ echo "✗ ERROR: NATS failed to start within 30 seconds"
+ docker logs nats-dynamo
+ exit 1
+ fi
+ sleep 1
+done
+echo ""
+
+# Start monitoring stack (Prometheus + Grafana) if not running
+MONITORING_DIR="${SCRIPT_DIR}/monitoring"
+if [ -f "$MONITORING_DIR/docker-compose.yml" ]; then
+ PROMETHEUS_RUNNING=$(docker ps --format '{{.Names}}' | grep -q "^dynamo-prometheus$" && echo "true" || echo "false")
+ GRAFANA_RUNNING=$(docker ps --format '{{.Names}}' | grep -q "^dynamo-grafana$" && echo "true" || echo "false")
+
+ if [ "$PROMETHEUS_RUNNING" = "false" ] || [ "$GRAFANA_RUNNING" = "false" ]; then
+ echo "Starting monitoring stack (Prometheus + Grafana)..."
+ cd "$MONITORING_DIR"
+ docker compose up -d
+ cd "$SCRIPT_DIR"
+
+ # Wait for Prometheus to be ready
+ echo "Waiting for Prometheus to be ready..."
+ for i in {1..30}; do
+ if curl -s http://localhost:9090/-/ready > /dev/null 2>&1; then
+ echo "✓ Prometheus is ready (http://localhost:9090)"
+ break
+ fi
+ if [ $i -eq 30 ]; then
+ echo "⚠ WARNING: Prometheus may not be fully ready yet"
+ fi
+ sleep 1
+ done
+
+ # Wait for Grafana to be ready
+ echo "Waiting for Grafana to be ready..."
+ for i in {1..30}; do
+ if curl -s http://localhost:3000/api/health > /dev/null 2>&1; then
+ echo "✓ Grafana is ready (http://localhost:3000)"
+ break
+ fi
+ if [ $i -eq 30 ]; then
+ echo "⚠ WARNING: Grafana may not be fully ready yet"
+ fi
+ sleep 1
+ done
+ echo ""
+ else
+ echo "✓ Monitoring stack already running"
+ echo " Prometheus: http://localhost:9090"
+ echo " Grafana: http://localhost:3000"
+ echo ""
+ fi
+else
+ echo "⚠ Monitoring docker-compose.yml not found at: $MONITORING_DIR"
+ echo " Skipping monitoring stack startup"
+ echo ""
+fi
+
+# Clean up existing Dynamo container if it exists
+if docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
+ echo "Removing existing Dynamo container: $CONTAINER_NAME"
+ docker rm -f $CONTAINER_NAME
+fi
+
+# Verify HF_TOKEN is set
+if [ -z "${HF_TOKEN:-}" ]; then
+ echo ""
+ echo "⚠ HF_TOKEN environment variable is not set."
+ echo ""
+ if [ -d "$LOCAL_MODEL_DIR" ]; then
+ echo "✓ Local model found - proceeding without HF_TOKEN"
+ HF_TOKEN="dummy"
+ else
+ echo "✗ Local model NOT found and no HF_TOKEN to download it"
+ echo ""
+ read -p "Please enter your HuggingFace token (or press Enter to skip): " HF_TOKEN
+ if [ -z "$HF_TOKEN" ]; then
+ echo "WARNING: Proceeding without HF_TOKEN."
+ HF_TOKEN="dummy"
+ else
+ echo "✓ HuggingFace token received"
+ fi
+ fi
+else
+ echo "✓ HuggingFace token is set"
+fi
+echo ""
+
+# Verify model exists locally
+if [ ! -d "$LOCAL_MODEL_DIR" ]; then
+ echo "WARNING: Model directory not found at: $LOCAL_MODEL_DIR"
+ echo ""
+ echo "To download the model, run:"
+ echo " hf download meta-llama/Llama-3.3-70B-Instruct --local-dir $LOCAL_MODEL_DIR"
+ echo ""
+ read -p "Continue anyway (model will be downloaded from HuggingFace)? [y/N] " -n 1 -r
+ echo
+ if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+ exit 1
+ fi
+fi
+
+# Start container with optimized Thompson Sampling components
+echo ""
+echo "Starting Dynamo container with OPTIMIZED Thompson Sampling components..."
+docker run -d \
+ --name $CONTAINER_NAME \
+ --gpus "\"device=${WORKER_GPUS}\"" \
+ --network host \
+ --ipc=host \
+ --shm-size=$SHM_SIZE \
+ --ulimit memlock=-1 \
+ --ulimit stack=67108864 \
+ -v $LOCAL_MODEL_DIR:$MODEL:ro \
+ -v $CUSTOM_DYNAMO_DIR:/workspace/custom_dynamo:ro \
+ -e HF_TOKEN="$HF_TOKEN" \
+ -e HUGGING_FACE_HUB_TOKEN="$HF_TOKEN" \
+ -e RUST_BACKTRACE=1 \
+ -e PYTHONUNBUFFERED=1 \
+ -e DYN_HTTP_PORT=$HTTP_PORT \
+ -e DYN_ROUTER_MODE=round-robin \
+ -e WORKER_METRICS_PORT=$WORKER_METRICS_PORT \
+ -e ROUTER_METRICS_PORT=$ROUTER_METRICS_PORT \
+ -e PROCESSOR_METRICS_PORT=$PROCESSOR_METRICS_PORT \
+ -e KV_BLOCK_SIZE=$KV_BLOCK_SIZE \
+ -e MEM_FRACTION_STATIC=$MEM_FRACTION_STATIC \
+ -e DYNAMO_WORKER_COMPONENT=worker \
+ $IMAGE \
+ bash -c "
+ set -e
+
+ echo '========================================================='
+ echo 'Verifying external infrastructure services...'
+ echo '========================================================='
+
+ # Verify ETCD is accessible
+ if curl -s http://localhost:2379/health > /dev/null 2>&1; then
+ echo '✓ ETCD accessible at localhost:2379'
+ else
+ echo '✗ ERROR: ETCD not accessible at localhost:2379'
+ exit 1
+ fi
+
+ # Verify NATS is accessible
+ if timeout 2 bash -c '/dev/null; then
+ echo '✓ NATS accessible at localhost:4222'
+ else
+ echo '✗ ERROR: NATS not accessible at localhost:4222'
+ exit 1
+ fi
+
+ echo ''
+
+ # Function to wait for worker initialization via ETCD registration
+ wait_for_worker() {
+ local worker_type=\$1
+ local pid=\$2
+ # Use WORKER_INIT_TIMEOUT_S (defaults to 1800s / 30 min)
+ local max_wait=$WORKER_INIT_TIMEOUT_S
+ local elapsed=0
+ local poll_interval=5
+
+ echo \"Waiting for \$worker_type worker (PID \$pid) to initialize...\"
+ echo \" Detection: ETCD worker registration\"
+ echo \" Timeout: \${max_wait}s\"
+
+ while [ \$elapsed -lt \$max_wait ]; do
+ if ! kill -0 \$pid 2>/dev/null; then
+ echo \"ERROR: \$worker_type worker process died!\"
+ return 1
+ fi
+
+ local etcd_response=\$(curl -s --max-time 2 http://localhost:2379/v3/kv/range \
+ -X POST \
+ -H \"Content-Type: application/json\" \
+ -d '{\"key\":\"AA==\",\"range_end\":\"AA==\",\"keys_only\":true}' 2>&1)
+
+ if [ \$((elapsed % 30)) -eq 0 ] && [ \$elapsed -gt 0 ]; then
+ echo \" [DEBUG] ETCD count: \$(echo \"\$etcd_response\" | grep -o '\"count\":\"[^\"]*\"')\"
+ fi
+
+ if echo \"\$etcd_response\" | grep -q '\"count\"' && \
+ ! echo \"\$etcd_response\" | grep -q '\"count\":\"0\"'; then
+ echo \"✓ \$worker_type worker is ready (registered with ETCD at \${elapsed}s)\"
+ return 0
+ fi
+
+ sleep \$poll_interval
+ elapsed=\$((elapsed + poll_interval))
+ if [ \$((elapsed % 30)) -eq 0 ]; then
+ echo \" ... \${elapsed}s / \${max_wait}s (waiting for ETCD registration)\"
+ fi
+ done
+
+ echo \"ERROR: \$worker_type worker failed to register with ETCD within \${max_wait}s\"
+ return 1
+ }
+
+ # =========================================================================
+ # STARTUP ORDER WITH MODEL NAME ISOLATION
+ # =========================================================================
+ # Using different model names to force ALL traffic through the processor.
+ # Workers register with internal model name (${SERVED_MODEL_NAME}-internal),
+ # while processor registers with public model name (${SERVED_MODEL_NAME}).
+ # Frontend only routes to backends matching the requested model name.
+ #
+ # Order:
+ # 1. Workers (model=${SERVED_MODEL_NAME}-internal, not discovered for public model)
+ # 2. Router (needs workers to be present)
+ # 3. Processor (model=${SERVED_MODEL_NAME}, frontend discovers this)
+ # 4. Frontend (routes ${SERVED_MODEL_NAME} requests to processor ONLY)
+ # =========================================================================
+
+ echo '========================================================='
+ echo 'Step 1: Starting $NUM_WORKERS Unified Worker(s) (Host GPUs $WORKER_GPUS -> Container GPUs $CONTAINER_GPU_INDICES)...'
+ echo '========================================================='
+ # Workers register at workers.worker.generate (in 'workers' namespace)
+ # They start first so the router can discover them during initialization
+ # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component
+
+ # Start multiple workers, each using TP_SIZE GPUs
+ WORKER_PIDS=()
+ for i in \$(seq 0 \$(($NUM_WORKERS - 1))); do
+ # Calculate GPU range for this worker (e.g., worker 0: 0,1; worker 1: 2,3; etc.)
+ START_GPU=\$((i * $TP_SIZE))
+ END_GPU=\$(((i + 1) * $TP_SIZE - 1))
+ WORKER_GPU_LIST=\$(seq -s, \$START_GPU \$END_GPU)
+ WORKER_PORT=\$((30000 + i))
+
+ echo \"Starting Worker \$i: GPUs \$WORKER_GPU_LIST, Port \$WORKER_PORT (internal model name)\"
+ echo \" KV Block Size: $KV_BLOCK_SIZE tokens, Mem Fraction: $MEM_FRACTION_STATIC\"
+ CUDA_VISIBLE_DEVICES=\$WORKER_GPU_LIST \
+ DYN_SYSTEM_PORT=\$((WORKER_METRICS_PORT + i)) \
+ DYN_NAMESPACE=workers \
+ python3 -m dynamo.sglang \
+ --model-path $MODEL \
+ --served-model-name ${SERVED_MODEL_NAME}-internal \
+ --host 0.0.0.0 \
+ --port \$WORKER_PORT \
+ --tp $TP_SIZE \
+ --trust-remote-code \
+ --enable-metrics \
+ --page-size $KV_BLOCK_SIZE \
+ --mem-fraction-static $MEM_FRACTION_STATIC \
+ --endpoint workers.worker.generate &
+ WORKER_PIDS+=(\$!)
+ echo \" Worker \$i PID: \${WORKER_PIDS[\$i]}\"
+ done
+ echo \"\"
+ echo \"Total workers started: \${#WORKER_PIDS[@]}\"
+ echo \"Worker PIDs: \${WORKER_PIDS[*]}\"
+ echo \"Registered at: workers.worker.generate (model: ${SERVED_MODEL_NAME}-internal)\"
+ echo \"NOTE: Workers use internal model name so frontend only discovers processor\"
+ echo \"\"
+
+ # Wait for first worker to initialize (checks ETCD registration)
+ wait_for_worker \"Unified\" \${WORKER_PIDS[0]} || exit 1
+
+ # Give additional workers time to initialize
+ if [ \${#WORKER_PIDS[@]} -gt 1 ]; then
+ echo \"Waiting additional 30s for remaining workers to initialize...\"
+ sleep 30
+ fi
+
+ echo ''
+ echo '========================================================='
+ echo 'Step 2: Starting Custom Router (Thompson Sampling + Prometheus)...'
+ echo '========================================================='
+ # Router uses config.yaml for all parameters
+ # It needs workers to be present (started in Step 1)
+ # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component
+ DYN_SYSTEM_PORT=\$ROUTER_METRICS_PORT \
+ python3 /workspace/custom_dynamo/router.py \
+ --config /workspace/custom_dynamo/config.yaml &
+ ROUTER_PID=\$!
+ echo \"Router PID: \$ROUTER_PID\"
+ echo \"Metrics at: http://localhost:\$ROUTER_METRICS_PORT/metrics\"
+ sleep 15
+ echo \"\"
+
+ echo ''
+ echo '========================================================='
+ echo 'Step 3: Starting Custom Processor (Static Mode)...'
+ echo '========================================================='
+ # STATIC MODE: Processor uses @dynamo_worker(static=True) so it registers
+ # at dynamo.backend.generate WITHOUT an instance ID. This is required for
+ # --static-endpoint on the frontend to find it.
+ # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component
+ DYN_SYSTEM_PORT=\$PROCESSOR_METRICS_PORT \
+ python3 /workspace/custom_dynamo/processor.py \
+ --enable-router \
+ --model-path $MODEL \
+ --model-name $SERVED_MODEL_NAME &
+ PROCESSOR_PID=\$!
+ echo \"Processor PID: \$PROCESSOR_PID\"
+ echo \"Model: $SERVED_MODEL_NAME (from $MODEL)\"
+ echo \"Registered at: dynamo.backend.generate (namespace=dynamo)\"
+ echo \"Forwards to: workers.worker.generate (actual SGLang workers)\"
+ echo \"Metrics at: http://localhost:\$PROCESSOR_METRICS_PORT/metrics\"
+ sleep 15
+ echo \"\"
+
+ echo ''
+ echo '========================================================='
+ echo 'Step 4: Starting Default Dynamo Frontend (Namespace-Scoped Discovery)...'
+ echo '========================================================='
+ # NAMESPACE-SCOPED DISCOVERY: Frontend discovers backends via ETCD ModelWatcher,
+ # but only from the 'dynamo' namespace. Workers are in the 'workers' namespace,
+ # so the frontend will ONLY discover the processor (in 'dynamo' namespace).
+ # This ensures ALL requests go through the Thompson Sampling router.
+ echo \"Frontend KV Block Size: $KV_BLOCK_SIZE tokens (must match worker --page-size)\"
+ python3 -m dynamo.frontend \
+ --http-port $HTTP_PORT \
+ --model-name $SERVED_MODEL_NAME \
+ --model-path $MODEL \
+ --kv-cache-block-size $KV_BLOCK_SIZE \
+ --namespace dynamo &
+ FRONTEND_PID=\$!
+ echo \"Frontend PID: \$FRONTEND_PID\"
+ echo \"Discovery: ETCD ModelWatcher (namespace=dynamo, discovers processor ONLY)\"
+ sleep 15
+ echo \"\"
+
+ echo ''
+ echo '========================================================='
+ echo '✓ All components started successfully!'
+ echo '========================================================='
+ echo \"Infrastructure Services (External):\"
+ echo \" ETCD: localhost:2379\"
+ echo \" NATS: localhost:4222\"
+ echo \"\"
+ echo \"Dynamo Components (This Container):\"
+ echo \" Unified Workers: \${#WORKER_PIDS[@]} workers (GPUs $WORKER_GPUS, TP=$TP_SIZE each)\"
+ for i in \$(seq 0 \$((\${#WORKER_PIDS[@]} - 1))); do
+ START_GPU=\$((i * $TP_SIZE))
+ END_GPU=\$(((i + 1) * $TP_SIZE - 1))
+ echo \" Worker \$i: PID \${WORKER_PIDS[\$i]}, GPUs \$START_GPU-\$END_GPU, port \$((30000 + i))\"
+ done
+ echo \" → Registered at: workers.worker.generate (hidden from frontend)\"
+ echo \" Router: PID \$ROUTER_PID (Thompson Sampling + Prometheus)\"
+ echo \" → Registered at: dynamo.router.{find_worker,feedback}\"
+ echo \" → Metrics: http://localhost:\$ROUTER_METRICS_PORT/metrics\"
+ echo \" Processor: PID \$PROCESSOR_PID (NVExt annotation extraction)\"
+ echo \" → Registered at: dynamo.backend.generate (STATIC mode)\"
+ echo \" → Metrics: http://localhost:\$PROCESSOR_METRICS_PORT/metrics\"
+ echo \" Frontend: PID \$FRONTEND_PID (Default Dynamo HTTP API on port $HTTP_PORT)\"
+ echo \" → Discovery: ETCD ModelWatcher\"
+ echo \" → Metrics: http://localhost:$HTTP_PORT/metrics\"
+ echo ''
+ echo 'Request Flow (Dynamic Discovery - Thompson Sampling when routed to processor):'
+ echo ' Client → Default Frontend API (port $HTTP_PORT)'
+ echo ' ↓ (tokenization + nvext parsing)'
+ echo ' Frontend routes via ETCD ModelWatcher (processor OR workers)'
+ echo ' ↓'
+ echo ' IF routed to Processor (dynamo.backend.generate):'
+ echo ' ↓ (extract hints from annotations)'
+ echo ' ↓ (query Thompson Sampling router)'
+ echo ' Custom Router → worker_id'
+ echo ' ↓ (KV overlap + workload-aware selection)'
+ echo ' Processor routes to → workers.worker.generate (with worker_id)'
+ echo ' ↓'
+ echo ' Unified Worker (workers.worker.generate)'
+ echo ' ↓'
+ echo ' Response + Feedback to Router'
+ echo ''
+ echo 'Prometheus Metrics Endpoints:'
+ echo ' - Frontend: http://localhost:$HTTP_PORT/metrics (latency, throughput)'
+ echo ' - Workers: http://localhost:\$WORKER_METRICS_PORT/metrics - \$((WORKER_METRICS_PORT + \${#WORKER_PIDS[@]} - 1))/metrics (KV cache)'
+ echo ' - Router: http://localhost:\$ROUTER_METRICS_PORT/metrics (thompson_router_*)'
+ echo ' - Processor: http://localhost:\$PROCESSOR_METRICS_PORT/metrics (thompson_* KVE)'
+ echo '========================================================='
+
+ # Monitor all processes
+ while true; do
+ if ! kill -0 \$FRONTEND_PID 2>/dev/null; then
+ echo \"ERROR: Frontend died!\"
+ exit 1
+ fi
+ if ! kill -0 \$PROCESSOR_PID 2>/dev/null; then
+ echo \"ERROR: Processor died!\"
+ exit 1
+ fi
+ if ! kill -0 \$ROUTER_PID 2>/dev/null; then
+ echo \"ERROR: Router died!\"
+ exit 1
+ fi
+ for i in \$(seq 0 \$((\${#WORKER_PIDS[@]} - 1))); do
+ if ! kill -0 \${WORKER_PIDS[\$i]} 2>/dev/null; then
+ echo \"ERROR: Worker \$i (PID \${WORKER_PIDS[\$i]}) died!\"
+ exit 1
+ fi
+ done
+ sleep 10
+ done
+ "
+
+# Wait for container to start
+echo ""
+echo "Waiting for container to start..."
+sleep 15
+
+# Check if container started successfully
+if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
+ echo ""
+ echo "========================================================="
+ echo "✓ Dynamo with OPTIMIZED Thompson Sampling Router Started!"
+ echo "========================================================="
+ echo ""
+ echo "Architecture (Model Name Isolation - Thompson Sampling):"
+ echo ""
+ echo " Model Name Isolation Mode:"
+ echo " - Workers register with internal model name (${SERVED_MODEL_NAME}-internal)"
+ echo " - Processor registers with public model name (${SERVED_MODEL_NAME})"
+ echo " - Frontend routes ${SERVED_MODEL_NAME} requests to processor ONLY"
+ echo " - ALL requests go through Thompson Sampling router"
+ echo ""
+ echo " Startup Order:"
+ echo " 1. Workers → model=${SERVED_MODEL_NAME}-internal (not matched by frontend)"
+ echo " 2. Router → dynamo.router.{find_worker,feedback}"
+ echo " 3. Processor → model=${SERVED_MODEL_NAME} (matched by frontend)"
+ echo " 4. Frontend → routes to processor for ${SERVED_MODEL_NAME} requests"
+ echo ""
+ echo " Request Flow (ALL requests go through processor):"
+ echo " Client Request (with nvext.annotations)"
+ echo " ↓"
+ echo " Default Dynamo Frontend (port $HTTP_PORT)"
+ echo " ↓ ETCD ModelWatcher (namespace=dynamo) routes to processor"
+ echo " Custom Processor (dynamo.backend.generate)"
+ echo " ↓ extracts: prefix_id, total_requests, osl, iat"
+ echo " ↓ queries Thompson Sampling router"
+ echo " Custom Router → worker_id"
+ echo " ↓ KV overlap + workload-aware selection"
+ echo " Processor forwards to workers.worker.generate"
+ echo " ↓"
+ echo " Unified Workers ($NUM_WORKERS x TP=$TP_SIZE = $NUM_GPUS GPUs total)"
+ echo " ↓"
+ echo " Response + Feedback Loop"
+ echo ""
+ echo "Infrastructure Services (Managed):"
+ echo " ETCD: etcd-dynamo container, localhost:2379"
+ echo " NATS: nats-dynamo container, localhost:4222"
+ echo ""
+ echo "Prometheus Metrics Endpoints:"
+ echo " Frontend: http://localhost:$HTTP_PORT/metrics (latency, throughput)"
+ echo " Workers: http://localhost:$WORKER_METRICS_PORT/metrics - $((WORKER_METRICS_PORT + NUM_WORKERS - 1))/metrics (KV cache)"
+ echo " Router: http://localhost:$ROUTER_METRICS_PORT/metrics (routing)"
+ echo " Processor: http://localhost:$PROCESSOR_METRICS_PORT/metrics (KVE)"
+ echo ""
+ echo "Dynamo Components:"
+ echo " Frontend: HTTP API on port $HTTP_PORT"
+ echo " Unified Workers: $NUM_WORKERS workers (TP=$TP_SIZE each, ports 30000-$((30000 + NUM_WORKERS - 1)))"
+ echo ""
+ echo "KV Cache Settings:"
+ echo " Block Size: $KV_BLOCK_SIZE tokens (DYNAMO_KV_BLOCK_SIZE)"
+ echo " GPU Mem Fraction: $MEM_FRACTION_STATIC (DYNAMO_MEM_FRACTION_STATIC)"
+ echo ""
+ echo "API Endpoint: http://localhost:$HTTP_PORT/v1/chat/completions"
+ echo "Health Check: http://localhost:$HTTP_PORT/health"
+ echo ""
+ echo "NVExt Annotations (in request body):"
+ echo " \"nvext\": {"
+ echo " \"annotations\": ["
+ echo " \"prefix_id:\","
+ echo " \"total_requests:\","
+ echo " \"osl:LOW|MEDIUM|HIGH\","
+ echo " \"iat:LOW|MEDIUM|HIGH\""
+ echo " ]"
+ echo " }"
+ echo ""
+ echo "Monitoring Dashboards:"
+ echo " Grafana: http://localhost:3000 (no login required)"
+ echo " Prometheus: http://localhost:9090"
+ echo ""
+ echo "Useful Commands:"
+ echo " Interactive shell: docker exec -it $CONTAINER_NAME bash"
+ echo " View Dynamo logs: docker logs -f $CONTAINER_NAME"
+ echo " View ETCD logs: docker logs -f etcd-dynamo"
+ echo " View NATS logs: docker logs -f nats-dynamo"
+ echo " GPU usage: watch -n 2 nvidia-smi"
+ echo " Stop all: bash stop_dynamo.sh"
+ echo " Stop all + metrics: bash stop_dynamo.sh --kill-metrics"
+ echo ""
+ echo "Query Metrics:"
+ echo " curl http://localhost:$HTTP_PORT/metrics | grep dynamo_frontend"
+ echo " curl http://localhost:$WORKER_METRICS_PORT/metrics | grep kvstats"
+ echo " curl http://localhost:$ROUTER_METRICS_PORT/metrics | grep thompson_router"
+ echo " curl http://localhost:$PROCESSOR_METRICS_PORT/metrics | grep thompson_kve"
+ echo ""
+ echo "========================================================="
+ echo "Test Request (with nvext annotations):"
+ echo "========================================================="
+ echo ""
+ echo "# Basic test (no hints)"
+ echo "curl http://localhost:$HTTP_PORT/v1/chat/completions \\"
+ echo " -H 'Content-Type: application/json' \\"
+ echo " -d '{"
+ echo " \"model\": \"$SERVED_MODEL_NAME\","
+ echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}],"
+ echo " \"max_tokens\": 50"
+ echo " }'"
+ echo ""
+ echo "# Test with nvext annotations (routing hints)"
+ echo "curl http://localhost:$HTTP_PORT/v1/chat/completions \\"
+ echo " -H 'Content-Type: application/json' \\"
+ echo " -d '{"
+ echo " \"model\": \"$SERVED_MODEL_NAME\","
+ echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}],"
+ echo " \"max_tokens\": 50,"
+ echo " \"nvext\": {"
+ echo " \"annotations\": ["
+ echo " \"prefix_id:test-session-001\","
+ echo " \"total_requests:5\","
+ echo " \"osl:MEDIUM\","
+ echo " \"iat:LOW\""
+ echo " ]"
+ echo " }"
+ echo " }'"
+ echo ""
+ echo "# Streaming test with hints"
+ echo "curl http://localhost:$HTTP_PORT/v1/chat/completions \\"
+ echo " -H 'Content-Type: application/json' \\"
+ echo " -d '{"
+ echo " \"model\": \"$SERVED_MODEL_NAME\","
+ echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}],"
+ echo " \"max_tokens\": 50,"
+ echo " \"stream\": true,"
+ echo " \"nvext\": {"
+ echo " \"annotations\": [\"prefix_id:stream-test\", \"total_requests:1\"]"
+ echo " }"
+ echo " }'"
+ echo ""
+ echo "========================================================="
+ echo ""
+ echo "Waiting for SGLang to initialize (this may take 5-10 minutes for a 70B model)..."
+ echo "Monitoring logs (Ctrl+C to exit, container continues)..."
+ echo ""
+
+ # Wait for server to be ready
+ echo "Checking for API availability (timeout=${WORKER_INIT_TIMEOUT_S}s)..."
+ max_attempts=$WORKER_INIT_TIMEOUT_S
+ attempt=0
+
+ while [ $attempt -lt $max_attempts ]; do
+ # Use || true to prevent curl connection failures from exiting due to set -e
+ # curl returns "000" for connection refused, so we just need to prevent the exit
+ health_response=$(curl -s --max-time 5 -o /dev/null -w "%{http_code}" http://localhost:$HTTP_PORT/health 2>/dev/null) || true
+ if [ "$health_response" = "200" ]; then
+ echo "✓ Dynamo API is ready! (health check passed)"
+ break
+ fi
+ attempt=$((attempt + 1))
+ if [ $((attempt % 15)) -eq 0 ]; then
+ echo " ... still waiting ($attempt/$max_attempts) - health response: $health_response"
+ fi
+ sleep 1
+ done
+
+ if [ $attempt -ge $max_attempts ]; then
+ echo ""
+ echo "⚠ Timeout waiting for API. Check logs with: docker logs $CONTAINER_NAME"
+ echo ""
+ else
+ echo ""
+ echo "Quick test (polling every 15s for up to 5 minutes):"
+ echo ""
+
+ quick_test_max_attempts=20 # 20 * 15s = 5 minutes
+ quick_test_attempt=0
+ quick_test_success=false
+
+ while [ $quick_test_attempt -lt $quick_test_max_attempts ]; do
+ quick_test_attempt=$((quick_test_attempt + 1))
+ echo " Attempt $quick_test_attempt/$quick_test_max_attempts..."
+
+ quick_test_response=$(curl -s --max-time 60 http://localhost:$HTTP_PORT/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "'$SERVED_MODEL_NAME'",
+ "messages": [{"role": "user", "content": "Say hello"}],
+ "max_tokens": 20
+ }' 2>&1) || true
+
+ # Check if response is empty/null
+ if [ -z "$quick_test_response" ]; then
+ echo " Empty response, retrying in 15s..."
+ sleep 15
+ continue
+ fi
+
+ # Check if response contains an error
+ error_message=$(echo "$quick_test_response" | jq -r '.error.message // .error // empty' 2>/dev/null)
+ if [ -n "$error_message" ]; then
+ echo ""
+ echo "========================================================="
+ echo "✗ Quick test failed with error:"
+ echo " $error_message"
+ echo "========================================================="
+ echo ""
+ echo "Full response:"
+ echo "$quick_test_response" | jq . 2>/dev/null || echo "$quick_test_response"
+ echo ""
+ echo "Check logs with: docker logs $CONTAINER_NAME"
+ exit 1
+ fi
+
+ # Check if response has valid choices (success)
+ choices_content=$(echo "$quick_test_response" | jq -r '.choices[0].message.content // empty' 2>/dev/null)
+ if [ -n "$choices_content" ]; then
+ echo ""
+ echo "========================================================="
+ echo "✓ Quick test successful!"
+ echo "========================================================="
+ echo ""
+ echo "$quick_test_response" | jq '.choices[0].message.content, .usage'
+ echo ""
+ echo "========================================================="
+ echo "Container is running. View logs with:"
+ echo " docker logs -f $CONTAINER_NAME"
+ echo "========================================================="
+ quick_test_success=true
+ break
+ fi
+
+ # Response exists but no choices - might still be loading
+ echo " Response received but no valid choices, retrying in 15s..."
+ echo " Response: $(echo "$quick_test_response" | head -c 200)..."
+ sleep 15
+ done
+
+ if [ "$quick_test_success" = false ]; then
+ echo ""
+ echo "========================================================="
+ echo "⚠ Quick test timed out after 5 minutes"
+ echo "========================================================="
+ echo ""
+ echo "Container is running but may not be fully ready."
+ echo "Try manually: curl http://localhost:$HTTP_PORT/v1/chat/completions ..."
+ echo "Check logs with: docker logs $CONTAINER_NAME"
+ fi
+ fi
+else
+ echo ""
+ echo "========================================================="
+ echo "✗ Container failed to start!"
+ echo "========================================================="
+ echo ""
+ echo "Check logs with: docker logs $CONTAINER_NAME"
+ exit 1
+fi
diff --git a/external/dynamo/start_dynamo_optimized_thompson_hints_vllm.sh b/external/dynamo/start_dynamo_optimized_thompson_hints_vllm.sh
new file mode 100755
index 0000000000..17bf20fe20
--- /dev/null
+++ b/external/dynamo/start_dynamo_optimized_thompson_hints_vllm.sh
@@ -0,0 +1,1089 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Dynamo vLLM with OPTIMIZED Thompson Sampling Router Architecture
+#
+# Key difference from SGLang version:
+# - Uses vLLM backend instead of SGLang
+# - vLLM has native KVBM support for KV event publishing
+# - Different CLI flags (--block-size vs --page-size, etc.)
+# - Enables radix/prefix caching by default (no --disable-radix-cache)
+#
+# Architecture:
+# Client → Default Dynamo Frontend (tokenization + nvext parsing)
+# ↓ PreprocessedRequest with annotations
+# Custom Processor (extracts hints, queries router)
+# ↓ RouterRequest
+# Custom Router (Thompson Sampling + KV overlap)
+# ↓ worker_id
+# vLLM Backend Worker
+# ↓ response tokens
+# Processor sends feedback to Router
+#
+# Components:
+# - ETCD (metadata and worker discovery)
+# - NATS (message queue for KV events)
+# - Default Dynamo Frontend (HTTP API on port 8000)
+# - Custom Router (Thompson Sampling + KV overlap)
+# - Custom Processor (hint extraction + routing)
+# - vLLM Workers (unified mode, multiple workers with TP=2 each)
+#
+# Prometheus Metrics:
+# - Frontend: http://localhost:8000/metrics
+# - Backend/Router/Processor: http://localhost:8081/metrics
+#
+# To stop all components: bash stop_dynamo.sh
+
+set -euo pipefail
+
+# Configuration Variables (can be overridden via environment variables)
+# See env.example for documentation on each variable
+CONTAINER_NAME="dynamo-vllm"
+WORKER_GPUS="${DYNAMO_GPU_DEVICES:-0,1,2,3,4,5,6,7}"
+TP_SIZE="${DYNAMO_TP_SIZE:-4}"
+HTTP_PORT="${DYNAMO_HTTP_PORT:-8000}"
+# Metrics ports - each component gets its own port to avoid conflicts
+# Using 18xxx range to avoid conflicts with common services
+# Workers use sequential ports starting at WORKER_METRICS_PORT (18081, 18082, ...)
+# Router and Processor are offset to allow for many workers
+WORKER_METRICS_PORT="${DYNAMO_WORKER_METRICS_PORT:-18081}"
+ROUTER_METRICS_PORT="${DYNAMO_ROUTER_METRICS_PORT:-18090}"
+PROCESSOR_METRICS_PORT="${DYNAMO_PROCESSOR_METRICS_PORT:-18091}"
+MODEL="/workspace/models/Llama-3.3-70B-Instruct"
+SERVED_MODEL_NAME="${DYNAMO_MODEL_NAME:-llama-3.3-70b}"
+
+# ============================================================================
+# MultiLRU Configuration Logic
+# ============================================================================
+# Default behavior (standard vLLM 0.7.1 image):
+# - Uses router.py and processor.py (with @dynamo_worker(static=False))
+# - Uses standard vLLM scheduler (no MultiLRU)
+# - Works with nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.1
+#
+# To enable MultiLRU (requires custom-built image):
+# export DYNAMO_USE_MULTILRU=true
+# export DYNAMO_VLLM_IMAGE=dynamo-multi-lru:latest
+# bash start_dynamo_optimized_thompson_hints_vllm.sh
+# ============================================================================
+
+# Enforce safe defaults: only use multilru if EXPLICITLY enabled
+if [ "${DYNAMO_USE_MULTILRU:-}" != "true" ]; then
+ # Not explicitly set to true - use standard configuration
+ DYNAMO_USE_MULTILRU="false"
+ # If image wasn't explicitly set to custom multilru image, use standard
+ if [ "${DYNAMO_VLLM_IMAGE:-}" != "dynamo-multi-lru:latest" ]; then
+ IMAGE="nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.1"
+ else
+ IMAGE="${DYNAMO_VLLM_IMAGE}"
+ fi
+else
+ # Explicitly enabled - use multilru configuration
+ DYNAMO_USE_MULTILRU="true"
+ # Default to custom image if not specified
+ IMAGE="${DYNAMO_VLLM_IMAGE:-dynamo-multi-lru:latest}"
+fi
+
+SHM_SIZE="${DYNAMO_SHM_SIZE:-16g}"
+WORKER_INIT_TIMEOUT_S="${DYNAMO_WORKER_INIT_TIMEOUT_S:-1800}"
+
+# KV Cache Configuration
+# Block size in tokens - must match between vLLM (--block-size) and Frontend (--kv-cache-block-size)
+KV_BLOCK_SIZE="${DYNAMO_KV_BLOCK_SIZE:-16}"
+# Fraction of GPU memory for KV cache (0.0-1.0). Reduce to test cache pressure/degradation.
+# NOTE: 0.85 is safer than 0.9+ to avoid OOM during vLLM warmup with large max_num_seqs
+GPU_MEMORY_UTILIZATION="${DYNAMO_GPU_MEMORY_UTILIZATION:-0.85}"
+# Maximum concurrent sequences per worker. Lower values use less memory during warmup.
+# vLLM default is 1024, but this can cause OOM on memory-constrained setups.
+MAX_NUM_SEQS="${DYNAMO_MAX_NUM_SEQS:-256}"
+# Override the number of GPU KV cache blocks (for experiments with limited cache).
+# Set to a small number (e.g., 8-16) to force cache eviction behavior.
+# Leave empty/unset to use automatic calculation based on GPU memory.
+NUM_GPU_BLOCKS_OVERRIDE="${DYNAMO_NUM_GPU_BLOCKS_OVERRIDE:-}"
+
+# Compute container-internal GPU indices (GPUs are renumbered 0,1,2,... inside the container)
+NUM_GPUS=$(echo "$WORKER_GPUS" | tr ',' '\n' | wc -l)
+CONTAINER_GPU_INDICES=$(seq -s, 0 $((NUM_GPUS - 1)))
+
+# Calculate number of workers based on available GPUs and TP size
+NUM_WORKERS=$((NUM_GPUS / TP_SIZE))
+
+# vLLM-specific: Enable KVBM event publishing for radix tree observability
+# Each worker needs a unique KV event port - configured via DYN_VLLM_KV_EVENT_PORT
+# Port allocation: Worker 0 = 20080, Worker 1 = 20081, etc.
+# This is set per-worker at startup time below
+ENABLE_KV_EVENTS="${DYNAMO_ENABLE_KV_EVENTS:-true}"
+KV_EVENT_BASE_PORT="${DYNAMO_KV_EVENT_BASE_PORT:-20080}"
+
+# Local paths - DYNAMO_MODEL_DIR must be set or script will error
+if [ -z "${DYNAMO_MODEL_DIR:-}" ]; then
+ echo "ERROR: DYNAMO_MODEL_DIR environment variable must be set"
+ echo ""
+ echo "Example:"
+ echo " export DYNAMO_MODEL_DIR=\"/path/to/your/models/Llama-3.3-70B-Instruct\""
+ echo ""
+ echo "Then run this script again."
+ exit 1
+fi
+
+# Validate model directory
+if [ -d "${DYNAMO_MODEL_DIR}" ]; then
+ if [ ! -f "${DYNAMO_MODEL_DIR}/config.json" ]; then
+ echo "ERROR: ${DYNAMO_MODEL_DIR} exists but is not a valid model directory"
+ echo ""
+ echo "Missing: config.json"
+ echo ""
+ echo "Find it: find ~/.cache/huggingface/hub -name config.json -path '*Llama-3.3-70B*'"
+ exit 1
+ fi
+
+ if ! grep -q '"model_type"' "${DYNAMO_MODEL_DIR}/config.json" 2>/dev/null; then
+ echo "ERROR: ${DYNAMO_MODEL_DIR}/config.json is missing 'model_type' field"
+ echo ""
+ echo "This usually means incomplete/corrupted download. Try:"
+ echo " rm -rf ${DYNAMO_MODEL_DIR}"
+ echo " hf download meta-llama/Llama-3.3-70B-Instruct --local-dir ${DYNAMO_MODEL_DIR}"
+ exit 1
+ fi
+fi
+LOCAL_MODEL_DIR="${DYNAMO_MODEL_DIR}"
+
+# Repository directory - auto-detect from script location
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CUSTOM_DYNAMO_DIR="${SCRIPT_DIR}/optimized"
+
+echo "========================================================="
+echo "Dynamo vLLM with OPTIMIZED Thompson Sampling Router"
+echo "========================================================="
+if [ "$DYNAMO_USE_MULTILRU" = "true" ]; then
+ echo "Configuration: MultiLRU Mode (custom image: $IMAGE)"
+else
+ echo "Configuration: Standard Mode (image: $IMAGE)"
+fi
+echo "Model: Llama-3.3-70B-Instruct"
+echo "Container: $CONTAINER_NAME"
+echo "HTTP Port: $HTTP_PORT (default Dynamo frontend)"
+echo "Metrics Ports:"
+echo " - Worker: $WORKER_METRICS_PORT (KV cache, internal)"
+echo " - Router: $ROUTER_METRICS_PORT (Thompson routing)"
+echo " - Processor: $PROCESSOR_METRICS_PORT (KVE metrics)"
+echo ""
+echo "Architecture Differences (vs SGLang version):"
+echo " - vLLM backend (native KVBM support)"
+echo " - KV events enabled: $ENABLE_KV_EVENTS"
+echo " - Different CLI flags (--block-size, --gpu-memory-utilization)"
+echo " - Prefix caching enabled by default"
+echo ""
+echo "Components:"
+echo " - ETCD (metadata and discovery)"
+echo " - NATS (message queue for KV events)"
+echo " - Default Frontend (HTTP API on port $HTTP_PORT)"
+echo " - Custom Router (Thompson Sampling + KV overlap)"
+echo " - Custom Processor (hint extraction + routing)"
+echo " - vLLM Worker (unified mode)"
+echo ""
+echo "Backend Workers:"
+echo " Workers: $NUM_WORKERS (GPUs: $NUM_GPUS, TP=$TP_SIZE per worker)"
+echo " GPUs: $WORKER_GPUS"
+echo " Mode: UNIFIED (no prefill/decode disaggregation)"
+echo ""
+echo "KV Cache Configuration:"
+echo " Block Size: $KV_BLOCK_SIZE tokens (--block-size / --kv-cache-block-size)"
+echo " GPU Mem Utilization: $GPU_MEMORY_UTILIZATION (--gpu-memory-utilization)"
+echo " Max Concurrent Seqs: $MAX_NUM_SEQS (--max-num-seqs, prevents OOM during warmup)"
+echo " KV Events: $ENABLE_KV_EVENTS (KVBM event publishing)"
+if [ "$ENABLE_KV_EVENTS" = "true" ] && [ "$NUM_WORKERS" -gt 1 ]; then
+ echo " Per-worker ports: $KV_EVENT_BASE_PORT - $((KV_EVENT_BASE_PORT + NUM_WORKERS - 1))"
+fi
+if [ -n "$NUM_GPU_BLOCKS_OVERRIDE" ]; then
+ echo " ⚠️ GPU Blocks Override: $NUM_GPU_BLOCKS_OVERRIDE (EXPERIMENT MODE - limited cache!)"
+fi
+echo ""
+echo "========================================================="
+
+# Select router/processor scripts based on DYNAMO_USE_MULTILRU
+if [ "$DYNAMO_USE_MULTILRU" = "true" ]; then
+ ROUTER_SCRIPT="router_multilru.py"
+ PROCESSOR_SCRIPT="processor_multilru.py"
+else
+ ROUTER_SCRIPT="router.py"
+ PROCESSOR_SCRIPT="processor.py"
+fi
+
+# Verify selected components exist
+if [ ! -f "$CUSTOM_DYNAMO_DIR/$ROUTER_SCRIPT" ]; then
+ echo "✗ ERROR: Custom $ROUTER_SCRIPT not found at: $CUSTOM_DYNAMO_DIR/$ROUTER_SCRIPT"
+ exit 1
+fi
+if [ ! -f "$CUSTOM_DYNAMO_DIR/$PROCESSOR_SCRIPT" ]; then
+ echo "✗ ERROR: Custom $PROCESSOR_SCRIPT not found at: $CUSTOM_DYNAMO_DIR/$PROCESSOR_SCRIPT"
+ exit 1
+fi
+echo "✓ Custom components found in: $CUSTOM_DYNAMO_DIR"
+echo " Router: $ROUTER_SCRIPT"
+echo " Processor: $PROCESSOR_SCRIPT"
+echo ""
+
+# Start ETCD if not running
+if docker ps -a --format '{{.Names}}' | grep -q "^etcd-dynamo$"; then
+ echo "Removing existing ETCD container..."
+ docker rm -f etcd-dynamo
+fi
+
+echo "Starting ETCD container..."
+docker run -d \
+ --name etcd-dynamo \
+ --network host \
+ -e ALLOW_NONE_AUTHENTICATION=yes \
+ -e ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379 \
+ -e ETCD_ADVERTISE_CLIENT_URLS=http://localhost:2379 \
+ bitnamilegacy/etcd:3.6.1
+
+# Wait for ETCD to be ready
+echo "Waiting for ETCD to be ready..."
+for i in {1..30}; do
+ if curl -s http://localhost:2379/health > /dev/null 2>&1; then
+ echo "✓ ETCD is ready"
+ sleep 2
+ break
+ fi
+ if [ $i -eq 30 ]; then
+ echo "✗ ERROR: ETCD failed to start within 30 seconds"
+ docker logs etcd-dynamo
+ exit 1
+ fi
+ sleep 1
+done
+
+# Start NATS if not running
+if docker ps -a --format '{{.Names}}' | grep -q "^nats-dynamo$"; then
+ echo "Removing existing NATS container..."
+ docker rm -f nats-dynamo
+fi
+
+echo "Starting NATS container..."
+docker run -d \
+ --name nats-dynamo \
+ --network host \
+ nats:2.11.4 \
+ -js
+
+# Wait for NATS to be ready
+echo "Waiting for NATS to be ready..."
+for i in {1..30}; do
+ if timeout 2 bash -c 'cat < /dev/null > /dev/tcp/localhost/4222' 2>/dev/null; then
+ echo "✓ NATS is ready"
+ break
+ fi
+ if [ $i -eq 30 ]; then
+ echo "✗ ERROR: NATS failed to start within 30 seconds"
+ docker logs nats-dynamo
+ exit 1
+ fi
+ sleep 1
+done
+echo ""
+
+# Start monitoring stack (Prometheus + Grafana) if not running
+MONITORING_DIR="${SCRIPT_DIR}/monitoring"
+if [ -f "$MONITORING_DIR/docker-compose.yml" ]; then
+ PROMETHEUS_RUNNING=$(docker ps --format '{{.Names}}' | grep -q "^dynamo-prometheus$" && echo "true" || echo "false")
+ GRAFANA_RUNNING=$(docker ps --format '{{.Names}}' | grep -q "^dynamo-grafana$" && echo "true" || echo "false")
+
+ if [ "$PROMETHEUS_RUNNING" = "false" ] || [ "$GRAFANA_RUNNING" = "false" ]; then
+ echo "Starting monitoring stack (Prometheus + Grafana)..."
+ cd "$MONITORING_DIR"
+ docker compose up -d
+ cd "$SCRIPT_DIR"
+
+ # Wait for Prometheus to be ready
+ echo "Waiting for Prometheus to be ready..."
+ for i in {1..30}; do
+ if curl -s http://localhost:9090/-/ready > /dev/null 2>&1; then
+ echo "✓ Prometheus is ready (http://localhost:9090)"
+ break
+ fi
+ if [ $i -eq 30 ]; then
+ echo "⚠ WARNING: Prometheus may not be fully ready yet"
+ fi
+ sleep 1
+ done
+
+ # Wait for Grafana to be ready
+ echo "Waiting for Grafana to be ready..."
+ for i in {1..30}; do
+ if curl -s http://localhost:3000/api/health > /dev/null 2>&1; then
+ echo "✓ Grafana is ready (http://localhost:3000)"
+ break
+ fi
+ if [ $i -eq 30 ]; then
+ echo "⚠ WARNING: Grafana may not be fully ready yet"
+ fi
+ sleep 1
+ done
+ echo ""
+ else
+ echo "✓ Monitoring stack already running"
+ echo " Prometheus: http://localhost:9090"
+ echo " Grafana: http://localhost:3000"
+ echo ""
+ fi
+else
+ echo "⚠ Monitoring docker-compose.yml not found at: $MONITORING_DIR"
+ echo " Skipping monitoring stack startup"
+ echo ""
+fi
+
+# Clean up existing Dynamo container if it exists
+if docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
+ echo "Removing existing Dynamo container: $CONTAINER_NAME"
+ docker rm -f $CONTAINER_NAME
+fi
+
+# Verify HF_TOKEN is set
+if [ -z "${HF_TOKEN:-}" ]; then
+ echo ""
+ echo "⚠ HF_TOKEN environment variable is not set."
+ echo ""
+ if [ -d "$LOCAL_MODEL_DIR" ]; then
+ echo "✓ Local model found - proceeding without HF_TOKEN"
+ HF_TOKEN="dummy"
+ else
+ echo "✗ Local model NOT found and no HF_TOKEN to download it"
+ echo ""
+ read -p "Please enter your HuggingFace token (or press Enter to skip): " HF_TOKEN
+ if [ -z "$HF_TOKEN" ]; then
+ echo "WARNING: Proceeding without HF_TOKEN."
+ HF_TOKEN="dummy"
+ else
+ echo "✓ HuggingFace token received"
+ fi
+ fi
+else
+ echo "✓ HuggingFace token is set"
+fi
+echo ""
+
+# Verify model exists locally
+if [ ! -d "$LOCAL_MODEL_DIR" ]; then
+ echo "WARNING: Model directory not found at: $LOCAL_MODEL_DIR"
+ echo ""
+ echo "To download the model, run:"
+ echo " hf download meta-llama/Llama-3.3-70B-Instruct --local-dir $LOCAL_MODEL_DIR"
+ echo ""
+ read -p "Continue anyway (model will be downloaded from HuggingFace)? [y/N] " -n 1 -r
+ echo
+ if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+ exit 1
+ fi
+fi
+
+# KV events are configured per-worker via --kv-events-config JSON inside the container
+# Each worker gets a unique endpoint port: tcp://*:$KV_EVENT_PORT
+
+# Start container with optimized Thompson Sampling components
+echo ""
+echo "Starting Dynamo container with OPTIMIZED Thompson Sampling components (vLLM)..."
+docker run -d \
+ --name $CONTAINER_NAME \
+ --gpus "\"device=${WORKER_GPUS}\"" \
+ --network host \
+ --ipc=host \
+ --shm-size=$SHM_SIZE \
+ --ulimit memlock=-1 \
+ --ulimit stack=67108864 \
+ -v $LOCAL_MODEL_DIR:$MODEL:ro \
+ -v $CUSTOM_DYNAMO_DIR:/workspace/custom_dynamo:ro \
+ -v ${SCRIPT_DIR}/monitoring/scripts:/workspace/monitoring/scripts:ro \
+ -e HF_TOKEN="$HF_TOKEN" \
+ -e HUGGING_FACE_HUB_TOKEN="$HF_TOKEN" \
+ -e RUST_BACKTRACE=1 \
+ -e PYTHONUNBUFFERED=1 \
+ -e DYN_HTTP_PORT=$HTTP_PORT \
+ -e DYN_ROUTER_MODE=round-robin \
+ -e WORKER_METRICS_PORT=$WORKER_METRICS_PORT \
+ -e ROUTER_METRICS_PORT=$ROUTER_METRICS_PORT \
+ -e PROCESSOR_METRICS_PORT=$PROCESSOR_METRICS_PORT \
+ -e KV_BLOCK_SIZE=$KV_BLOCK_SIZE \
+ -e GPU_MEMORY_UTILIZATION=$GPU_MEMORY_UTILIZATION \
+ -e MAX_NUM_SEQS=$MAX_NUM_SEQS \
+ -e ENABLE_KV_EVENTS=$ENABLE_KV_EVENTS \
+ -e KV_EVENT_BASE_PORT=$KV_EVENT_BASE_PORT \
+ -e DYNAMO_USE_MULTILRU=$DYNAMO_USE_MULTILRU \
+ -e DYNAMO_WORKER_COMPONENT=backend \
+ $IMAGE \
+ bash -c "
+ set -e
+
+ echo '========================================================='
+ echo 'Verifying external infrastructure services...'
+ echo '========================================================='
+
+ # Verify ETCD is accessible
+ if curl -s http://localhost:2379/health > /dev/null 2>&1; then
+ echo '✓ ETCD accessible at localhost:2379'
+ else
+ echo '✗ ERROR: ETCD not accessible at localhost:2379'
+ exit 1
+ fi
+
+ # Verify NATS is accessible
+ if timeout 2 bash -c '/dev/null; then
+ echo '✓ NATS accessible at localhost:4222'
+ else
+ echo '✗ ERROR: NATS not accessible at localhost:4222'
+ exit 1
+ fi
+
+ echo ''
+
+ # Function to wait for worker initialization via ETCD registration
+ wait_for_worker() {
+ local worker_type=\$1
+ local pid=\$2
+ # Use WORKER_INIT_TIMEOUT_S (defaults to 1800s / 30 min)
+ local max_wait=$WORKER_INIT_TIMEOUT_S
+ local elapsed=0
+ local poll_interval=5
+
+ echo \"Waiting for \$worker_type worker (PID \$pid) to initialize...\"
+ echo \" Detection: ETCD worker registration\"
+ echo \" Timeout: \${max_wait}s\"
+
+ while [ \$elapsed -lt \$max_wait ]; do
+ if ! kill -0 \$pid 2>/dev/null; then
+ echo \"ERROR: \$worker_type worker process died!\"
+ return 1
+ fi
+
+ local etcd_response=\$(curl -s --max-time 2 http://localhost:2379/v3/kv/range \
+ -X POST \
+ -H \"Content-Type: application/json\" \
+ -d '{\"key\":\"AA==\",\"range_end\":\"AA==\",\"keys_only\":true}' 2>&1)
+
+ if [ \$((elapsed % 30)) -eq 0 ] && [ \$elapsed -gt 0 ]; then
+ echo \" [DEBUG] ETCD count: \$(echo \"\$etcd_response\" | grep -o '\"count\":\"[^\"]*\"')\"
+ fi
+
+ if echo \"\$etcd_response\" | grep -q '\"count\"' && \
+ ! echo \"\$etcd_response\" | grep -q '\"count\":\"0\"'; then
+ echo \"✓ \$worker_type worker is ready (registered with ETCD at \${elapsed}s)\"
+ return 0
+ fi
+
+ sleep \$poll_interval
+ elapsed=\$((elapsed + poll_interval))
+ if [ \$((elapsed % 30)) -eq 0 ]; then
+ echo \" ... \${elapsed}s / \${max_wait}s (waiting for ETCD registration)\"
+ fi
+ done
+
+ echo \"ERROR: \$worker_type worker failed to register with ETCD within \${max_wait}s\"
+ return 1
+ }
+
+ # Function to wait for ALL workers to register with ETCD
+ # Counts workers registered at workers.backend.generate endpoint
+ wait_for_all_workers() {
+ local expected_count=\$1
+ local max_wait=$WORKER_INIT_TIMEOUT_S
+ local elapsed=0
+ local poll_interval=10
+
+ echo \"\"
+ echo \"Waiting for ALL \$expected_count vLLM workers to register with ETCD...\"
+ echo \" Detection: Count workers at workers.backend.generate endpoint\"
+ echo \" Timeout: \${max_wait}s\"
+ echo \"\"
+
+ while [ \$elapsed -lt \$max_wait ]; do
+ # Check all worker PIDs are still alive
+ for wpid in \"\${WORKER_PIDS[@]}\"; do
+ if ! kill -0 \$wpid 2>/dev/null; then
+ echo \"ERROR: Worker process \$wpid died during initialization!\"
+ return 1
+ fi
+ done
+
+ # Count worker registrations in ETCD
+ # Workers register with keys like: v1/instances/workers/backend/generate/
+ local worker_count=\$(curl -s --max-time 2 http://localhost:2379/v3/kv/range \
+ -X POST \
+ -H \"Content-Type: application/json\" \
+ -d '{
+ \"key\": \"'\"djEvaW5zdGFuY2VzL3dvcmtlcnMvYmFja2VuZC9nZW5lcmF0ZS8=\"'\",
+ \"range_end\": \"'\"djEvaW5zdGFuY2VzL3dvcmtlcnMvYmFja2VuZC9nZW5lcmF0ZTA=\"'\",
+ \"count_only\": true
+ }' 2>/dev/null | grep -o '\"count\":\"[^\"]*\"' | grep -o '[0-9]*' || echo \"0\")
+
+ if [ \"\$worker_count\" -ge \"\$expected_count\" ]; then
+ echo \"✓ All \$expected_count vLLM workers registered with ETCD (took \${elapsed}s)\"
+ return 0
+ fi
+
+ if [ \$((elapsed % 30)) -eq 0 ]; then
+ echo \" [\${elapsed}s] Workers registered: \$worker_count / \$expected_count\"
+ fi
+
+ sleep \$poll_interval
+ elapsed=\$((elapsed + poll_interval))
+ done
+
+ echo \"ERROR: Only \$worker_count / \$expected_count workers registered within \${max_wait}s\"
+ echo \" Some workers may still be initializing torch.compile (can take 10+ min first time)\"
+ return 1
+ }
+
+ # =========================================================================
+ # STARTUP ORDER WITH MODEL NAME ISOLATION
+ # =========================================================================
+ # Using different model names to force ALL traffic through the processor.
+ # Workers register with internal model name (${SERVED_MODEL_NAME}-internal),
+ # while processor registers with public model name (${SERVED_MODEL_NAME}).
+ # Frontend only routes to backends matching the requested model name.
+ #
+ # Order:
+ # 1. Workers (model=${SERVED_MODEL_NAME}-internal, not discovered for public model)
+ # 2. Router (needs workers to be present)
+ # 3. Processor (model=${SERVED_MODEL_NAME}, frontend discovers this)
+ # 4. Frontend (routes ${SERVED_MODEL_NAME} requests to processor ONLY)
+ # =========================================================================
+
+ echo '========================================================='
+ echo 'Step 1: Starting $NUM_WORKERS vLLM Unified Worker(s) (Host GPUs $WORKER_GPUS -> Container GPUs $CONTAINER_GPU_INDICES)...'
+ echo '========================================================='
+ # Workers register at workers.worker.generate (in 'workers' namespace)
+ # They start first so the router can discover them during initialization
+ # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component
+
+ # KV events configuration
+ # NOTE: KV events are configured via --kv-events-config JSON, not --enable-kv-events flag
+ # Each worker gets a unique endpoint port via the config
+ # --enable-prefix-caching is a separate vLLM feature (always enabled by default in unified mode)
+ if [ \"\$ENABLE_KV_EVENTS\" = \"true\" ]; then
+ echo \"KV Events: ENABLED (per-worker ports starting at \$KV_EVENT_BASE_PORT)\"
+ else
+ echo \"KV Events: DISABLED (set DYNAMO_ENABLE_KV_EVENTS=true to enable)\"
+ fi
+
+ # Build optional --num-gpu-blocks-override flag (for cache size experiments)
+ GPU_BLOCKS_OVERRIDE_OPT=\"\"
+ if [ -n \"$NUM_GPU_BLOCKS_OVERRIDE\" ]; then
+ GPU_BLOCKS_OVERRIDE_OPT=\"--num-gpu-blocks-override $NUM_GPU_BLOCKS_OVERRIDE\"
+ echo \"GPU Blocks Override: $NUM_GPU_BLOCKS_OVERRIDE (experiment mode - limited cache!)\"
+ fi
+
+ # Start multiple workers, each using TP_SIZE GPUs
+ WORKER_PIDS=()
+ for i in \$(seq 0 \$(($NUM_WORKERS - 1))); do
+ # Calculate GPU range for this worker (e.g., worker 0: 0,1; worker 1: 2,3; etc.)
+ START_GPU=\$((i * $TP_SIZE))
+ END_GPU=\$(((i + 1) * $TP_SIZE - 1))
+ WORKER_GPU_LIST=\$(seq -s, \$START_GPU \$END_GPU)
+
+ # Calculate port offsets for this worker to avoid ZMQ port conflicts
+ #
+ # 1. NIXL Side Channel Ports (for KV transfer handshake)
+ # Each worker's NIXL connector uses TP_SIZE consecutive ports
+ # Port spacing = TP_SIZE (minimum needed to avoid overlap)
+ # Examples:
+ # TP=1, 8 GPUs → 8 workers: 5557, 5558, 5559, 5560, 5561, 5562, 5563, 5564
+ # TP=2, 8 GPUs → 4 workers: 5557-5558, 5559-5560, 5561-5562, 5563-5564
+ # TP=4, 8 GPUs → 2 workers: 5557-5560, 5561-5564
+ # TP=8, 8 GPUs → 1 worker: 5557-5564
+ NIXL_BASE_PORT=\$((5557 + i * $TP_SIZE))
+
+ # 2. KV Event Publisher Port (for publishing KV cache events to subscriber)
+ # Each worker needs a unique port for its ZMQ publisher
+ # Set via DYN_VLLM_KV_EVENT_PORT environment variable
+ # Default base: 20080, Worker 0: 20080, Worker 1: 20081, etc.
+ KV_EVENT_PORT=\$(($KV_EVENT_BASE_PORT + i))
+
+ echo \"Starting vLLM Worker \$i: GPUs \$WORKER_GPU_LIST (internal model name)\"
+ echo \" KV Block Size: $KV_BLOCK_SIZE tokens, GPU Mem Util: $GPU_MEMORY_UTILIZATION, Max Seqs: $MAX_NUM_SEQS\"
+ echo \" NIXL Port Range: \$NIXL_BASE_PORT - \$((NIXL_BASE_PORT + $TP_SIZE - 1)) (TP=$TP_SIZE)\"
+ echo \" KV Event Port: \$KV_EVENT_PORT (KV Events: $ENABLE_KV_EVENTS)\"
+ # NOTE: dynamo.vllm does NOT accept --host/--port/--endpoint like dynamo.sglang
+ # Endpoint is set via DYN_ENDPOINT env var, namespace via DYN_NAMESPACE
+ # VLLM_NIXL_SIDE_CHANNEL_PORT sets the base port for NIXL handshake listener
+ # DYN_VLLM_KV_EVENT_PORT sets the port for KV event publishing (unique per worker)
+ # KV events are configured via --kv-events-config JSON with unique endpoint per worker
+
+ # Build KV events config JSON for this worker (unique endpoint per worker)
+ KV_EVENTS_JSON=\"{\\\"enable_kv_cache_events\\\":true,\\\"publisher\\\":\\\"zmq\\\",\\\"endpoint\\\":\\\"tcp://*:\$KV_EVENT_PORT\\\"}\"
+
+ # Build scheduler class option - use DynamoScheduler for MultiLruBackend if available
+ # Set DYNAMO_USE_MULTILRU=false to disable
+ SCHEDULER_OPT=\"\"
+ if [ \"\${DYNAMO_USE_MULTILRU:-false}\" = \"true\" ]; then
+ SCHEDULER_OPT=\"--scheduler-cls kvbm.v2.vllm.schedulers.dynamo.DynamoScheduler\"
+ echo \" Scheduler: DynamoScheduler with MultiLruBackend (frequency-based eviction)\"
+ else
+ echo \" Scheduler: Default vLLM scheduler\"
+ fi
+
+ if [ \"\$ENABLE_KV_EVENTS\" = \"true\" ]; then
+ CUDA_VISIBLE_DEVICES=\$WORKER_GPU_LIST \
+ DYN_SYSTEM_PORT=\$((WORKER_METRICS_PORT + i)) \
+ DYN_NAMESPACE=workers \
+ DYN_ENDPOINT=workers.worker.generate \
+ VLLM_NIXL_SIDE_CHANNEL_PORT=\$NIXL_BASE_PORT \
+ DYN_VLLM_KV_EVENT_PORT=\$KV_EVENT_PORT \
+ python3 -m dynamo.vllm \
+ --model $MODEL \
+ --served-model-name ${SERVED_MODEL_NAME}-internal \
+ --tensor-parallel-size $TP_SIZE \
+ --trust-remote-code \
+ --block-size $KV_BLOCK_SIZE \
+ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
+ --max-num-seqs $MAX_NUM_SEQS \
+ \$SCHEDULER_OPT \
+ \$GPU_BLOCKS_OVERRIDE_OPT \
+ --kv-events-config \"\$KV_EVENTS_JSON\" &
+ else
+ CUDA_VISIBLE_DEVICES=\$WORKER_GPU_LIST \
+ DYN_SYSTEM_PORT=\$((WORKER_METRICS_PORT + i)) \
+ DYN_NAMESPACE=workers \
+ DYN_ENDPOINT=workers.worker.generate \
+ VLLM_NIXL_SIDE_CHANNEL_PORT=\$NIXL_BASE_PORT \
+ DYN_VLLM_KV_EVENT_PORT=\$KV_EVENT_PORT \
+ python3 -m dynamo.vllm \
+ --model $MODEL \
+ --served-model-name ${SERVED_MODEL_NAME}-internal \
+ --tensor-parallel-size $TP_SIZE \
+ --trust-remote-code \
+ --block-size $KV_BLOCK_SIZE \
+ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
+ --max-num-seqs $MAX_NUM_SEQS \
+ \$SCHEDULER_OPT \
+ \$GPU_BLOCKS_OVERRIDE_OPT &
+ fi
+ WORKER_PIDS+=(\$!)
+ echo \" Worker \$i PID: \${WORKER_PIDS[\$i]}\"
+ done
+ echo \"\"
+ echo \"Total workers started: \${#WORKER_PIDS[@]}\"
+ echo \"Worker PIDs: \${WORKER_PIDS[*]}\"
+ echo \"Registered at: workers.worker.generate (model: ${SERVED_MODEL_NAME}-internal)\"
+ echo \"NOTE: Workers use internal model name so frontend only discovers processor\"
+ echo \"\"
+
+ # Wait for first worker to initialize (checks ETCD registration)
+ wait_for_worker \"vLLM Unified\" \${WORKER_PIDS[0]} || exit 1
+
+ # Wait for ALL workers to register with ETCD
+ # vLLM workers can take a long time to initialize due to torch.compile
+ if [ \${#WORKER_PIDS[@]} -gt 1 ]; then
+ wait_for_all_workers \${#WORKER_PIDS[@]} || {
+ echo \"WARNING: Not all workers initialized. Continuing with available workers.\"
+ echo \" Dashboard metrics may be incomplete.\"
+ }
+ fi
+
+ echo ''
+ echo '========================================================='
+ echo 'Step 2: Starting Custom Router (Thompson Sampling + Prometheus)...'
+ echo '========================================================='
+ # Router uses config.yaml for all parameters
+ # It needs workers to be present (started in Step 1)
+ # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component
+ DYN_SYSTEM_PORT=\$ROUTER_METRICS_PORT \
+ python3 /workspace/custom_dynamo/$ROUTER_SCRIPT \
+ --config /workspace/custom_dynamo/config.yaml &
+ ROUTER_PID=\$!
+ echo \"Router PID: \$ROUTER_PID\"
+ echo \"Metrics at: http://localhost:\$ROUTER_METRICS_PORT/metrics\"
+ sleep 15
+ echo \"\"
+
+ echo ''
+ echo '========================================================='
+ echo 'Step 3: Starting Custom Processor (Static Mode)...'
+ echo '========================================================='
+ # STATIC MODE: Processor uses @dynamo_worker(static=True) so it registers
+ # at dynamo.backend.generate WITHOUT an instance ID. This is required for
+ # --static-endpoint on the frontend to find it.
+ # DYN_SYSTEM_PORT sets the Prometheus metrics port for this component
+ DYN_SYSTEM_PORT=\$PROCESSOR_METRICS_PORT \
+ python3 /workspace/custom_dynamo/$PROCESSOR_SCRIPT \
+ --enable-router \
+ --model-path $MODEL \
+ --model-name $SERVED_MODEL_NAME &
+ PROCESSOR_PID=\$!
+ echo \"Processor PID: \$PROCESSOR_PID\"
+ echo \"Model: $SERVED_MODEL_NAME (from $MODEL)\"
+ echo \"Registered at: dynamo.backend.generate (namespace=dynamo)\"
+ echo \"Forwards to: workers.worker.generate (actual vLLM workers)\"
+ echo \"Metrics at: http://localhost:\$PROCESSOR_METRICS_PORT/metrics\"
+ sleep 15
+ echo \"\"
+
+ echo ''
+ echo '========================================================='
+ echo 'Step 4: Starting Default Dynamo Frontend (Namespace-Scoped Discovery)...'
+ echo '========================================================='
+ # NAMESPACE-SCOPED DISCOVERY: Frontend discovers backends via ETCD ModelWatcher,
+ # but only from the 'dynamo' namespace. Workers are in the 'workers' namespace,
+ # so the frontend will ONLY discover the processor (in 'dynamo' namespace).
+ # This ensures ALL requests go through the Thompson Sampling router.
+ echo \"Frontend KV Block Size: $KV_BLOCK_SIZE tokens (must match worker --block-size)\"
+ python3 -m dynamo.frontend \
+ --http-port $HTTP_PORT \
+ --model-name $SERVED_MODEL_NAME \
+ --model-path $MODEL \
+ --kv-cache-block-size $KV_BLOCK_SIZE \
+ --namespace dynamo &
+ FRONTEND_PID=\$!
+ echo \"Frontend PID: \$FRONTEND_PID\"
+ echo \"Discovery: ETCD ModelWatcher (namespace=dynamo, discovers processor ONLY)\"
+ sleep 15
+ echo \"\"
+
+ echo ''
+ echo '========================================================='
+ echo '✓ All components started successfully!'
+ echo '========================================================='
+ echo \"Infrastructure Services (External):\"
+ echo \" ETCD: localhost:2379\"
+ echo \" NATS: localhost:4222\"
+ echo \"\"
+ echo \"Dynamo Components (This Container):\"
+ echo \" vLLM Unified Workers: \${#WORKER_PIDS[@]} workers (GPUs $WORKER_GPUS, TP=$TP_SIZE each)\"
+ for i in \$(seq 0 \$((\${#WORKER_PIDS[@]} - 1))); do
+ START_GPU=\$((i * $TP_SIZE))
+ END_GPU=\$(((i + 1) * $TP_SIZE - 1))
+ echo \" Worker \$i: PID \${WORKER_PIDS[\$i]}, GPUs \$START_GPU-\$END_GPU\"
+ done
+ echo \" → Registered at: workers.worker.generate (hidden from frontend)\"
+ echo \" Router: PID \$ROUTER_PID (Thompson Sampling + Prometheus)\"
+ echo \" → Registered at: dynamo.router.{find_worker,feedback}\"
+ echo \" → Metrics: http://localhost:\$ROUTER_METRICS_PORT/metrics\"
+ echo \" Processor: PID \$PROCESSOR_PID (NVExt annotation extraction)\"
+ echo \" → Registered at: dynamo.backend.generate (STATIC mode)\"
+ echo \" → Metrics: http://localhost:\$PROCESSOR_METRICS_PORT/metrics\"
+ echo \" Frontend: PID \$FRONTEND_PID (Default Dynamo HTTP API on port $HTTP_PORT)\"
+ echo \" → Discovery: ETCD ModelWatcher\"
+ echo \" → Metrics: http://localhost:$HTTP_PORT/metrics\"
+ echo ''
+ echo 'Request Flow (Dynamic Discovery - Thompson Sampling when routed to processor):'
+ echo ' Client → Default Frontend API (port $HTTP_PORT)'
+ echo ' ↓ (tokenization + nvext parsing)'
+ echo ' Frontend routes via ETCD ModelWatcher (processor OR workers)'
+ echo ' ↓'
+ echo ' IF routed to Processor (dynamo.backend.generate):'
+ echo ' ↓ (extract hints from annotations)'
+ echo ' ↓ (query Thompson Sampling router)'
+ echo ' Custom Router → worker_id'
+ echo ' ↓ (KV overlap + workload-aware selection)'
+ echo ' Processor routes to → workers.worker.generate (with worker_id)'
+ echo ' ↓'
+ echo ' vLLM Unified Worker (workers.worker.generate)'
+ echo ' ↓'
+ echo ' Response + Feedback to Router'
+ echo ''
+ echo 'Prometheus Metrics Endpoints:'
+ echo ' - Frontend: http://localhost:$HTTP_PORT/metrics (latency, throughput)'
+ echo ' - Workers: http://localhost:\$WORKER_METRICS_PORT/metrics - \$((WORKER_METRICS_PORT + \${#WORKER_PIDS[@]} - 1))/metrics (KV cache)'
+ echo ' - Router: http://localhost:\$ROUTER_METRICS_PORT/metrics (thompson_router_*)'
+ echo ' - Processor: http://localhost:\$PROCESSOR_METRICS_PORT/metrics (thompson_* KVE)'
+ echo '========================================================='
+
+ # Monitor all processes
+ while true; do
+ if ! kill -0 \$FRONTEND_PID 2>/dev/null; then
+ echo \"ERROR: Frontend died!\"
+ exit 1
+ fi
+ if ! kill -0 \$PROCESSOR_PID 2>/dev/null; then
+ echo \"ERROR: Processor died!\"
+ exit 1
+ fi
+ if ! kill -0 \$ROUTER_PID 2>/dev/null; then
+ echo \"ERROR: Router died!\"
+ exit 1
+ fi
+ for i in \$(seq 0 \$((\${#WORKER_PIDS[@]} - 1))); do
+ if ! kill -0 \${WORKER_PIDS[\$i]} 2>/dev/null; then
+ echo \"ERROR: Worker \$i (PID \${WORKER_PIDS[\$i]}) died!\"
+ exit 1
+ fi
+ done
+ sleep 10
+ done
+ "
+
+# Wait for container to start
+echo ""
+echo "Waiting for container to start..."
+sleep 15
+
+# Check if container started successfully
+if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
+ echo ""
+ echo "========================================================="
+ echo "✓ Dynamo with OPTIMIZED Thompson Sampling Router Started! (vLLM)"
+ echo "========================================================="
+ echo ""
+ echo "Architecture (Model Name Isolation - Thompson Sampling):"
+ echo ""
+ echo " Model Name Isolation Mode:"
+ echo " - Workers register with internal model name (${SERVED_MODEL_NAME}-internal)"
+ echo " - Processor registers with public model name (${SERVED_MODEL_NAME})"
+ echo " - Frontend routes ${SERVED_MODEL_NAME} requests to processor ONLY"
+ echo " - ALL requests go through Thompson Sampling router"
+ echo ""
+ echo " Startup Order:"
+ echo " 1. Workers → model=${SERVED_MODEL_NAME}-internal (not matched by frontend)"
+ echo " 2. Router → dynamo.router.{find_worker,feedback}"
+ echo " 3. Processor → model=${SERVED_MODEL_NAME} (matched by frontend)"
+ echo " 4. Frontend → routes to processor for ${SERVED_MODEL_NAME} requests"
+ echo ""
+ echo " Request Flow (ALL requests go through processor):"
+ echo " Client Request (with nvext.annotations)"
+ echo " ↓"
+ echo " Default Dynamo Frontend (port $HTTP_PORT)"
+ echo " ↓ ETCD ModelWatcher (namespace=dynamo) routes to processor"
+ echo " Custom Processor (dynamo.backend.generate)"
+ echo " ↓ extracts: prefix_id, total_requests, osl, iat"
+ echo " ↓ queries Thompson Sampling router"
+ echo " Custom Router → worker_id"
+ echo " ↓ KV overlap + workload-aware selection"
+ echo " Processor forwards to workers.worker.generate"
+ echo " ↓"
+ echo " vLLM Unified Workers ($NUM_WORKERS x TP=$TP_SIZE = $NUM_GPUS GPUs total)"
+ echo " ↓"
+ echo " Response + Feedback Loop"
+ echo ""
+ echo "Infrastructure Services (Managed):"
+ echo " ETCD: etcd-dynamo container, localhost:2379"
+ echo " NATS: nats-dynamo container, localhost:4222"
+ echo ""
+ echo "Prometheus Metrics Endpoints:"
+ echo " Frontend: http://localhost:$HTTP_PORT/metrics (latency, throughput)"
+ echo " Workers: http://localhost:$WORKER_METRICS_PORT/metrics - $((WORKER_METRICS_PORT + NUM_WORKERS - 1))/metrics (KV cache)"
+ echo " Router: http://localhost:$ROUTER_METRICS_PORT/metrics (routing)"
+ echo " Processor: http://localhost:$PROCESSOR_METRICS_PORT/metrics (KVE)"
+ echo ""
+ echo "Dynamo Components:"
+ echo " Frontend: HTTP API on port $HTTP_PORT"
+ echo " vLLM Unified Workers: $NUM_WORKERS workers (TP=$TP_SIZE each)"
+ echo ""
+ echo "KV Cache Settings:"
+ echo " Block Size: $KV_BLOCK_SIZE tokens (DYNAMO_KV_BLOCK_SIZE)"
+ echo " GPU Mem Utilization: $GPU_MEMORY_UTILIZATION (DYNAMO_GPU_MEMORY_UTILIZATION)"
+ echo " Max Concurrent Seqs: $MAX_NUM_SEQS (DYNAMO_MAX_NUM_SEQS)"
+ echo " KV Events: $ENABLE_KV_EVENTS (DYNAMO_ENABLE_KV_EVENTS)"
+ if [ "${DYNAMO_USE_MULTILRU:-false}" = "true" ]; then
+ echo " Scheduler: DynamoScheduler with MultiLruBackend (DYNAMO_USE_MULTILRU=true)"
+ echo " → 4-pool system: Cold→Warm→Hot→VeryHot"
+ echo " → Promotion thresholds: [2, 6, 15] accesses"
+ else
+ echo " Scheduler: Default vLLM scheduler (DYNAMO_USE_MULTILRU=false)"
+ fi
+ echo ""
+ echo "API Endpoint: http://localhost:$HTTP_PORT/v1/chat/completions"
+ echo "Health Check: http://localhost:$HTTP_PORT/health"
+ echo ""
+ echo "NVExt Annotations (in request body):"
+ echo " \"nvext\": {"
+ echo " \"annotations\": ["
+ echo " \"prefix_id:\","
+ echo " \"total_requests:\","
+ echo " \"osl:LOW|MEDIUM|HIGH\","
+ echo " \"iat:LOW|MEDIUM|HIGH\""
+ echo " ]"
+ echo " }"
+ echo ""
+ echo "Monitoring Dashboards:"
+ echo " Grafana: http://localhost:3000 (no login required)"
+ echo " Prometheus: http://localhost:9090"
+ echo ""
+ echo "Useful Commands:"
+ echo " Interactive shell: docker exec -it $CONTAINER_NAME bash"
+ echo " View Dynamo logs: docker logs -f $CONTAINER_NAME"
+ echo " View ETCD logs: docker logs -f etcd-dynamo"
+ echo " View NATS logs: docker logs -f nats-dynamo"
+ echo " GPU usage: watch -n 2 nvidia-smi"
+ echo " Stop all: bash stop_dynamo.sh"
+ echo " Stop all + metrics: bash stop_dynamo.sh --kill-metrics"
+ echo ""
+ echo "Query Metrics (vLLM uses 'vllm:' prefix):"
+ echo " curl http://localhost:$HTTP_PORT/metrics | grep dynamo_frontend"
+ echo " curl http://localhost:$WORKER_METRICS_PORT/metrics | grep vllm:"
+ echo " curl http://localhost:$ROUTER_METRICS_PORT/metrics | grep thompson_router"
+ echo " curl http://localhost:$PROCESSOR_METRICS_PORT/metrics | grep thompson_kve"
+ echo ""
+ echo "========================================================="
+ echo "Test Request (with nvext annotations):"
+ echo "========================================================="
+ echo ""
+ echo "# Basic test (no hints)"
+ echo "curl http://localhost:$HTTP_PORT/v1/chat/completions \\"
+ echo " -H 'Content-Type: application/json' \\"
+ echo " -d '{"
+ echo " \"model\": \"$SERVED_MODEL_NAME\","
+ echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}],"
+ echo " \"max_tokens\": 50"
+ echo " }'"
+ echo ""
+ echo "# Test with nvext annotations (routing hints)"
+ echo "curl http://localhost:$HTTP_PORT/v1/chat/completions \\"
+ echo " -H 'Content-Type: application/json' \\"
+ echo " -d '{"
+ echo " \"model\": \"$SERVED_MODEL_NAME\","
+ echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}],"
+ echo " \"max_tokens\": 50,"
+ echo " \"nvext\": {"
+ echo " \"annotations\": ["
+ echo " \"prefix_id:test-session-001\","
+ echo " \"total_requests:5\","
+ echo " \"osl:MEDIUM\","
+ echo " \"iat:LOW\""
+ echo " ]"
+ echo " }"
+ echo " }'"
+ echo ""
+ echo "# Streaming test with hints"
+ echo "curl http://localhost:$HTTP_PORT/v1/chat/completions \\"
+ echo " -H 'Content-Type: application/json' \\"
+ echo " -d '{"
+ echo " \"model\": \"$SERVED_MODEL_NAME\","
+ echo " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}],"
+ echo " \"max_tokens\": 50,"
+ echo " \"stream\": true,"
+ echo " \"nvext\": {"
+ echo " \"annotations\": [\"prefix_id:stream-test\", \"total_requests:1\"]"
+ echo " }"
+ echo " }'"
+ echo ""
+ echo "========================================================="
+ echo ""
+ echo "Waiting for vLLM to initialize (this may take 5-10 minutes for a 70B model)..."
+ echo "Monitoring logs (Ctrl+C to exit, container continues)..."
+ echo ""
+
+ # Wait for server to be ready
+ echo "Checking for API availability (timeout=${WORKER_INIT_TIMEOUT_S}s)..."
+ max_attempts=$WORKER_INIT_TIMEOUT_S
+ attempt=0
+
+ while [ $attempt -lt $max_attempts ]; do
+ # Use || true to prevent curl connection failures from exiting due to set -e
+ # curl returns "000" for connection refused, so we just need to prevent the exit
+ health_response=$(curl -s --max-time 5 -o /dev/null -w "%{http_code}" http://localhost:$HTTP_PORT/health 2>/dev/null) || true
+ if [ "$health_response" = "200" ]; then
+ echo "✓ Dynamo API is ready! (health check passed)"
+ break
+ fi
+ attempt=$((attempt + 1))
+ if [ $((attempt % 15)) -eq 0 ]; then
+ echo " ... still waiting ($attempt/$max_attempts) - health response: $health_response"
+ fi
+ sleep 1
+ done
+
+ if [ $attempt -ge $max_attempts ]; then
+ echo ""
+ echo "⚠ Timeout waiting for API. Check logs with: docker logs $CONTAINER_NAME"
+ echo ""
+ else
+ echo ""
+ echo "Quick test (polling every 15s for up to 5 minutes):"
+ echo ""
+
+ quick_test_max_attempts=20 # 20 * 15s = 5 minutes
+ quick_test_attempt=0
+ quick_test_success=false
+
+ while [ $quick_test_attempt -lt $quick_test_max_attempts ]; do
+ quick_test_attempt=$((quick_test_attempt + 1))
+ echo " Attempt $quick_test_attempt/$quick_test_max_attempts..."
+
+ quick_test_response=$(curl -s --max-time 60 http://localhost:$HTTP_PORT/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "'$SERVED_MODEL_NAME'",
+ "messages": [{"role": "user", "content": "Say hello"}],
+ "max_tokens": 20
+ }' 2>&1) || true
+
+ # Check if response is empty/null
+ if [ -z "$quick_test_response" ]; then
+ echo " Empty response, retrying in 15s..."
+ sleep 15
+ continue
+ fi
+
+ # Check if response contains an error
+ error_message=$(echo "$quick_test_response" | jq -r '.error.message // .error // empty' 2>/dev/null)
+ if [ -n "$error_message" ]; then
+ echo ""
+ echo "========================================================="
+ echo "✗ Quick test failed with error:"
+ echo " $error_message"
+ echo "========================================================="
+ echo ""
+ echo "Full response:"
+ echo "$quick_test_response" | jq . 2>/dev/null || echo "$quick_test_response"
+ echo ""
+ echo "Check logs with: docker logs $CONTAINER_NAME"
+ exit 1
+ fi
+
+ # Check if response has valid choices (success)
+ choices_content=$(echo "$quick_test_response" | jq -r '.choices[0].message.content // empty' 2>/dev/null)
+ if [ -n "$choices_content" ]; then
+ echo ""
+ echo "========================================================="
+ echo "✓ Quick test successful!"
+ echo "========================================================="
+ echo ""
+ echo "$quick_test_response" | jq '.choices[0].message.content, .usage'
+ echo ""
+ echo "========================================================="
+ echo "Container is running. View logs with:"
+ echo " docker logs -f $CONTAINER_NAME"
+ echo "========================================================="
+ quick_test_success=true
+ break
+ fi
+
+ # Response exists but no choices - might still be loading
+ echo " Response received but no valid choices, retrying in 15s..."
+ echo " Response: $(echo "$quick_test_response" | head -c 200)..."
+ sleep 15
+ done
+
+ if [ "$quick_test_success" = false ]; then
+ echo ""
+ echo "========================================================="
+ echo "⚠ Quick test timed out after 5 minutes"
+ echo "========================================================="
+ echo ""
+ echo "Container is running but may not be fully ready."
+ echo "Try manually: curl http://localhost:$HTTP_PORT/v1/chat/completions ..."
+ echo "Check logs with: docker logs $CONTAINER_NAME"
+ fi
+ fi
+else
+ echo ""
+ echo "========================================================="
+ echo "✗ Container failed to start!"
+ echo "========================================================="
+ echo ""
+ echo "Check logs with: docker logs $CONTAINER_NAME"
+ exit 1
+fi
+
diff --git a/external/dynamo/start_dynamo_unified.sh b/external/dynamo/start_dynamo_unified.sh
index c99a3114a9..5ed3e34bd5 100755
--- a/external/dynamo/start_dynamo_unified.sh
+++ b/external/dynamo/start_dynamo_unified.sh
@@ -32,9 +32,9 @@
# Configuration Variables (can be overridden via environment variables)
CONTAINER_NAME="dynamo-sglang"
-WORKER_GPUS="${DYNAMO_GPU_DEVICES:-0,1,2,3}"
-TP_SIZE="${DYNAMO_TP_SIZE:-4}"
-HTTP_PORT="${DYNAMO_HTTP_PORT:-8099}"
+WORKER_GPUS="${DYNAMO_GPU_DEVICES:-0,1,2,3,4,5,6,7}"
+TP_SIZE="${DYNAMO_TP_SIZE:-2}"
+HTTP_PORT="${DYNAMO_HTTP_PORT:-8000}"
MODEL="/workspace/models/Llama-3.3-70B-Instruct"
SERVED_MODEL_NAME="${DYNAMO_MODEL_NAME:-llama-3.3-70b}"
IMAGE="nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.1"
@@ -44,12 +44,15 @@ SHM_SIZE="${DYNAMO_SHM_SIZE:-16g}"
ETCD_CLIENT_PORT="${DYNAMO_ETCD_PORT:-2379}"
ETCD_PEER_PORT="${DYNAMO_ETCD_PEER_PORT:-2390}"
NATS_PORT="${DYNAMO_NATS_PORT:-4222}"
-WORKER_INIT_TIMEOUT_S="${DYNAMO_WORKER_INIT_TIMEOUT_S:-600}"
+WORKER_INIT_TIMEOUT_S="${DYNAMO_WORKER_INIT_TIMEOUT_S:-1800}"
# Compute container-internal GPU indices (GPUs are renumbered 0,1,2,... inside the container)
NUM_GPUS=$(echo "$WORKER_GPUS" | tr ',' '\n' | wc -l)
CONTAINER_GPU_INDICES=$(seq -s, 0 $((NUM_GPUS - 1)))
+# Calculate number of workers based on available GPUs and TP size
+NUM_WORKERS=$((NUM_GPUS / TP_SIZE))
+
# Local paths - DYNAMO_MODEL_DIR must be set or script will error
if [ -z "${DYNAMO_MODEL_DIR}" ]; then
echo "ERROR: DYNAMO_MODEL_DIR environment variable must be set"
@@ -82,7 +85,7 @@ if [ -d "${DYNAMO_MODEL_DIR}" ]; then
echo ""
echo "This usually means incomplete/corrupted download. Try:"
echo " rm -rf ${DYNAMO_MODEL_DIR}"
- echo " huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir ${DYNAMO_MODEL_DIR}"
+ echo " hf download meta-llama/Llama-3.3-70B-Instruct --local-dir ${DYNAMO_MODEL_DIR}"
exit 1
fi
fi
@@ -101,8 +104,9 @@ echo " - NATS (message queue for requests)"
echo " - Dynamo Frontend (HTTP API on port $HTTP_PORT)"
echo " - SGLang Worker (unified mode)"
echo ""
-echo "Backend Worker:"
-echo " Unified: GPUs $WORKER_GPUS (TP=$TP_SIZE)"
+echo "Backend Workers:"
+echo " Workers: $NUM_WORKERS (GPUs: $NUM_GPUS, TP=$TP_SIZE per worker)"
+echo " GPUs: $WORKER_GPUS"
echo " Mode: UNIFIED (no prefill/decode disaggregation)"
echo ""
echo "========================================================="
@@ -217,7 +221,7 @@ if [ ! -d "$LOCAL_MODEL_DIR" ]; then
echo "WARNING: Model directory not found at: $LOCAL_MODEL_DIR"
echo ""
echo "To download the model, run:"
- echo " huggingface-cli download meta-llama/Llama-3.3-70B-Instruct --local-dir $LOCAL_MODEL_DIR"
+ echo " hf download meta-llama/Llama-3.3-70B-Instruct --local-dir $LOCAL_MODEL_DIR"
echo ""
read -p "Continue anyway (model will be downloaded from HuggingFace)? [y/N] " -n 1 -r
echo
@@ -277,8 +281,8 @@ docker run -d \
wait_for_worker() {
local worker_type=\$1
local pid=\$2
- # local max_wait=300
- local max_wait=${DYNAMO_WORKER_INIT_TIMEOUT_S:-600}
+ # Use WORKER_INIT_TIMEOUT_S (defaults to 1800s / 30 min)
+ local max_wait=$WORKER_INIT_TIMEOUT_S
local elapsed=0
local poll_interval=5
@@ -329,23 +333,45 @@ docker run -d \
}
echo '========================================================='
- echo 'Step 1: Starting Unified Worker (Host GPUs $WORKER_GPUS -> Container GPUs $CONTAINER_GPU_INDICES)...'
+ echo 'Step 1: Starting $NUM_WORKERS Unified Worker(s) (Host GPUs $WORKER_GPUS -> Container GPUs $CONTAINER_GPU_INDICES)...'
echo '========================================================='
- CUDA_VISIBLE_DEVICES=$CONTAINER_GPU_INDICES \
- python3 -m dynamo.sglang \
- --model-path $MODEL \
- --served-model-name $SERVED_MODEL_NAME \
- --host 0.0.0.0 \
- --port 30000 \
- --tp $TP_SIZE \
- --trust-remote-code \
- --mem-fraction-static 0.8 &
- WORKER_PID=\$!
- echo \"Unified Worker PID: \$WORKER_PID\"
+
+ # Start multiple workers, each using TP_SIZE GPUs
+ WORKER_PIDS=()
+ for i in \$(seq 0 \$(($NUM_WORKERS - 1))); do
+ # Calculate GPU range for this worker (e.g., worker 0: 0,1; worker 1: 2,3; etc.)
+ START_GPU=\$((i * $TP_SIZE))
+ END_GPU=\$(((i + 1) * $TP_SIZE - 1))
+ WORKER_GPU_LIST=\$(seq -s, \$START_GPU \$END_GPU)
+ WORKER_PORT=\$((30000 + i))
+
+ echo \"Starting Worker \$i: GPUs \$WORKER_GPU_LIST, Port \$WORKER_PORT\"
+ CUDA_VISIBLE_DEVICES=\$WORKER_GPU_LIST \
+ python3 -m dynamo.sglang \
+ --model-path $MODEL \
+ --served-model-name $SERVED_MODEL_NAME \
+ --host 0.0.0.0 \
+ --port \$WORKER_PORT \
+ --tp $TP_SIZE \
+ --trust-remote-code \
+ --mem-fraction-static 0.9 &
+ WORKER_PIDS+=(\$!)
+ echo \" Worker \$i PID: \${WORKER_PIDS[\$i]}\"
+ done
echo \"\"
+ echo \"Total workers started: \${#WORKER_PIDS[@]}\"
+ echo \"Worker PIDs: \${WORKER_PIDS[*]}\"
+ echo \"\"
+
+ # Wait for first worker to initialize (checks ETCD registration)
+ # Once one worker is registered, the frontend can start discovering workers
+ wait_for_worker \"Unified\" \${WORKER_PIDS[0]} || exit 1
- # Wait for unified worker to initialize (checks ETCD registration)
- wait_for_worker \"Unified\" \$WORKER_PID || exit 1
+ # Give additional workers time to initialize
+ if [ \${#WORKER_PIDS[@]} -gt 1 ]; then
+ echo \"Waiting additional 30s for remaining workers to initialize...\"
+ sleep 30
+ fi
echo ''
echo '========================================================='
@@ -370,7 +396,12 @@ docker run -d \
echo \" NATS: localhost:$NATS_PORT\"
echo \"\"
echo \"Dynamo Components (This Container):\"
- echo \" Unified Worker: PID \$WORKER_PID (GPUs $WORKER_GPUS, TP=$TP_SIZE, internal port 30000)\"
+ echo \" Unified Workers: \${#WORKER_PIDS[@]} workers (GPUs $WORKER_GPUS, TP=$TP_SIZE each)\"
+ for i in \$(seq 0 \$((\${#WORKER_PIDS[@]} - 1))); do
+ START_GPU=\$((i * $TP_SIZE))
+ END_GPU=\$(((i + 1) * $TP_SIZE - 1))
+ echo \" Worker \$i: PID \${WORKER_PIDS[\$i]}, GPUs \$START_GPU-\$END_GPU, port \$((30000 + i))\"
+ done
echo \" Frontend: PID \$FRONTEND_PID (HTTP API on port $HTTP_PORT)\"
echo ''
echo 'Request Flow:'
@@ -390,10 +421,12 @@ docker run -d \
echo \"ERROR: Frontend died!\"
exit 1
fi
- if ! kill -0 \$WORKER_PID 2>/dev/null; then
- echo \"ERROR: Unified worker died!\"
- exit 1
- fi
+ for i in \$(seq 0 \$((\${#WORKER_PIDS[@]} - 1))); do
+ if ! kill -0 \${WORKER_PIDS[\$i]} 2>/dev/null; then
+ echo \"ERROR: Worker \$i (PID \${WORKER_PIDS[\$i]}) died!\"
+ exit 1
+ fi
+ done
sleep 10
done
"
@@ -417,9 +450,9 @@ if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
echo " ↓"
echo " Frontend discovers workers via ETCD"
echo " ↓"
- echo " Frontend routes to Unified Worker"
+ echo " Frontend routes to one of $NUM_WORKERS Unified Workers"
echo " ↓ (localhost:$ETCD_CLIENT_PORT - worker discovery)"
- echo " Unified Worker (GPUs $WORKER_GPUS, TP=$TP_SIZE)"
+ echo " Unified Workers ($NUM_WORKERS x TP=$TP_SIZE = $NUM_GPUS GPUs total)"
echo " ↓"
echo " Response"
echo ""
@@ -429,7 +462,7 @@ if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
echo ""
echo "Dynamo Components (This Container):"
echo " Frontend: HTTP API on port $HTTP_PORT"
- echo " Unified Worker: GPUs $WORKER_GPUS (TP=$TP_SIZE, internal port 30000)"
+ echo " Unified Workers: $NUM_WORKERS workers (TP=$TP_SIZE each, ports 30000-$((30000 + NUM_WORKERS - 1)))"
echo ""
echo "API Endpoint: http://localhost:$HTTP_PORT/v1/chat/completions"
echo "Health Check: http://localhost:$HTTP_PORT/health"
@@ -445,7 +478,7 @@ if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
echo ""
echo "========================================================="
echo "Test Request:"
- echo "========================================================="
+ echo "=====================================ca===================="
echo ""
echo "# Basic test"
echo "curl http://localhost:$HTTP_PORT/v1/chat/completions \\"
@@ -484,8 +517,8 @@ if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
echo ""
# Wait for server to be ready (check /v1/models which only works when workers are discovered)
- echo "Checking for API availability (timeout=15 minutes)..."
- max_attempts=900
+ echo "Checking for API availability (timeout=${WORKER_INIT_TIMEOUT_S}s)..."
+ max_attempts=$WORKER_INIT_TIMEOUT_S
attempt=0
while [ $attempt -lt $max_attempts ]; do
diff --git a/external/dynamo/start_dynamo_unified_thompson_hints.sh b/external/dynamo/start_dynamo_unified_thompson_hints.sh
index 86977029c2..3a804b892d 100755
--- a/external/dynamo/start_dynamo_unified_thompson_hints.sh
+++ b/external/dynamo/start_dynamo_unified_thompson_hints.sh
@@ -41,7 +41,7 @@
CONTAINER_NAME="dynamo-sglang"
WORKER_GPUS="${DYNAMO_GPU_DEVICES:-0,1,2,3}"
TP_SIZE="${DYNAMO_TP_SIZE:-4}"
-HTTP_PORT="${DYNAMO_HTTP_PORT:-8099}"
+HTTP_PORT="${DYNAMO_HTTP_PORT:-8000}"
MODEL="/workspace/models/Llama-3.3-70B-Instruct"
SERVED_MODEL_NAME="${DYNAMO_MODEL_NAME:-llama-3.3-70b}"
IMAGE="nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.1"
diff --git a/external/dynamo/stop_dynamo.sh b/external/dynamo/stop_dynamo.sh
index 231eeb8c90..5afdc83edc 100755
--- a/external/dynamo/stop_dynamo.sh
+++ b/external/dynamo/stop_dynamo.sh
@@ -14,31 +14,73 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-# Dynamo SGLang Shutdown Script
-# Stops all components: Dynamo worker container, ETCD, and NATS
+# Dynamo Shutdown Script
+# Stops all components: Dynamo worker container (SGLang or vLLM), ETCD, and NATS
# Works for: UNIFIED, THOMPSON SAMPLING, and DISAGGREGATED modes
+# Supports both SGLang and vLLM backends
+#
+# Usage:
+# bash stop_dynamo.sh # Stop Dynamo, ETCD, NATS only
+# bash stop_dynamo.sh --kill-metrics # Also stop Prometheus and Grafana
+# bash stop_dynamo.sh --clear-metrics # Stop monitoring stack AND remove Prometheus data volume
+
+# Parse command line arguments
+KILL_METRICS=false
+CLEAR_METRICS=false
+for arg in "$@"; do
+ case $arg in
+ --kill-metrics)
+ KILL_METRICS=true
+ shift
+ ;;
+ --clear-metrics)
+ KILL_METRICS=true
+ CLEAR_METRICS=true
+ shift
+ ;;
+ -h|--help)
+ echo "Usage: bash stop_dynamo.sh [OPTIONS]"
+ echo ""
+ echo "Options:"
+ echo " --kill-metrics Also stop Prometheus and Grafana containers"
+ echo " --clear-metrics Stop monitoring stack AND remove Prometheus data volume (clears old metrics)"
+ echo " -h, --help Show this help message"
+ exit 0
+ ;;
+ esac
+done
echo "========================================================="
-echo "Stopping Dynamo SGLang FULL STACK"
+echo "Stopping Dynamo FULL STACK (SGLang/vLLM)"
echo "========================================================="
echo ""
-# Stop Dynamo containers (check for both standard and thompson variants)
+# Stop Dynamo containers (check for SGLang and vLLM variants)
STOPPED_CONTAINER=false
+# SGLang containers
if docker ps --format '{{.Names}}' | grep -q "^dynamo-sglang$"; then
- echo "Stopping Dynamo container (standard)..."
+ echo "Stopping Dynamo container (SGLang)..."
docker stop dynamo-sglang
docker rm dynamo-sglang
- echo "✓ Dynamo container stopped and removed"
+ echo "✓ Dynamo SGLang container stopped and removed"
STOPPED_CONTAINER=true
fi
if docker ps --format '{{.Names}}' | grep -q "^dynamo-sglang-thompson$"; then
- echo "Stopping Dynamo container (Thompson Sampling)..."
+ echo "Stopping Dynamo container (SGLang Thompson Sampling)..."
docker stop dynamo-sglang-thompson
docker rm dynamo-sglang-thompson
- echo "✓ Dynamo Thompson container stopped and removed"
+ echo "✓ Dynamo SGLang Thompson container stopped and removed"
+ STOPPED_CONTAINER=true
+fi
+
+# vLLM containers
+if docker ps --format '{{.Names}}' | grep -q "^dynamo-vllm$"; then
+ echo "Stopping Dynamo container (vLLM)..."
+ docker stop dynamo-vllm
+ docker rm dynamo-vllm
+ echo "✓ Dynamo vLLM container stopped and removed"
STOPPED_CONTAINER=true
fi
@@ -68,13 +110,57 @@ else
echo " (NATS container not running)"
fi
+# Stop monitoring stack if --kill-metrics flag is set
+if [ "$KILL_METRICS" = true ]; then
+ echo ""
+ echo "========================================================="
+ echo "Stopping Monitoring Stack (--kill-metrics)"
+ echo "========================================================="
+
+ # Stop Prometheus
+ if docker ps --format '{{.Names}}' | grep -q "^dynamo-prometheus$"; then
+ echo ""
+ echo "Stopping Prometheus container..."
+ docker stop dynamo-prometheus
+ docker rm dynamo-prometheus
+ echo "✓ Prometheus container stopped and removed"
+ else
+ echo " (Prometheus container not running)"
+ fi
+
+ # Stop Grafana
+ if docker ps --format '{{.Names}}' | grep -q "^dynamo-grafana$"; then
+ echo ""
+ echo "Stopping Grafana container..."
+ docker stop dynamo-grafana
+ docker rm dynamo-grafana
+ echo "✓ Grafana container stopped and removed"
+ else
+ echo " (Grafana container not running)"
+ fi
+
+ # Clear Prometheus data volume if --clear-metrics flag is set
+ if [ "$CLEAR_METRICS" = true ]; then
+ echo ""
+ echo "Clearing Prometheus data volume..."
+ docker volume rm monitoring_prometheus_data && echo "✓ Prometheus data volume removed (old metrics cleared)"
+ fi
+fi
+
echo ""
echo "========================================================="
echo "✓ All components stopped!"
+if [ "$KILL_METRICS" = true ]; then
+ echo " (including monitoring stack)"
+fi
+if [ "$CLEAR_METRICS" = true ]; then
+ echo " (Prometheus data volume cleared)"
+fi
echo "========================================================="
echo ""
echo "To restart:"
echo " Standard Unified: bash start_dynamo_unified.sh"
-echo " Thompson Sampling: bash start_dynamo_unified_thompson_hints.sh"
+echo " SGLang Thompson: bash start_dynamo_optimized_thompson_hints_sglang.sh"
+echo " vLLM Thompson: bash start_dynamo_optimized_thompson_hints_vllm.sh"
echo ""
diff --git a/src/nat/data_models/profiler.py b/src/nat/data_models/profiler.py
index cb0ed64544..3c65db8f12 100644
--- a/src/nat/data_models/profiler.py
+++ b/src/nat/data_models/profiler.py
@@ -14,6 +14,7 @@
# limitations under the License.
from pydantic import BaseModel
+from pydantic import Field
class PromptCachingConfig(BaseModel):
@@ -40,6 +41,188 @@ class PrefixSpanConfig(BaseModel):
chain_with_common_prefixes: bool = False
+class DynamoMetricsConfig(BaseModel):
+ """
+ Configuration for collecting Dynamo inference stack metrics.
+
+ Core Optimization Metrics
+ -------------------------
+ The profiler focuses on three core metrics for Dynamo LLM optimization:
+
+ 1. **KV Efficiency (KVE)** (``collect_kv_cache``):
+ Token-agnostic measure of computational work saved via KV cache.
+ Formula: ``KVE = cached_tokens / prompt_tokens``
+ A KVE of 0.8 means 80% of prompt tokens were served from cache.
+ Affected by prefix routing hints (prefix_id, prefix_osl, prefix_iat).
+
+ 2. **Time to First Token - TTFT** (``collect_ttft``):
+ Latency from request to first token. Lower = faster initial response.
+ Affected by queue depth, worker selection, KV cache hits.
+
+ 3. **Inter-Token Latency - ITL** (``collect_itl``):
+ Time between tokens during streaming. Lower = smoother streaming.
+ Affected by batch scheduling, GPU utilization.
+
+ To collect only core metrics for optimization, use::
+
+ config = DynamoMetricsConfig.core_metrics_only()
+
+ Dynamo Endpoints
+ ----------------
+ - Frontend (:8000/metrics): Latency, throughput, token stats
+ - Worker (:8081/metrics): KV cache, SGLang stats
+ - Router (:8082/metrics): Thompson Sampling routing
+ - Processor (:8083/metrics): Thompson Sampling KVE
+
+ Adding New Metrics
+ ------------------
+ To add metrics from any Dynamo endpoint:
+
+ 1. **Identify the metric** from the endpoint::
+
+ curl localhost:8081/metrics | grep kv
+
+ 2. **Add to DynamoMetricsResult** in ``src/nat/profiler/inference_optimization/dynamo_metrics.py``:
+ - Add a new field to the Pydantic model
+ - Add the Prometheus query in ``METRIC_QUERIES``
+
+ 3. **Example - Adding a new metric**::
+
+ # In dynamo_metrics.py METRIC_QUERIES dict:
+ "my_new_metric": "rate(dynamo_component_my_metric_total[5m])"
+
+ # In DynamoMetricsResult model:
+ my_new_metric: float | None = Field(default=None, description="My new metric")
+
+ Metric Reference by Endpoint
+ ----------------------------
+ - **Frontend (:8000)**: ``dynamo_frontend_*`` (requests, latency, tokens)
+ - **Worker (:8081)**: ``dynamo_component_kvstats_*``, ``sglang:*`` (KV cache, SGLang)
+ - **Router (:8082)**: ``dynamo_component_*`` with ``dynamo_component="router"`` label
+ - **Processor (:8083)**: ``dynamo_component_thompson_*`` (Thompson Sampling)
+
+ See ``external/dynamo/monitoring/README.md`` for the complete metrics reference.
+ """
+
+ enable: bool = Field(default=False, description="Enable Dynamo metrics collection")
+
+ prometheus_url: str = Field(
+ default="http://localhost:9090",
+ description="Prometheus server URL for querying Dynamo metrics",
+ )
+
+ # =========================================================================
+ # CORE OPTIMIZATION METRICS (Primary targets)
+ # =========================================================================
+ collect_kv_cache: bool = Field(
+ default=True,
+ description="[CORE] Collect KV Efficiency (KVE = cached_tokens/prompt_tokens) - "
+ "primary metric for prefix caching optimization. Measures fraction of work saved.",
+ )
+ collect_ttft: bool = Field(
+ default=True,
+ description="[CORE] Collect Time to First Token (P50/P95/P99) - primary latency metric",
+ )
+ collect_itl: bool = Field(
+ default=True,
+ description="[CORE] Collect Inter-Token Latency (P50/P95/P99) - primary streaming metric",
+ )
+
+ # =========================================================================
+ # SUPPLEMENTARY METRICS (Context and diagnostics)
+ # =========================================================================
+ collect_inflight_requests: bool = Field(
+ default=True,
+ description="Collect current inflight requests across components",
+ )
+ collect_throughput: bool = Field(
+ default=True,
+ description="Collect requests per minute throughput",
+ )
+ collect_token_throughput: bool = Field(
+ default=True,
+ description="Collect token generation throughput (tokens/sec)",
+ )
+
+ # Query time range for rate calculations
+ query_range: str = Field(
+ default="30s",
+ description="Time range for rate calculations in Prometheus queries. "
+ "Minimum: '15s' (Prometheus scrapes every 5s, need ≥3 points for reliable rates). "
+ "Options: '15s', '30s' (default), '1m', '2m', '5m'. "
+ "Should roughly match experiment duration. Too short = noisy. Too long = stale data included.",
+ )
+
+ # Historical lookback for range queries (set automatically from workflow duration if 0)
+ lookback_seconds: float = Field(
+ default=0.0,
+ description="Lookback time in seconds for Prometheus range queries when instant queries return no data. "
+ "If 0 (default), will be set automatically to the workflow duration + buffer. "
+ "This allows capturing TTFT/ITL metrics from the entire eval run, even after the workflow completes.",
+ )
+
+ # Workflow time window (set automatically by profiler)
+ workflow_start_timestamp: float | None = Field(
+ default=None,
+ description="Unix timestamp when the workflow started (set automatically by profiler). "
+ "Used for precise range query time windows.",
+ )
+ workflow_end_timestamp: float | None = Field(
+ default=None,
+ description="Unix timestamp when the workflow ended (set automatically by profiler). "
+ "Used for precise range query time windows to isolate metrics to this eval run.",
+ )
+
+ @classmethod
+ def core_metrics_only(
+ cls,
+ prometheus_url: str = "http://localhost:9090",
+ query_range: str = "30s",
+ ) -> "DynamoMetricsConfig":
+ """
+ Create a config that collects only the three core optimization metrics.
+
+ This is optimized for tight optimization loops where you only need:
+ - KV Cache Efficiency
+ - TTFT (Time to First Token)
+ - ITL (Inter-Token Latency)
+
+ Args:
+ prometheus_url: Prometheus server URL
+ query_range: Time range for rate calculations
+
+ Returns:
+ DynamoMetricsConfig with only core metrics enabled
+
+ Usage::
+
+ config = DynamoMetricsConfig.core_metrics_only()
+ # Equivalent to:
+ # DynamoMetricsConfig(
+ # enable=True,
+ # collect_kv_cache=True,
+ # collect_ttft=True,
+ # collect_itl=True,
+ # collect_inflight_requests=False,
+ # collect_throughput=False,
+ # collect_token_throughput=False,
+ # )
+ """
+ return cls(
+ enable=True,
+ prometheus_url=prometheus_url,
+ query_range=query_range,
+ # Core metrics
+ collect_kv_cache=True,
+ collect_ttft=True,
+ collect_itl=True,
+ # Disable supplementary metrics
+ collect_inflight_requests=False,
+ collect_throughput=False,
+ collect_token_throughput=False,
+ )
+
+
class ProfilerConfig(BaseModel):
base_metrics: bool = False
@@ -52,3 +235,4 @@ class ProfilerConfig(BaseModel):
bottleneck_analysis: BottleneckConfig = BottleneckConfig()
concurrency_spike_analysis: ConcurrencySpikeConfig = ConcurrencySpikeConfig()
prefix_span_analysis: PrefixSpanConfig = PrefixSpanConfig()
+ dynamo_metrics: DynamoMetricsConfig = DynamoMetricsConfig()
diff --git a/src/nat/llm/dynamo_llm.py b/src/nat/llm/dynamo_llm.py
index 79667e106b..dc0c4a3058 100644
--- a/src/nat/llm/dynamo_llm.py
+++ b/src/nat/llm/dynamo_llm.py
@@ -13,15 +13,29 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
-Dynamo LLM provider with automatic prefix header injection for KV cache optimization.
+Dynamo LLM provider with automatic prefix injection for KV cache optimization.
-This module provides a specialized OpenAI-compatible LLM that sends Dynamo prefix headers
+This module provides a specialized OpenAI-compatible LLM that sends Dynamo prefix hints
for optimal KV cache management and request routing. The prefix parameters are optimizable
via the NAT optimizer.
-The implementation uses httpx event hooks to inject headers at the HTTP transport level,
+The implementation uses httpx event hooks to inject hints at the HTTP transport level,
making it framework-agnostic (works with LangChain, LlamaIndex, etc.).
+Transport Mechanisms
+--------------------
+
+This module supports two transport mechanisms for routing hints, used simultaneously
+for maximum compatibility:
+
+1. **HTTP Headers** (``x-prefix-*``): For the generalized Thompson Sampling setup
+ that uses custom ``frontend.py`` which reads headers directly.
+ *DEPRECATED: Will be removed when start_dynamo_unified_thompson_hints.sh is retired.*
+
+2. **nvext.annotations** (in request body): For the optimized Thompson Sampling setup
+ that uses the default Dynamo frontend with custom ``processor.py`` which reads
+ annotations from the preprocessed request. *This is the preferred mechanism.*
+
Dynamo Prefix Parameters
-------------------------
@@ -46,15 +60,13 @@
- Lower values allow more load balancing
"""
+import json
import logging
import uuid
-from collections.abc import Callable
-from collections.abc import Coroutine
from collections.abc import Iterator
from contextlib import contextmanager
from contextvars import ContextVar
from typing import TYPE_CHECKING
-from typing import Any
from typing import Literal
if TYPE_CHECKING:
@@ -164,14 +176,20 @@ def scope(cls, prefix_id: str) -> Iterator[None]:
class DynamoModelConfig(OpenAIModelConfig, name="dynamo"):
"""
- A Dynamo LLM provider with automatic prefix header injection for KV cache optimization.
+ A Dynamo LLM provider with automatic prefix hint injection for KV cache optimization.
- This is a specialized OpenAI-compatible LLM that sends Dynamo prefix headers
- for optimal KV cache management and request routing. Prefix headers are enabled
+ This is a specialized OpenAI-compatible LLM that sends Dynamo prefix hints
+ for optimal KV cache management and request routing. Prefix hints are enabled
by default using the template "nat-dynamo-{uuid}". The prefix routing parameters
(prefix_total_requests, prefix_osl, prefix_iat) are optimizable via the NAT optimizer.
- To disable prefix headers, set prefix_template to null/None in your config.
+ Hints are sent via both HTTP headers (``x-prefix-*``) and ``nvext.annotations``
+ in the request body for compatibility with different Dynamo setups:
+
+ - **Generalized Thompson Sampling** (custom frontend.py): Reads HTTP headers
+ - **Optimized Thompson Sampling** (default frontend + processor.py): Reads nvext.annotations
+
+ To disable prefix hints, set prefix_template to null/None in your config.
"""
# =========================================================================
@@ -246,72 +264,99 @@ def get_dynamo_field_names() -> frozenset[str]:
# =============================================================================
-# HTTPX EVENT HOOK FOR HEADER INJECTION
+# CUSTOM TRANSPORT FOR DYNAMO HINT INJECTION
# =============================================================================
-def _create_dynamo_request_hook(
- prefix_template: str | None,
- total_requests: int,
- osl: str,
- iat: str,
-) -> Callable[["httpx.Request"], Coroutine[Any, Any, None]]:
+class _DynamoTransport:
"""
- Create an httpx event hook that injects Dynamo prefix headers into requests.
-
- This hook is called before each HTTP request is sent, allowing us to inject
- headers dynamically. The prefix ID is generated ONCE when the hook is created,
- ensuring all requests from the same client share the same prefix ID. This enables
- Dynamo's KV cache optimization across multi-turn conversations.
-
- The context variable can override this for scenarios where you need different
- prefix IDs (e.g., per-question in batch evaluation).
+ Custom transport wrapper that injects nvext.annotations into request bodies.
- Args:
- prefix_template: Template string with {uuid} placeholder
- total_requests: Expected number of requests for this prefix
- osl: Output sequence length hint (LOW/MEDIUM/HIGH)
- iat: Inter-arrival time hint (LOW/MEDIUM/HIGH)
-
- Returns:
- An async function suitable for use as an httpx event hook.
+ This approach is more reliable than using event hooks because it modifies
+ the request BEFORE httpx's internal state machine processes it.
"""
- # Generate the default prefix ID ONCE when the hook is created
- # This ensures all requests from this client share the same prefix ID
- unique_id = uuid.uuid4().hex[:16]
- if prefix_template:
- default_prefix_id = prefix_template.format(uuid=unique_id)
- else:
- default_prefix_id = f"nat-dynamo-{unique_id}"
- logger.debug("Created Dynamo request hook with default prefix ID: %s", default_prefix_id)
+ def __init__(
+ self,
+ transport: "httpx.AsyncBaseTransport",
+ prefix_id: str,
+ total_requests: int,
+ osl: str,
+ iat: str,
+ ):
+ self._transport = transport
+ self._prefix_id = prefix_id
+ self._total_requests = total_requests
+ self._osl = osl.upper()
+ self._iat = iat.upper()
+
+ async def handle_async_request(self, request: "httpx.Request") -> "httpx.Response":
+ import httpx
- async def on_request(request):
- """Inject Dynamo prefix headers before each request."""
# Check context variable first (allows per-question override in batch evaluation)
context_prefix_id = DynamoPrefixContext.get()
-
- if context_prefix_id:
- prefix_id = context_prefix_id
- logger.debug("Using context prefix ID: %s", prefix_id)
- else:
- # Use the pre-generated prefix ID (same for all requests from this client)
- prefix_id = default_prefix_id
- logger.debug("Using default prefix ID: %s", prefix_id)
-
- # Inject Dynamo headers
- request.headers["x-prefix-id"] = prefix_id
- request.headers["x-prefix-total-requests"] = str(total_requests)
- request.headers["x-prefix-osl"] = osl.upper()
- request.headers["x-prefix-iat"] = iat.upper()
-
- logger.debug("Injected Dynamo headers: prefix_id=%s, total_requests=%d, osl=%s, iat=%s",
- prefix_id,
- total_requests,
- osl.upper(),
- iat.upper())
-
- return on_request
+ prefix_id = context_prefix_id if context_prefix_id else self._prefix_id
+
+ # Add HTTP headers (for generalized setup compatibility)
+ headers = dict(request.headers)
+ headers["x-prefix-id"] = prefix_id
+ headers["x-prefix-total-requests"] = str(self._total_requests)
+ headers["x-prefix-osl"] = self._osl
+ headers["x-prefix-iat"] = self._iat
+
+ # Modify body if it's a POST request with JSON content
+ content = request.content
+ if request.method == "POST" and content:
+ try:
+ body = json.loads(content.decode("utf-8"))
+ if isinstance(body, dict):
+ # Build annotations list
+ annotations = [
+ f"prefix_id:{prefix_id}",
+ f"total_requests:{self._total_requests}",
+ f"osl:{self._osl}",
+ f"iat:{self._iat}",
+ ]
+
+ # Add/merge nvext.annotations
+ if "nvext" not in body:
+ body["nvext"] = {}
+ if not isinstance(body["nvext"], dict):
+ body["nvext"] = {}
+
+ existing = body["nvext"].get("annotations", [])
+ if not isinstance(existing, list):
+ existing = []
+
+ # Our annotations take precedence
+ body["nvext"]["annotations"] = annotations + [
+ a for a in existing
+ if not any(a.startswith(f"{key}:") for key in ["prefix_id", "total_requests", "osl", "iat"])
+ ]
+
+ # Re-encode
+ content = json.dumps(body).encode("utf-8")
+ headers["content-length"] = str(len(content))
+
+ logger.debug("Injected nvext.annotations: %s (body size: %d bytes)",
+ body["nvext"]["annotations"],
+ len(content))
+ except (json.JSONDecodeError, UnicodeDecodeError) as e:
+ logger.debug("Could not inject nvext.annotations: %s", e)
+
+ # Create a new request with modified headers and content
+ new_request = httpx.Request(
+ method=request.method,
+ url=request.url,
+ headers=headers,
+ content=content,
+ extensions=request.extensions,
+ )
+
+ return await self._transport.handle_async_request(new_request)
+
+ async def aclose(self):
+ await self._transport.aclose()
def create_httpx_client_with_dynamo_hooks(
@@ -322,10 +367,16 @@ def create_httpx_client_with_dynamo_hooks(
timeout: float = 600.0,
) -> "httpx.AsyncClient":
"""
- Create an httpx.AsyncClient with Dynamo prefix header injection.
+ Create an httpx.AsyncClient with Dynamo prefix hint injection.
- This client can be passed to the OpenAI SDK to inject headers at the HTTP level,
- making it framework-agnostic.
+ This client can be passed to the OpenAI SDK to inject hints at the HTTP level,
+ making it framework-agnostic. Hints are injected via both HTTP headers and
+ nvext.annotations in the request body for maximum compatibility with different
+ Dynamo setups:
+
+ - **Generalized setup** (custom frontend.py): Reads ``x-prefix-*`` HTTP headers
+ - **Optimized setup** (default frontend + custom processor.py): Reads
+ ``nvext.annotations`` from the request body
Args:
prefix_template: Template string with {uuid} placeholder
@@ -335,14 +386,31 @@ def create_httpx_client_with_dynamo_hooks(
timeout: HTTP request timeout in seconds
Returns:
- An httpx.AsyncClient configured with Dynamo header injection.
+ An httpx.AsyncClient configured with Dynamo hint injection.
"""
import httpx
- request_hook = _create_dynamo_request_hook(prefix_template, total_requests, osl, iat)
+ # Generate the prefix ID once
+ unique_id = uuid.uuid4().hex[:16]
+ if prefix_template:
+ prefix_id = prefix_template.format(uuid=unique_id)
+ else:
+ prefix_id = f"nat-dynamo-{unique_id}"
+
+ logger.debug("Created Dynamo client with prefix ID: %s", prefix_id)
+
+ # Create a base transport and wrap it with our custom transport
+ base_transport = httpx.AsyncHTTPTransport()
+ dynamo_transport = _DynamoTransport(
+ transport=base_transport,
+ prefix_id=prefix_id,
+ total_requests=total_requests,
+ osl=osl,
+ iat=iat,
+ )
return httpx.AsyncClient(
- event_hooks={"request": [request_hook]},
+ transport=dynamo_transport,
timeout=httpx.Timeout(timeout),
)
diff --git a/src/nat/profiler/inference_optimization/dynamo_metrics.py b/src/nat/profiler/inference_optimization/dynamo_metrics.py
new file mode 100644
index 0000000000..d0f6514b5d
--- /dev/null
+++ b/src/nat/profiler/inference_optimization/dynamo_metrics.py
@@ -0,0 +1,1082 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Dynamo Metrics Collector for NAT Profiler.
+
+This module collects performance metrics from the Dynamo inference stack via Prometheus.
+Metrics are collected from four Dynamo components:
+
+- **Frontend** (:8000): User-facing latency, throughput, token statistics
+- **Worker** (:8081): KV cache utilization, SGLang backend metrics
+- **Router** (:8082): Thompson Sampling routing decisions
+- **Processor** (:8083): Thompson Sampling KVE (KV Efficiency) metrics
+
+Core Optimization Metrics
+-------------------------
+
+The profiler focuses on three core metrics for Dynamo LLM optimization:
+
+1. **KV Efficiency (KVE)** - Token-agnostic measure of computational savings:
+
+ - Formula: ``KVE = cached_tokens / prompt_tokens``
+ - Measures the fraction of total work saved via KV cache reuse
+ - A KVE of 0.8 means 80% of prompt tokens were served from cache
+ - Source: Thompson Sampling processor (``dynamo_component_thompson_kve_*``)
+ - Fallback: SGLang native ``cache_hit_rate`` if KVE counters unavailable
+ - Affected by: prefix_id routing, prefix hints (osl, iat), request patterns
+
+2. **Time to First Token (TTFT)** (``ttft_p50``, ``ttft_p95``, ``ttft_p99``):
+
+ - Latency from request arrival to first token generation
+ - Critical for user-perceived responsiveness
+ - Affected by queue depth, worker selection, KV cache hits
+
+3. **Inter-Token Latency (ITL)** (``itl_p50``, ``itl_p95``, ``itl_p99``):
+
+ - Time between consecutive token generations during streaming
+ - Affects smoothness of streaming responses
+ - Influenced by batch scheduling and GPU utilization
+
+Adding New Metrics
+------------------
+
+To add a new metric from any Dynamo endpoint:
+
+1. **Find the metric name** by curling the endpoint::
+
+ curl -s http://localhost:8081/metrics | grep -i kv
+ curl -s http://localhost:8000/metrics | grep -i token
+
+2. **Add the Prometheus query** to ``METRIC_QUERIES``::
+
+ METRIC_QUERIES = {
+ ...
+ "my_new_metric": "rate(dynamo_component_my_metric_total[{range}])",
+ }
+
+ Note: Use ``{range}`` placeholder for time range (replaced with config value).
+
+3. **Add the field** to ``DynamoMetricsResult``::
+
+ class DynamoMetricsResult(BaseModel):
+ ...
+ my_new_metric: float | None = Field(
+ default=None,
+ description="Description of my new metric"
+ )
+
+4. **Update the collector** if needed (optional - for complex metrics):
+
+ If the metric requires special handling (e.g., combining multiple queries),
+ add custom logic in ``DynamoMetricsCollector.collect()``.
+
+Metric Reference by Endpoint
+----------------------------
+
+**Frontend (:8000/metrics)**::
+
+ dynamo_frontend_requests_total # Counter: Total requests
+ dynamo_frontend_inflight_requests # Gauge: Current inflight
+ dynamo_frontend_time_to_first_token_seconds_bucket # Histogram: TTFT
+ dynamo_frontend_inter_token_latency_seconds_bucket # Histogram: ITL
+ dynamo_frontend_output_tokens_total # Counter: Total output tokens
+
+**Worker (:8081/metrics)**::
+
+ dynamo_component_kvstats_gpu_cache_usage_percent # Gauge: KV cache %
+ dynamo_component_kvstats_gpu_prefix_cache_hit_rate # Gauge: Cache hit rate
+ sglang:cache_hit_rate # Gauge: SGLang native cache hit
+ sglang:gen_throughput # Gauge: Generation throughput
+ sglang:num_running_reqs # Gauge: Running requests
+ sglang:num_queue_reqs # Gauge: Queued requests
+
+**Router (:8082/metrics)**::
+
+ dynamo_component_requests_total{dynamo_endpoint="find_worker"}
+ dynamo_component_request_duration_seconds_bucket
+
+**Processor (:8083/metrics)**::
+
+ dynamo_component_thompson_requests_total
+ dynamo_component_thompson_kve_cached_tokens_total
+ dynamo_component_thompson_kve_prompt_tokens_total
+ dynamo_component_thompson_routing_decisions_total
+
+See ``external/dynamo/monitoring/README.md`` for the complete metrics reference.
+"""
+
+import logging
+import math
+import time
+from typing import Any
+
+import httpx
+from pydantic import BaseModel
+from pydantic import Field
+
+from nat.data_models.profiler import DynamoMetricsConfig
+
+logger = logging.getLogger(__name__)
+
+# =============================================================================
+# PROMETHEUS QUERY DEFINITIONS
+# =============================================================================
+
+# Metric queries using Prometheus query language (PromQL).
+# Use {range} placeholder for time range substitution.
+#
+# To add a new metric:
+# 1. Add the query here with a descriptive key
+# 2. Add corresponding field to DynamoMetricsResult
+# 3. The collector will automatically fetch and populate it
+METRIC_QUERIES: dict[str, str] = {
+ # -------------------------------------------------------------------------
+ # Inflight Requests (Gauge metrics - no rate needed)
+ # -------------------------------------------------------------------------
+ "inflight_requests_frontend": "dynamo_frontend_inflight_requests",
+ "inflight_requests_worker": "dynamo_component_inflight_requests",
+ "queued_requests": "dynamo_frontend_queued_requests",
+
+ # -------------------------------------------------------------------------
+ # Throughput (Rate metrics)
+ # -------------------------------------------------------------------------
+ "requests_per_minute": "rate(dynamo_frontend_requests_total[{range}]) * 60",
+ "token_throughput": "rate(dynamo_frontend_output_tokens_total[{range}])",
+
+ # -------------------------------------------------------------------------
+ # Time to First Token (TTFT) - Histogram quantiles
+ # -------------------------------------------------------------------------
+ "ttft_p50": "histogram_quantile(0.50, rate(dynamo_frontend_time_to_first_token_seconds_bucket[{range}]))",
+ "ttft_p95": "histogram_quantile(0.95, rate(dynamo_frontend_time_to_first_token_seconds_bucket[{range}]))",
+ "ttft_p99": "histogram_quantile(0.99, rate(dynamo_frontend_time_to_first_token_seconds_bucket[{range}]))",
+
+ # -------------------------------------------------------------------------
+ # Inter-Token Latency (ITL) - Histogram quantiles
+ # -------------------------------------------------------------------------
+ "itl_p50": "histogram_quantile(0.50, rate(dynamo_frontend_inter_token_latency_seconds_bucket[{range}]))",
+ "itl_p95": "histogram_quantile(0.95, rate(dynamo_frontend_inter_token_latency_seconds_bucket[{range}]))",
+ "itl_p99": "histogram_quantile(0.99, rate(dynamo_frontend_inter_token_latency_seconds_bucket[{range}]))",
+
+ # -------------------------------------------------------------------------
+ # KV Cache Metrics (Gauge metrics)
+ # -------------------------------------------------------------------------
+ "kv_cache_usage_percent": "dynamo_component_kvstats_gpu_cache_usage_percent",
+ "kv_cache_hit_rate_sglang": "sglang:cache_hit_rate", # SGLang native (fallback)
+ "kv_cache_hit_rate_dynamo": "dynamo_component_kvstats_gpu_prefix_cache_hit_rate",
+
+ # -------------------------------------------------------------------------
+ # KV Efficiency (KVE) - TRUE efficiency metric from Thompson Sampling processor
+ # KVE = cached_tokens / prompt_tokens (fraction of work saved)
+ # This is token-agnostic and measures actual computational savings
+ # -------------------------------------------------------------------------
+ "kve_cached_tokens_rate": "rate(dynamo_component_thompson_kve_cached_tokens_total[{range}])",
+ "kve_prompt_tokens_rate": "rate(dynamo_component_thompson_kve_prompt_tokens_total[{range}])",
+ # Block-level KVE metrics for deeper analysis
+ "kve_device_blocks_rate": "rate(dynamo_component_thompson_kve_device_blocks_total[{range}])",
+ "kve_host_blocks_rate": "rate(dynamo_component_thompson_kve_host_blocks_total[{range}])",
+ "kve_disk_blocks_rate": "rate(dynamo_component_thompson_kve_disk_blocks_total[{range}])",
+
+ # -------------------------------------------------------------------------
+ # SGLang Worker Metrics (Gauge metrics)
+ # -------------------------------------------------------------------------
+ "sglang_running_requests": "sglang:num_running_reqs",
+ "sglang_queue_depth": "sglang:num_queue_reqs",
+ "sglang_gen_throughput": "sglang:gen_throughput",
+ "sglang_utilization": "sglang:utilization",
+
+ # -------------------------------------------------------------------------
+ # Thompson Sampling Metrics (Rate metrics)
+ # -------------------------------------------------------------------------
+ "thompson_routing_decisions_rate": "rate(dynamo_component_thompson_routing_decisions_total[{range}])",
+ "thompson_requests_rate": "rate(dynamo_component_thompson_requests_total[{range}])",
+}
+
+# =============================================================================
+# DATA MODELS
+# =============================================================================
+
+
+class DynamoCoreMetrics(BaseModel):
+ """
+ Core optimization metrics for Dynamo LLM inference.
+
+ These three metrics are the primary targets for optimization:
+
+ 1. **KV Efficiency (KVE)**: Fraction of computational work saved via KV cache.
+ - Formula: ``cached_tokens / prompt_tokens``
+ - Target: Maximize (closer to 1.0 = more work saved)
+ - Affected by: prefix_id routing, prefix hints (osl, iat), request patterns
+ - Token-agnostic measure of actual computational savings
+
+ 2. **TTFT (Time to First Token)**: User-perceived initial latency.
+ - Target: Minimize (lower is better)
+ - Affected by: queue depth, worker selection, KV cache hits
+
+ 3. **ITL (Inter-Token Latency)**: Streaming smoothness.
+ - Target: Minimize (lower is better)
+ - Affected by: batch scheduling, GPU utilization, memory bandwidth
+
+ Usage::
+
+ result = await collector.collect()
+ core = result.get_core_metrics()
+
+ print(f"KV Efficiency: {core.kv_efficiency:.2%}")
+ print(f"TTFT P95: {core.ttft_p95_seconds:.3f}s")
+ print(f"ITL P95: {core.itl_p95_seconds:.3f}s")
+
+ # Check if all core metrics are available
+ if core.is_complete():
+ print("All core metrics collected successfully")
+ """
+
+ # -------------------------------------------------------------------------
+ # KV Efficiency - KVE (CORE METRIC #1)
+ # Goal: MAXIMIZE - Higher efficiency = more computational work saved
+ # Formula: cached_tokens / prompt_tokens
+ # -------------------------------------------------------------------------
+ kv_efficiency: float | None = Field(
+ default=None,
+ description="KV Efficiency (0-1): fraction of prompt tokens served from cache. "
+ "Computed as cached_tokens / prompt_tokens from Thompson Sampling processor. "
+ "Higher values indicate more computational work saved via KV cache reuse. "
+ "This is the PRIMARY metric affected by prefix routing hints (prefix_id, prefix_osl, prefix_iat).",
+ )
+ kv_efficiency_fallback: float | None = Field(
+ default=None,
+ description="Fallback KV efficiency from SGLang native cache_hit_rate. "
+ "Used when Thompson Sampling KVE counters are unavailable.",
+ )
+
+ # -------------------------------------------------------------------------
+ # Time to First Token - TTFT (CORE METRIC #2)
+ # Goal: MINIMIZE - Lower latency = faster initial response
+ # -------------------------------------------------------------------------
+ ttft_p50_seconds: float | None = Field(
+ default=None,
+ description="Time to First Token - 50th percentile (median) in seconds",
+ )
+ ttft_p95_seconds: float | None = Field(
+ default=None,
+ description="Time to First Token - 95th percentile in seconds. "
+ "Primary latency target for optimization.",
+ )
+ ttft_p99_seconds: float | None = Field(
+ default=None,
+ description="Time to First Token - 99th percentile in seconds (tail latency)",
+ )
+
+ # -------------------------------------------------------------------------
+ # Inter-Token Latency - ITL (CORE METRIC #3)
+ # Goal: MINIMIZE - Lower latency = smoother streaming
+ # -------------------------------------------------------------------------
+ itl_p50_seconds: float | None = Field(
+ default=None,
+ description="Inter-Token Latency - 50th percentile (median) in seconds",
+ )
+ itl_p95_seconds: float | None = Field(
+ default=None,
+ description="Inter-Token Latency - 95th percentile in seconds. "
+ "Primary streaming smoothness target.",
+ )
+ itl_p99_seconds: float | None = Field(
+ default=None,
+ description="Inter-Token Latency - 99th percentile in seconds (tail latency)",
+ )
+
+ def get_effective_kv_efficiency(self) -> float | None:
+ """
+ Get the best available KV efficiency value.
+
+ Prefers the true KVE (cached_tokens/prompt_tokens) from Thompson Sampling,
+ falls back to SGLang native cache_hit_rate if KVE is unavailable.
+
+ Returns:
+ KV efficiency (0-1) or None if neither source is available
+ """
+ if self.kv_efficiency is not None:
+ return self.kv_efficiency
+ return self.kv_efficiency_fallback
+
+ def is_complete(self) -> bool:
+ """
+ Check if all core optimization metrics were successfully collected.
+
+ Returns:
+ True if KV efficiency (or fallback), ttft_p95, and itl_p95 are all available
+ """
+ return all([
+ self.get_effective_kv_efficiency() is not None,
+ self.ttft_p95_seconds is not None,
+ self.itl_p95_seconds is not None,
+ ])
+
+ def get_optimization_summary(self) -> dict[str, float | None]:
+ """
+ Get a summary dict of the primary optimization targets.
+
+ Returns:
+ Dict with the three key metrics for optimization loops
+ """
+ return {
+ "kv_efficiency": self.get_effective_kv_efficiency(),
+ "kv_efficiency_source": "kve" if self.kv_efficiency is not None else "sglang_fallback",
+ "ttft_p95_seconds": self.ttft_p95_seconds,
+ "itl_p95_seconds": self.itl_p95_seconds,
+ }
+
+ def to_optimization_score(
+ self,
+ kv_weight: float = 0.4,
+ ttft_weight: float = 0.4,
+ itl_weight: float = 0.2,
+ ttft_target_seconds: float = 0.5,
+ itl_target_seconds: float = 0.05,
+ ) -> float | None:
+ """
+ Compute a combined optimization score (higher is better).
+
+ This provides a single scalar for optimization algorithms that combines
+ the three core metrics with configurable weights.
+
+ Args:
+ kv_weight: Weight for KV efficiency (0-1)
+ ttft_weight: Weight for TTFT score (0-1)
+ itl_weight: Weight for ITL score (0-1)
+ ttft_target_seconds: Target TTFT for scoring (score=1.0 at target)
+ itl_target_seconds: Target ITL for scoring (score=1.0 at target)
+
+ Returns:
+ Combined score (0-1) where higher is better, or None if metrics unavailable
+
+ Note:
+ Weights should sum to 1.0. TTFT and ITL scores are computed as
+ target/actual (capped at 1.0) so lower latency = higher score.
+ """
+ if not self.is_complete():
+ return None
+
+ # KV efficiency score is already 0-1 (higher is better)
+ kv_score = self.get_effective_kv_efficiency() or 0.0
+
+ # TTFT score: target/actual, capped at 1.0 (lower latency = higher score)
+ ttft_score = min(1.0, ttft_target_seconds / max(self.ttft_p95_seconds or ttft_target_seconds, 0.001))
+
+ # ITL score: target/actual, capped at 1.0 (lower latency = higher score)
+ itl_score = min(1.0, itl_target_seconds / max(self.itl_p95_seconds or itl_target_seconds, 0.001))
+
+ return (kv_weight * kv_score) + (ttft_weight * ttft_score) + (itl_weight * itl_score)
+
+
+class DynamoMetricsResult(BaseModel):
+ """
+ Results from Dynamo metrics collection.
+
+ To add a new metric:
+ 1. Add a field here with appropriate type and description
+ 2. Add the corresponding Prometheus query to METRIC_QUERIES above
+ 3. The collector will automatically populate it
+
+ All metrics are optional (None) to handle cases where:
+ - The metric endpoint is unavailable
+ - Prometheus query returns no data
+ - The Dynamo component is not running
+
+ For optimization, use ``get_core_metrics()`` to extract the three primary
+ optimization targets (KV Cache Efficiency, TTFT, ITL).
+ """
+
+ # =========================================================================
+ # CORE OPTIMIZATION METRICS (Primary targets for optimization)
+ # =========================================================================
+
+ # -------------------------------------------------------------------------
+ # KV Efficiency - KVE (CORE METRIC #1)
+ # Dashboard panels: "KV Cache Usage %", "KV Cache Stats"
+ # KVE = cached_tokens / prompt_tokens (fraction of work saved)
+ # -------------------------------------------------------------------------
+ kve_cached_tokens_rate: float | None = Field(
+ default=None,
+ description="Rate of tokens served from KV cache (tokens/sec). KVE numerator.",
+ )
+ kve_prompt_tokens_rate: float | None = Field(
+ default=None,
+ description="Rate of total prompt tokens processed (tokens/sec). KVE denominator.",
+ )
+ kve_device_blocks_rate: float | None = Field(
+ default=None,
+ description="Rate of KV blocks served from GPU memory (blocks/sec)",
+ )
+ kve_host_blocks_rate: float | None = Field(
+ default=None,
+ description="Rate of KV blocks served from CPU/host memory (blocks/sec)",
+ )
+ kve_disk_blocks_rate: float | None = Field(
+ default=None,
+ description="Rate of KV blocks served from disk (blocks/sec)",
+ )
+ kv_cache_usage_percent: float | None = Field(
+ default=None,
+ description="GPU KV cache memory utilization (0-100%)",
+ )
+ kv_cache_hit_rate_sglang: float | None = Field(
+ default=None,
+ description="[FALLBACK] KV cache hit rate from SGLang native metric (0-1). "
+ "Used when Thompson Sampling KVE counters are unavailable.",
+ )
+ kv_cache_hit_rate_dynamo: float | None = Field(
+ default=None,
+ description="KV cache hit rate from Dynamo component (0-1), alternative source",
+ )
+
+ # -------------------------------------------------------------------------
+ # Time to First Token - TTFT (CORE METRIC #2)
+ # Dashboard panels: "Time to First Token (P95)", "TTFT Over Time"
+ # -------------------------------------------------------------------------
+ ttft_p50: float | None = Field(
+ default=None,
+ description="Time to First Token - 50th percentile (seconds)",
+ )
+ ttft_p95: float | None = Field(
+ default=None,
+ description="[CORE] Time to First Token - 95th percentile (seconds). PRIMARY latency target.",
+ )
+ ttft_p99: float | None = Field(
+ default=None,
+ description="Time to First Token - 99th percentile (seconds)",
+ )
+
+ # -------------------------------------------------------------------------
+ # Inter-Token Latency - ITL (CORE METRIC #3)
+ # Dashboard panel: "ITL Over Time" - Inter-token latency trends
+ # -------------------------------------------------------------------------
+ itl_p50: float | None = Field(
+ default=None,
+ description="Inter-Token Latency - 50th percentile (seconds)",
+ )
+ itl_p95: float | None = Field(
+ default=None,
+ description="[CORE] Inter-Token Latency - 95th percentile (seconds). PRIMARY streaming target.",
+ )
+ itl_p99: float | None = Field(
+ default=None,
+ description="Inter-Token Latency - 99th percentile (seconds)",
+ )
+
+ # =========================================================================
+ # SUPPLEMENTARY METRICS (Context and diagnostics)
+ # =========================================================================
+
+ # -------------------------------------------------------------------------
+ # Inflight Requests
+ # Dashboard panel: "Inflight Requests" - Current load across components
+ # -------------------------------------------------------------------------
+ inflight_requests_frontend: float | None = Field(
+ default=None,
+ description="Current inflight requests at the frontend (user-facing API)",
+ )
+ inflight_requests_worker: float | None = Field(
+ default=None,
+ description="Current inflight requests at the worker (SGLang backend)",
+ )
+ queued_requests: float | None = Field(
+ default=None,
+ description="Requests currently queued at the frontend",
+ )
+
+ # -------------------------------------------------------------------------
+ # Throughput
+ # Dashboard panel: "Requests/min" - Throughput
+ # -------------------------------------------------------------------------
+ requests_per_minute: float | None = Field(
+ default=None,
+ description="Request throughput in requests per minute",
+ )
+
+ # -------------------------------------------------------------------------
+ # Token Throughput
+ # Dashboard panel: "Token Throughput" - Tokens generated per second
+ # -------------------------------------------------------------------------
+ token_throughput: float | None = Field(
+ default=None,
+ description="Output token generation rate (tokens/second)",
+ )
+
+ # -------------------------------------------------------------------------
+ # SGLang Worker Metrics
+ # Additional worker-level metrics for deeper analysis
+ # -------------------------------------------------------------------------
+ sglang_running_requests: float | None = Field(
+ default=None,
+ description="Number of requests currently running in SGLang",
+ )
+ sglang_queue_depth: float | None = Field(
+ default=None,
+ description="Number of requests queued in SGLang",
+ )
+ sglang_gen_throughput: float | None = Field(
+ default=None,
+ description="SGLang generation throughput",
+ )
+ sglang_utilization: float | None = Field(
+ default=None,
+ description="SGLang GPU utilization",
+ )
+
+ # -------------------------------------------------------------------------
+ # Thompson Sampling Metrics
+ # Routing efficiency and decision-making metrics
+ # -------------------------------------------------------------------------
+ thompson_routing_decisions_rate: float | None = Field(
+ default=None,
+ description="Rate of Thompson Sampling routing decisions per second",
+ )
+ thompson_requests_rate: float | None = Field(
+ default=None,
+ description="Rate of requests processed by Thompson Sampling processor",
+ )
+
+ # -------------------------------------------------------------------------
+ # Metadata
+ # -------------------------------------------------------------------------
+ collection_timestamp: float | None = Field(
+ default=None,
+ description="Unix timestamp when metrics were collected",
+ )
+ prometheus_url: str | None = Field(
+ default=None,
+ description="Prometheus URL used for collection",
+ )
+ errors: list[str] = Field(
+ default_factory=list,
+ description="Any errors encountered during collection",
+ )
+
+ # =========================================================================
+ # CORE METRICS EXTRACTION
+ # =========================================================================
+
+ def compute_kv_efficiency(self) -> float | None:
+ """
+ Compute KV Efficiency (KVE) from Thompson Sampling processor metrics.
+
+ KVE = cached_tokens / prompt_tokens
+
+ This measures the fraction of computational work saved via KV cache reuse.
+ A KVE of 0.8 means 80% of prompt tokens were served from cache.
+
+ Returns:
+ KVE (0-1) if both metrics are available and prompt_tokens > 0, else None
+ """
+ if self.kve_cached_tokens_rate is None or self.kve_prompt_tokens_rate is None:
+ return None
+ if self.kve_prompt_tokens_rate <= 0:
+ return None
+ return self.kve_cached_tokens_rate / self.kve_prompt_tokens_rate
+
+ def get_core_metrics(self) -> DynamoCoreMetrics:
+ """
+ Extract the three core optimization metrics.
+
+ KV Efficiency is computed as cached_tokens / prompt_tokens from the
+ Thompson Sampling processor. Falls back to SGLang native cache_hit_rate
+ if KVE counters are unavailable.
+
+ Returns:
+ DynamoCoreMetrics with KV efficiency, TTFT, and ITL
+
+ Usage::
+
+ result = await collector.collect()
+ core = result.get_core_metrics()
+
+ if core.is_complete():
+ score = core.to_optimization_score()
+ print(f"Optimization score: {score:.3f}")
+ """
+ # Compute true KVE from Thompson Sampling processor metrics
+ kv_efficiency = self.compute_kv_efficiency()
+
+ return DynamoCoreMetrics(
+ kv_efficiency=kv_efficiency,
+ kv_efficiency_fallback=self.kv_cache_hit_rate_sglang,
+ ttft_p50_seconds=self.ttft_p50,
+ ttft_p95_seconds=self.ttft_p95,
+ ttft_p99_seconds=self.ttft_p99,
+ itl_p50_seconds=self.itl_p50,
+ itl_p95_seconds=self.itl_p95,
+ itl_p99_seconds=self.itl_p99,
+ )
+
+ def has_core_metrics(self) -> bool:
+ """
+ Check if all three core optimization metrics are available.
+
+ Returns:
+ True if kv_cache_hit_rate, ttft_p95, and itl_p95 are all collected
+ """
+ return self.get_core_metrics().is_complete()
+
+
+# =============================================================================
+# METRICS COLLECTOR
+# =============================================================================
+
+
+class DynamoMetricsCollector:
+ """
+ Collects Dynamo inference stack metrics from Prometheus.
+
+ Usage::
+
+ from nat.profiler.inference_optimization.dynamo_metrics import DynamoMetricsCollector
+ from nat.data_models.profiler import DynamoMetricsConfig
+
+ config = DynamoMetricsConfig(enable=True, prometheus_url="http://localhost:9090")
+ collector = DynamoMetricsCollector(config)
+ result = await collector.collect()
+
+ print(f"TTFT P95: {result.ttft_p95}")
+ print(f"KV Cache Usage: {result.kv_cache_usage_percent}%")
+ """
+
+ def __init__(self, config: DynamoMetricsConfig):
+ """
+ Initialize the collector with configuration.
+
+ Args:
+ config: DynamoMetricsConfig with Prometheus URL and metric toggles
+ """
+ self.config = config
+ self.prometheus_url = config.prometheus_url.rstrip("/")
+
+ async def collect(self) -> DynamoMetricsResult:
+ """
+ Collect all enabled Dynamo metrics from Prometheus.
+
+ Returns:
+ DynamoMetricsResult with collected metric values
+ """
+ result = DynamoMetricsResult(
+ collection_timestamp=time.time(),
+ prometheus_url=self.prometheus_url,
+ )
+
+ # Build list of metrics to collect based on config toggles
+ metrics_to_collect = self._get_enabled_metrics()
+
+ # Log collection parameters
+ if self.config.workflow_start_timestamp is not None:
+ if self.config.workflow_end_timestamp is not None:
+ duration = self.config.workflow_end_timestamp - self.config.workflow_start_timestamp
+ lookback_info = f"isolated_window={duration:.1f}s"
+ else:
+ lookback_info = f"workflow_start={self.config.workflow_start_timestamp:.2f}"
+ elif self.config.lookback_seconds > 0:
+ lookback_info = f"lookback={self.config.lookback_seconds}s"
+ else:
+ lookback_info = "lookback=600s (default)"
+
+ logger.info("Collecting %d Dynamo metrics from %s (query_range=%s, %s)",
+ len(metrics_to_collect),
+ self.prometheus_url,
+ self.config.query_range,
+ lookback_info)
+
+ collected_count = 0
+ null_count = 0
+
+ # Collect each metric
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ for metric_name, query_template in metrics_to_collect.items():
+ try:
+ # Substitute time range placeholder
+ query = query_template.replace("{range}", self.config.query_range)
+ value = await self._query_prometheus(client, query)
+
+ if value is not None:
+ setattr(result, metric_name, value)
+ logger.debug("Collected %s = %s", metric_name, value)
+ collected_count += 1
+ else:
+ logger.debug("No data for metric %s", metric_name)
+ null_count += 1
+
+ except Exception as e:
+ error_msg = f"Failed to collect {metric_name}: {e}"
+ logger.warning(error_msg)
+ result.errors.append(error_msg)
+
+ logger.info("Dynamo metrics collection complete: %d collected, %d null, %d errors",
+ collected_count,
+ null_count,
+ len(result.errors))
+
+ # Log summary of key metrics for debugging
+ core = result.get_core_metrics()
+ if core.ttft_p95_seconds is not None or core.itl_p95_seconds is not None:
+ logger.info("Core metrics - TTFT P95: %s, ITL P95: %s, KV Efficiency: %s",
+ core.ttft_p95_seconds,
+ core.itl_p95_seconds,
+ core.kv_efficiency)
+ else:
+ logger.warning("Core metrics (TTFT, ITL) not available - check Prometheus connectivity and metric names")
+
+ return result
+
+ def _get_enabled_metrics(self) -> dict[str, str]:
+ """
+ Get the subset of METRIC_QUERIES enabled by config.
+
+ Returns:
+ Dict mapping metric names to their Prometheus queries
+ """
+ enabled: dict[str, str] = {}
+
+ # Map config flags to metric prefixes/names
+ metric_groups = {
+ "collect_inflight_requests": ["inflight_requests_frontend", "inflight_requests_worker", "queued_requests"],
+ "collect_throughput": ["requests_per_minute"],
+ "collect_ttft": ["ttft_p50", "ttft_p95", "ttft_p99"],
+ "collect_itl": ["itl_p50", "itl_p95", "itl_p99"],
+ "collect_kv_cache": [
+ # KVE metrics (primary - token-level efficiency)
+ "kve_cached_tokens_rate",
+ "kve_prompt_tokens_rate",
+ "kve_device_blocks_rate",
+ "kve_host_blocks_rate",
+ "kve_disk_blocks_rate", # Supplementary KV cache metrics
+ "kv_cache_usage_percent",
+ "kv_cache_hit_rate_sglang", # Fallback for KVE
+ "kv_cache_hit_rate_dynamo",
+ ],
+ "collect_token_throughput": ["token_throughput", "sglang_gen_throughput"],
+ }
+
+ for config_flag, metric_names in metric_groups.items():
+ if getattr(self.config, config_flag, False):
+ for name in metric_names:
+ if name in METRIC_QUERIES:
+ enabled[name] = METRIC_QUERIES[name]
+
+ # Always collect SGLang worker metrics for context
+ for name in ["sglang_running_requests", "sglang_queue_depth", "sglang_utilization"]:
+ if name in METRIC_QUERIES:
+ enabled[name] = METRIC_QUERIES[name]
+
+ # Always collect Thompson Sampling metrics when available
+ for name in ["thompson_routing_decisions_rate", "thompson_requests_rate"]:
+ if name in METRIC_QUERIES:
+ enabled[name] = METRIC_QUERIES[name]
+
+ return enabled
+
+ async def _query_prometheus(self, client: httpx.AsyncClient, query: str) -> float | None:
+ """
+ Execute a Prometheus query and extract the scalar result.
+
+ First attempts an instant query. If no data is returned (e.g., because
+ rate() returns 0 after workflow completion), falls back to a range query
+ with historical lookback to capture the most recent non-zero value.
+
+ Args:
+ client: httpx AsyncClient
+ query: PromQL query string
+
+ Returns:
+ Float value if successful, None if no data or error
+ """
+ # First try instant query
+ value = await self._query_prometheus_instant(client, query)
+ if value is not None:
+ return value
+
+ # If instant query failed, try range query with lookback
+ # This captures historical data when rate() returns 0 after workflow completes
+ logger.debug("Instant query returned no data, trying range query with lookback: %s", query)
+ return await self._query_prometheus_range(client, query)
+
+ async def _query_prometheus_instant(self, client: httpx.AsyncClient, query: str) -> float | None:
+ """
+ Execute a Prometheus instant query.
+
+ Args:
+ client: httpx AsyncClient
+ query: PromQL query string
+
+ Returns:
+ Float value if successful, None if no data or error
+ """
+ url = f"{self.prometheus_url}/api/v1/query"
+ params = {"query": query}
+
+ response = await client.get(url, params=params)
+ response.raise_for_status()
+
+ data = response.json()
+
+ if data.get("status") != "success":
+ logger.warning("Prometheus instant query failed: %s", data.get("error", "unknown"))
+ return None
+
+ results = data.get("data", {}).get("result", [])
+
+ if not results:
+ logger.debug("No data for instant query: %s", query)
+ return None
+
+ # For instant queries, extract the value from the first result
+ # Result format: [{"metric": {...}, "value": [timestamp, "value_string"]}]
+ try:
+ value_str = results[0]["value"][1]
+ value = float(value_str)
+
+ # Handle special float values
+ if math.isnan(value):
+ logger.debug("Instant query returned NaN for: %s", query)
+ return None
+
+ # Zero values from rate() after activity stops are not useful
+ if value == 0.0:
+ logger.debug("Instant query returned 0.0 for rate-based query: %s", query)
+ return None
+
+ return value
+ except (KeyError, IndexError, ValueError) as e:
+ logger.debug("Failed to parse Prometheus instant result for query '%s': %s", query, e)
+ return None
+
+ async def _query_prometheus_range(self, client: httpx.AsyncClient, query: str) -> float | None:
+ """
+ Execute a Prometheus range query with historical lookback.
+
+ This captures metrics that were recorded during the workflow execution
+ but are no longer updating (rate() would return 0 for instant queries).
+
+ The time window is determined by:
+ 1. If workflow timestamps are set: query from workflow start to workflow end (isolated to this eval)
+ 2. If lookback_seconds is set: query that many seconds back from now
+ 3. Otherwise: default to 10 minutes (600 seconds)
+
+ Args:
+ client: httpx AsyncClient
+ query: PromQL query string
+
+ Returns:
+ The most recent non-NaN, non-zero value if found, None otherwise
+ """
+ url = f"{self.prometheus_url}/api/v1/query_range"
+
+ # Determine time window based on config
+ # Priority: workflow timestamps > lookback_seconds > default 600s
+ if self.config.workflow_start_timestamp is not None:
+ # Use exact workflow time window (no buffer before, small buffer after for scrape delay)
+ # No buffer before: avoids any risk of including pre-workflow empty data
+ # Small buffer after (15s): accounts for Prometheus scrape interval
+ start_time = self.config.workflow_start_timestamp
+
+ if self.config.workflow_end_timestamp is not None:
+ # Use actual workflow end time + small buffer for scrape delay
+ end_time = self.config.workflow_end_timestamp + 15.0
+ logger.debug("Using isolated workflow time window: %.2f to %.2f (%.1f seconds)",
+ start_time,
+ end_time,
+ end_time - start_time)
+ else:
+ # Fall back to current time if end timestamp not set
+ end_time = time.time()
+ logger.debug("Using workflow start with current time: %.2f to %.2f (%.1f seconds)",
+ start_time,
+ end_time,
+ end_time - start_time)
+ elif self.config.lookback_seconds > 0:
+ end_time = time.time()
+ start_time = end_time - self.config.lookback_seconds
+ logger.debug("Using configured lookback for range query: %.1f seconds", self.config.lookback_seconds)
+ else:
+ # Default to 10 minutes (600 seconds) for backward compatibility
+ end_time = time.time()
+ start_time = end_time - 600
+ logger.debug("Using default 10-minute lookback for range query")
+
+ # Use 15s step to get reasonable granularity
+ step = "15s"
+
+ params = {
+ "query": query,
+ "start": start_time,
+ "end": end_time,
+ "step": step,
+ }
+
+ try:
+ response = await client.get(url, params=params)
+ response.raise_for_status()
+
+ data = response.json()
+
+ if data.get("status") != "success":
+ logger.warning("Prometheus range query failed: %s", data.get("error", "unknown"))
+ return None
+
+ results = data.get("data", {}).get("result", [])
+
+ if not results:
+ logger.debug("No data for range query: %s", query)
+ return None
+
+ # Range query result format:
+ # [{"metric": {...}, "values": [[timestamp, "value_string"], ...]}]
+ # Collect all valid (non-NaN, non-zero) values and compute the average
+ # This gives a representative measurement across the entire workflow
+ valid_values: list[float] = []
+
+ for series in results:
+ values = series.get("values", [])
+ for timestamp_val, value_str in values:
+ try:
+ value = float(value_str)
+ if not math.isnan(value) and value != 0.0:
+ valid_values.append(value)
+ except (ValueError, TypeError):
+ continue
+
+ if valid_values:
+ # Use average for a representative measurement across the workflow
+ avg_value = sum(valid_values) / len(valid_values)
+ min_value = min(valid_values)
+ max_value = max(valid_values)
+ logger.debug("Range query found %d valid samples for %s: avg=%.4f, min=%.4f, max=%.4f",
+ len(valid_values),
+ query,
+ avg_value,
+ min_value,
+ max_value)
+ return avg_value
+
+ logger.debug("Range query found no valid values for: %s", query)
+ return None
+
+ except Exception as e:
+ logger.debug("Range query failed for '%s': %s", query, e)
+ return None
+
+ async def health_check(self) -> dict[str, Any]:
+ """
+ Check connectivity to Prometheus and Dynamo endpoints.
+
+ Returns:
+ Dict with health status for each component
+ """
+ health: dict[str, Any] = {
+ "prometheus": False,
+ "frontend": False,
+ "worker": False,
+ "errors": [],
+ }
+
+ async with httpx.AsyncClient(timeout=10.0) as client:
+ # Check Prometheus
+ try:
+ response = await client.get(f"{self.prometheus_url}/-/healthy")
+ health["prometheus"] = response.status_code == 200
+ except Exception as e:
+ health["errors"].append(f"Prometheus: {e}")
+
+ # Check if Dynamo metrics are being scraped
+ try:
+ # Query for any frontend metric to verify scraping
+ url = f"{self.prometheus_url}/api/v1/query"
+ response = await client.get(url, params={"query": "up{job=~\".*dynamo.*\"}"})
+ if response.status_code == 200:
+ data = response.json()
+ results = data.get("data", {}).get("result", [])
+ health["frontend"] = len(results) > 0
+ health["worker"] = len(results) > 0
+ except Exception as e:
+ health["errors"].append(f"Dynamo metrics check: {e}")
+
+ return health
+
+
+# =============================================================================
+# CONVENIENCE FUNCTIONS
+# =============================================================================
+
+
+async def collect_dynamo_metrics(config: DynamoMetricsConfig) -> DynamoMetricsResult:
+ """
+ Convenience function to collect Dynamo metrics.
+
+ Args:
+ config: DynamoMetricsConfig with collection settings
+
+ Returns:
+ DynamoMetricsResult with collected metrics
+ """
+ collector = DynamoMetricsCollector(config)
+ return await collector.collect()
+
+
+async def collect_core_metrics(
+ prometheus_url: str = "http://localhost:9090",
+ query_range: str = "30s",
+) -> DynamoCoreMetrics:
+ """
+ Convenience function to collect only the three core optimization metrics.
+
+ This is a simplified interface for optimization loops that only need:
+ - KV Cache Efficiency
+ - Time to First Token (TTFT)
+ - Inter-Token Latency (ITL)
+
+ Args:
+ prometheus_url: Prometheus server URL
+ query_range: Time range for rate calculations (e.g., '1m', '5m')
+
+ Returns:
+ DynamoCoreMetrics with the three core metrics
+
+ Usage::
+
+ from nat.profiler.inference_optimization.dynamo_metrics import collect_core_metrics
+
+ # Quick collection for optimization
+ core = await collect_core_metrics()
+
+ if core.is_complete():
+ print(f"KV Efficiency: {core.kv_cache_efficiency:.2%}")
+ print(f"TTFT P95: {core.ttft_p95_seconds:.3f}s")
+ print(f"ITL P95: {core.itl_p95_seconds:.3f}s")
+
+ # Get combined optimization score
+ score = core.to_optimization_score()
+ print(f"Combined score: {score:.3f}")
+ """
+ config = DynamoMetricsConfig(
+ enable=True,
+ prometheus_url=prometheus_url,
+ query_range=query_range,
+ # Enable only core metrics for efficiency
+ collect_kv_cache=True,
+ collect_ttft=True,
+ collect_itl=True,
+ # Disable supplementary metrics
+ collect_inflight_requests=False,
+ collect_throughput=False,
+ collect_token_throughput=False,
+ )
+ result = await collect_dynamo_metrics(config)
+ return result.get_core_metrics()
diff --git a/src/nat/profiler/profile_runner.py b/src/nat/profiler/profile_runner.py
index 0ac72f5deb..f7afdd8b90 100644
--- a/src/nat/profiler/profile_runner.py
+++ b/src/nat/profiler/profile_runner.py
@@ -45,6 +45,7 @@ class InferenceOptimizationHolder(BaseModel):
common_prefixes: Any
token_uniqueness: Any
workflow_runtimes: Any
+ dynamo_metrics: Any = None
class ProfilerRunner:
@@ -82,6 +83,41 @@ def __init__(self, profiler_config: ProfilerConfig, output_dir: Path, write_outp
# Ensure output directory
os.makedirs(output_dir, exist_ok=True)
+ def _get_workflow_time_window(
+ self,
+ all_steps: list[list[IntermediateStep]],
+ ) -> tuple[float | None, float | None]:
+ """
+ Extract the workflow time window from intermediate steps.
+
+ Finds the earliest and latest event timestamps across all workflow executions
+ to determine the time range for Prometheus queries.
+
+ Args:
+ all_steps: List of workflow executions, each containing intermediate steps
+
+ Returns:
+ Tuple of (start_timestamp, end_timestamp) in Unix seconds, or (None, None) if no data
+ """
+ min_timestamp = float('inf')
+ max_timestamp = float('-inf')
+
+ for workflow_steps in all_steps:
+ for step in workflow_steps:
+ ts = step.event_timestamp
+ min_timestamp = min(min_timestamp, ts)
+ max_timestamp = max(max_timestamp, ts)
+ # Also check span_event_timestamp for start times of END events
+ span_ts = step.span_event_timestamp
+ if span_ts is not None:
+ min_timestamp = min(min_timestamp, span_ts)
+
+ if min_timestamp == float('inf') or max_timestamp == float('-inf'):
+ logger.warning("Could not determine workflow time window from intermediate steps")
+ return None, None
+
+ return min_timestamp, max_timestamp
+
async def run(self, all_steps: list[list[IntermediateStep]]) -> ProfilerResults:
"""
Main entrypoint: Works on Input DataFrame generated from eval to fit forecasting model,
@@ -187,10 +223,38 @@ async def run(self, all_steps: list[list[IntermediateStep]]) -> ProfilerResults:
workflow_runtimes = compute_workflow_runtime_metrics(all_steps)
workflow_runtimes_results = workflow_runtimes
+ # ------------------------------------------------------------
+ # Collect Dynamo inference stack metrics (if enabled)
+ # ------------------------------------------------------------
+ dynamo_metrics_results = None
+ if self.profile_config.dynamo_metrics.enable:
+ from nat.profiler.inference_optimization.dynamo_metrics import collect_dynamo_metrics
+ try:
+ # Calculate workflow time window from intermediate steps
+ workflow_start, workflow_end = self._get_workflow_time_window(all_steps)
+ if workflow_start is not None and workflow_end is not None:
+ # Set both start and end timestamps so Prometheus range queries
+ # are isolated to THIS eval run (not picking up data from other runs)
+ self.profile_config.dynamo_metrics.workflow_start_timestamp = workflow_start
+ self.profile_config.dynamo_metrics.workflow_end_timestamp = workflow_end
+ workflow_duration = workflow_end - workflow_start
+ logger.info("Workflow time window: %.1f seconds (%.2f to %.2f) - metrics isolated to this eval run",
+ workflow_duration,
+ workflow_start,
+ workflow_end)
+
+ dynamo_metrics_results = await collect_dynamo_metrics(self.profile_config.dynamo_metrics)
+ if dynamo_metrics_results.errors:
+ logger.warning("Dynamo metrics collection had errors: %s", dynamo_metrics_results.errors)
+ logger.info("Collected Dynamo metrics successfully")
+ except Exception as e:
+ logger.warning("Failed to collect Dynamo metrics: %s", e)
+
inference_optimization_results = InferenceOptimizationHolder(confidence_intervals=simple_metrics,
common_prefixes=common_prefix_results,
token_uniqueness=token_uniqueness_results,
- workflow_runtimes=workflow_runtimes_results)
+ workflow_runtimes=workflow_runtimes_results,
+ dynamo_metrics=dynamo_metrics_results)
if self.write_output and inference_optimization_results:
# Save to JSON