NVIDIA · bbednarski9 · Jan 14, 2026 · Jan 22, 2026 · Jan 22, 2026 · Jan 22, 2026
@@ -38,7 +38,8 @@ repos:
         name: Clear Jupyter Notebook Output Cells
         entry: ci/scripts/clear_notebook_output_cells.sh
         files: "\\.ipynb$"
-        language: unsupported_script
+        language: python
+        additional_dependencies: ["nbconvert"]
 
   - repo: https://github.com/tcort/markdown-link-check
     rev: v3.14.1

@@ -148,7 +148,7 @@ uv pip install "nvidia-nat[crewai]"
 
 LangChain is a framework for building applications that utilize large language models (LLMs) to interact with data. It provides a set of tools for creating chains of LLM calls, allowing for complex workflows powered by LLMs. LangChain focuses on modularity and extensibility, making it suitable for integrating custom data pipelines and enhancing intelligent applications.
 
-For more information, visit the [LangChain website](https://www.langchain.com/).
+For more information, visit the [LangChain documentation](https://docs.langchain.com/oss/python/langchain/overview).
 
 
 | Capability              | Providers / Details                                                                 |

@@ -78,7 +78,7 @@ llms:
   dynamo_llm:
     _type: dynamo
     model_name: llama-3.3-70b
-    base_url: http://localhost:8099/v1
+    base_url: http://localhost:8000/v1
     api_key: dummy
     # _type: nim
     # model_name: meta/llama-3.3-70b-instruct

@@ -26,12 +26,24 @@
 # - Bottleneck analysis with nested call stacks
 # - Concurrency spike detection
 # - Prompt caching prefix identification
+# - Dynamo inference stack metrics (KVE, TTFT, ITL from Prometheus)
+#
+# Core Dynamo Optimization Metrics:
+# 1. KV Efficiency (KVE) = cached_tokens / prompt_tokens
+#    - Measures fraction of computational work saved via KV cache
+#    - Higher is better (0.8 = 80% of tokens from cache)
+# 2. Time to First Token (TTFT) - User-perceived initial latency
+# 3. Inter-Token Latency (ITL) - Streaming smoothness
 #
 # Combines self-evaluating agent with detailed profiler for:
 # - Understanding performance characteristics of rethinking
 # - Identifying optimization opportunities
 # - Generating data for throughput analysis scripts
 #
+# Prerequisites:
+#   - Prometheus running at localhost:9090 (for Dynamo metrics)
+#   - Dynamo stack with monitoring enabled
+#
 # Usage:
 #   nat profile --config_file configs/profile_rethinking_full_test.yml
 #
@@ -130,7 +142,7 @@ llms:
   dynamo_llm:
     _type: dynamo
     model_name: llama-3.3-70b
-    base_url: http://localhost:8099/v1
+    base_url: http://localhost:8000/v1
     api_key: dummy
     temperature: 0.0
     max_tokens: 8192
@@ -151,7 +163,7 @@ llms:
   eval_llm:
     _type: dynamo
     model_name: llama-3.3-70b
-    base_url: http://localhost:8099/v1
+    base_url: http://localhost:8000/v1
     api_key: dummy
     temperature: 0.0
     max_tokens: 1024
@@ -187,7 +199,7 @@ workflow:
 
 eval:
   general:
-    max_concurrency: 36
+    max_concurrency: 8
 
     output:
       dir: ./examples/dynamo_integration/react_benchmark_agent/outputs/dynamo_evals/rethinking_full_test_for_profiling/
@@ -223,6 +235,25 @@ eval:
       concurrency_spike_analysis:
         enable: true
         spike_threshold: 24  # Alert when concurrent functions >= 24
+      # Dynamo inference stack metrics - collect from Prometheus
+      # Core optimization metrics: KV Efficiency, TTFT, ITL
+      dynamo_metrics:
+        enable: true
+        prometheus_url: http://localhost:9090
+        # Time range for rate calculations - should match experiment duration
+        # Minimum: 15s (Prometheus scrapes every 5s, need ≥3 points for reliable rates)
+        # Options: 15s (very short), 30s, 1m, 2m, 5m
+        # Shorter = more accurate for brief experiments, but noisier
+        # Longer = smoother averages, but may include pre-experiment data
+        query_range: 30s
+        # Core metrics (primary optimization targets)
+        collect_kv_cache: true   # KVE = cached_tokens/prompt_tokens (work saved)
+        collect_ttft: true       # Time to First Token (P50/P95/P99)
+        collect_itl: true        # Inter-Token Latency (P50/P95/P99)
+        # Supplementary metrics (context and diagnostics)
+        collect_inflight_requests: true
+        collect_throughput: true
+        collect_token_throughput: true
 
   evaluators:
     tool_selection_quality:

@@ -13,84 +13,119 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Required: Set your model directory path
-export DYNAMO_MODEL_DIR="/path/to/your/models/Llama-3.3-70B-Instruct"
-# Optional: Set repository directory (for Thompson Sampling router)
-export DYNAMO_REPO_DIR="/path/to/NeMo-Agent-Toolkit"
+export HF_HOME=/path/to/local/storage/.cache/huggingface
+
+export HF_TOKEN=my_huggingface_read_token
+
+# Required: Set your model directory path with model weights
+# EXAMPLE ls from properly configured directory
+# ~/models/Llama-3.3-70B-Instruct$ ls
+# LICENSE                           model-00003-of-00030.safetensors  model-00010-of-00030.safetensors  model-00017-of-00030.safetensors  model-00024-of-00030.safetensors  model.safetensors.index.json
+# README.md                         model-00004-of-00030.safetensors  model-00011-of-00030.safetensors  model-00018-of-00030.safetensors  model-00025-of-00030.safetensors  original
+# USE_POLICY.md                     model-00005-of-00030.safetensors  model-00012-of-00030.safetensors  model-00019-of-00030.safetensors  model-00026-of-00030.safetensors  special_tokens_map.json
+# config.json                       model-00006-of-00030.safetensors  model-00013-of-00030.safetensors  model-00020-of-00030.safetensors  model-00027-of-00030.safetensors  tokenizer.json
+# generation_config.json            model-00007-of-00030.safetensors  model-00014-of-00030.safetensors  model-00021-of-00030.safetensors  model-00028-of-00030.safetensors  tokenizer_config.json
+# model-00001-of-00030.safetensors  model-00008-of-00030.safetensors  model-00015-of-00030.safetensors  model-00022-of-00030.safetensors  model-00029-of-00030.safetensors
+# model-00002-of-00030.safetensors  model-00009-of-00030.safetensors  model-00016-of-00030.safetensors  model-00023-of-00030.safetensors  model-00030-of-00030.safetensors
+export DYNAMO_MODEL_DIR=/path/to/your/models/Llama-3.3-70B-Instruct
+
+# Set repository directory (for Thompson Sampling router)
+export DYNAMO_REPO_DIR=/path/to/NeMo-Agent-Toolkit/external/dynamo
 
 # =============================================================================
 # OPTIONAL VARIABLES - GPU Configuration
 # =============================================================================
 
 # GPU device IDs for unified mode (comma-separated)
 # Default: 0,1,2,3
-DYNAMO_GPU_DEVICES="0,1,2,3"
+export DYNAMO_GPU_DEVICES=0,1,2,3
 
 # GPU device IDs for disaggregated mode prefill workers
 # Default: 0,1
-# DYNAMO_PREFILL_GPUS="0,1"
+# DYNAMO_PREFILL_GPUS=0,1
 
 # GPU device IDs for disaggregated mode decode workers
 # Default: 2,3
-# DYNAMO_DECODE_GPUS="2,3"
+# DYNAMO_DECODE_GPUS=2,3
 
 # Tensor parallelism size (number of GPUs per worker)
 # Default: 4 for unified mode, 2 for disaggregated mode
-# DYNAMO_TP_SIZE="4"
+# DYNAMO_TP_SIZE=4
 
 # =============================================================================
 # OPTIONAL VARIABLES - Network Configuration
 # =============================================================================
 
 # HTTP port for Dynamo frontend API
-# Default: 8099
-# DYNAMO_HTTP_PORT="8099"
+# Default: 8000
+DYNAMO_HTTP_PORT=8000
 
 # ETCD client port for metadata and discovery
 # Default: 2379
-# DYNAMO_ETCD_PORT="2379"
+# DYNAMO_ETCD_PORT=2379
 
 # ETCD peer port
 # Default: 2390
-# DYNAMO_ETCD_PEER_PORT="2390"
+# DYNAMO_ETCD_PEER_PORT=2390
 
 # NATS messaging port
 # Default: 4222
-# DYNAMO_NATS_PORT="4222"
+# DYNAMO_NATS_PORT=4222
 
 # =============================================================================
 # OPTIONAL VARIABLES - Model Configuration
 # =============================================================================
 
 # Model name as exposed by the API
 # Default: llama-3.3-70b
-# DYNAMO_MODEL_NAME="llama-3.3-70b"
+# DYNAMO_MODEL_NAME=llama-3.3-70b
 
 # Shared memory size for Docker container
 # Default: 16g
-# DYNAMO_SHM_SIZE="16g"
+# DYNAMO_SHM_SIZE=16g
 
 # =============================================================================
 # OPTIONAL VARIABLES - Disaggregated Mode
 # =============================================================================
 
 # Bootstrap port for disaggregated mode communication
 # Default: 12345
-# DYNAMO_DISAGG_BOOTSTRAP_PORT="12345"
+# DYNAMO_DISAGG_BOOTSTRAP_PORT=12345
 
 # Transfer backend for KV cache (nixl, nccl, or gloo)
 # Default: nixl
-# DYNAMO_DISAGG_TRANSFER_BACKEND="nixl"
+# DYNAMO_DISAGG_TRANSFER_BACKEND=nixl
+
+# =============================================================================
+# OPTIONAL VARIABLES - Performance Tuning
+# =============================================================================
+
+# Worker initialization timeout (seconds)
+# Increase for large models (70B+) or cold starts
+# Default: 1800 (30 minutes)
+# DYNAMO_WORKER_INIT_TIMEOUT_S=1800
+
+# Block size in tokens - must match between SGLang (--page-size) and Frontend (--kv-cache-block-size)
+# Default: 64 tokens per block
+DYNAMO_KV_BLOCK_SIZE=64
 
+# Fraction of GPU memory for KV cache (0.0-1.0)
+# Reduce to test cache pressure/degradation scenarios
+# Default: 0.9 (90% of GPU memory for KV cache)
+# DYNAMO_MEM_FRACTION_STATIC=0.9
 
 # =============================================================================
-# OPTIONAL VARIABLES - Custom Thompson Sampler
+# OPTIONAL VARIABLES - LRU development
 # =============================================================================
 
-# Path to CSV file for router decision logging
-# Default: router_metrics.csv
-# ROUTER_METRICS_CSV = "router_metrics.csv"
+# Path to Dynamo source for patching (auto-detected from DYNAMO_REPO_DIR)
+# DYNAMO_SOURCE_DIR=/path/to/dynamo
+
+# vLLM worker option 1: default
+DYNAMO_USE_MULTILRU=false
+DYNAMO_VLLM_IMAGE="nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.1"
+
+#vLLM worker option 2: MultiLRU from Ryan Olsen's dev branch
+# DYNAMO_USE_MULTILRU=true # will force script to use processor_multirlu.py and router_multirlu.py
+# DYNAMO_VLLM_IMAGE="dynamo-multi-lru:latest"
 
-# timeout period for dynamo worker initialization
-# Default: 300
-# DYNAMO_WORKER_INIT_TIMEOUT_S = 300