Skip to content
Open
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ repos:
name: Clear Jupyter Notebook Output Cells
entry: ci/scripts/clear_notebook_output_cells.sh
files: "\\.ipynb$"
language: unsupported_script
language: python
additional_dependencies: ["nbconvert"]

- repo: https://github.com/tcort/markdown-link-check
rev: v3.14.1
Expand Down
2 changes: 1 addition & 1 deletion docs/source/components/integrations/frameworks.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ uv pip install "nvidia-nat[crewai]"

LangChain is a framework for building applications that utilize large language models (LLMs) to interact with data. It provides a set of tools for creating chains of LLM calls, allowing for complex workflows powered by LLMs. LangChain focuses on modularity and extensibility, making it suitable for integrating custom data pipelines and enhancing intelligent applications.

For more information, visit the [LangChain website](https://www.langchain.com/).
For more information, visit the [LangChain documentation](https://docs.langchain.com/oss/python/langchain/overview).


| Capability | Providers / Details |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ llms:
dynamo_llm:
_type: dynamo
model_name: llama-3.3-70b
base_url: http://localhost:8099/v1
base_url: http://localhost:8000/v1
api_key: dummy
# _type: nim
# model_name: meta/llama-3.3-70b-instruct
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,24 @@
# - Bottleneck analysis with nested call stacks
# - Concurrency spike detection
# - Prompt caching prefix identification
# - Dynamo inference stack metrics (KVE, TTFT, ITL from Prometheus)
#
# Core Dynamo Optimization Metrics:
# 1. KV Efficiency (KVE) = cached_tokens / prompt_tokens
# - Measures fraction of computational work saved via KV cache
# - Higher is better (0.8 = 80% of tokens from cache)
# 2. Time to First Token (TTFT) - User-perceived initial latency
# 3. Inter-Token Latency (ITL) - Streaming smoothness
#
# Combines self-evaluating agent with detailed profiler for:
# - Understanding performance characteristics of rethinking
# - Identifying optimization opportunities
# - Generating data for throughput analysis scripts
#
# Prerequisites:
# - Prometheus running at localhost:9090 (for Dynamo metrics)
# - Dynamo stack with monitoring enabled
#
# Usage:
# nat profile --config_file configs/profile_rethinking_full_test.yml
#
Expand Down Expand Up @@ -130,7 +142,7 @@ llms:
dynamo_llm:
_type: dynamo
model_name: llama-3.3-70b
base_url: http://localhost:8099/v1
base_url: http://localhost:8000/v1
api_key: dummy
temperature: 0.0
max_tokens: 8192
Expand All @@ -151,7 +163,7 @@ llms:
eval_llm:
_type: dynamo
model_name: llama-3.3-70b
base_url: http://localhost:8099/v1
base_url: http://localhost:8000/v1
api_key: dummy
temperature: 0.0
max_tokens: 1024
Expand Down Expand Up @@ -187,7 +199,7 @@ workflow:

eval:
general:
max_concurrency: 36
max_concurrency: 8

output:
dir: ./examples/dynamo_integration/react_benchmark_agent/outputs/dynamo_evals/rethinking_full_test_for_profiling/
Expand Down Expand Up @@ -223,6 +235,25 @@ eval:
concurrency_spike_analysis:
enable: true
spike_threshold: 24 # Alert when concurrent functions >= 24
# Dynamo inference stack metrics - collect from Prometheus
# Core optimization metrics: KV Efficiency, TTFT, ITL
dynamo_metrics:
enable: true
prometheus_url: http://localhost:9090
# Time range for rate calculations - should match experiment duration
# Minimum: 15s (Prometheus scrapes every 5s, need ≥3 points for reliable rates)
# Options: 15s (very short), 30s, 1m, 2m, 5m
# Shorter = more accurate for brief experiments, but noisier
# Longer = smoother averages, but may include pre-experiment data
query_range: 30s
# Core metrics (primary optimization targets)
collect_kv_cache: true # KVE = cached_tokens/prompt_tokens (work saved)
collect_ttft: true # Time to First Token (P50/P95/P99)
collect_itl: true # Inter-Token Latency (P50/P95/P99)
# Supplementary metrics (context and diagnostics)
collect_inflight_requests: true
collect_throughput: true
collect_token_throughput: true

evaluators:
tool_selection_quality:
Expand Down
83 changes: 59 additions & 24 deletions external/dynamo/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -13,84 +13,119 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# Required: Set your model directory path
export DYNAMO_MODEL_DIR="/path/to/your/models/Llama-3.3-70B-Instruct"
# Optional: Set repository directory (for Thompson Sampling router)
export DYNAMO_REPO_DIR="/path/to/NeMo-Agent-Toolkit"
export HF_HOME=/path/to/local/storage/.cache/huggingface

export HF_TOKEN=my_huggingface_read_token

# Required: Set your model directory path with model weights
# EXAMPLE ls from properly configured directory
# ~/models/Llama-3.3-70B-Instruct$ ls
# LICENSE model-00003-of-00030.safetensors model-00010-of-00030.safetensors model-00017-of-00030.safetensors model-00024-of-00030.safetensors model.safetensors.index.json
# README.md model-00004-of-00030.safetensors model-00011-of-00030.safetensors model-00018-of-00030.safetensors model-00025-of-00030.safetensors original
# USE_POLICY.md model-00005-of-00030.safetensors model-00012-of-00030.safetensors model-00019-of-00030.safetensors model-00026-of-00030.safetensors special_tokens_map.json
# config.json model-00006-of-00030.safetensors model-00013-of-00030.safetensors model-00020-of-00030.safetensors model-00027-of-00030.safetensors tokenizer.json
# generation_config.json model-00007-of-00030.safetensors model-00014-of-00030.safetensors model-00021-of-00030.safetensors model-00028-of-00030.safetensors tokenizer_config.json
# model-00001-of-00030.safetensors model-00008-of-00030.safetensors model-00015-of-00030.safetensors model-00022-of-00030.safetensors model-00029-of-00030.safetensors
# model-00002-of-00030.safetensors model-00009-of-00030.safetensors model-00016-of-00030.safetensors model-00023-of-00030.safetensors model-00030-of-00030.safetensors
export DYNAMO_MODEL_DIR=/path/to/your/models/Llama-3.3-70B-Instruct

# Set repository directory (for Thompson Sampling router)
export DYNAMO_REPO_DIR=/path/to/NeMo-Agent-Toolkit/external/dynamo

# =============================================================================
# OPTIONAL VARIABLES - GPU Configuration
# =============================================================================

# GPU device IDs for unified mode (comma-separated)
# Default: 0,1,2,3
DYNAMO_GPU_DEVICES="0,1,2,3"
export DYNAMO_GPU_DEVICES=0,1,2,3

# GPU device IDs for disaggregated mode prefill workers
# Default: 0,1
# DYNAMO_PREFILL_GPUS="0,1"
# DYNAMO_PREFILL_GPUS=0,1

# GPU device IDs for disaggregated mode decode workers
# Default: 2,3
# DYNAMO_DECODE_GPUS="2,3"
# DYNAMO_DECODE_GPUS=2,3

# Tensor parallelism size (number of GPUs per worker)
# Default: 4 for unified mode, 2 for disaggregated mode
# DYNAMO_TP_SIZE="4"
# DYNAMO_TP_SIZE=4

# =============================================================================
# OPTIONAL VARIABLES - Network Configuration
# =============================================================================

# HTTP port for Dynamo frontend API
# Default: 8099
# DYNAMO_HTTP_PORT="8099"
# Default: 8000
DYNAMO_HTTP_PORT=8000

Comment on lines 59 to 62
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

# First, find the file and check if it exists
fd -type f -name ".env.example" external/dynamo/

Repository: NVIDIA/NeMo-Agent-Toolkit

Length of output: 239


🏁 Script executed:

# Read the file to inspect the specific lines mentioned
head -130 external/dynamo/.env.example | tail -80

Repository: NVIDIA/NeMo-Agent-Toolkit

Length of output: 2835


🏁 Script executed:

# Get line count and read the full file to see export patterns
wc -l external/dynamo/.env.example
cat -n external/dynamo/.env.example

Repository: NVIDIA/NeMo-Agent-Toolkit

Length of output: 6718


🏁 Script executed:

# Check for dotenv-linter configuration or references in the repo
fd -name ".dotenv*" -o -name "*dotenv*" | grep -E "(config|rc|yml|yaml)"

Repository: NVIDIA/NeMo-Agent-Toolkit

Length of output: 299


🏁 Script executed:

# Search for dotenv-linter usage or configuration
rg "dotenv-linter" --type yaml --type json --type toml

Repository: NVIDIA/NeMo-Agent-Toolkit

Length of output: 51


🏁 Script executed:

# Check if dotenv-linter is installed and test it on the file
which dotenv-linter || echo "dotenv-linter not found in PATH"

Repository: NVIDIA/NeMo-Agent-Toolkit

Length of output: 161


Add missing export statements and remove quotes for consistency with dotenv conventions.

The file uses export on some variables (lines 16, 18, 30, 33, 41) but omits it on others (lines 61, 110, 125-126). When sourced, variables without export won't propagate to child processes. Additionally, line 126 has quotes around the image URL, which is non-standard for .env files and can cause parsing issues.

Suggested fix
-DYNAMO_HTTP_PORT=8000
+export DYNAMO_HTTP_PORT=8000
...
-DYNAMO_KV_BLOCK_SIZE=64
+export DYNAMO_KV_BLOCK_SIZE=64
...
-DYNAMO_USE_MULTILRU=false
-DYNAMO_VLLM_IMAGE="nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.1"
+export DYNAMO_USE_MULTILRU=false
+export DYNAMO_VLLM_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.1
🤖 Prompt for AI Agents
In `@external/dynamo/.env.example` around lines 59 - 62, Update the .env.example
so all variables are consistently exported and remove unnecessary quotes: add
export to variables missing it (e.g., DYNAMO_HTTP_PORT and the other
non-exported entries referenced in the comment) so they propagate to child
processes, and remove the surrounding quotes from the image URL on the quoted
line so the value follows dotenv conventions; ensure other exported lines (those
already using export) remain unchanged and keep naming identical to existing
keys.

# ETCD client port for metadata and discovery
# Default: 2379
# DYNAMO_ETCD_PORT="2379"
# DYNAMO_ETCD_PORT=2379

# ETCD peer port
# Default: 2390
# DYNAMO_ETCD_PEER_PORT="2390"
# DYNAMO_ETCD_PEER_PORT=2390

# NATS messaging port
# Default: 4222
# DYNAMO_NATS_PORT="4222"
# DYNAMO_NATS_PORT=4222

# =============================================================================
# OPTIONAL VARIABLES - Model Configuration
# =============================================================================

# Model name as exposed by the API
# Default: llama-3.3-70b
# DYNAMO_MODEL_NAME="llama-3.3-70b"
# DYNAMO_MODEL_NAME=llama-3.3-70b

# Shared memory size for Docker container
# Default: 16g
# DYNAMO_SHM_SIZE="16g"
# DYNAMO_SHM_SIZE=16g

# =============================================================================
# OPTIONAL VARIABLES - Disaggregated Mode
# =============================================================================

# Bootstrap port for disaggregated mode communication
# Default: 12345
# DYNAMO_DISAGG_BOOTSTRAP_PORT="12345"
# DYNAMO_DISAGG_BOOTSTRAP_PORT=12345

# Transfer backend for KV cache (nixl, nccl, or gloo)
# Default: nixl
# DYNAMO_DISAGG_TRANSFER_BACKEND="nixl"
# DYNAMO_DISAGG_TRANSFER_BACKEND=nixl

# =============================================================================
# OPTIONAL VARIABLES - Performance Tuning
# =============================================================================

# Worker initialization timeout (seconds)
# Increase for large models (70B+) or cold starts
# Default: 1800 (30 minutes)
# DYNAMO_WORKER_INIT_TIMEOUT_S=1800

# Block size in tokens - must match between SGLang (--page-size) and Frontend (--kv-cache-block-size)
# Default: 64 tokens per block
DYNAMO_KV_BLOCK_SIZE=64

# Fraction of GPU memory for KV cache (0.0-1.0)
# Reduce to test cache pressure/degradation scenarios
# Default: 0.9 (90% of GPU memory for KV cache)
# DYNAMO_MEM_FRACTION_STATIC=0.9

# =============================================================================
# OPTIONAL VARIABLES - Custom Thompson Sampler
# OPTIONAL VARIABLES - LRU development
# =============================================================================

# Path to CSV file for router decision logging
# Default: router_metrics.csv
# ROUTER_METRICS_CSV = "router_metrics.csv"
# Path to Dynamo source for patching (auto-detected from DYNAMO_REPO_DIR)
# DYNAMO_SOURCE_DIR=/path/to/dynamo

# vLLM worker option 1: default
DYNAMO_USE_MULTILRU=false
DYNAMO_VLLM_IMAGE="nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.1"

#vLLM worker option 2: MultiLRU from Ryan Olsen's dev branch
# DYNAMO_USE_MULTILRU=true # will force script to use processor_multirlu.py and router_multirlu.py
# DYNAMO_VLLM_IMAGE="dynamo-multi-lru:latest"

# timeout period for dynamo worker initialization
# Default: 300
# DYNAMO_WORKER_INIT_TIMEOUT_S = 300
Loading
Loading