Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/actions/docker-build/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ runs:
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
run: |
sudo apt-get update && sudo apt-get install -y git build-essential protobuf-compiler libclang-dev
curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain stable
curl --retry 5 --retry-delay 2 --fail --show-error -sSL -o /tmp/rustup-init.sh https://sh.rustup.rs
sh /tmp/rustup-init.sh -y --default-toolchain stable
rm -f /tmp/rustup-init.sh
. "$HOME/.cargo/env"
echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
cargo install cbindgen
Expand Down
10 changes: 7 additions & 3 deletions .github/actions/install-vcluster-cli/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,14 @@ runs:
aarch64) VCLUSTER_ARCH="arm64" ;;
*) echo "Unsupported architecture: ${ARCH}"; exit 1 ;;
esac
curl -sL -o /tmp/vcluster \
TMP_BIN="$(mktemp -p /tmp vcluster.XXXXXX)"
curl --retry 5 --retry-delay 2 \
--connect-timeout 10 --max-time 120 \
--fail --show-error -sL \
-o "${TMP_BIN}" \
"https://github.com/loft-sh/vcluster/releases/download/${{ inputs.vcluster_version }}/vcluster-linux-${VCLUSTER_ARCH}"
sudo mv /tmp/vcluster /usr/local/bin/vcluster
sudo chmod +x /usr/local/bin/vcluster
sudo install -m 0755 "${TMP_BIN}" /usr/local/bin/vcluster
rm -f "${TMP_BIN}"
vcluster version
fi
echo "::endgroup::"
2 changes: 1 addition & 1 deletion .github/workflows/nightly-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ jobs:
PROTOC_VER="30.2"
PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip"
PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f"
curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
echo "${PROTOC_SHA256} ${PROTOC_ZIP}" | sha256sum -c -
unzip "${PROTOC_ZIP}" -d $HOME/.local
rm "${PROTOC_ZIP}"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pre-merge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ jobs:
PROTOC_VER="30.2"
PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip"
PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f"
curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
echo "${PROTOC_SHA256} ${PROTOC_ZIP}" | sha256sum -c -
unzip "${PROTOC_ZIP}" -d $HOME/.local
rm "${PROTOC_ZIP}"
Expand Down Expand Up @@ -140,7 +140,7 @@ jobs:
PROTOC_VER="30.2"
PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip"
PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f"
curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
echo "${PROTOC_SHA256} ${PROTOC_ZIP}" | sha256sum -c -
unzip "${PROTOC_ZIP}" -d $HOME/.local
rm "${PROTOC_ZIP}"
Expand Down
10 changes: 8 additions & 2 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,10 @@ jobs:
env:
CRANE_VERSION: v0.20.2
run: |
curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \
curl --retry 5 --retry-delay 2 \
--connect-timeout 10 --max-time 120 \
--fail --show-error -sL \
"https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \
| tar -xzf - crane
sudo mv crane /usr/local/bin/
crane version
Expand Down Expand Up @@ -440,7 +443,10 @@ jobs:
env:
CRANE_VERSION: v0.20.2
run: |
curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \
curl --retry 5 --retry-delay 2 \
--connect-timeout 10 --max-time 120 \
--fail --show-error -sL \
"https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \
| tar -xzf - crane
sudo mv crane /usr/local/bin/
crane version
Expand Down
6 changes: 4 additions & 2 deletions components/src/dynamo/planner/config/planner_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ class PlannerConfig(BaseModel):
"kubernetes", "virtual", "global-planner"
] = SLAPlannerDefaults.environment
namespace: str = Field(
default_factory=lambda: os.environ.get("DYN_NAMESPACE", "dynamo")
default_factory=lambda: os.environ.get("DYN_NAMESPACE", "dynamo"),
exclude=True,
)
backend: Literal["vllm", "sglang", "trtllm", "mocker"] = SLAPlannerDefaults.backend
mode: Literal["disagg", "prefill", "decode", "agg"] = SLAPlannerDefaults.mode
Expand Down Expand Up @@ -109,7 +110,8 @@ class PlannerConfig(BaseModel):
default_factory=lambda: os.environ.get(
"PROMETHEUS_ENDPOINT",
"http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090",
)
),
exclude=True,
)
metric_reporting_prometheus_port: int = Field(
default_factory=lambda: int(os.environ.get("PLANNER_PROMETHEUS_PORT", 0))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -446,18 +446,20 @@ async def _process_token_stream(
# Internal transport field consumed by frontend nvext mapping.
out["disaggregated_params"] = {"routed_experts": routed_experts}
if finish_reason:
input_tokens = res["meta_info"]["prompt_tokens"]
completion_tokens = res["meta_info"]["completion_tokens"]
cached_tokens = res["meta_info"]["cached_tokens"]
meta_info = res.get("meta_info", {})
input_tokens = meta_info.get("prompt_tokens")
completion_tokens = meta_info.get("completion_tokens")
cached_tokens = meta_info.get("cached_tokens")
prefill_prompt_tokens_details = None
if cached_tokens is not None and cached_tokens > 0:
prefill_prompt_tokens_details = {"cached_tokens": cached_tokens}
out["completion_usage"] = {
"prompt_tokens": input_tokens,
"completion_tokens": completion_tokens,
"total_tokens": input_tokens + completion_tokens,
"prompt_tokens_details": prefill_prompt_tokens_details,
}
if input_tokens is not None and completion_tokens is not None:
out["completion_usage"] = {
"prompt_tokens": input_tokens,
"completion_tokens": completion_tokens,
"total_tokens": input_tokens + completion_tokens,
"prompt_tokens_details": prefill_prompt_tokens_details,
}
if not context.is_stopped():
yield out

Expand Down
41 changes: 40 additions & 1 deletion components/src/dynamo/sglang/tests/test_sglang_decode_handler.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from contextlib import asynccontextmanager

import pytest

from dynamo.sglang.request_handlers.llm.decode_handler import _extract_media_urls
from dynamo.sglang.request_handlers.llm.decode_handler import (
DecodeWorkerHandler,
_extract_media_urls,
)

pytestmark = [
pytest.mark.unit,
Expand Down Expand Up @@ -34,3 +39,37 @@ def test_extract_media_urls_returns_none_for_missing_or_invalid_items():
assert (
_extract_media_urls({"image_url": [{"ignored": "value"}]}, "image_url") is None
)


class _FakeContext:
def id(self):
return "ctx"

def is_stopped(self):
return False


@pytest.mark.asyncio
async def test_process_token_stream_treats_completion_usage_fields_as_optional():
handler = DecodeWorkerHandler.__new__(DecodeWorkerHandler)

@asynccontextmanager
async def _noop_monitor(request_id_future, context):
yield None

handler._cancellation_monitor = _noop_monitor

async def _stream():
yield {
"meta_info": {
"id": "rid",
"finish_reason": {"type": "stop"},
},
"output_ids": [],
}

outputs = [
item async for item in handler._process_token_stream(_stream(), _FakeContext())
]

assert outputs == [{"finish_reason": "stop", "token_ids": []}]
2 changes: 2 additions & 0 deletions deploy/operator/internal/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ const (
DynamoNixlPort = 19090
DynamoNixlPortName = "nixl"

DynamoFPMBasePort = 20380

MpiRunSshPort = 2222

// Default security context values
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
{Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"},
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"},
{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-lws-deploy"},
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"},
Expand Down Expand Up @@ -916,6 +917,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
{Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"},
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"},
{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-lws-deploy"},
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1307,6 +1307,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
Value: string(dgdr.UID),
},
}
if r.Config.Infrastructure.PrometheusEndpoint != "" {
profilerEnv = append(profilerEnv, corev1.EnvVar{
Name: "PROMETHEUS_ENDPOINT",
Value: r.Config.Infrastructure.PrometheusEndpoint,
})
}

// Build volume mounts
volumeMounts := []corev1.VolumeMount{
Expand Down
4 changes: 4 additions & 0 deletions deploy/operator/internal/dynamo/component_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont
Name: "NIXL_TELEMETRY_PROMETHEUS_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoNixlPort),
},
{
Name: "DYN_FORWARDPASS_METRIC_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoFPMBasePort),
},
}...)

if context.WorkerHashSuffix != "" {
Expand Down
17 changes: 17 additions & 0 deletions deploy/operator/internal/dynamo/graph_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2160,6 +2160,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: "NIXL_TELEMETRY_PROMETHEUS_PORT",
Value: "19090",
},
{
Name: "DYN_FORWARDPASS_METRIC_PORT",
Value: "20380",
},
{
Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment",
Expand Down Expand Up @@ -2374,6 +2378,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: "NIXL_TELEMETRY_PROMETHEUS_PORT",
Value: "19090",
},
{
Name: "DYN_FORWARDPASS_METRIC_PORT",
Value: "20380",
},
{
Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment",
Expand Down Expand Up @@ -3187,6 +3195,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: "NIXL_TELEMETRY_PROMETHEUS_PORT",
Value: "19090",
},
{
Name: "DYN_FORWARDPASS_METRIC_PORT",
Value: "20380",
},
{
Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment",
Expand Down Expand Up @@ -3388,6 +3400,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: "NIXL_TELEMETRY_PROMETHEUS_PORT",
Value: "19090",
},
{
Name: "DYN_FORWARDPASS_METRIC_PORT",
Value: "20380",
},
{
Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment",
Expand Down Expand Up @@ -5635,6 +5651,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
{Name: commonconsts.DynamoComponentEnvVar, Value: "worker"},
{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
{Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"},
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"},
{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-deployment"},
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-deployment"},
Expand Down
1 change: 1 addition & 0 deletions recipes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ These recipes are under active development and may require additional setup step
| Model | Framework | Mode | GPUs | Deployment | Notes |
|-------|-----------|------|------|------------|-------|
| **[GLM-5-NVFP4](glm-5-nvfp4/sglang/disagg/)** | SGLang | Disagg Prefill/Decode | 20x GB200 | ✅ | NVFP4, EAGLE speculative decoding, TP16 decode + TP4 prefill. Requires [custom container build](glm-5-nvfp4/). |
| **[Nemotron-3-Nano-Omni-NVFP4](nemotron-3-nano-omni/vllm/agg/)** | vLLM | Aggregated | 1x GPU | ✅ | Multimodal text/image/video/audio serving. Requires [custom container build](nemotron-3-nano-omni/). |
| **[nvidia/Kimi-K2.5-NVFP4](kimi-k2.5/trtllm/agg/nvidia/)** | TensorRT-LLM | Aggregated | 8x B200 | ✅ | Text only — MoE model, TP8×EP8, reasoning + tool calling. Vision input not yet functional. |
| **[DeepSeek-V4-Flash](deepseek-v4/deepseek-v4-flash/vllm/agg/)** | vLLM | Aggregated | 4x B200 | ✅ | Text only — MoE model (284B / 13B active), DP=4 + EP, FP8 KV cache, reasoning + tool calling. Requires [custom container build](deepseek-v4/container/). |
| **[DeepSeek-V4-Flash](deepseek-v4/deepseek-v4-flash/sglang/agg/)** | SGLang | Aggregated | 4x B200 | ✅ | Text only — MoE model (284B / 13B active), TP=4, MXFP4 MoE via FlashInfer, EAGLE MTP (3 steps / 4 draft tokens), reasoning + tool calling. Prebuilt image available; optional [custom container build](deepseek-v4/container/). |
Expand Down
60 changes: 60 additions & 0 deletions recipes/nemotron-3-nano-omni/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# syntax=docker/dockerfile:1.10.0
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Layer the ai-dynamo wheel (and the omni multimodal extras) onto an upstream
# vLLM image. Dynamo is installed from https://pypi.nvidia.com/ai-dynamo/ —
# no source build, no Rust toolchain. Match DYNAMO_VERSION to a nightly that
# targets the same vLLM minor as BASE_IMAGE (see the wheel's METADATA for the
# pinned vllm version).
#
# Build:
# docker build -t <registry>/nemotron-omni-vllm:latest \
# -f recipes/nemotron-3-nano-omni/Dockerfile \
# recipes/nemotron-3-nano-omni
#
# Override defaults with --build-arg, e.g.:
# --build-arg BASE_IMAGE=vllm/vllm-openai:v0.20.0
# --build-arg DYNAMO_VERSION=1.2.0.dev20260427

ARG BASE_IMAGE="vllm/vllm-openai:v0.20.0"
ARG DYNAMO_VERSION="1.2.0.dev20260427"

FROM ${BASE_IMAGE}
USER root

ARG DYNAMO_VERSION

# ai-dynamo Python package (dynamo.frontend, dynamo.vllm, ...) installed with
# --no-deps so that:
# - the base image's vLLM 0.20 / torch / cuda stack is preserved (otherwise
# pip would try to re-install vllm[flashinfer,otel,runai]==0.20.0 which is
# declared by ai-dynamo[vllm]), and
# - we skip `nixl` (KV transport, only used for disagg) and `ray` (multi-node
# orchestration), neither of which an aggregated single-GPU recipe needs.
RUN pip install --no-cache-dir --no-deps \
--extra-index-url https://pypi.nvidia.com \
ai-dynamo==${DYNAMO_VERSION}

# ai-dynamo's declared core dependencies, plus the Rust _core extension wheel
# (`ai-dynamo-runtime`) that the Python package imports natively. `kubernetes`,
# `pydantic`, and `pydantic-settings` are explicitly pinned because the latest
# unpinned versions (kubernetes 35.x, pydantic 2.13.x) violate ai-dynamo's
# declared compatibility ranges.
RUN pip install --no-cache-dir \
--extra-index-url https://pypi.nvidia.com \
ai-dynamo-runtime==${DYNAMO_VERSION} \
"kubernetes<33.0.0,>=32.0.1" \
"pydantic<2.13" "pydantic-settings<2.13.0" \
msgpack msgspec prometheus-client pyzmq transformers

# ai-dynamo[vllm] extras minus nixl + ray (handled above), plus the
# multimodal-video Python deps (av/ftfy/nvtx/sentencepiece) that the Nemotron
# Nano Omni model needs at runtime but which aren't pulled in by any extra.
RUN pip install --no-cache-dir \
blake3 librosa soundfile uvloop \
av ftfy nvtx sentencepiece

# vllm/vllm-openai's default ENTRYPOINT runs `vllm serve`; reset it so the
# image behaves as a plain dynamo runtime image.
ENTRYPOINT ["/bin/bash"]
Loading
Loading