diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml index c3637b853bc6..5c3299fc3877 100644 --- a/.github/actions/docker-build/action.yml +++ b/.github/actions/docker-build/action.yml @@ -80,7 +80,9 @@ runs: ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com run: | sudo apt-get update && sudo apt-get install -y git build-essential protobuf-compiler libclang-dev - curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain stable + curl --retry 5 --retry-delay 2 --fail --show-error -sSL -o /tmp/rustup-init.sh https://sh.rustup.rs + sh /tmp/rustup-init.sh -y --default-toolchain stable + rm -f /tmp/rustup-init.sh . "$HOME/.cargo/env" echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" cargo install cbindgen diff --git a/.github/actions/install-vcluster-cli/action.yml b/.github/actions/install-vcluster-cli/action.yml index 572d979a153a..b3e127b36e24 100644 --- a/.github/actions/install-vcluster-cli/action.yml +++ b/.github/actions/install-vcluster-cli/action.yml @@ -23,10 +23,14 @@ runs: aarch64) VCLUSTER_ARCH="arm64" ;; *) echo "Unsupported architecture: ${ARCH}"; exit 1 ;; esac - curl -sL -o /tmp/vcluster \ + TMP_BIN="$(mktemp -p /tmp vcluster.XXXXXX)" + curl --retry 5 --retry-delay 2 \ + --connect-timeout 10 --max-time 120 \ + --fail --show-error -sL \ + -o "${TMP_BIN}" \ "https://github.com/loft-sh/vcluster/releases/download/${{ inputs.vcluster_version }}/vcluster-linux-${VCLUSTER_ARCH}" - sudo mv /tmp/vcluster /usr/local/bin/vcluster - sudo chmod +x /usr/local/bin/vcluster + sudo install -m 0755 "${TMP_BIN}" /usr/local/bin/vcluster + rm -f "${TMP_BIN}" vcluster version fi echo "::endgroup::" diff --git a/.github/workflows/nightly-ci.yml b/.github/workflows/nightly-ci.yml index d92bbe88d71c..d671a7c98618 100644 --- a/.github/workflows/nightly-ci.yml +++ b/.github/workflows/nightly-ci.yml @@ -258,7 +258,7 @@ jobs: PROTOC_VER="30.2" PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip" PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f" - curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" + curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" echo "${PROTOC_SHA256} ${PROTOC_ZIP}" | sha256sum -c - unzip "${PROTOC_ZIP}" -d $HOME/.local rm "${PROTOC_ZIP}" diff --git a/.github/workflows/pre-merge.yml b/.github/workflows/pre-merge.yml index 3693b06728a2..7e96b753e350 100644 --- a/.github/workflows/pre-merge.yml +++ b/.github/workflows/pre-merge.yml @@ -81,7 +81,7 @@ jobs: PROTOC_VER="30.2" PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip" PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f" - curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" + curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" echo "${PROTOC_SHA256} ${PROTOC_ZIP}" | sha256sum -c - unzip "${PROTOC_ZIP}" -d $HOME/.local rm "${PROTOC_ZIP}" @@ -140,7 +140,7 @@ jobs: PROTOC_VER="30.2" PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip" PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f" - curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" + curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" echo "${PROTOC_SHA256} ${PROTOC_ZIP}" | sha256sum -c - unzip "${PROTOC_ZIP}" -d $HOME/.local rm "${PROTOC_ZIP}" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 24dd4547445f..cc145b4c037b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -176,7 +176,10 @@ jobs: env: CRANE_VERSION: v0.20.2 run: | - curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \ + curl --retry 5 --retry-delay 2 \ + --connect-timeout 10 --max-time 120 \ + --fail --show-error -sL \ + "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \ | tar -xzf - crane sudo mv crane /usr/local/bin/ crane version @@ -440,7 +443,10 @@ jobs: env: CRANE_VERSION: v0.20.2 run: | - curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \ + curl --retry 5 --retry-delay 2 \ + --connect-timeout 10 --max-time 120 \ + --fail --show-error -sL \ + "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \ | tar -xzf - crane sudo mv crane /usr/local/bin/ crane version diff --git a/components/src/dynamo/frontend/sglang_processor.py b/components/src/dynamo/frontend/sglang_processor.py index 473cf9ca9ec8..daaff052ad08 100644 --- a/components/src/dynamo/frontend/sglang_processor.py +++ b/components/src/dynamo/frontend/sglang_processor.py @@ -39,6 +39,7 @@ preprocess_chat_request, ) from .utils import ( + FrontendRoundRobinRouter, PreprocessError, extract_mm_urls, handle_engine_error, @@ -624,9 +625,16 @@ async def chat_engine_factory( kv_router_config=self.router_config.kv_router_config, ) else: - router = await generate_endpoint.client( + client = await generate_endpoint.client( router_mode=self.router_config.router_mode ) + if self.router_config.router_mode == RouterMode.RoundRobin: + router = FrontendRoundRobinRouter( + client, + f"{namespace_name}.{component_name}.{endpoint_name}", + ) + else: + router = client preprocess_pool = None preprocess_workers = self.config.preprocess_workers diff --git a/components/src/dynamo/frontend/tests/test_frontend_routing_utils.py b/components/src/dynamo/frontend/tests/test_frontend_routing_utils.py new file mode 100644 index 000000000000..229eed89e567 --- /dev/null +++ b/components/src/dynamo/frontend/tests/test_frontend_routing_utils.py @@ -0,0 +1,93 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from dynamo.frontend.utils import FrontendRoundRobinRouter + +pytestmark = [pytest.mark.unit, pytest.mark.gpu_0, pytest.mark.pre_merge] + + +class _FakeClient: + def __init__(self, instance_sequences): + self._instance_sequences = list(instance_sequences) + self._instance_idx = 0 + self.direct_calls = [] + + def instance_ids(self): + if self._instance_idx < len(self._instance_sequences): + ids = self._instance_sequences[self._instance_idx] + self._instance_idx += 1 + return ids + return self._instance_sequences[-1] + + async def wait_for_instances(self): + return [9, 10] + + async def direct(self, request, instance_id, annotated=True): + self.direct_calls.append((request, instance_id, annotated)) + return { + "instance": str(instance_id), + "request": request, + "annotated": annotated, + } + + +@pytest.mark.asyncio +async def test_frontend_round_robin_router_balances_sorted_instance_ids(): + client = _FakeClient([[20, 10, 30], [20, 10, 30], [20, 10, 30], [20, 10, 30]]) + router = FrontendRoundRobinRouter(client, "dynamo.backend.generate") + + results = [] + for idx in range(4): + results.append(await router.generate({"seq": idx}, annotated=False)) + + assert [item["instance"] for item in results] == ["10", "20", "30", "10"] + assert [call[2] for call in client.direct_calls] == [False, False, False, False] + + +@pytest.mark.asyncio +async def test_frontend_round_robin_router_refreshes_membership_each_request(): + client = _FakeClient([[2, 1], [3, 2, 1], [3, 2, 1]]) + router = FrontendRoundRobinRouter(client, "dynamo.backend.generate") + + first = await router.generate({"seq": 0}, annotated=False) + second = await router.generate({"seq": 1}, annotated=False) + third = await router.generate({"seq": 2}, annotated=False) + + assert first["instance"] == "1" + assert second["instance"] == "2" + assert third["instance"] == "3" + + +@pytest.mark.asyncio +async def test_frontend_round_robin_router_waits_for_instances_when_empty(): + client = _FakeClient([[]]) + router = FrontendRoundRobinRouter(client, "dynamo.backend.generate") + + result = await router.generate({"seq": 0}, annotated=False) + + assert result["instance"] == "9" + + +@pytest.mark.asyncio +async def test_frontend_round_robin_router_raises_when_no_instances_ever_appear(): + client = _FakeClient([[]]) + client.wait_for_instances = _empty_instances + router = FrontendRoundRobinRouter(client, "dynamo.backend.generate") + + with pytest.raises(RuntimeError, match="No active backend instances available"): + await router.generate({"seq": 0}, annotated=False) + + +@pytest.mark.asyncio +async def test_frontend_round_robin_router_rejects_unexpected_kwargs(): + client = _FakeClient([[1]]) + router = FrontendRoundRobinRouter(client, "dynamo.backend.generate") + + with pytest.raises(TypeError, match="Unsupported kwargs"): + await router.generate({"seq": 0}, annotated=False, foo=1) + + +async def _empty_instances(): + return [] diff --git a/components/src/dynamo/frontend/utils.py b/components/src/dynamo/frontend/utils.py index b85a5dae59f7..76db2df7c96d 100644 --- a/components/src/dynamo/frontend/utils.py +++ b/components/src/dynamo/frontend/utils.py @@ -3,11 +3,14 @@ """Shared utilities for frontend chat processors (vLLM, SGLang).""" +import asyncio import logging +import os import uuid from typing import Any _MASK_64_BITS = (1 << 64) - 1 +logger = logging.getLogger(__name__) def random_uuid() -> str: @@ -33,6 +36,61 @@ def __init__(self, error_dict: dict[str, Any]): super().__init__(str(error_dict)) +class FrontendRoundRobinRouter: + """Frontend-managed round-robin over the current runtime client membership. + + This avoids sticky routing behavior in the opaque runtime round-robin client by + selecting an instance in Python and sending the request via ``Client.direct``. + """ + + def __init__(self, client: Any, endpoint_name: str): + self._client = client + self._endpoint_name = endpoint_name + self._cursor = 0 + self._lock = asyncio.Lock() + self._debug = os.getenv("DYN_FRONTEND_ROUTING_DEBUG", "").lower() in { + "1", + "true", + "yes", + "on", + } + + async def generate(self, request: dict[str, Any], **kwargs: Any): + annotated = kwargs.pop("annotated", None) + if kwargs: + raise TypeError( + f"Unsupported kwargs for frontend round-robin router: {sorted(kwargs)}" + ) + + instance_ids = list(self._client.instance_ids()) + if not instance_ids: + instance_ids = list(await self._client.wait_for_instances()) + if not instance_ids: + raise RuntimeError( + f"No active backend instances available for {self._endpoint_name}" + ) + + instance_ids = sorted(instance_ids) + async with self._lock: + instance_id = instance_ids[self._cursor % len(instance_ids)] + self._cursor += 1 + + if self._debug: + logger.info( + "Frontend routing selected endpoint=%s instance=%s instances=%s annotated=%s", + self._endpoint_name, + instance_id, + instance_ids, + annotated, + ) + + return await self._client.direct( + request, + instance_id=instance_id, + annotated=annotated, + ) + + # Content part types that carry media URLs, mapped to the key used in the # multimodal data dict sent to the backend handler. _MEDIA_CONTENT_TYPES = ("image_url", "audio_url", "video_url") diff --git a/components/src/dynamo/frontend/vllm_processor.py b/components/src/dynamo/frontend/vllm_processor.py index 8962bfd02ec5..81c368ba09bc 100644 --- a/components/src/dynamo/frontend/vllm_processor.py +++ b/components/src/dynamo/frontend/vllm_processor.py @@ -46,6 +46,7 @@ from .prepost import StreamingPostProcessor, preprocess_chat_request from .utils import ( + FrontendRoundRobinRouter, extract_mm_urls, handle_engine_error, make_internal_error, @@ -800,9 +801,16 @@ async def chat_engine_factory( kv_router_config=self.router_config.kv_router_config, ) else: - router = await generate_endpoint.client( + client = await generate_endpoint.client( router_mode=self.router_config.router_mode ) + if self.router_config.router_mode == RouterMode.RoundRobin: + router = FrontendRoundRobinRouter( + client, + f"{namespace_name}.{component_name}.{endpoint_name}", + ) + else: + router = client block_size = self.config.kv_cache_block_size or 16 diff --git a/components/src/dynamo/planner/config/planner_config.py b/components/src/dynamo/planner/config/planner_config.py index 93e074bcb6fd..307757381d45 100644 --- a/components/src/dynamo/planner/config/planner_config.py +++ b/components/src/dynamo/planner/config/planner_config.py @@ -52,7 +52,8 @@ class PlannerConfig(BaseModel): "kubernetes", "virtual", "global-planner" ] = SLAPlannerDefaults.environment namespace: str = Field( - default_factory=lambda: os.environ.get("DYN_NAMESPACE", "dynamo") + default_factory=lambda: os.environ.get("DYN_NAMESPACE", "dynamo"), + exclude=True, ) backend: Literal["vllm", "sglang", "trtllm", "mocker"] = SLAPlannerDefaults.backend mode: Literal["disagg", "prefill", "decode", "agg"] = SLAPlannerDefaults.mode @@ -109,7 +110,8 @@ class PlannerConfig(BaseModel): default_factory=lambda: os.environ.get( "PROMETHEUS_ENDPOINT", "http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090", - ) + ), + exclude=True, ) metric_reporting_prometheus_port: int = Field( default_factory=lambda: int(os.environ.get("PLANNER_PROMETHEUS_PORT", 0)) diff --git a/deploy/operator/internal/consts/consts.go b/deploy/operator/internal/consts/consts.go index 2d8b67ca533b..a1b9222bb98c 100644 --- a/deploy/operator/internal/consts/consts.go +++ b/deploy/operator/internal/consts/consts.go @@ -27,6 +27,8 @@ const ( DynamoNixlPort = 19090 DynamoNixlPortName = "nixl" + DynamoFPMBasePort = 20380 + MpiRunSshPort = 2222 // Default security context values diff --git a/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go b/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go index 9216c3c453da..4aaf920cf286 100644 --- a/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go +++ b/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go @@ -773,6 +773,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. {Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName}, {Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker}, {Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"}, + {Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"}, {Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"}, {Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-lws-deploy"}, {Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"}, @@ -916,6 +917,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. {Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName}, {Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker}, {Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"}, + {Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"}, {Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"}, {Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-lws-deploy"}, {Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"}, diff --git a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go index 59c0a8982b4a..eb70054264a0 100644 --- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go +++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go @@ -1307,6 +1307,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. Value: string(dgdr.UID), }, } + if r.Config.Infrastructure.PrometheusEndpoint != "" { + profilerEnv = append(profilerEnv, corev1.EnvVar{ + Name: "PROMETHEUS_ENDPOINT", + Value: r.Config.Infrastructure.PrometheusEndpoint, + }) + } // Build volume mounts volumeMounts := []corev1.VolumeMount{ diff --git a/deploy/operator/internal/dynamo/component_worker.go b/deploy/operator/internal/dynamo/component_worker.go index 38d9d0c45503..be19b61c3863 100644 --- a/deploy/operator/internal/dynamo/component_worker.go +++ b/deploy/operator/internal/dynamo/component_worker.go @@ -107,6 +107,10 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: fmt.Sprintf("%d", commonconsts.DynamoNixlPort), }, + { + Name: "DYN_FORWARDPASS_METRIC_PORT", + Value: fmt.Sprintf("%d", commonconsts.DynamoFPMBasePort), + }, }...) if context.WorkerHashSuffix != "" { diff --git a/deploy/operator/internal/dynamo/graph_test.go b/deploy/operator/internal/dynamo/graph_test.go index a6ab18bfc825..f5cab1e18a0d 100644 --- a/deploy/operator/internal/dynamo/graph_test.go +++ b/deploy/operator/internal/dynamo/graph_test.go @@ -2160,6 +2160,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: "19090", }, + { + Name: "DYN_FORWARDPASS_METRIC_PORT", + Value: "20380", + }, { Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-dynamo-graph-deployment", @@ -2374,6 +2378,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: "19090", }, + { + Name: "DYN_FORWARDPASS_METRIC_PORT", + Value: "20380", + }, { Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-dynamo-graph-deployment", @@ -3187,6 +3195,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: "19090", }, + { + Name: "DYN_FORWARDPASS_METRIC_PORT", + Value: "20380", + }, { Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-dynamo-graph-deployment", @@ -3388,6 +3400,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: "19090", }, + { + Name: "DYN_FORWARDPASS_METRIC_PORT", + Value: "20380", + }, { Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-dynamo-graph-deployment", @@ -5635,6 +5651,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) { {Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName}, {Name: commonconsts.DynamoComponentEnvVar, Value: "worker"}, {Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"}, + {Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"}, {Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"}, {Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-deployment"}, {Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-deployment"}, diff --git a/recipes/README.md b/recipes/README.md index 0783a8c87377..500aad85fa01 100644 --- a/recipes/README.md +++ b/recipes/README.md @@ -68,6 +68,7 @@ These recipes are under active development and may require additional setup step | Model | Framework | Mode | GPUs | Deployment | Notes | |-------|-----------|------|------|------------|-------| | **[GLM-5-NVFP4](glm-5-nvfp4/sglang/disagg/)** | SGLang | Disagg Prefill/Decode | 20x GB200 | ✅ | NVFP4, EAGLE speculative decoding, TP16 decode + TP4 prefill. Requires [custom container build](glm-5-nvfp4/). | +| **[Nemotron-3-Nano-Omni-NVFP4](nemotron-3-nano-omni/vllm/agg/)** | vLLM | Aggregated | 1x GPU | ✅ | Multimodal text/image/video/audio serving. Requires [custom container build](nemotron-3-nano-omni/). | | **[nvidia/Kimi-K2.5-NVFP4](kimi-k2.5/trtllm/agg/nvidia/)** | TensorRT-LLM | Aggregated | 8x B200 | ✅ | Text only — MoE model, TP8×EP8, reasoning + tool calling. Vision input not yet functional. | | **[DeepSeek-V4-Flash](deepseek-v4/deepseek-v4-flash/vllm/agg/)** | vLLM | Aggregated | 4x B200 | ✅ | Text only — MoE model (284B / 13B active), DP=4 + EP, FP8 KV cache, reasoning + tool calling. Requires [custom container build](deepseek-v4/container/). | | **[DeepSeek-V4-Flash](deepseek-v4/deepseek-v4-flash/sglang/agg/)** | SGLang | Aggregated | 4x B200 | ✅ | Text only — MoE model (284B / 13B active), TP=4, MXFP4 MoE via FlashInfer, EAGLE MTP (3 steps / 4 draft tokens), reasoning + tool calling. Prebuilt image available; optional [custom container build](deepseek-v4/container/). | diff --git a/recipes/nemotron-3-nano-omni/Dockerfile b/recipes/nemotron-3-nano-omni/Dockerfile new file mode 100644 index 000000000000..1d13afe628ff --- /dev/null +++ b/recipes/nemotron-3-nano-omni/Dockerfile @@ -0,0 +1,60 @@ +# syntax=docker/dockerfile:1.10.0 +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Layer the ai-dynamo wheel (and the omni multimodal extras) onto an upstream +# vLLM image. Dynamo is installed from https://pypi.nvidia.com/ai-dynamo/ — +# no source build, no Rust toolchain. Match DYNAMO_VERSION to a nightly that +# targets the same vLLM minor as BASE_IMAGE (see the wheel's METADATA for the +# pinned vllm version). +# +# Build: +# docker build -t /nemotron-omni-vllm:latest \ +# -f recipes/nemotron-3-nano-omni/Dockerfile \ +# recipes/nemotron-3-nano-omni +# +# Override defaults with --build-arg, e.g.: +# --build-arg BASE_IMAGE=vllm/vllm-openai:v0.20.0 +# --build-arg DYNAMO_VERSION=1.2.0.dev20260427 + +ARG BASE_IMAGE="vllm/vllm-openai:v0.20.0" +ARG DYNAMO_VERSION="1.2.0.dev20260427" + +FROM ${BASE_IMAGE} +USER root + +ARG DYNAMO_VERSION + +# ai-dynamo Python package (dynamo.frontend, dynamo.vllm, ...) installed with +# --no-deps so that: +# - the base image's vLLM 0.20 / torch / cuda stack is preserved (otherwise +# pip would try to re-install vllm[flashinfer,otel,runai]==0.20.0 which is +# declared by ai-dynamo[vllm]), and +# - we skip `nixl` (KV transport, only used for disagg) and `ray` (multi-node +# orchestration), neither of which an aggregated single-GPU recipe needs. +RUN pip install --no-cache-dir --no-deps \ + --extra-index-url https://pypi.nvidia.com \ + ai-dynamo==${DYNAMO_VERSION} + +# ai-dynamo's declared core dependencies, plus the Rust _core extension wheel +# (`ai-dynamo-runtime`) that the Python package imports natively. `kubernetes`, +# `pydantic`, and `pydantic-settings` are explicitly pinned because the latest +# unpinned versions (kubernetes 35.x, pydantic 2.13.x) violate ai-dynamo's +# declared compatibility ranges. +RUN pip install --no-cache-dir \ + --extra-index-url https://pypi.nvidia.com \ + ai-dynamo-runtime==${DYNAMO_VERSION} \ + "kubernetes<33.0.0,>=32.0.1" \ + "pydantic<2.13" "pydantic-settings<2.13.0" \ + msgpack msgspec prometheus-client pyzmq transformers + +# ai-dynamo[vllm] extras minus nixl + ray (handled above), plus the +# multimodal-video Python deps (av/ftfy/nvtx/sentencepiece) that the Nemotron +# Nano Omni model needs at runtime but which aren't pulled in by any extra. +RUN pip install --no-cache-dir \ + blake3 librosa soundfile uvloop \ + av ftfy nvtx sentencepiece + +# vllm/vllm-openai's default ENTRYPOINT runs `vllm serve`; reset it so the +# image behaves as a plain dynamo runtime image. +ENTRYPOINT ["/bin/bash"] diff --git a/recipes/nemotron-3-nano-omni/README.md b/recipes/nemotron-3-nano-omni/README.md new file mode 100644 index 000000000000..e4cbfe6031a1 --- /dev/null +++ b/recipes/nemotron-3-nano-omni/README.md @@ -0,0 +1,185 @@ + + +# Nemotron 3 Nano Omni NVFP4 + +Serves [nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4](https://huggingface.co/nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4) +using vLLM with an aggregated Dynamo deployment. + +This recipe builds a custom container that layers the `ai-dynamo` wheel +(from ) onto an upstream vLLM image — no +source build, no Rust toolchain. + +## Topology + +| Role | Replicas | GPUs/replica | Notes | +|------|----------|--------------|-------| +| Frontend | 1 | 0 | Dynamo frontend with prefix-hash KV routing | +| vLLM worker | 1 | 1 | Text, image, video, and audio inputs | + +## Prerequisites + +- A Kubernetes cluster with the [Dynamo Operator](../../docs/kubernetes/README.md) installed +- One NVIDIA GPU per worker replica +- Shared PVC storage for the Hugging Face model cache +- Hugging Face access to `nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4` + +## Step 1: Build the Container + +```bash +docker build \ + -t /nemotron-omni-vllm:latest \ + -f recipes/nemotron-3-nano-omni/Dockerfile \ + recipes/nemotron-3-nano-omni +docker push /nemotron-omni-vllm:latest +``` + +Useful build args: + +- `BASE_IMAGE=` — pin to a different vLLM base (default `vllm/vllm-openai:v0.20.0`). +- `DYNAMO_VERSION=` — pin to a specific `ai-dynamo` release or nightly from . Default tracks the latest tested nightly. Make sure the chosen wheel's `vllm` dependency matches `BASE_IMAGE`. + +## Step 2: Download the Model + +Create the PVC, Hugging Face token secret, and download the model weights: + +```bash +export NAMESPACE= + +# Create the namespace if it does not already exist. +kubectl create namespace ${NAMESPACE} --dry-run=client -o yaml | kubectl apply -f - + +# First edit storageClassName in model-cache.yaml for your cluster. +kubectl apply -f recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml -n ${NAMESPACE} + +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN= \ + -n ${NAMESPACE} + +kubectl apply -f recipes/nemotron-3-nano-omni/model-cache/model-download.yaml -n ${NAMESPACE} +kubectl wait --for=condition=complete job/model-download -n ${NAMESPACE} --timeout=3600s +``` + +## Step 3: Deploy + +Edit `vllm/agg/deploy.yaml` and replace all `` values: + +- `/nemotron-omni-vllm:latest` - your built container image + +If your registry is private, add the appropriate `imagePullSecrets` to the +deployment. + +```bash +kubectl apply -f recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml -n ${NAMESPACE} +``` + +Monitor startup: + +```bash +kubectl get pods -n ${NAMESPACE} -l nvidia.com/dynamo-graph-deployment-name=nemotron-omni-vllm-agg -w +``` + +## Step 4: Test + +```bash +kubectl port-forward svc/nemotron-omni-vllm-agg-frontend 8000:8000 -n ${NAMESPACE} +``` + +In another terminal, send a minimal text request: + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4", + "messages": [{"role": "user", "content": "Hello!"}], + "max_tokens": 128 + }' +``` + +To exercise the multimodal path, attach an image: + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4", + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"}}, + {"type": "text", "text": "Describe what is in this image."} + ] + }], + "max_tokens": 256 + }' +``` + +…or an audio clip: + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4", + "messages": [{ + "role": "user", + "content": [ + {"type": "audio_url", "audio_url": {"url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"}}, + {"type": "text", "text": "Transcribe this audio clip."} + ] + }], + "max_tokens": 256 + }' +``` + +## Key Configuration Notes + +- `--enable-multimodal` enables image, video, and audio inputs. +- `--media-io-kwargs '{"video": {"num_frames": 512, "fps": 1}}'` samples long + videos at one frame per second, capped at 512 frames. +- `--dyn-tool-call-parser nemotron_nano` and + `--dyn-reasoning-parser nemotron_nano` enable Nemotron Nano tool-call and + reasoning parsing. +- The frontend uses `--router-mode kv --no-kv-events`, which approximates + KV-aware routing with prefix hashing without requiring backend KV events. + +## Optional: Run without NATS + +The Dynamo runtime defaults to NATS for the event plane and connects to a +NATS server if `NATS_SERVER` is set in the environment (the operator +auto-injects this on most clusters). On clusters without NATS — or where +you'd rather avoid the dependency — you can run on TCP request plane + ZMQ +event plane only. Add to both Frontend and VllmWorker: + +```yaml +mainContainer: + env: + - name: DYN_EVENT_PLANE + value: zmq + command: ["/bin/bash", "-lc"] + args: + # Operator-injected NATS_SERVER takes effect even when set to ""; we have + # to actually unset it before the runtime reads env. + - >- + unset NATS_SERVER && + exec python3 -m dynamo.frontend ... # or dynamo.vllm +``` + +The request plane defaults to TCP already, so no further flags are needed. + +## File Layout + +```text +recipes/nemotron-3-nano-omni/ + README.md + Dockerfile + model-cache/ + model-cache.yaml + model-download.yaml + vllm/ + agg/ + deploy.yaml +``` diff --git a/recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml b/recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml new file mode 100644 index 000000000000..5d6e2b6e998b --- /dev/null +++ b/recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: model-cache +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 250Gi + storageClassName: "your-storage-class-name" diff --git a/recipes/nemotron-3-nano-omni/model-cache/model-download.yaml b/recipes/nemotron-3-nano-omni/model-cache/model-download.yaml new file mode 100644 index 000000000000..6e34cf65512b --- /dev/null +++ b/recipes/nemotron-3-nano-omni/model-cache/model-download.yaml @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +apiVersion: batch/v1 +kind: Job +metadata: + name: model-download +spec: + backoffLimit: 3 + completions: 1 + parallelism: 1 + template: + metadata: + labels: + app: model-download + spec: + restartPolicy: Never + containers: + - name: model-download + image: python:3.10-slim + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + seccompProfile: + type: RuntimeDefault + command: ["sh", "-c"] + envFrom: + - secretRef: + name: hf-token-secret + env: + - name: MODEL_NAME + value: nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4 + - name: HF_HOME + value: /model-store + - name: HF_XET_HIGH_PERFORMANCE + value: "1" + args: + - | + set -eux + pip install --no-cache-dir huggingface_hub==1.11.0 + hf download "$MODEL_NAME" + volumeMounts: + - name: model-cache + mountPath: /model-store + volumes: + - name: model-cache + persistentVolumeClaim: + claimName: model-cache diff --git a/recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml b/recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml new file mode 100644 index 000000000000..1c03c392f811 --- /dev/null +++ b/recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml @@ -0,0 +1,102 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# Nemotron Nano Omni aggregated vLLM deployment. +# +# Prerequisites: +# - Dynamo Platform installed +# - Model weights downloaded into the model-cache PVC +# - Container built from recipes/nemotron-3-nano-omni/Dockerfile +# - HF_TOKEN secret created: +# kubectl create secret generic hf-token-secret \ +# --from-literal=HF_TOKEN= -n +# +# Replace image references before applying: +# /nemotron-omni-vllm:latest +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: nemotron-omni-vllm-agg +spec: + backendFramework: vllm + pvcs: + - name: model-cache + create: false + services: + Frontend: + componentType: frontend + envFromSecret: hf-token-secret + replicas: 1 + volumeMounts: + - name: model-cache + mountPoint: /model-store + extraPodSpec: + mainContainer: + image: /nemotron-omni-vllm:latest + imagePullPolicy: IfNotPresent + startupProbe: + httpGet: + path: /health + port: 8000 + periodSeconds: 10 + timeoutSeconds: 1800 + failureThreshold: 60 + env: + - name: HF_HOME + value: /model-store + command: + - /bin/bash + - -lc + args: + - >- + exec python3 -m dynamo.frontend + --router-mode kv + --no-kv-events + --http-port 8000 + + VllmWorker: + componentType: worker + envFromSecret: hf-token-secret + replicas: 1 + resources: + limits: + gpu: "1" + requests: + gpu: "1" + volumeMounts: + - name: model-cache + mountPoint: /model-store + sharedMemory: + size: 16Gi + extraPodSpec: + mainContainer: + image: /nemotron-omni-vllm:latest + imagePullPolicy: IfNotPresent + startupProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 10 + failureThreshold: 120 + env: + - name: HF_HOME + value: /model-store + # Match the --media-io-kwargs num_frames so dynamo's multimodal + # preprocessor and vLLM agree on the video frame ceiling. + - name: DYN_MM_VIDEO_NUM_FRAMES + value: "512" + command: + - /bin/bash + - -lc + args: + - >- + exec python3 -m dynamo.vllm + --model nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4 + --served-model-name nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4 + --max-model-len 131072 + --enable-multimodal + --media-io-kwargs '{"video": {"num_frames": 512, "fps": 1}}' + --trust-remote-code + --video-pruning-rate 0.5 + --dyn-tool-call-parser nemotron_nano + --dyn-reasoning-parser nemotron_nano