diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml index c3637b853bc6..5c3299fc3877 100644 --- a/.github/actions/docker-build/action.yml +++ b/.github/actions/docker-build/action.yml @@ -80,7 +80,9 @@ runs: ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com run: | sudo apt-get update && sudo apt-get install -y git build-essential protobuf-compiler libclang-dev - curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain stable + curl --retry 5 --retry-delay 2 --fail --show-error -sSL -o /tmp/rustup-init.sh https://sh.rustup.rs + sh /tmp/rustup-init.sh -y --default-toolchain stable + rm -f /tmp/rustup-init.sh . "$HOME/.cargo/env" echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" cargo install cbindgen diff --git a/.github/actions/install-vcluster-cli/action.yml b/.github/actions/install-vcluster-cli/action.yml index 572d979a153a..b3e127b36e24 100644 --- a/.github/actions/install-vcluster-cli/action.yml +++ b/.github/actions/install-vcluster-cli/action.yml @@ -23,10 +23,14 @@ runs: aarch64) VCLUSTER_ARCH="arm64" ;; *) echo "Unsupported architecture: ${ARCH}"; exit 1 ;; esac - curl -sL -o /tmp/vcluster \ + TMP_BIN="$(mktemp -p /tmp vcluster.XXXXXX)" + curl --retry 5 --retry-delay 2 \ + --connect-timeout 10 --max-time 120 \ + --fail --show-error -sL \ + -o "${TMP_BIN}" \ "https://github.com/loft-sh/vcluster/releases/download/${{ inputs.vcluster_version }}/vcluster-linux-${VCLUSTER_ARCH}" - sudo mv /tmp/vcluster /usr/local/bin/vcluster - sudo chmod +x /usr/local/bin/vcluster + sudo install -m 0755 "${TMP_BIN}" /usr/local/bin/vcluster + rm -f "${TMP_BIN}" vcluster version fi echo "::endgroup::" diff --git a/.github/workflows/nightly-ci.yml b/.github/workflows/nightly-ci.yml index d92bbe88d71c..d671a7c98618 100644 --- a/.github/workflows/nightly-ci.yml +++ b/.github/workflows/nightly-ci.yml @@ -258,7 +258,7 @@ jobs: PROTOC_VER="30.2" PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip" PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f" - curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" + curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" echo "${PROTOC_SHA256} ${PROTOC_ZIP}" | sha256sum -c - unzip "${PROTOC_ZIP}" -d $HOME/.local rm "${PROTOC_ZIP}" diff --git a/.github/workflows/pre-merge.yml b/.github/workflows/pre-merge.yml index 3693b06728a2..7e96b753e350 100644 --- a/.github/workflows/pre-merge.yml +++ b/.github/workflows/pre-merge.yml @@ -81,7 +81,7 @@ jobs: PROTOC_VER="30.2" PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip" PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f" - curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" + curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" echo "${PROTOC_SHA256} ${PROTOC_ZIP}" | sha256sum -c - unzip "${PROTOC_ZIP}" -d $HOME/.local rm "${PROTOC_ZIP}" @@ -140,7 +140,7 @@ jobs: PROTOC_VER="30.2" PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip" PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f" - curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" + curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" echo "${PROTOC_SHA256} ${PROTOC_ZIP}" | sha256sum -c - unzip "${PROTOC_ZIP}" -d $HOME/.local rm "${PROTOC_ZIP}" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 24dd4547445f..cc145b4c037b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -176,7 +176,10 @@ jobs: env: CRANE_VERSION: v0.20.2 run: | - curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \ + curl --retry 5 --retry-delay 2 \ + --connect-timeout 10 --max-time 120 \ + --fail --show-error -sL \ + "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \ | tar -xzf - crane sudo mv crane /usr/local/bin/ crane version @@ -440,7 +443,10 @@ jobs: env: CRANE_VERSION: v0.20.2 run: | - curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \ + curl --retry 5 --retry-delay 2 \ + --connect-timeout 10 --max-time 120 \ + --fail --show-error -sL \ + "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \ | tar -xzf - crane sudo mv crane /usr/local/bin/ crane version diff --git a/components/src/dynamo/planner/config/planner_config.py b/components/src/dynamo/planner/config/planner_config.py index 93e074bcb6fd..307757381d45 100644 --- a/components/src/dynamo/planner/config/planner_config.py +++ b/components/src/dynamo/planner/config/planner_config.py @@ -52,7 +52,8 @@ class PlannerConfig(BaseModel): "kubernetes", "virtual", "global-planner" ] = SLAPlannerDefaults.environment namespace: str = Field( - default_factory=lambda: os.environ.get("DYN_NAMESPACE", "dynamo") + default_factory=lambda: os.environ.get("DYN_NAMESPACE", "dynamo"), + exclude=True, ) backend: Literal["vllm", "sglang", "trtllm", "mocker"] = SLAPlannerDefaults.backend mode: Literal["disagg", "prefill", "decode", "agg"] = SLAPlannerDefaults.mode @@ -109,7 +110,8 @@ class PlannerConfig(BaseModel): default_factory=lambda: os.environ.get( "PROMETHEUS_ENDPOINT", "http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090", - ) + ), + exclude=True, ) metric_reporting_prometheus_port: int = Field( default_factory=lambda: int(os.environ.get("PLANNER_PROMETHEUS_PORT", 0)) diff --git a/components/src/dynamo/sglang/request_handlers/handler_base.py b/components/src/dynamo/sglang/request_handlers/handler_base.py index 8276113e0fc1..56f4c50ebfa7 100644 --- a/components/src/dynamo/sglang/request_handlers/handler_base.py +++ b/components/src/dynamo/sglang/request_handlers/handler_base.py @@ -24,6 +24,10 @@ ) import sglang as sgl +from sglang.srt.managers.io_struct import ( + DestroyWeightsUpdateGroupReqInput, + InitWeightsUpdateGroupReqInput, +) from dynamo._core import Context from dynamo.common.utils.input_params import InputParamManager @@ -733,6 +737,12 @@ def _priority_kwargs(self, priority: Any) -> Dict[str, Any]: return {"priority": normalized} return {} + def _weight_update_unsupported_response(self) -> dict: + return { + "success": False, + "message": "weight update control not supported on this worker", + } + async def release_memory_occupation(self, body: dict) -> dict: """Release GPU memory occupation and unregister from discovery. @@ -857,6 +867,30 @@ async def update_weights_from_disk(self, body: dict) -> dict: "num_paused_requests": num_paused_requests, } + async def init_weights_update_group(self, body: dict) -> dict: + """Initialize distributed weight-update NCCL group on the worker.""" + if self.engine is None: + return self._weight_update_unsupported_response() + + req = InitWeightsUpdateGroupReqInput(**body) + ( + success, + message, + ) = await self.engine.tokenizer_manager.init_weights_update_group(req, None) + return {"success": success, "message": message} + + async def destroy_weights_update_group(self, body: dict) -> dict: + """Destroy distributed weight-update NCCL group on the worker.""" + if self.engine is None: + return self._weight_update_unsupported_response() + + req = DestroyWeightsUpdateGroupReqInput(**body) + ( + success, + message, + ) = await self.engine.tokenizer_manager.destroy_weights_update_group(req, None) + return {"success": success, "message": message} + async def update_weights_from_tensor(self, body: dict) -> dict: """Update model weights from tensors without restarting the server.""" from sglang.srt.managers.io_struct import UpdateWeightsFromTensorReqInput @@ -980,6 +1014,15 @@ async def session_control(self, request, context=None): result = {"status": "error", "message": f"Unknown action: {action}"} yield result + async def get_weight_version(self, body: dict) -> dict: + """Return the active weight version currently served by the worker.""" + _ = body + if self.engine is None: + return self._weight_update_unsupported_response() + return { + "weight_version": self.engine.tokenizer_manager.server_args.weight_version + } + def register_engine_routes(self, runtime: DistributedRuntime) -> None: """Register all engine routes for this handler. @@ -994,6 +1037,12 @@ def register_engine_routes(self, runtime: DistributedRuntime) -> None: runtime.register_engine_route( "resume_memory_occupation", self.resume_memory_occupation ) + runtime.register_engine_route( + "init_weights_update_group", self.init_weights_update_group + ) + runtime.register_engine_route( + "destroy_weights_update_group", self.destroy_weights_update_group + ) runtime.register_engine_route( "update_weights_from_disk", self.update_weights_from_disk ) @@ -1009,6 +1058,7 @@ def register_engine_routes(self, runtime: DistributedRuntime) -> None: runtime.register_engine_route( "update_weight_version", self.update_weight_version ) + runtime.register_engine_route("get_weight_version", self.get_weight_version) if getattr(self.config, "dynamo_args", None) and getattr( self.config.dynamo_args, "enable_rl", False ): diff --git a/components/src/dynamo/sglang/tests/test_sglang_handler_base.py b/components/src/dynamo/sglang/tests/test_sglang_handler_base.py new file mode 100644 index 000000000000..2b62698284b7 --- /dev/null +++ b/components/src/dynamo/sglang/tests/test_sglang_handler_base.py @@ -0,0 +1,72 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import types + +import pytest + +from dynamo.sglang.request_handlers.handler_base import BaseWorkerHandler + +pytestmark = [ + pytest.mark.unit, + pytest.mark.sglang, + pytest.mark.gpu_0, + pytest.mark.pre_merge, +] + + +class _DummyRuntime: + def __init__(self): + self.routes = {} + + def register_engine_route(self, name, handler): + self.routes[name] = handler + + +class _DummyWorkerHandler(BaseWorkerHandler): + async def generate(self, request, context): + if False: + yield request, context + + +def test_register_engine_routes_includes_weight_update_routes(): + handler = _DummyWorkerHandler.__new__(_DummyWorkerHandler) + runtime = _DummyRuntime() + + handler.register_engine_routes(runtime) + + assert "init_weights_update_group" in runtime.routes + assert "destroy_weights_update_group" in runtime.routes + assert "get_weight_version" in runtime.routes + + +@pytest.mark.asyncio +async def test_get_weight_version_reads_active_version_from_server_args(): + handler = _DummyWorkerHandler.__new__(_DummyWorkerHandler) + handler.engine = types.SimpleNamespace( + tokenizer_manager=types.SimpleNamespace( + server_args=types.SimpleNamespace(weight_version=17) + ) + ) + + result = await handler.get_weight_version({}) + + assert result == {"weight_version": 17} + + +@pytest.mark.asyncio +async def test_weight_update_routes_return_unsupported_without_engine(): + handler = _DummyWorkerHandler.__new__(_DummyWorkerHandler) + handler.engine = None + + init_result = await handler.init_weights_update_group({}) + destroy_result = await handler.destroy_weights_update_group({}) + version_result = await handler.get_weight_version({}) + + expected = { + "success": False, + "message": "weight update control not supported on this worker", + } + assert init_result == expected + assert destroy_result == expected + assert version_result == expected diff --git a/deploy/operator/internal/consts/consts.go b/deploy/operator/internal/consts/consts.go index 2d8b67ca533b..a1b9222bb98c 100644 --- a/deploy/operator/internal/consts/consts.go +++ b/deploy/operator/internal/consts/consts.go @@ -27,6 +27,8 @@ const ( DynamoNixlPort = 19090 DynamoNixlPortName = "nixl" + DynamoFPMBasePort = 20380 + MpiRunSshPort = 2222 // Default security context values diff --git a/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go b/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go index 9216c3c453da..4aaf920cf286 100644 --- a/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go +++ b/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go @@ -773,6 +773,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. {Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName}, {Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker}, {Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"}, + {Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"}, {Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"}, {Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-lws-deploy"}, {Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"}, @@ -916,6 +917,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. {Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName}, {Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker}, {Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"}, + {Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"}, {Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"}, {Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-lws-deploy"}, {Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"}, diff --git a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go index 59c0a8982b4a..eb70054264a0 100644 --- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go +++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go @@ -1307,6 +1307,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. Value: string(dgdr.UID), }, } + if r.Config.Infrastructure.PrometheusEndpoint != "" { + profilerEnv = append(profilerEnv, corev1.EnvVar{ + Name: "PROMETHEUS_ENDPOINT", + Value: r.Config.Infrastructure.PrometheusEndpoint, + }) + } // Build volume mounts volumeMounts := []corev1.VolumeMount{ diff --git a/deploy/operator/internal/dynamo/component_worker.go b/deploy/operator/internal/dynamo/component_worker.go index 38d9d0c45503..be19b61c3863 100644 --- a/deploy/operator/internal/dynamo/component_worker.go +++ b/deploy/operator/internal/dynamo/component_worker.go @@ -107,6 +107,10 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: fmt.Sprintf("%d", commonconsts.DynamoNixlPort), }, + { + Name: "DYN_FORWARDPASS_METRIC_PORT", + Value: fmt.Sprintf("%d", commonconsts.DynamoFPMBasePort), + }, }...) if context.WorkerHashSuffix != "" { diff --git a/deploy/operator/internal/dynamo/graph_test.go b/deploy/operator/internal/dynamo/graph_test.go index a6ab18bfc825..f5cab1e18a0d 100644 --- a/deploy/operator/internal/dynamo/graph_test.go +++ b/deploy/operator/internal/dynamo/graph_test.go @@ -2160,6 +2160,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: "19090", }, + { + Name: "DYN_FORWARDPASS_METRIC_PORT", + Value: "20380", + }, { Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-dynamo-graph-deployment", @@ -2374,6 +2378,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: "19090", }, + { + Name: "DYN_FORWARDPASS_METRIC_PORT", + Value: "20380", + }, { Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-dynamo-graph-deployment", @@ -3187,6 +3195,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: "19090", }, + { + Name: "DYN_FORWARDPASS_METRIC_PORT", + Value: "20380", + }, { Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-dynamo-graph-deployment", @@ -3388,6 +3400,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: "19090", }, + { + Name: "DYN_FORWARDPASS_METRIC_PORT", + Value: "20380", + }, { Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-dynamo-graph-deployment", @@ -5635,6 +5651,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) { {Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName}, {Name: commonconsts.DynamoComponentEnvVar, Value: "worker"}, {Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"}, + {Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"}, {Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"}, {Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-deployment"}, {Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-deployment"}, diff --git a/recipes/README.md b/recipes/README.md index 0783a8c87377..500aad85fa01 100644 --- a/recipes/README.md +++ b/recipes/README.md @@ -68,6 +68,7 @@ These recipes are under active development and may require additional setup step | Model | Framework | Mode | GPUs | Deployment | Notes | |-------|-----------|------|------|------------|-------| | **[GLM-5-NVFP4](glm-5-nvfp4/sglang/disagg/)** | SGLang | Disagg Prefill/Decode | 20x GB200 | ✅ | NVFP4, EAGLE speculative decoding, TP16 decode + TP4 prefill. Requires [custom container build](glm-5-nvfp4/). | +| **[Nemotron-3-Nano-Omni-NVFP4](nemotron-3-nano-omni/vllm/agg/)** | vLLM | Aggregated | 1x GPU | ✅ | Multimodal text/image/video/audio serving. Requires [custom container build](nemotron-3-nano-omni/). | | **[nvidia/Kimi-K2.5-NVFP4](kimi-k2.5/trtllm/agg/nvidia/)** | TensorRT-LLM | Aggregated | 8x B200 | ✅ | Text only — MoE model, TP8×EP8, reasoning + tool calling. Vision input not yet functional. | | **[DeepSeek-V4-Flash](deepseek-v4/deepseek-v4-flash/vllm/agg/)** | vLLM | Aggregated | 4x B200 | ✅ | Text only — MoE model (284B / 13B active), DP=4 + EP, FP8 KV cache, reasoning + tool calling. Requires [custom container build](deepseek-v4/container/). | | **[DeepSeek-V4-Flash](deepseek-v4/deepseek-v4-flash/sglang/agg/)** | SGLang | Aggregated | 4x B200 | ✅ | Text only — MoE model (284B / 13B active), TP=4, MXFP4 MoE via FlashInfer, EAGLE MTP (3 steps / 4 draft tokens), reasoning + tool calling. Prebuilt image available; optional [custom container build](deepseek-v4/container/). | diff --git a/recipes/nemotron-3-nano-omni/Dockerfile b/recipes/nemotron-3-nano-omni/Dockerfile new file mode 100644 index 000000000000..1d13afe628ff --- /dev/null +++ b/recipes/nemotron-3-nano-omni/Dockerfile @@ -0,0 +1,60 @@ +# syntax=docker/dockerfile:1.10.0 +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Layer the ai-dynamo wheel (and the omni multimodal extras) onto an upstream +# vLLM image. Dynamo is installed from https://pypi.nvidia.com/ai-dynamo/ — +# no source build, no Rust toolchain. Match DYNAMO_VERSION to a nightly that +# targets the same vLLM minor as BASE_IMAGE (see the wheel's METADATA for the +# pinned vllm version). +# +# Build: +# docker build -t /nemotron-omni-vllm:latest \ +# -f recipes/nemotron-3-nano-omni/Dockerfile \ +# recipes/nemotron-3-nano-omni +# +# Override defaults with --build-arg, e.g.: +# --build-arg BASE_IMAGE=vllm/vllm-openai:v0.20.0 +# --build-arg DYNAMO_VERSION=1.2.0.dev20260427 + +ARG BASE_IMAGE="vllm/vllm-openai:v0.20.0" +ARG DYNAMO_VERSION="1.2.0.dev20260427" + +FROM ${BASE_IMAGE} +USER root + +ARG DYNAMO_VERSION + +# ai-dynamo Python package (dynamo.frontend, dynamo.vllm, ...) installed with +# --no-deps so that: +# - the base image's vLLM 0.20 / torch / cuda stack is preserved (otherwise +# pip would try to re-install vllm[flashinfer,otel,runai]==0.20.0 which is +# declared by ai-dynamo[vllm]), and +# - we skip `nixl` (KV transport, only used for disagg) and `ray` (multi-node +# orchestration), neither of which an aggregated single-GPU recipe needs. +RUN pip install --no-cache-dir --no-deps \ + --extra-index-url https://pypi.nvidia.com \ + ai-dynamo==${DYNAMO_VERSION} + +# ai-dynamo's declared core dependencies, plus the Rust _core extension wheel +# (`ai-dynamo-runtime`) that the Python package imports natively. `kubernetes`, +# `pydantic`, and `pydantic-settings` are explicitly pinned because the latest +# unpinned versions (kubernetes 35.x, pydantic 2.13.x) violate ai-dynamo's +# declared compatibility ranges. +RUN pip install --no-cache-dir \ + --extra-index-url https://pypi.nvidia.com \ + ai-dynamo-runtime==${DYNAMO_VERSION} \ + "kubernetes<33.0.0,>=32.0.1" \ + "pydantic<2.13" "pydantic-settings<2.13.0" \ + msgpack msgspec prometheus-client pyzmq transformers + +# ai-dynamo[vllm] extras minus nixl + ray (handled above), plus the +# multimodal-video Python deps (av/ftfy/nvtx/sentencepiece) that the Nemotron +# Nano Omni model needs at runtime but which aren't pulled in by any extra. +RUN pip install --no-cache-dir \ + blake3 librosa soundfile uvloop \ + av ftfy nvtx sentencepiece + +# vllm/vllm-openai's default ENTRYPOINT runs `vllm serve`; reset it so the +# image behaves as a plain dynamo runtime image. +ENTRYPOINT ["/bin/bash"] diff --git a/recipes/nemotron-3-nano-omni/README.md b/recipes/nemotron-3-nano-omni/README.md new file mode 100644 index 000000000000..e4cbfe6031a1 --- /dev/null +++ b/recipes/nemotron-3-nano-omni/README.md @@ -0,0 +1,185 @@ + + +# Nemotron 3 Nano Omni NVFP4 + +Serves [nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4](https://huggingface.co/nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4) +using vLLM with an aggregated Dynamo deployment. + +This recipe builds a custom container that layers the `ai-dynamo` wheel +(from ) onto an upstream vLLM image — no +source build, no Rust toolchain. + +## Topology + +| Role | Replicas | GPUs/replica | Notes | +|------|----------|--------------|-------| +| Frontend | 1 | 0 | Dynamo frontend with prefix-hash KV routing | +| vLLM worker | 1 | 1 | Text, image, video, and audio inputs | + +## Prerequisites + +- A Kubernetes cluster with the [Dynamo Operator](../../docs/kubernetes/README.md) installed +- One NVIDIA GPU per worker replica +- Shared PVC storage for the Hugging Face model cache +- Hugging Face access to `nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4` + +## Step 1: Build the Container + +```bash +docker build \ + -t /nemotron-omni-vllm:latest \ + -f recipes/nemotron-3-nano-omni/Dockerfile \ + recipes/nemotron-3-nano-omni +docker push /nemotron-omni-vllm:latest +``` + +Useful build args: + +- `BASE_IMAGE=` — pin to a different vLLM base (default `vllm/vllm-openai:v0.20.0`). +- `DYNAMO_VERSION=` — pin to a specific `ai-dynamo` release or nightly from . Default tracks the latest tested nightly. Make sure the chosen wheel's `vllm` dependency matches `BASE_IMAGE`. + +## Step 2: Download the Model + +Create the PVC, Hugging Face token secret, and download the model weights: + +```bash +export NAMESPACE= + +# Create the namespace if it does not already exist. +kubectl create namespace ${NAMESPACE} --dry-run=client -o yaml | kubectl apply -f - + +# First edit storageClassName in model-cache.yaml for your cluster. +kubectl apply -f recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml -n ${NAMESPACE} + +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN= \ + -n ${NAMESPACE} + +kubectl apply -f recipes/nemotron-3-nano-omni/model-cache/model-download.yaml -n ${NAMESPACE} +kubectl wait --for=condition=complete job/model-download -n ${NAMESPACE} --timeout=3600s +``` + +## Step 3: Deploy + +Edit `vllm/agg/deploy.yaml` and replace all `` values: + +- `/nemotron-omni-vllm:latest` - your built container image + +If your registry is private, add the appropriate `imagePullSecrets` to the +deployment. + +```bash +kubectl apply -f recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml -n ${NAMESPACE} +``` + +Monitor startup: + +```bash +kubectl get pods -n ${NAMESPACE} -l nvidia.com/dynamo-graph-deployment-name=nemotron-omni-vllm-agg -w +``` + +## Step 4: Test + +```bash +kubectl port-forward svc/nemotron-omni-vllm-agg-frontend 8000:8000 -n ${NAMESPACE} +``` + +In another terminal, send a minimal text request: + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4", + "messages": [{"role": "user", "content": "Hello!"}], + "max_tokens": 128 + }' +``` + +To exercise the multimodal path, attach an image: + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4", + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"}}, + {"type": "text", "text": "Describe what is in this image."} + ] + }], + "max_tokens": 256 + }' +``` + +…or an audio clip: + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4", + "messages": [{ + "role": "user", + "content": [ + {"type": "audio_url", "audio_url": {"url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"}}, + {"type": "text", "text": "Transcribe this audio clip."} + ] + }], + "max_tokens": 256 + }' +``` + +## Key Configuration Notes + +- `--enable-multimodal` enables image, video, and audio inputs. +- `--media-io-kwargs '{"video": {"num_frames": 512, "fps": 1}}'` samples long + videos at one frame per second, capped at 512 frames. +- `--dyn-tool-call-parser nemotron_nano` and + `--dyn-reasoning-parser nemotron_nano` enable Nemotron Nano tool-call and + reasoning parsing. +- The frontend uses `--router-mode kv --no-kv-events`, which approximates + KV-aware routing with prefix hashing without requiring backend KV events. + +## Optional: Run without NATS + +The Dynamo runtime defaults to NATS for the event plane and connects to a +NATS server if `NATS_SERVER` is set in the environment (the operator +auto-injects this on most clusters). On clusters without NATS — or where +you'd rather avoid the dependency — you can run on TCP request plane + ZMQ +event plane only. Add to both Frontend and VllmWorker: + +```yaml +mainContainer: + env: + - name: DYN_EVENT_PLANE + value: zmq + command: ["/bin/bash", "-lc"] + args: + # Operator-injected NATS_SERVER takes effect even when set to ""; we have + # to actually unset it before the runtime reads env. + - >- + unset NATS_SERVER && + exec python3 -m dynamo.frontend ... # or dynamo.vllm +``` + +The request plane defaults to TCP already, so no further flags are needed. + +## File Layout + +```text +recipes/nemotron-3-nano-omni/ + README.md + Dockerfile + model-cache/ + model-cache.yaml + model-download.yaml + vllm/ + agg/ + deploy.yaml +``` diff --git a/recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml b/recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml new file mode 100644 index 000000000000..5d6e2b6e998b --- /dev/null +++ b/recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: model-cache +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 250Gi + storageClassName: "your-storage-class-name" diff --git a/recipes/nemotron-3-nano-omni/model-cache/model-download.yaml b/recipes/nemotron-3-nano-omni/model-cache/model-download.yaml new file mode 100644 index 000000000000..6e34cf65512b --- /dev/null +++ b/recipes/nemotron-3-nano-omni/model-cache/model-download.yaml @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +apiVersion: batch/v1 +kind: Job +metadata: + name: model-download +spec: + backoffLimit: 3 + completions: 1 + parallelism: 1 + template: + metadata: + labels: + app: model-download + spec: + restartPolicy: Never + containers: + - name: model-download + image: python:3.10-slim + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + seccompProfile: + type: RuntimeDefault + command: ["sh", "-c"] + envFrom: + - secretRef: + name: hf-token-secret + env: + - name: MODEL_NAME + value: nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4 + - name: HF_HOME + value: /model-store + - name: HF_XET_HIGH_PERFORMANCE + value: "1" + args: + - | + set -eux + pip install --no-cache-dir huggingface_hub==1.11.0 + hf download "$MODEL_NAME" + volumeMounts: + - name: model-cache + mountPath: /model-store + volumes: + - name: model-cache + persistentVolumeClaim: + claimName: model-cache diff --git a/recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml b/recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml new file mode 100644 index 000000000000..1c03c392f811 --- /dev/null +++ b/recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml @@ -0,0 +1,102 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# Nemotron Nano Omni aggregated vLLM deployment. +# +# Prerequisites: +# - Dynamo Platform installed +# - Model weights downloaded into the model-cache PVC +# - Container built from recipes/nemotron-3-nano-omni/Dockerfile +# - HF_TOKEN secret created: +# kubectl create secret generic hf-token-secret \ +# --from-literal=HF_TOKEN= -n +# +# Replace image references before applying: +# /nemotron-omni-vllm:latest +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: nemotron-omni-vllm-agg +spec: + backendFramework: vllm + pvcs: + - name: model-cache + create: false + services: + Frontend: + componentType: frontend + envFromSecret: hf-token-secret + replicas: 1 + volumeMounts: + - name: model-cache + mountPoint: /model-store + extraPodSpec: + mainContainer: + image: /nemotron-omni-vllm:latest + imagePullPolicy: IfNotPresent + startupProbe: + httpGet: + path: /health + port: 8000 + periodSeconds: 10 + timeoutSeconds: 1800 + failureThreshold: 60 + env: + - name: HF_HOME + value: /model-store + command: + - /bin/bash + - -lc + args: + - >- + exec python3 -m dynamo.frontend + --router-mode kv + --no-kv-events + --http-port 8000 + + VllmWorker: + componentType: worker + envFromSecret: hf-token-secret + replicas: 1 + resources: + limits: + gpu: "1" + requests: + gpu: "1" + volumeMounts: + - name: model-cache + mountPoint: /model-store + sharedMemory: + size: 16Gi + extraPodSpec: + mainContainer: + image: /nemotron-omni-vllm:latest + imagePullPolicy: IfNotPresent + startupProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 10 + failureThreshold: 120 + env: + - name: HF_HOME + value: /model-store + # Match the --media-io-kwargs num_frames so dynamo's multimodal + # preprocessor and vLLM agree on the video frame ceiling. + - name: DYN_MM_VIDEO_NUM_FRAMES + value: "512" + command: + - /bin/bash + - -lc + args: + - >- + exec python3 -m dynamo.vllm + --model nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4 + --served-model-name nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4 + --max-model-len 131072 + --enable-multimodal + --media-io-kwargs '{"video": {"num_frames": 512, "fps": 1}}' + --trust-remote-code + --video-pruning-rate 0.5 + --dyn-tool-call-parser nemotron_nano + --dyn-reasoning-parser nemotron_nano