diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml
index c3637b853bc6..5c3299fc3877 100644
--- a/.github/actions/docker-build/action.yml
+++ b/.github/actions/docker-build/action.yml
@@ -80,7 +80,9 @@ runs:
         ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
       run: |
         sudo apt-get update && sudo apt-get install -y git build-essential protobuf-compiler libclang-dev
-        curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain stable
+        curl --retry 5 --retry-delay 2 --fail --show-error -sSL -o /tmp/rustup-init.sh https://sh.rustup.rs
+        sh /tmp/rustup-init.sh -y --default-toolchain stable
+        rm -f /tmp/rustup-init.sh
         . "$HOME/.cargo/env"
         echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
         cargo install cbindgen
diff --git a/.github/actions/install-vcluster-cli/action.yml b/.github/actions/install-vcluster-cli/action.yml
index 572d979a153a..b3e127b36e24 100644
--- a/.github/actions/install-vcluster-cli/action.yml
+++ b/.github/actions/install-vcluster-cli/action.yml
@@ -23,10 +23,14 @@ runs:
             aarch64) VCLUSTER_ARCH="arm64" ;;
             *)       echo "Unsupported architecture: ${ARCH}"; exit 1 ;;
           esac
-          curl -sL -o /tmp/vcluster \
+          TMP_BIN="$(mktemp -p /tmp vcluster.XXXXXX)"
+          curl --retry 5 --retry-delay 2 \
+            --connect-timeout 10 --max-time 120 \
+            --fail --show-error -sL \
+            -o "${TMP_BIN}" \
             "https://github.com/loft-sh/vcluster/releases/download/${{ inputs.vcluster_version }}/vcluster-linux-${VCLUSTER_ARCH}"
-          sudo mv /tmp/vcluster /usr/local/bin/vcluster
-          sudo chmod +x /usr/local/bin/vcluster
+          sudo install -m 0755 "${TMP_BIN}" /usr/local/bin/vcluster
+          rm -f "${TMP_BIN}"
           vcluster version
         fi
         echo "::endgroup::"
diff --git a/.github/workflows/nightly-ci.yml b/.github/workflows/nightly-ci.yml
index d92bbe88d71c..d671a7c98618 100644
--- a/.github/workflows/nightly-ci.yml
+++ b/.github/workflows/nightly-ci.yml
@@ -258,7 +258,7 @@ jobs:
         PROTOC_VER="30.2"
         PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip"
         PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f"
-        curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
+        curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
         echo "${PROTOC_SHA256}  ${PROTOC_ZIP}" | sha256sum -c -
         unzip "${PROTOC_ZIP}" -d $HOME/.local
         rm "${PROTOC_ZIP}"
diff --git a/.github/workflows/pre-merge.yml b/.github/workflows/pre-merge.yml
index 3693b06728a2..7e96b753e350 100644
--- a/.github/workflows/pre-merge.yml
+++ b/.github/workflows/pre-merge.yml
@@ -81,7 +81,7 @@ jobs:
         PROTOC_VER="30.2"
         PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip"
         PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f"
-        curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
+        curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
         echo "${PROTOC_SHA256}  ${PROTOC_ZIP}" | sha256sum -c -
         unzip "${PROTOC_ZIP}" -d $HOME/.local
         rm "${PROTOC_ZIP}"
@@ -140,7 +140,7 @@ jobs:
         PROTOC_VER="30.2"
         PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip"
         PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f"
-        curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
+        curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
         echo "${PROTOC_SHA256}  ${PROTOC_ZIP}" | sha256sum -c -
         unzip "${PROTOC_ZIP}" -d $HOME/.local
         rm "${PROTOC_ZIP}"
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 24dd4547445f..cc145b4c037b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -176,7 +176,10 @@ jobs:
         env:
           CRANE_VERSION: v0.20.2
         run: |
-          curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \
+          curl --retry 5 --retry-delay 2 \
+            --connect-timeout 10 --max-time 120 \
+            --fail --show-error -sL \
+            "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \
             | tar -xzf - crane
           sudo mv crane /usr/local/bin/
           crane version
@@ -440,7 +443,10 @@ jobs:
         env:
           CRANE_VERSION: v0.20.2
         run: |
-          curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \
+          curl --retry 5 --retry-delay 2 \
+            --connect-timeout 10 --max-time 120 \
+            --fail --show-error -sL \
+            "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \
             | tar -xzf - crane
           sudo mv crane /usr/local/bin/
           crane version
diff --git a/components/src/dynamo/frontend/sglang_processor.py b/components/src/dynamo/frontend/sglang_processor.py
index 473cf9ca9ec8..daaff052ad08 100644
--- a/components/src/dynamo/frontend/sglang_processor.py
+++ b/components/src/dynamo/frontend/sglang_processor.py
@@ -39,6 +39,7 @@
     preprocess_chat_request,
 )
 from .utils import (
+    FrontendRoundRobinRouter,
     PreprocessError,
     extract_mm_urls,
     handle_engine_error,
@@ -624,9 +625,16 @@ async def chat_engine_factory(
                 kv_router_config=self.router_config.kv_router_config,
             )
         else:
-            router = await generate_endpoint.client(
+            client = await generate_endpoint.client(
                 router_mode=self.router_config.router_mode
             )
+            if self.router_config.router_mode == RouterMode.RoundRobin:
+                router = FrontendRoundRobinRouter(
+                    client,
+                    f"{namespace_name}.{component_name}.{endpoint_name}",
+                )
+            else:
+                router = client
 
         preprocess_pool = None
         preprocess_workers = self.config.preprocess_workers
diff --git a/components/src/dynamo/frontend/tests/test_frontend_routing_utils.py b/components/src/dynamo/frontend/tests/test_frontend_routing_utils.py
new file mode 100644
index 000000000000..229eed89e567
--- /dev/null
+++ b/components/src/dynamo/frontend/tests/test_frontend_routing_utils.py
@@ -0,0 +1,93 @@
+#  SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from dynamo.frontend.utils import FrontendRoundRobinRouter
+
+pytestmark = [pytest.mark.unit, pytest.mark.gpu_0, pytest.mark.pre_merge]
+
+
+class _FakeClient:
+    def __init__(self, instance_sequences):
+        self._instance_sequences = list(instance_sequences)
+        self._instance_idx = 0
+        self.direct_calls = []
+
+    def instance_ids(self):
+        if self._instance_idx < len(self._instance_sequences):
+            ids = self._instance_sequences[self._instance_idx]
+            self._instance_idx += 1
+            return ids
+        return self._instance_sequences[-1]
+
+    async def wait_for_instances(self):
+        return [9, 10]
+
+    async def direct(self, request, instance_id, annotated=True):
+        self.direct_calls.append((request, instance_id, annotated))
+        return {
+            "instance": str(instance_id),
+            "request": request,
+            "annotated": annotated,
+        }
+
+
+@pytest.mark.asyncio
+async def test_frontend_round_robin_router_balances_sorted_instance_ids():
+    client = _FakeClient([[20, 10, 30], [20, 10, 30], [20, 10, 30], [20, 10, 30]])
+    router = FrontendRoundRobinRouter(client, "dynamo.backend.generate")
+
+    results = []
+    for idx in range(4):
+        results.append(await router.generate({"seq": idx}, annotated=False))
+
+    assert [item["instance"] for item in results] == ["10", "20", "30", "10"]
+    assert [call[2] for call in client.direct_calls] == [False, False, False, False]
+
+
+@pytest.mark.asyncio
+async def test_frontend_round_robin_router_refreshes_membership_each_request():
+    client = _FakeClient([[2, 1], [3, 2, 1], [3, 2, 1]])
+    router = FrontendRoundRobinRouter(client, "dynamo.backend.generate")
+
+    first = await router.generate({"seq": 0}, annotated=False)
+    second = await router.generate({"seq": 1}, annotated=False)
+    third = await router.generate({"seq": 2}, annotated=False)
+
+    assert first["instance"] == "1"
+    assert second["instance"] == "2"
+    assert third["instance"] == "3"
+
+
+@pytest.mark.asyncio
+async def test_frontend_round_robin_router_waits_for_instances_when_empty():
+    client = _FakeClient([[]])
+    router = FrontendRoundRobinRouter(client, "dynamo.backend.generate")
+
+    result = await router.generate({"seq": 0}, annotated=False)
+
+    assert result["instance"] == "9"
+
+
+@pytest.mark.asyncio
+async def test_frontend_round_robin_router_raises_when_no_instances_ever_appear():
+    client = _FakeClient([[]])
+    client.wait_for_instances = _empty_instances
+    router = FrontendRoundRobinRouter(client, "dynamo.backend.generate")
+
+    with pytest.raises(RuntimeError, match="No active backend instances available"):
+        await router.generate({"seq": 0}, annotated=False)
+
+
+@pytest.mark.asyncio
+async def test_frontend_round_robin_router_rejects_unexpected_kwargs():
+    client = _FakeClient([[1]])
+    router = FrontendRoundRobinRouter(client, "dynamo.backend.generate")
+
+    with pytest.raises(TypeError, match="Unsupported kwargs"):
+        await router.generate({"seq": 0}, annotated=False, foo=1)
+
+
+async def _empty_instances():
+    return []
diff --git a/components/src/dynamo/frontend/utils.py b/components/src/dynamo/frontend/utils.py
index b85a5dae59f7..76db2df7c96d 100644
--- a/components/src/dynamo/frontend/utils.py
+++ b/components/src/dynamo/frontend/utils.py
@@ -3,11 +3,14 @@
 
 """Shared utilities for frontend chat processors (vLLM, SGLang)."""
 
+import asyncio
 import logging
+import os
 import uuid
 from typing import Any
 
 _MASK_64_BITS = (1 << 64) - 1
+logger = logging.getLogger(__name__)
 
 
 def random_uuid() -> str:
@@ -33,6 +36,61 @@ def __init__(self, error_dict: dict[str, Any]):
         super().__init__(str(error_dict))
 
 
+class FrontendRoundRobinRouter:
+    """Frontend-managed round-robin over the current runtime client membership.
+
+    This avoids sticky routing behavior in the opaque runtime round-robin client by
+    selecting an instance in Python and sending the request via ``Client.direct``.
+    """
+
+    def __init__(self, client: Any, endpoint_name: str):
+        self._client = client
+        self._endpoint_name = endpoint_name
+        self._cursor = 0
+        self._lock = asyncio.Lock()
+        self._debug = os.getenv("DYN_FRONTEND_ROUTING_DEBUG", "").lower() in {
+            "1",
+            "true",
+            "yes",
+            "on",
+        }
+
+    async def generate(self, request: dict[str, Any], **kwargs: Any):
+        annotated = kwargs.pop("annotated", None)
+        if kwargs:
+            raise TypeError(
+                f"Unsupported kwargs for frontend round-robin router: {sorted(kwargs)}"
+            )
+
+        instance_ids = list(self._client.instance_ids())
+        if not instance_ids:
+            instance_ids = list(await self._client.wait_for_instances())
+        if not instance_ids:
+            raise RuntimeError(
+                f"No active backend instances available for {self._endpoint_name}"
+            )
+
+        instance_ids = sorted(instance_ids)
+        async with self._lock:
+            instance_id = instance_ids[self._cursor % len(instance_ids)]
+            self._cursor += 1
+
+        if self._debug:
+            logger.info(
+                "Frontend routing selected endpoint=%s instance=%s instances=%s annotated=%s",
+                self._endpoint_name,
+                instance_id,
+                instance_ids,
+                annotated,
+            )
+
+        return await self._client.direct(
+            request,
+            instance_id=instance_id,
+            annotated=annotated,
+        )
+
+
 # Content part types that carry media URLs, mapped to the key used in the
 # multimodal data dict sent to the backend handler.
 _MEDIA_CONTENT_TYPES = ("image_url", "audio_url", "video_url")
diff --git a/components/src/dynamo/frontend/vllm_processor.py b/components/src/dynamo/frontend/vllm_processor.py
index 8962bfd02ec5..81c368ba09bc 100644
--- a/components/src/dynamo/frontend/vllm_processor.py
+++ b/components/src/dynamo/frontend/vllm_processor.py
@@ -46,6 +46,7 @@
 
 from .prepost import StreamingPostProcessor, preprocess_chat_request
 from .utils import (
+    FrontendRoundRobinRouter,
     extract_mm_urls,
     handle_engine_error,
     make_internal_error,
@@ -800,9 +801,16 @@ async def chat_engine_factory(
                 kv_router_config=self.router_config.kv_router_config,
             )
         else:
-            router = await generate_endpoint.client(
+            client = await generate_endpoint.client(
                 router_mode=self.router_config.router_mode
             )
+            if self.router_config.router_mode == RouterMode.RoundRobin:
+                router = FrontendRoundRobinRouter(
+                    client,
+                    f"{namespace_name}.{component_name}.{endpoint_name}",
+                )
+            else:
+                router = client
 
         block_size = self.config.kv_cache_block_size or 16
 
diff --git a/components/src/dynamo/planner/config/planner_config.py b/components/src/dynamo/planner/config/planner_config.py
index 93e074bcb6fd..307757381d45 100644
--- a/components/src/dynamo/planner/config/planner_config.py
+++ b/components/src/dynamo/planner/config/planner_config.py
@@ -52,7 +52,8 @@ class PlannerConfig(BaseModel):
         "kubernetes", "virtual", "global-planner"
     ] = SLAPlannerDefaults.environment
     namespace: str = Field(
-        default_factory=lambda: os.environ.get("DYN_NAMESPACE", "dynamo")
+        default_factory=lambda: os.environ.get("DYN_NAMESPACE", "dynamo"),
+        exclude=True,
     )
     backend: Literal["vllm", "sglang", "trtllm", "mocker"] = SLAPlannerDefaults.backend
     mode: Literal["disagg", "prefill", "decode", "agg"] = SLAPlannerDefaults.mode
@@ -109,7 +110,8 @@ class PlannerConfig(BaseModel):
         default_factory=lambda: os.environ.get(
             "PROMETHEUS_ENDPOINT",
             "http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090",
-        )
+        ),
+        exclude=True,
     )
     metric_reporting_prometheus_port: int = Field(
         default_factory=lambda: int(os.environ.get("PLANNER_PROMETHEUS_PORT", 0))
diff --git a/deploy/operator/internal/consts/consts.go b/deploy/operator/internal/consts/consts.go
index 2d8b67ca533b..a1b9222bb98c 100644
--- a/deploy/operator/internal/consts/consts.go
+++ b/deploy/operator/internal/consts/consts.go
@@ -27,6 +27,8 @@ const (
 	DynamoNixlPort     = 19090
 	DynamoNixlPortName = "nixl"
 
+	DynamoFPMBasePort = 20380
+
 	MpiRunSshPort = 2222
 
 	// Default security context values
diff --git a/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go b/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go
index 9216c3c453da..4aaf920cf286 100644
--- a/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go
+++ b/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go
@@ -773,6 +773,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
 											{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
 											{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
 											{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
+											{Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"},
 											{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"},
 											{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-lws-deploy"},
 											{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"},
@@ -916,6 +917,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
 											{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
 											{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
 											{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
+											{Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"},
 											{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"},
 											{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-lws-deploy"},
 											{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"},
diff --git a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
index 59c0a8982b4a..eb70054264a0 100644
--- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
+++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
@@ -1307,6 +1307,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
 				Value: string(dgdr.UID),
 			},
 		}
+		if r.Config.Infrastructure.PrometheusEndpoint != "" {
+			profilerEnv = append(profilerEnv, corev1.EnvVar{
+				Name:  "PROMETHEUS_ENDPOINT",
+				Value: r.Config.Infrastructure.PrometheusEndpoint,
+			})
+		}
 
 		// Build volume mounts
 		volumeMounts := []corev1.VolumeMount{
diff --git a/deploy/operator/internal/dynamo/component_worker.go b/deploy/operator/internal/dynamo/component_worker.go
index 38d9d0c45503..be19b61c3863 100644
--- a/deploy/operator/internal/dynamo/component_worker.go
+++ b/deploy/operator/internal/dynamo/component_worker.go
@@ -107,6 +107,10 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont
 			Name:  "NIXL_TELEMETRY_PROMETHEUS_PORT",
 			Value: fmt.Sprintf("%d", commonconsts.DynamoNixlPort),
 		},
+		{
+			Name:  "DYN_FORWARDPASS_METRIC_PORT",
+			Value: fmt.Sprintf("%d", commonconsts.DynamoFPMBasePort),
+		},
 	}...)
 
 	if context.WorkerHashSuffix != "" {
diff --git a/deploy/operator/internal/dynamo/graph_test.go b/deploy/operator/internal/dynamo/graph_test.go
index a6ab18bfc825..f5cab1e18a0d 100644
--- a/deploy/operator/internal/dynamo/graph_test.go
+++ b/deploy/operator/internal/dynamo/graph_test.go
@@ -2160,6 +2160,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 														Name:  "NIXL_TELEMETRY_PROMETHEUS_PORT",
 														Value: "19090",
 													},
+													{
+														Name:  "DYN_FORWARDPASS_METRIC_PORT",
+														Value: "20380",
+													},
 													{
 														Name:  "DYN_PARENT_DGD_K8S_NAME",
 														Value: "test-dynamo-graph-deployment",
@@ -2374,6 +2378,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 														Name:  "NIXL_TELEMETRY_PROMETHEUS_PORT",
 														Value: "19090",
 													},
+													{
+														Name:  "DYN_FORWARDPASS_METRIC_PORT",
+														Value: "20380",
+													},
 													{
 														Name:  "DYN_PARENT_DGD_K8S_NAME",
 														Value: "test-dynamo-graph-deployment",
@@ -3187,6 +3195,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 														Name:  "NIXL_TELEMETRY_PROMETHEUS_PORT",
 														Value: "19090",
 													},
+													{
+														Name:  "DYN_FORWARDPASS_METRIC_PORT",
+														Value: "20380",
+													},
 													{
 														Name:  "DYN_PARENT_DGD_K8S_NAME",
 														Value: "test-dynamo-graph-deployment",
@@ -3388,6 +3400,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 														Name:  "NIXL_TELEMETRY_PROMETHEUS_PORT",
 														Value: "19090",
 													},
+													{
+														Name:  "DYN_FORWARDPASS_METRIC_PORT",
+														Value: "20380",
+													},
 													{
 														Name:  "DYN_PARENT_DGD_K8S_NAME",
 														Value: "test-dynamo-graph-deployment",
@@ -5635,6 +5651,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
 							{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
 							{Name: commonconsts.DynamoComponentEnvVar, Value: "worker"},
 							{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
+							{Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"},
 							{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"},
 							{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-deployment"},
 							{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-deployment"},
diff --git a/recipes/README.md b/recipes/README.md
index 0783a8c87377..500aad85fa01 100644
--- a/recipes/README.md
+++ b/recipes/README.md
@@ -68,6 +68,7 @@ These recipes are under active development and may require additional setup step
 | Model | Framework | Mode | GPUs | Deployment | Notes |
 |-------|-----------|------|------|------------|-------|
 | **[GLM-5-NVFP4](glm-5-nvfp4/sglang/disagg/)** | SGLang | Disagg Prefill/Decode | 20x GB200 | ✅ | NVFP4, EAGLE speculative decoding, TP16 decode + TP4 prefill. Requires [custom container build](glm-5-nvfp4/). |
+| **[Nemotron-3-Nano-Omni-NVFP4](nemotron-3-nano-omni/vllm/agg/)** | vLLM | Aggregated | 1x GPU | ✅ | Multimodal text/image/video/audio serving. Requires [custom container build](nemotron-3-nano-omni/). |
 | **[nvidia/Kimi-K2.5-NVFP4](kimi-k2.5/trtllm/agg/nvidia/)** | TensorRT-LLM | Aggregated | 8x B200 | ✅ | Text only — MoE model, TP8×EP8, reasoning + tool calling. Vision input not yet functional. |
 | **[DeepSeek-V4-Flash](deepseek-v4/deepseek-v4-flash/vllm/agg/)** | vLLM | Aggregated | 4x B200 | ✅ | Text only — MoE model (284B / 13B active), DP=4 + EP, FP8 KV cache, reasoning + tool calling. Requires [custom container build](deepseek-v4/container/). |
 | **[DeepSeek-V4-Flash](deepseek-v4/deepseek-v4-flash/sglang/agg/)** | SGLang | Aggregated | 4x B200 | ✅ | Text only — MoE model (284B / 13B active), TP=4, MXFP4 MoE via FlashInfer, EAGLE MTP (3 steps / 4 draft tokens), reasoning + tool calling. Prebuilt image available; optional [custom container build](deepseek-v4/container/). |
diff --git a/recipes/nemotron-3-nano-omni/Dockerfile b/recipes/nemotron-3-nano-omni/Dockerfile
new file mode 100644
index 000000000000..1d13afe628ff
--- /dev/null
+++ b/recipes/nemotron-3-nano-omni/Dockerfile
@@ -0,0 +1,60 @@
+# syntax=docker/dockerfile:1.10.0
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Layer the ai-dynamo wheel (and the omni multimodal extras) onto an upstream
+# vLLM image. Dynamo is installed from https://pypi.nvidia.com/ai-dynamo/ —
+# no source build, no Rust toolchain. Match DYNAMO_VERSION to a nightly that
+# targets the same vLLM minor as BASE_IMAGE (see the wheel's METADATA for the
+# pinned vllm version).
+#
+# Build:
+#   docker build -t <registry>/nemotron-omni-vllm:latest \
+#                -f recipes/nemotron-3-nano-omni/Dockerfile \
+#                recipes/nemotron-3-nano-omni
+#
+# Override defaults with --build-arg, e.g.:
+#   --build-arg BASE_IMAGE=vllm/vllm-openai:v0.20.0
+#   --build-arg DYNAMO_VERSION=1.2.0.dev20260427
+
+ARG BASE_IMAGE="vllm/vllm-openai:v0.20.0"
+ARG DYNAMO_VERSION="1.2.0.dev20260427"
+
+FROM ${BASE_IMAGE}
+USER root
+
+ARG DYNAMO_VERSION
+
+# ai-dynamo Python package (dynamo.frontend, dynamo.vllm, ...) installed with
+# --no-deps so that:
+#   - the base image's vLLM 0.20 / torch / cuda stack is preserved (otherwise
+#     pip would try to re-install vllm[flashinfer,otel,runai]==0.20.0 which is
+#     declared by ai-dynamo[vllm]), and
+#   - we skip `nixl` (KV transport, only used for disagg) and `ray` (multi-node
+#     orchestration), neither of which an aggregated single-GPU recipe needs.
+RUN pip install --no-cache-dir --no-deps \
+      --extra-index-url https://pypi.nvidia.com \
+      ai-dynamo==${DYNAMO_VERSION}
+
+# ai-dynamo's declared core dependencies, plus the Rust _core extension wheel
+# (`ai-dynamo-runtime`) that the Python package imports natively. `kubernetes`,
+# `pydantic`, and `pydantic-settings` are explicitly pinned because the latest
+# unpinned versions (kubernetes 35.x, pydantic 2.13.x) violate ai-dynamo's
+# declared compatibility ranges.
+RUN pip install --no-cache-dir \
+      --extra-index-url https://pypi.nvidia.com \
+      ai-dynamo-runtime==${DYNAMO_VERSION} \
+      "kubernetes<33.0.0,>=32.0.1" \
+      "pydantic<2.13" "pydantic-settings<2.13.0" \
+      msgpack msgspec prometheus-client pyzmq transformers
+
+# ai-dynamo[vllm] extras minus nixl + ray (handled above), plus the
+# multimodal-video Python deps (av/ftfy/nvtx/sentencepiece) that the Nemotron
+# Nano Omni model needs at runtime but which aren't pulled in by any extra.
+RUN pip install --no-cache-dir \
+      blake3 librosa soundfile uvloop \
+      av ftfy nvtx sentencepiece
+
+# vllm/vllm-openai's default ENTRYPOINT runs `vllm serve`; reset it so the
+# image behaves as a plain dynamo runtime image.
+ENTRYPOINT ["/bin/bash"]
diff --git a/recipes/nemotron-3-nano-omni/README.md b/recipes/nemotron-3-nano-omni/README.md
new file mode 100644
index 000000000000..e4cbfe6031a1
--- /dev/null
+++ b/recipes/nemotron-3-nano-omni/README.md
@@ -0,0 +1,185 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+-->
+
+# Nemotron 3 Nano Omni NVFP4
+
+Serves [nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4](https://huggingface.co/nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4)
+using vLLM with an aggregated Dynamo deployment.
+
+This recipe builds a custom container that layers the `ai-dynamo` wheel
+(from <https://pypi.nvidia.com/ai-dynamo/>) onto an upstream vLLM image — no
+source build, no Rust toolchain.
+
+## Topology
+
+| Role | Replicas | GPUs/replica | Notes |
+|------|----------|--------------|-------|
+| Frontend | 1 | 0 | Dynamo frontend with prefix-hash KV routing |
+| vLLM worker | 1 | 1 | Text, image, video, and audio inputs |
+
+## Prerequisites
+
+- A Kubernetes cluster with the [Dynamo Operator](../../docs/kubernetes/README.md) installed
+- One NVIDIA GPU per worker replica
+- Shared PVC storage for the Hugging Face model cache
+- Hugging Face access to `nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4`
+
+## Step 1: Build the Container
+
+```bash
+docker build \
+  -t <your-registry>/nemotron-omni-vllm:latest \
+  -f recipes/nemotron-3-nano-omni/Dockerfile \
+  recipes/nemotron-3-nano-omni
+docker push <your-registry>/nemotron-omni-vllm:latest
+```
+
+Useful build args:
+
+- `BASE_IMAGE=<image>` — pin to a different vLLM base (default `vllm/vllm-openai:v0.20.0`).
+- `DYNAMO_VERSION=<version>` — pin to a specific `ai-dynamo` release or nightly from <https://pypi.nvidia.com/ai-dynamo/>. Default tracks the latest tested nightly. Make sure the chosen wheel's `vllm` dependency matches `BASE_IMAGE`.
+
+## Step 2: Download the Model
+
+Create the PVC, Hugging Face token secret, and download the model weights:
+
+```bash
+export NAMESPACE=<your-namespace>
+
+# Create the namespace if it does not already exist.
+kubectl create namespace ${NAMESPACE} --dry-run=client -o yaml | kubectl apply -f -
+
+# First edit storageClassName in model-cache.yaml for your cluster.
+kubectl apply -f recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml -n ${NAMESPACE}
+
+kubectl create secret generic hf-token-secret \
+  --from-literal=HF_TOKEN=<your-hf-token> \
+  -n ${NAMESPACE}
+
+kubectl apply -f recipes/nemotron-3-nano-omni/model-cache/model-download.yaml -n ${NAMESPACE}
+kubectl wait --for=condition=complete job/model-download -n ${NAMESPACE} --timeout=3600s
+```
+
+## Step 3: Deploy
+
+Edit `vllm/agg/deploy.yaml` and replace all `<placeholder>` values:
+
+- `<your-registry>/nemotron-omni-vllm:latest` - your built container image
+
+If your registry is private, add the appropriate `imagePullSecrets` to the
+deployment.
+
+```bash
+kubectl apply -f recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml -n ${NAMESPACE}
+```
+
+Monitor startup:
+
+```bash
+kubectl get pods -n ${NAMESPACE} -l nvidia.com/dynamo-graph-deployment-name=nemotron-omni-vllm-agg -w
+```
+
+## Step 4: Test
+
+```bash
+kubectl port-forward svc/nemotron-omni-vllm-agg-frontend 8000:8000 -n ${NAMESPACE}
+```
+
+In another terminal, send a minimal text request:
+
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
+    "messages": [{"role": "user", "content": "Hello!"}],
+    "max_tokens": 128
+  }'
+```
+
+To exercise the multimodal path, attach an image:
+
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
+    "messages": [{
+      "role": "user",
+      "content": [
+        {"type": "image_url", "image_url": {"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"}},
+        {"type": "text", "text": "Describe what is in this image."}
+      ]
+    }],
+    "max_tokens": 256
+  }'
+```
+
+…or an audio clip:
+
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
+    "messages": [{
+      "role": "user",
+      "content": [
+        {"type": "audio_url", "audio_url": {"url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"}},
+        {"type": "text", "text": "Transcribe this audio clip."}
+      ]
+    }],
+    "max_tokens": 256
+  }'
+```
+
+## Key Configuration Notes
+
+- `--enable-multimodal` enables image, video, and audio inputs.
+- `--media-io-kwargs '{"video": {"num_frames": 512, "fps": 1}}'` samples long
+  videos at one frame per second, capped at 512 frames.
+- `--dyn-tool-call-parser nemotron_nano` and
+  `--dyn-reasoning-parser nemotron_nano` enable Nemotron Nano tool-call and
+  reasoning parsing.
+- The frontend uses `--router-mode kv --no-kv-events`, which approximates
+  KV-aware routing with prefix hashing without requiring backend KV events.
+
+## Optional: Run without NATS
+
+The Dynamo runtime defaults to NATS for the event plane and connects to a
+NATS server if `NATS_SERVER` is set in the environment (the operator
+auto-injects this on most clusters). On clusters without NATS — or where
+you'd rather avoid the dependency — you can run on TCP request plane + ZMQ
+event plane only. Add to both Frontend and VllmWorker:
+
+```yaml
+mainContainer:
+  env:
+    - name: DYN_EVENT_PLANE
+      value: zmq
+  command: ["/bin/bash", "-lc"]
+  args:
+    # Operator-injected NATS_SERVER takes effect even when set to ""; we have
+    # to actually unset it before the runtime reads env.
+    - >-
+      unset NATS_SERVER &&
+      exec python3 -m dynamo.frontend ...   # or dynamo.vllm
+```
+
+The request plane defaults to TCP already, so no further flags are needed.
+
+## File Layout
+
+```text
+recipes/nemotron-3-nano-omni/
+  README.md
+  Dockerfile
+  model-cache/
+    model-cache.yaml
+    model-download.yaml
+  vllm/
+    agg/
+      deploy.yaml
+```
diff --git a/recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml b/recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml
new file mode 100644
index 000000000000..5d6e2b6e998b
--- /dev/null
+++ b/recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: model-cache
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 250Gi
+  storageClassName: "your-storage-class-name"
diff --git a/recipes/nemotron-3-nano-omni/model-cache/model-download.yaml b/recipes/nemotron-3-nano-omni/model-cache/model-download.yaml
new file mode 100644
index 000000000000..6e34cf65512b
--- /dev/null
+++ b/recipes/nemotron-3-nano-omni/model-cache/model-download.yaml
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: model-download
+spec:
+  backoffLimit: 3
+  completions: 1
+  parallelism: 1
+  template:
+    metadata:
+      labels:
+        app: model-download
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: model-download
+          image: python:3.10-slim
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: ["ALL"]
+            seccompProfile:
+              type: RuntimeDefault
+          command: ["sh", "-c"]
+          envFrom:
+            - secretRef:
+                name: hf-token-secret
+          env:
+            - name: MODEL_NAME
+              value: nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4
+            - name: HF_HOME
+              value: /model-store
+            - name: HF_XET_HIGH_PERFORMANCE
+              value: "1"
+          args:
+            - |
+              set -eux
+              pip install --no-cache-dir huggingface_hub==1.11.0
+              hf download "$MODEL_NAME"
+          volumeMounts:
+            - name: model-cache
+              mountPath: /model-store
+      volumes:
+        - name: model-cache
+          persistentVolumeClaim:
+            claimName: model-cache
diff --git a/recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml b/recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml
new file mode 100644
index 000000000000..1c03c392f811
--- /dev/null
+++ b/recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml
@@ -0,0 +1,102 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# Nemotron Nano Omni aggregated vLLM deployment.
+#
+# Prerequisites:
+#   - Dynamo Platform installed
+#   - Model weights downloaded into the model-cache PVC
+#   - Container built from recipes/nemotron-3-nano-omni/Dockerfile
+#   - HF_TOKEN secret created:
+#       kubectl create secret generic hf-token-secret \
+#         --from-literal=HF_TOKEN=<your-token> -n <namespace>
+#
+# Replace image references before applying:
+#   <your-registry>/nemotron-omni-vllm:latest
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: nemotron-omni-vllm-agg
+spec:
+  backendFramework: vllm
+  pvcs:
+    - name: model-cache
+      create: false
+  services:
+    Frontend:
+      componentType: frontend
+      envFromSecret: hf-token-secret
+      replicas: 1
+      volumeMounts:
+        - name: model-cache
+          mountPoint: /model-store
+      extraPodSpec:
+        mainContainer:
+          image: <your-registry>/nemotron-omni-vllm:latest
+          imagePullPolicy: IfNotPresent
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 8000
+            periodSeconds: 10
+            timeoutSeconds: 1800
+            failureThreshold: 60
+          env:
+            - name: HF_HOME
+              value: /model-store
+          command:
+            - /bin/bash
+            - -lc
+          args:
+            - >-
+              exec python3 -m dynamo.frontend
+              --router-mode kv
+              --no-kv-events
+              --http-port 8000
+
+    VllmWorker:
+      componentType: worker
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        limits:
+          gpu: "1"
+        requests:
+          gpu: "1"
+      volumeMounts:
+        - name: model-cache
+          mountPoint: /model-store
+      sharedMemory:
+        size: 16Gi
+      extraPodSpec:
+        mainContainer:
+          image: <your-registry>/nemotron-omni-vllm:latest
+          imagePullPolicy: IfNotPresent
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 9090
+            periodSeconds: 10
+            timeoutSeconds: 10
+            failureThreshold: 120
+          env:
+            - name: HF_HOME
+              value: /model-store
+            # Match the --media-io-kwargs num_frames so dynamo's multimodal
+            # preprocessor and vLLM agree on the video frame ceiling.
+            - name: DYN_MM_VIDEO_NUM_FRAMES
+              value: "512"
+          command:
+            - /bin/bash
+            - -lc
+          args:
+            - >-
+              exec python3 -m dynamo.vllm
+              --model nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4
+              --served-model-name nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4
+              --max-model-len 131072
+              --enable-multimodal
+              --media-io-kwargs '{"video": {"num_frames": 512, "fps": 1}}'
+              --trust-remote-code
+              --video-pruning-rate 0.5
+              --dyn-tool-call-parser nemotron_nano
+              --dyn-reasoning-parser nemotron_nano