diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml
index c3637b853bc6..5c3299fc3877 100644
--- a/.github/actions/docker-build/action.yml
+++ b/.github/actions/docker-build/action.yml
@@ -80,7 +80,9 @@ runs:
         ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
       run: |
         sudo apt-get update && sudo apt-get install -y git build-essential protobuf-compiler libclang-dev
-        curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain stable
+        curl --retry 5 --retry-delay 2 --fail --show-error -sSL -o /tmp/rustup-init.sh https://sh.rustup.rs
+        sh /tmp/rustup-init.sh -y --default-toolchain stable
+        rm -f /tmp/rustup-init.sh
         . "$HOME/.cargo/env"
         echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
         cargo install cbindgen
diff --git a/.github/actions/install-vcluster-cli/action.yml b/.github/actions/install-vcluster-cli/action.yml
index 572d979a153a..b3e127b36e24 100644
--- a/.github/actions/install-vcluster-cli/action.yml
+++ b/.github/actions/install-vcluster-cli/action.yml
@@ -23,10 +23,14 @@ runs:
             aarch64) VCLUSTER_ARCH="arm64" ;;
             *)       echo "Unsupported architecture: ${ARCH}"; exit 1 ;;
           esac
-          curl -sL -o /tmp/vcluster \
+          TMP_BIN="$(mktemp -p /tmp vcluster.XXXXXX)"
+          curl --retry 5 --retry-delay 2 \
+            --connect-timeout 10 --max-time 120 \
+            --fail --show-error -sL \
+            -o "${TMP_BIN}" \
             "https://github.com/loft-sh/vcluster/releases/download/${{ inputs.vcluster_version }}/vcluster-linux-${VCLUSTER_ARCH}"
-          sudo mv /tmp/vcluster /usr/local/bin/vcluster
-          sudo chmod +x /usr/local/bin/vcluster
+          sudo install -m 0755 "${TMP_BIN}" /usr/local/bin/vcluster
+          rm -f "${TMP_BIN}"
           vcluster version
         fi
         echo "::endgroup::"
diff --git a/.github/workflows/nightly-ci.yml b/.github/workflows/nightly-ci.yml
index d92bbe88d71c..d671a7c98618 100644
--- a/.github/workflows/nightly-ci.yml
+++ b/.github/workflows/nightly-ci.yml
@@ -258,7 +258,7 @@ jobs:
         PROTOC_VER="30.2"
         PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip"
         PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f"
-        curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
+        curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
         echo "${PROTOC_SHA256}  ${PROTOC_ZIP}" | sha256sum -c -
         unzip "${PROTOC_ZIP}" -d $HOME/.local
         rm "${PROTOC_ZIP}"
diff --git a/.github/workflows/pre-merge.yml b/.github/workflows/pre-merge.yml
index 3693b06728a2..7e96b753e350 100644
--- a/.github/workflows/pre-merge.yml
+++ b/.github/workflows/pre-merge.yml
@@ -81,7 +81,7 @@ jobs:
         PROTOC_VER="30.2"
         PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip"
         PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f"
-        curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
+        curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
         echo "${PROTOC_SHA256}  ${PROTOC_ZIP}" | sha256sum -c -
         unzip "${PROTOC_ZIP}" -d $HOME/.local
         rm "${PROTOC_ZIP}"
@@ -140,7 +140,7 @@ jobs:
         PROTOC_VER="30.2"
         PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip"
         PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f"
-        curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
+        curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}"
         echo "${PROTOC_SHA256}  ${PROTOC_ZIP}" | sha256sum -c -
         unzip "${PROTOC_ZIP}" -d $HOME/.local
         rm "${PROTOC_ZIP}"
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 24dd4547445f..cc145b4c037b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -176,7 +176,10 @@ jobs:
         env:
           CRANE_VERSION: v0.20.2
         run: |
-          curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \
+          curl --retry 5 --retry-delay 2 \
+            --connect-timeout 10 --max-time 120 \
+            --fail --show-error -sL \
+            "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \
             | tar -xzf - crane
           sudo mv crane /usr/local/bin/
           crane version
@@ -440,7 +443,10 @@ jobs:
         env:
           CRANE_VERSION: v0.20.2
         run: |
-          curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \
+          curl --retry 5 --retry-delay 2 \
+            --connect-timeout 10 --max-time 120 \
+            --fail --show-error -sL \
+            "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \
             | tar -xzf - crane
           sudo mv crane /usr/local/bin/
           crane version
diff --git a/components/src/dynamo/frontend/sglang_prepost.py b/components/src/dynamo/frontend/sglang_prepost.py
index ec9ebf0f6830..3edbbc8cb55d 100644
--- a/components/src/dynamo/frontend/sglang_prepost.py
+++ b/components/src/dynamo/frontend/sglang_prepost.py
@@ -511,6 +511,69 @@ def _try_parse_json_array(text: str) -> list | None:
     return None
 
 
+def _decode_single_token(tokenizer, token_id: int) -> str | None:
+    try:
+        return tokenizer.decode([token_id], skip_special_tokens=False)
+    except Exception as exc:
+        logger.exception(
+            "Failed to decode token for chat logprobs: token_id=%s tokenizer=%s",
+            token_id,
+            type(tokenizer).__name__,
+        )
+        raise RuntimeError(
+            f"Failed to decode token_id={token_id} with tokenizer={type(tokenizer).__name__}"
+        ) from exc
+
+
+def _build_chat_logprobs(
+    *,
+    tokenizer,
+    token_ids: list[int],
+    log_probs: list[float] | None,
+    top_logprobs: list[list[dict[str, Any]]] | None,
+) -> dict[str, Any] | None:
+    if not log_probs:
+        return None
+
+    content: list[dict[str, Any]] = []
+    for idx, logprob in enumerate(log_probs):
+        if idx >= len(token_ids):
+            break
+
+        token_id = token_ids[idx]
+        token_text = _decode_single_token(tokenizer, token_id)
+        token_top_logprobs = []
+        if top_logprobs and idx < len(top_logprobs):
+            for item in top_logprobs[idx]:
+                token_text_alt = item.get("token")
+                if token_text_alt is None and item.get("token_id") is not None:
+                    token_text_alt = _decode_single_token(tokenizer, item["token_id"])
+                token_top_logprobs.append(
+                    {
+                        "token": token_text_alt,
+                        "logprob": item.get("logprob"),
+                        "bytes": (
+                            list(token_text_alt.encode("utf-8"))
+                            if token_text_alt is not None
+                            else None
+                        ),
+                    }
+                )
+
+        content.append(
+            {
+                "token": token_text,
+                "logprob": float(logprob),
+                "bytes": (
+                    list(token_text.encode("utf-8")) if token_text is not None else None
+                ),
+                "top_logprobs": token_top_logprobs,
+            }
+        )
+
+    return {"content": content}
+
+
 class SglangStreamingPostProcessor:
     """Streaming post-processor using SGLang parsers and HF tokenizer detokenization.
 
@@ -621,6 +684,12 @@ def process_output(self, engine_response: dict[str, Any]) -> dict[str, Any] | No
         raw_ids = engine_response.get("token_ids")
         token_ids = raw_ids if isinstance(raw_ids, list) else list(raw_ids or [])
         finish_reason = engine_response.get("finish_reason")
+        logprobs = _build_chat_logprobs(
+            tokenizer=self.tokenizer,
+            token_ids=token_ids,
+            log_probs=engine_response.get("log_probs"),
+            top_logprobs=engine_response.get("top_logprobs"),
+        )
 
         delta_text = self._incremental_decode(token_ids) if token_ids else ""
 
@@ -630,14 +699,14 @@ def process_output(self, engine_response: dict[str, Any]) -> dict[str, Any] | No
                     "index": 0,
                     "delta": {"role": "assistant", "content": delta_text},
                     "finish_reason": finish_reason,
-                    "logprobs": None,
+                    "logprobs": logprobs,
                 }
             elif finish_reason:
                 return {
                     "index": 0,
                     "delta": {},
                     "finish_reason": finish_reason,
-                    "logprobs": None,
+                    "logprobs": logprobs,
                 }
             return None
 
@@ -854,7 +923,7 @@ def process_output(self, engine_response: dict[str, Any]) -> dict[str, Any] | No
                 "index": 0,
                 "delta": delta if has_content else {},
                 "finish_reason": effective_finish,
-                "logprobs": None,
+                "logprobs": logprobs,
             }
 
         return None
diff --git a/components/src/dynamo/frontend/tests/test_sglang_processor_unit.py b/components/src/dynamo/frontend/tests/test_sglang_processor_unit.py
index ba68a376bf9f..08bc69374933 100644
--- a/components/src/dynamo/frontend/tests/test_sglang_processor_unit.py
+++ b/components/src/dynamo/frontend/tests/test_sglang_processor_unit.py
@@ -1416,6 +1416,32 @@ def test_fast_path_content_output(self, tokenizer):
         assert choice["index"] == 0
         assert choice["logprobs"] is None
 
+    def test_fast_path_includes_logprobs(self, tokenizer):
+        """Fast path maps engine log_probs/top_logprobs into OpenAI chat logprobs."""
+        post = SglangStreamingPostProcessor(
+            tokenizer=tokenizer, tool_call_parser=None, reasoning_parser=None
+        )
+        token_ids = tokenizer.encode("Hello")
+        choice = post.process_output(
+            {
+                "token_ids": token_ids,
+                "log_probs": [-0.1] * len(token_ids),
+                "top_logprobs": [
+                    [{"token": tokenizer.decode([tid]), "logprob": -0.1}]
+                    for tid in token_ids
+                ],
+                "finish_reason": None,
+            }
+        )
+
+        assert choice is not None
+        assert choice["logprobs"] is not None
+        content = choice["logprobs"]["content"]
+        assert len(content) == len(token_ids)
+        assert content[0]["logprob"] == -0.1
+        assert "top_logprobs" in content[0]
+        assert "bytes" in content[0]
+
 
 # ---------------------------------------------------------------------------
 # SglangStreamingPostProcessor: reasoning parsing
diff --git a/components/src/dynamo/planner/config/planner_config.py b/components/src/dynamo/planner/config/planner_config.py
index 93e074bcb6fd..307757381d45 100644
--- a/components/src/dynamo/planner/config/planner_config.py
+++ b/components/src/dynamo/planner/config/planner_config.py
@@ -52,7 +52,8 @@ class PlannerConfig(BaseModel):
         "kubernetes", "virtual", "global-planner"
     ] = SLAPlannerDefaults.environment
     namespace: str = Field(
-        default_factory=lambda: os.environ.get("DYN_NAMESPACE", "dynamo")
+        default_factory=lambda: os.environ.get("DYN_NAMESPACE", "dynamo"),
+        exclude=True,
     )
     backend: Literal["vllm", "sglang", "trtllm", "mocker"] = SLAPlannerDefaults.backend
     mode: Literal["disagg", "prefill", "decode", "agg"] = SLAPlannerDefaults.mode
@@ -109,7 +110,8 @@ class PlannerConfig(BaseModel):
         default_factory=lambda: os.environ.get(
             "PROMETHEUS_ENDPOINT",
             "http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090",
-        )
+        ),
+        exclude=True,
     )
     metric_reporting_prometheus_port: int = Field(
         default_factory=lambda: int(os.environ.get("PLANNER_PROMETHEUS_PORT", 0))
diff --git a/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py b/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
index 50d123dd92f3..4a068cee2d67 100644
--- a/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
@@ -201,63 +201,74 @@ def _build_logprob_kwargs(request: Dict[str, Any]) -> Dict[str, Any]:
         return kwargs
 
     @staticmethod
+    def _format_top_logprobs(raw_top_logprobs: Any) -> list[dict[str, Any]]:
+        formatted: list[dict[str, Any]] = []
+        if not raw_top_logprobs:
+            return formatted
+
+        for rank, item in enumerate(raw_top_logprobs):
+            if not item:
+                continue
+
+            logprob = float(item[0]) if item[0] is not None else None
+            token_id = item[1] if len(item) > 1 else None
+            token = item[2] if len(item) > 2 else None
+            formatted.append(
+                {
+                    "rank": rank,
+                    "token_id": token_id,
+                    "token": token,
+                    "logprob": logprob,
+                    "bytes": list(token.encode("utf-8")) if token else None,
+                }
+            )
+
+        return formatted
+
+    @classmethod
     def _extract_logprobs(
-        meta_info: Dict[str, Any], num_output_logprobs_so_far: int
-    ) -> tuple:
-        """Extract logprobs from SGLang meta_info for new tokens.
+        cls, meta_info: Dict[str, Any], start_offset: int, num_new_tokens: int
+    ) -> tuple[list[float] | None, list[list[dict[str, Any]]] | None, int]:
+        """Extract logprobs from SGLang meta_info for newly emitted tokens."""
+        if num_new_tokens <= 0 or start_offset < 0:
+            return None, None, start_offset
+
+        output_token_logprobs = meta_info.get("output_token_logprobs") or []
+        if not output_token_logprobs or start_offset >= len(output_token_logprobs):
+            return None, None, start_offset
+
+        end_offset = start_offset + num_new_tokens
+        selected_logprobs = output_token_logprobs[start_offset:end_offset]
+
+        output_top_logprobs = meta_info.get("output_top_logprobs") or []
+        selected_top_logprobs = (
+            output_top_logprobs[start_offset:end_offset] if output_top_logprobs else []
+        )
 
-        While Dynamo forces stream_output=True (args.py) so that output_ids
-        are disjoint per chunk, SGLang's output_token_logprobs and
-        output_top_logprobs in meta_info are always cumulative. We track an
-        offset to slice out only the new entries each chunk.
+        log_probs: list[float] = []
+        top_logprobs: list[list[dict[str, Any]]] = []
 
-        Args:
-            meta_info: SGLang response meta_info dict.
-            num_output_logprobs_so_far: Number of logprob entries already
-                processed in previous chunks.
+        for idx, token_logprob in enumerate(selected_logprobs):
+            if not token_logprob:
+                continue
 
-        Returns:
-            Tuple of (log_probs, top_logprobs, new_total):
-            - log_probs: List of floats (selected token logprob per position)
-            - top_logprobs: List of lists of dicts with rank/token_id/token/logprob
-            - new_total: Updated count of logprob entries processed so far
-        """
-        output_token_logprobs = meta_info.get("output_token_logprobs")
-        if not output_token_logprobs:
-            return None, None, num_output_logprobs_so_far
-
-        new_logprobs = output_token_logprobs[num_output_logprobs_so_far:]
-        if not new_logprobs:
-            return None, None, num_output_logprobs_so_far
-
-        # Extract selected-token logprobs: each entry is (logprob, token_id, text_or_None)
-        log_probs = [float(entry[0]) for entry in new_logprobs]
-
-        # Extract top logprobs if available
-        top_logprobs: list[list[dict[str, Any]]] | None = None
-        output_top = meta_info.get("output_top_logprobs")
-        if output_top:
-            new_top = output_top[num_output_logprobs_so_far:]
-            if new_top:
-                top_logprobs = []
-                for position_entries in new_top:
-                    if position_entries is None:
-                        top_logprobs.append([])
-                        continue
-                    position_list = []
-                    for rank_idx, entry in enumerate(position_entries):
-                        position_list.append(
-                            {
-                                "rank": rank_idx + 1,
-                                "token_id": entry[1],
-                                "token": entry[2],
-                                "logprob": float(entry[0]),
-                            }
-                        )
-                    top_logprobs.append(position_list)
-
-        new_total = len(output_token_logprobs)
-        return log_probs, top_logprobs, new_total
+            log_probs.append(float(token_logprob[0]))
+            if selected_top_logprobs:
+                raw_top = (
+                    selected_top_logprobs[idx]
+                    if idx < len(selected_top_logprobs)
+                    else None
+                )
+                top_logprobs.append(cls._format_top_logprobs(raw_top))
+
+        if not log_probs:
+            return None, None, start_offset
+
+        return (
+            log_probs,
+            top_logprobs if selected_top_logprobs else None,
+            start_offset + len(selected_logprobs),
+        )
 
     async def generate(
         self, request: Dict[str, Any], context: Context
@@ -390,8 +401,9 @@ async def _process_token_stream(
         """
         # Use Future pattern for request ID - will be set when first response arrives
         request_id_future: asyncio.Future[str] = asyncio.Future()
-        # Logprob offset: output_ids are disjoint (stream_output=True) but
-        # meta_info logprobs are cumulative — track how many we've emitted.
+        # output_ids are disjoint per chunk, while SGLang logprob arrays are
+        # cumulative. Track the emitted-token offset so each chunk slices the
+        # matching cumulative window.
         num_output_logprobs_so_far = 0
         async with self._cancellation_monitor(request_id_future, context):
             async for res in stream_source:
@@ -425,18 +437,19 @@ async def _process_token_stream(
 
                 # Pass through disjoint token segments directly
                 out["token_ids"] = output_ids
-
-                # Extract logprobs for new tokens if available
                 (
                     log_probs,
                     top_logprobs,
                     num_output_logprobs_so_far,
-                ) = self._extract_logprobs(res["meta_info"], num_output_logprobs_so_far)
+                ) = self._extract_logprobs(
+                    res.get("meta_info", {}),
+                    num_output_logprobs_so_far,
+                    len(output_ids),
+                )
                 if log_probs is not None:
                     out["log_probs"] = log_probs
                 if top_logprobs is not None:
                     out["top_logprobs"] = top_logprobs
-
                 routed_experts = res["meta_info"].get("routed_experts")
                 if routed_experts is not None:
                     # Base64-encode tensor bytes to match sglang's output format.
diff --git a/components/src/dynamo/sglang/tests/test_sglang_decode_handler.py b/components/src/dynamo/sglang/tests/test_sglang_decode_handler.py
index 7e3d546549b8..226973082508 100644
--- a/components/src/dynamo/sglang/tests/test_sglang_decode_handler.py
+++ b/components/src/dynamo/sglang/tests/test_sglang_decode_handler.py
@@ -3,7 +3,10 @@
 
 import pytest
 
-from dynamo.sglang.request_handlers.llm.decode_handler import _extract_media_urls
+from dynamo.sglang.request_handlers.llm.decode_handler import (
+    DecodeWorkerHandler,
+    _extract_media_urls,
+)
 
 pytestmark = [
     pytest.mark.unit,
@@ -34,3 +37,74 @@ def test_extract_media_urls_returns_none_for_missing_or_invalid_items():
     assert (
         _extract_media_urls({"image_url": [{"ignored": "value"}]}, "image_url") is None
     )
+
+
+def test_build_logprob_kwargs_token_request_uses_output_options_logprobs():
+    assert DecodeWorkerHandler._build_logprob_kwargs(
+        {"output_options": {"logprobs": 3}}
+    ) == {
+        "return_logprob": True,
+        "top_logprobs_num": 3,
+    }
+
+
+def test_extract_logprobs_from_meta_info():
+    log_probs, top_logprobs = DecodeWorkerHandler._extract_logprobs(
+        {
+            "output_token_logprobs": [[-0.25, 11, " hello"], [-0.5, 12, " world"]],
+            "output_top_logprobs": [
+                [[-0.25, 11, " hello"], [-1.0, 99, " hi"]],
+                [[-0.5, 12, " world"]],
+            ],
+        },
+        0,
+        2,
+    )
+
+    assert log_probs == [-0.25, -0.5]
+    assert top_logprobs is not None
+    assert top_logprobs[0][0]["token"] == " hello"
+    assert top_logprobs[0][0]["bytes"] == list(" hello".encode("utf-8"))
+
+
+def test_extract_logprobs_uses_start_offset_for_streaming_chunks():
+    log_probs, top_logprobs = DecodeWorkerHandler._extract_logprobs(
+        {
+            "output_token_logprobs": [
+                [-0.1, 10, " a"],
+                [-0.2, 11, " b"],
+                [-0.3, 12, " c"],
+            ],
+            "output_top_logprobs": [
+                [[-0.1, 10, " a"]],
+                [[-0.2, 11, " b"]],
+                [[-0.3, 12, " c"]],
+            ],
+        },
+        1,
+        1,
+    )
+
+    assert log_probs == [-0.2]
+    assert top_logprobs == [
+        [
+            {
+                "rank": 0,
+                "token_id": 11,
+                "token": " b",
+                "logprob": -0.2,
+                "bytes": list(" b".encode("utf-8")),
+            }
+        ]
+    ]
+
+
+def test_extract_logprobs_returns_none_when_offset_exceeds_available_entries():
+    log_probs, top_logprobs = DecodeWorkerHandler._extract_logprobs(
+        {"output_token_logprobs": [[-0.25, 11, " hello"]]},
+        1,
+        5,
+    )
+
+    assert log_probs is None
+    assert top_logprobs is None
diff --git a/deploy/operator/internal/consts/consts.go b/deploy/operator/internal/consts/consts.go
index 2d8b67ca533b..a1b9222bb98c 100644
--- a/deploy/operator/internal/consts/consts.go
+++ b/deploy/operator/internal/consts/consts.go
@@ -27,6 +27,8 @@ const (
 	DynamoNixlPort     = 19090
 	DynamoNixlPortName = "nixl"
 
+	DynamoFPMBasePort = 20380
+
 	MpiRunSshPort = 2222
 
 	// Default security context values
diff --git a/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go b/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go
index 9216c3c453da..4aaf920cf286 100644
--- a/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go
+++ b/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go
@@ -773,6 +773,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
 											{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
 											{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
 											{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
+											{Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"},
 											{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"},
 											{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-lws-deploy"},
 											{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"},
@@ -916,6 +917,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
 											{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
 											{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
 											{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
+											{Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"},
 											{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"},
 											{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-lws-deploy"},
 											{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"},
diff --git a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
index 59c0a8982b4a..eb70054264a0 100644
--- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
+++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
@@ -1307,6 +1307,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
 				Value: string(dgdr.UID),
 			},
 		}
+		if r.Config.Infrastructure.PrometheusEndpoint != "" {
+			profilerEnv = append(profilerEnv, corev1.EnvVar{
+				Name:  "PROMETHEUS_ENDPOINT",
+				Value: r.Config.Infrastructure.PrometheusEndpoint,
+			})
+		}
 
 		// Build volume mounts
 		volumeMounts := []corev1.VolumeMount{
diff --git a/deploy/operator/internal/dynamo/component_worker.go b/deploy/operator/internal/dynamo/component_worker.go
index 38d9d0c45503..be19b61c3863 100644
--- a/deploy/operator/internal/dynamo/component_worker.go
+++ b/deploy/operator/internal/dynamo/component_worker.go
@@ -107,6 +107,10 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont
 			Name:  "NIXL_TELEMETRY_PROMETHEUS_PORT",
 			Value: fmt.Sprintf("%d", commonconsts.DynamoNixlPort),
 		},
+		{
+			Name:  "DYN_FORWARDPASS_METRIC_PORT",
+			Value: fmt.Sprintf("%d", commonconsts.DynamoFPMBasePort),
+		},
 	}...)
 
 	if context.WorkerHashSuffix != "" {
diff --git a/deploy/operator/internal/dynamo/graph_test.go b/deploy/operator/internal/dynamo/graph_test.go
index a6ab18bfc825..f5cab1e18a0d 100644
--- a/deploy/operator/internal/dynamo/graph_test.go
+++ b/deploy/operator/internal/dynamo/graph_test.go
@@ -2160,6 +2160,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 														Name:  "NIXL_TELEMETRY_PROMETHEUS_PORT",
 														Value: "19090",
 													},
+													{
+														Name:  "DYN_FORWARDPASS_METRIC_PORT",
+														Value: "20380",
+													},
 													{
 														Name:  "DYN_PARENT_DGD_K8S_NAME",
 														Value: "test-dynamo-graph-deployment",
@@ -2374,6 +2378,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 														Name:  "NIXL_TELEMETRY_PROMETHEUS_PORT",
 														Value: "19090",
 													},
+													{
+														Name:  "DYN_FORWARDPASS_METRIC_PORT",
+														Value: "20380",
+													},
 													{
 														Name:  "DYN_PARENT_DGD_K8S_NAME",
 														Value: "test-dynamo-graph-deployment",
@@ -3187,6 +3195,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 														Name:  "NIXL_TELEMETRY_PROMETHEUS_PORT",
 														Value: "19090",
 													},
+													{
+														Name:  "DYN_FORWARDPASS_METRIC_PORT",
+														Value: "20380",
+													},
 													{
 														Name:  "DYN_PARENT_DGD_K8S_NAME",
 														Value: "test-dynamo-graph-deployment",
@@ -3388,6 +3400,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 														Name:  "NIXL_TELEMETRY_PROMETHEUS_PORT",
 														Value: "19090",
 													},
+													{
+														Name:  "DYN_FORWARDPASS_METRIC_PORT",
+														Value: "20380",
+													},
 													{
 														Name:  "DYN_PARENT_DGD_K8S_NAME",
 														Value: "test-dynamo-graph-deployment",
@@ -5635,6 +5651,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
 							{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
 							{Name: commonconsts.DynamoComponentEnvVar, Value: "worker"},
 							{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
+							{Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"},
 							{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"},
 							{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-deployment"},
 							{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-deployment"},
diff --git a/recipes/README.md b/recipes/README.md
index 0783a8c87377..500aad85fa01 100644
--- a/recipes/README.md
+++ b/recipes/README.md
@@ -68,6 +68,7 @@ These recipes are under active development and may require additional setup step
 | Model | Framework | Mode | GPUs | Deployment | Notes |
 |-------|-----------|------|------|------------|-------|
 | **[GLM-5-NVFP4](glm-5-nvfp4/sglang/disagg/)** | SGLang | Disagg Prefill/Decode | 20x GB200 | ✅ | NVFP4, EAGLE speculative decoding, TP16 decode + TP4 prefill. Requires [custom container build](glm-5-nvfp4/). |
+| **[Nemotron-3-Nano-Omni-NVFP4](nemotron-3-nano-omni/vllm/agg/)** | vLLM | Aggregated | 1x GPU | ✅ | Multimodal text/image/video/audio serving. Requires [custom container build](nemotron-3-nano-omni/). |
 | **[nvidia/Kimi-K2.5-NVFP4](kimi-k2.5/trtllm/agg/nvidia/)** | TensorRT-LLM | Aggregated | 8x B200 | ✅ | Text only — MoE model, TP8×EP8, reasoning + tool calling. Vision input not yet functional. |
 | **[DeepSeek-V4-Flash](deepseek-v4/deepseek-v4-flash/vllm/agg/)** | vLLM | Aggregated | 4x B200 | ✅ | Text only — MoE model (284B / 13B active), DP=4 + EP, FP8 KV cache, reasoning + tool calling. Requires [custom container build](deepseek-v4/container/). |
 | **[DeepSeek-V4-Flash](deepseek-v4/deepseek-v4-flash/sglang/agg/)** | SGLang | Aggregated | 4x B200 | ✅ | Text only — MoE model (284B / 13B active), TP=4, MXFP4 MoE via FlashInfer, EAGLE MTP (3 steps / 4 draft tokens), reasoning + tool calling. Prebuilt image available; optional [custom container build](deepseek-v4/container/). |
diff --git a/recipes/nemotron-3-nano-omni/Dockerfile b/recipes/nemotron-3-nano-omni/Dockerfile
new file mode 100644
index 000000000000..1d13afe628ff
--- /dev/null
+++ b/recipes/nemotron-3-nano-omni/Dockerfile
@@ -0,0 +1,60 @@
+# syntax=docker/dockerfile:1.10.0
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Layer the ai-dynamo wheel (and the omni multimodal extras) onto an upstream
+# vLLM image. Dynamo is installed from https://pypi.nvidia.com/ai-dynamo/ —
+# no source build, no Rust toolchain. Match DYNAMO_VERSION to a nightly that
+# targets the same vLLM minor as BASE_IMAGE (see the wheel's METADATA for the
+# pinned vllm version).
+#
+# Build:
+#   docker build -t <registry>/nemotron-omni-vllm:latest \
+#                -f recipes/nemotron-3-nano-omni/Dockerfile \
+#                recipes/nemotron-3-nano-omni
+#
+# Override defaults with --build-arg, e.g.:
+#   --build-arg BASE_IMAGE=vllm/vllm-openai:v0.20.0
+#   --build-arg DYNAMO_VERSION=1.2.0.dev20260427
+
+ARG BASE_IMAGE="vllm/vllm-openai:v0.20.0"
+ARG DYNAMO_VERSION="1.2.0.dev20260427"
+
+FROM ${BASE_IMAGE}
+USER root
+
+ARG DYNAMO_VERSION
+
+# ai-dynamo Python package (dynamo.frontend, dynamo.vllm, ...) installed with
+# --no-deps so that:
+#   - the base image's vLLM 0.20 / torch / cuda stack is preserved (otherwise
+#     pip would try to re-install vllm[flashinfer,otel,runai]==0.20.0 which is
+#     declared by ai-dynamo[vllm]), and
+#   - we skip `nixl` (KV transport, only used for disagg) and `ray` (multi-node
+#     orchestration), neither of which an aggregated single-GPU recipe needs.
+RUN pip install --no-cache-dir --no-deps \
+      --extra-index-url https://pypi.nvidia.com \
+      ai-dynamo==${DYNAMO_VERSION}
+
+# ai-dynamo's declared core dependencies, plus the Rust _core extension wheel
+# (`ai-dynamo-runtime`) that the Python package imports natively. `kubernetes`,
+# `pydantic`, and `pydantic-settings` are explicitly pinned because the latest
+# unpinned versions (kubernetes 35.x, pydantic 2.13.x) violate ai-dynamo's
+# declared compatibility ranges.
+RUN pip install --no-cache-dir \
+      --extra-index-url https://pypi.nvidia.com \
+      ai-dynamo-runtime==${DYNAMO_VERSION} \
+      "kubernetes<33.0.0,>=32.0.1" \
+      "pydantic<2.13" "pydantic-settings<2.13.0" \
+      msgpack msgspec prometheus-client pyzmq transformers
+
+# ai-dynamo[vllm] extras minus nixl + ray (handled above), plus the
+# multimodal-video Python deps (av/ftfy/nvtx/sentencepiece) that the Nemotron
+# Nano Omni model needs at runtime but which aren't pulled in by any extra.
+RUN pip install --no-cache-dir \
+      blake3 librosa soundfile uvloop \
+      av ftfy nvtx sentencepiece
+
+# vllm/vllm-openai's default ENTRYPOINT runs `vllm serve`; reset it so the
+# image behaves as a plain dynamo runtime image.
+ENTRYPOINT ["/bin/bash"]
diff --git a/recipes/nemotron-3-nano-omni/README.md b/recipes/nemotron-3-nano-omni/README.md
new file mode 100644
index 000000000000..e4cbfe6031a1
--- /dev/null
+++ b/recipes/nemotron-3-nano-omni/README.md
@@ -0,0 +1,185 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+-->
+
+# Nemotron 3 Nano Omni NVFP4
+
+Serves [nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4](https://huggingface.co/nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4)
+using vLLM with an aggregated Dynamo deployment.
+
+This recipe builds a custom container that layers the `ai-dynamo` wheel
+(from <https://pypi.nvidia.com/ai-dynamo/>) onto an upstream vLLM image — no
+source build, no Rust toolchain.
+
+## Topology
+
+| Role | Replicas | GPUs/replica | Notes |
+|------|----------|--------------|-------|
+| Frontend | 1 | 0 | Dynamo frontend with prefix-hash KV routing |
+| vLLM worker | 1 | 1 | Text, image, video, and audio inputs |
+
+## Prerequisites
+
+- A Kubernetes cluster with the [Dynamo Operator](../../docs/kubernetes/README.md) installed
+- One NVIDIA GPU per worker replica
+- Shared PVC storage for the Hugging Face model cache
+- Hugging Face access to `nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4`
+
+## Step 1: Build the Container
+
+```bash
+docker build \
+  -t <your-registry>/nemotron-omni-vllm:latest \
+  -f recipes/nemotron-3-nano-omni/Dockerfile \
+  recipes/nemotron-3-nano-omni
+docker push <your-registry>/nemotron-omni-vllm:latest
+```
+
+Useful build args:
+
+- `BASE_IMAGE=<image>` — pin to a different vLLM base (default `vllm/vllm-openai:v0.20.0`).
+- `DYNAMO_VERSION=<version>` — pin to a specific `ai-dynamo` release or nightly from <https://pypi.nvidia.com/ai-dynamo/>. Default tracks the latest tested nightly. Make sure the chosen wheel's `vllm` dependency matches `BASE_IMAGE`.
+
+## Step 2: Download the Model
+
+Create the PVC, Hugging Face token secret, and download the model weights:
+
+```bash
+export NAMESPACE=<your-namespace>
+
+# Create the namespace if it does not already exist.
+kubectl create namespace ${NAMESPACE} --dry-run=client -o yaml | kubectl apply -f -
+
+# First edit storageClassName in model-cache.yaml for your cluster.
+kubectl apply -f recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml -n ${NAMESPACE}
+
+kubectl create secret generic hf-token-secret \
+  --from-literal=HF_TOKEN=<your-hf-token> \
+  -n ${NAMESPACE}
+
+kubectl apply -f recipes/nemotron-3-nano-omni/model-cache/model-download.yaml -n ${NAMESPACE}
+kubectl wait --for=condition=complete job/model-download -n ${NAMESPACE} --timeout=3600s
+```
+
+## Step 3: Deploy
+
+Edit `vllm/agg/deploy.yaml` and replace all `<placeholder>` values:
+
+- `<your-registry>/nemotron-omni-vllm:latest` - your built container image
+
+If your registry is private, add the appropriate `imagePullSecrets` to the
+deployment.
+
+```bash
+kubectl apply -f recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml -n ${NAMESPACE}
+```
+
+Monitor startup:
+
+```bash
+kubectl get pods -n ${NAMESPACE} -l nvidia.com/dynamo-graph-deployment-name=nemotron-omni-vllm-agg -w
+```
+
+## Step 4: Test
+
+```bash
+kubectl port-forward svc/nemotron-omni-vllm-agg-frontend 8000:8000 -n ${NAMESPACE}
+```
+
+In another terminal, send a minimal text request:
+
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
+    "messages": [{"role": "user", "content": "Hello!"}],
+    "max_tokens": 128
+  }'
+```
+
+To exercise the multimodal path, attach an image:
+
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
+    "messages": [{
+      "role": "user",
+      "content": [
+        {"type": "image_url", "image_url": {"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"}},
+        {"type": "text", "text": "Describe what is in this image."}
+      ]
+    }],
+    "max_tokens": 256
+  }'
+```
+
+…or an audio clip:
+
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4",
+    "messages": [{
+      "role": "user",
+      "content": [
+        {"type": "audio_url", "audio_url": {"url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"}},
+        {"type": "text", "text": "Transcribe this audio clip."}
+      ]
+    }],
+    "max_tokens": 256
+  }'
+```
+
+## Key Configuration Notes
+
+- `--enable-multimodal` enables image, video, and audio inputs.
+- `--media-io-kwargs '{"video": {"num_frames": 512, "fps": 1}}'` samples long
+  videos at one frame per second, capped at 512 frames.
+- `--dyn-tool-call-parser nemotron_nano` and
+  `--dyn-reasoning-parser nemotron_nano` enable Nemotron Nano tool-call and
+  reasoning parsing.
+- The frontend uses `--router-mode kv --no-kv-events`, which approximates
+  KV-aware routing with prefix hashing without requiring backend KV events.
+
+## Optional: Run without NATS
+
+The Dynamo runtime defaults to NATS for the event plane and connects to a
+NATS server if `NATS_SERVER` is set in the environment (the operator
+auto-injects this on most clusters). On clusters without NATS — or where
+you'd rather avoid the dependency — you can run on TCP request plane + ZMQ
+event plane only. Add to both Frontend and VllmWorker:
+
+```yaml
+mainContainer:
+  env:
+    - name: DYN_EVENT_PLANE
+      value: zmq
+  command: ["/bin/bash", "-lc"]
+  args:
+    # Operator-injected NATS_SERVER takes effect even when set to ""; we have
+    # to actually unset it before the runtime reads env.
+    - >-
+      unset NATS_SERVER &&
+      exec python3 -m dynamo.frontend ...   # or dynamo.vllm
+```
+
+The request plane defaults to TCP already, so no further flags are needed.
+
+## File Layout
+
+```text
+recipes/nemotron-3-nano-omni/
+  README.md
+  Dockerfile
+  model-cache/
+    model-cache.yaml
+    model-download.yaml
+  vllm/
+    agg/
+      deploy.yaml
+```
diff --git a/recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml b/recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml
new file mode 100644
index 000000000000..5d6e2b6e998b
--- /dev/null
+++ b/recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: model-cache
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 250Gi
+  storageClassName: "your-storage-class-name"
diff --git a/recipes/nemotron-3-nano-omni/model-cache/model-download.yaml b/recipes/nemotron-3-nano-omni/model-cache/model-download.yaml
new file mode 100644
index 000000000000..6e34cf65512b
--- /dev/null
+++ b/recipes/nemotron-3-nano-omni/model-cache/model-download.yaml
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: model-download
+spec:
+  backoffLimit: 3
+  completions: 1
+  parallelism: 1
+  template:
+    metadata:
+      labels:
+        app: model-download
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: model-download
+          image: python:3.10-slim
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: ["ALL"]
+            seccompProfile:
+              type: RuntimeDefault
+          command: ["sh", "-c"]
+          envFrom:
+            - secretRef:
+                name: hf-token-secret
+          env:
+            - name: MODEL_NAME
+              value: nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4
+            - name: HF_HOME
+              value: /model-store
+            - name: HF_XET_HIGH_PERFORMANCE
+              value: "1"
+          args:
+            - |
+              set -eux
+              pip install --no-cache-dir huggingface_hub==1.11.0
+              hf download "$MODEL_NAME"
+          volumeMounts:
+            - name: model-cache
+              mountPath: /model-store
+      volumes:
+        - name: model-cache
+          persistentVolumeClaim:
+            claimName: model-cache
diff --git a/recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml b/recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml
new file mode 100644
index 000000000000..1c03c392f811
--- /dev/null
+++ b/recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml
@@ -0,0 +1,102 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# Nemotron Nano Omni aggregated vLLM deployment.
+#
+# Prerequisites:
+#   - Dynamo Platform installed
+#   - Model weights downloaded into the model-cache PVC
+#   - Container built from recipes/nemotron-3-nano-omni/Dockerfile
+#   - HF_TOKEN secret created:
+#       kubectl create secret generic hf-token-secret \
+#         --from-literal=HF_TOKEN=<your-token> -n <namespace>
+#
+# Replace image references before applying:
+#   <your-registry>/nemotron-omni-vllm:latest
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: nemotron-omni-vllm-agg
+spec:
+  backendFramework: vllm
+  pvcs:
+    - name: model-cache
+      create: false
+  services:
+    Frontend:
+      componentType: frontend
+      envFromSecret: hf-token-secret
+      replicas: 1
+      volumeMounts:
+        - name: model-cache
+          mountPoint: /model-store
+      extraPodSpec:
+        mainContainer:
+          image: <your-registry>/nemotron-omni-vllm:latest
+          imagePullPolicy: IfNotPresent
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 8000
+            periodSeconds: 10
+            timeoutSeconds: 1800
+            failureThreshold: 60
+          env:
+            - name: HF_HOME
+              value: /model-store
+          command:
+            - /bin/bash
+            - -lc
+          args:
+            - >-
+              exec python3 -m dynamo.frontend
+              --router-mode kv
+              --no-kv-events
+              --http-port 8000
+
+    VllmWorker:
+      componentType: worker
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        limits:
+          gpu: "1"
+        requests:
+          gpu: "1"
+      volumeMounts:
+        - name: model-cache
+          mountPoint: /model-store
+      sharedMemory:
+        size: 16Gi
+      extraPodSpec:
+        mainContainer:
+          image: <your-registry>/nemotron-omni-vllm:latest
+          imagePullPolicy: IfNotPresent
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 9090
+            periodSeconds: 10
+            timeoutSeconds: 10
+            failureThreshold: 120
+          env:
+            - name: HF_HOME
+              value: /model-store
+            # Match the --media-io-kwargs num_frames so dynamo's multimodal
+            # preprocessor and vLLM agree on the video frame ceiling.
+            - name: DYN_MM_VIDEO_NUM_FRAMES
+              value: "512"
+          command:
+            - /bin/bash
+            - -lc
+          args:
+            - >-
+              exec python3 -m dynamo.vllm
+              --model nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4
+              --served-model-name nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4
+              --max-model-len 131072
+              --enable-multimodal
+              --media-io-kwargs '{"video": {"num_frames": 512, "fps": 1}}'
+              --trust-remote-code
+              --video-pruning-rate 0.5
+              --dyn-tool-call-parser nemotron_nano
+              --dyn-reasoning-parser nemotron_nano