diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml index c3637b853bc6..5c3299fc3877 100644 --- a/.github/actions/docker-build/action.yml +++ b/.github/actions/docker-build/action.yml @@ -80,7 +80,9 @@ runs: ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com run: | sudo apt-get update && sudo apt-get install -y git build-essential protobuf-compiler libclang-dev - curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain stable + curl --retry 5 --retry-delay 2 --fail --show-error -sSL -o /tmp/rustup-init.sh https://sh.rustup.rs + sh /tmp/rustup-init.sh -y --default-toolchain stable + rm -f /tmp/rustup-init.sh . "$HOME/.cargo/env" echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" cargo install cbindgen diff --git a/.github/actions/install-vcluster-cli/action.yml b/.github/actions/install-vcluster-cli/action.yml index 572d979a153a..b3e127b36e24 100644 --- a/.github/actions/install-vcluster-cli/action.yml +++ b/.github/actions/install-vcluster-cli/action.yml @@ -23,10 +23,14 @@ runs: aarch64) VCLUSTER_ARCH="arm64" ;; *) echo "Unsupported architecture: ${ARCH}"; exit 1 ;; esac - curl -sL -o /tmp/vcluster \ + TMP_BIN="$(mktemp -p /tmp vcluster.XXXXXX)" + curl --retry 5 --retry-delay 2 \ + --connect-timeout 10 --max-time 120 \ + --fail --show-error -sL \ + -o "${TMP_BIN}" \ "https://github.com/loft-sh/vcluster/releases/download/${{ inputs.vcluster_version }}/vcluster-linux-${VCLUSTER_ARCH}" - sudo mv /tmp/vcluster /usr/local/bin/vcluster - sudo chmod +x /usr/local/bin/vcluster + sudo install -m 0755 "${TMP_BIN}" /usr/local/bin/vcluster + rm -f "${TMP_BIN}" vcluster version fi echo "::endgroup::" diff --git a/.github/workflows/nightly-ci.yml b/.github/workflows/nightly-ci.yml index d92bbe88d71c..d671a7c98618 100644 --- a/.github/workflows/nightly-ci.yml +++ b/.github/workflows/nightly-ci.yml @@ -258,7 +258,7 @@ jobs: PROTOC_VER="30.2" PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip" PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f" - curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" + curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" echo "${PROTOC_SHA256} ${PROTOC_ZIP}" | sha256sum -c - unzip "${PROTOC_ZIP}" -d $HOME/.local rm "${PROTOC_ZIP}" diff --git a/.github/workflows/pre-merge.yml b/.github/workflows/pre-merge.yml index 3693b06728a2..7e96b753e350 100644 --- a/.github/workflows/pre-merge.yml +++ b/.github/workflows/pre-merge.yml @@ -81,7 +81,7 @@ jobs: PROTOC_VER="30.2" PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip" PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f" - curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" + curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" echo "${PROTOC_SHA256} ${PROTOC_ZIP}" | sha256sum -c - unzip "${PROTOC_ZIP}" -d $HOME/.local rm "${PROTOC_ZIP}" @@ -140,7 +140,7 @@ jobs: PROTOC_VER="30.2" PROTOC_ZIP="protoc-${PROTOC_VER}-linux-x86_64.zip" PROTOC_SHA256="327e9397c6fb3ea2a542513a3221334c6f76f7aa524a7d2561142b67b312a01f" - curl -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" + curl --retry 5 --retry-delay 2 -fsSLO "$PB_REL/download/v${PROTOC_VER}/${PROTOC_ZIP}" echo "${PROTOC_SHA256} ${PROTOC_ZIP}" | sha256sum -c - unzip "${PROTOC_ZIP}" -d $HOME/.local rm "${PROTOC_ZIP}" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 24dd4547445f..cc145b4c037b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -176,7 +176,10 @@ jobs: env: CRANE_VERSION: v0.20.2 run: | - curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \ + curl --retry 5 --retry-delay 2 \ + --connect-timeout 10 --max-time 120 \ + --fail --show-error -sL \ + "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \ | tar -xzf - crane sudo mv crane /usr/local/bin/ crane version @@ -440,7 +443,10 @@ jobs: env: CRANE_VERSION: v0.20.2 run: | - curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \ + curl --retry 5 --retry-delay 2 \ + --connect-timeout 10 --max-time 120 \ + --fail --show-error -sL \ + "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \ | tar -xzf - crane sudo mv crane /usr/local/bin/ crane version diff --git a/components/src/dynamo/frontend/sglang_prepost.py b/components/src/dynamo/frontend/sglang_prepost.py index ec9ebf0f6830..3edbbc8cb55d 100644 --- a/components/src/dynamo/frontend/sglang_prepost.py +++ b/components/src/dynamo/frontend/sglang_prepost.py @@ -511,6 +511,69 @@ def _try_parse_json_array(text: str) -> list | None: return None +def _decode_single_token(tokenizer, token_id: int) -> str | None: + try: + return tokenizer.decode([token_id], skip_special_tokens=False) + except Exception as exc: + logger.exception( + "Failed to decode token for chat logprobs: token_id=%s tokenizer=%s", + token_id, + type(tokenizer).__name__, + ) + raise RuntimeError( + f"Failed to decode token_id={token_id} with tokenizer={type(tokenizer).__name__}" + ) from exc + + +def _build_chat_logprobs( + *, + tokenizer, + token_ids: list[int], + log_probs: list[float] | None, + top_logprobs: list[list[dict[str, Any]]] | None, +) -> dict[str, Any] | None: + if not log_probs: + return None + + content: list[dict[str, Any]] = [] + for idx, logprob in enumerate(log_probs): + if idx >= len(token_ids): + break + + token_id = token_ids[idx] + token_text = _decode_single_token(tokenizer, token_id) + token_top_logprobs = [] + if top_logprobs and idx < len(top_logprobs): + for item in top_logprobs[idx]: + token_text_alt = item.get("token") + if token_text_alt is None and item.get("token_id") is not None: + token_text_alt = _decode_single_token(tokenizer, item["token_id"]) + token_top_logprobs.append( + { + "token": token_text_alt, + "logprob": item.get("logprob"), + "bytes": ( + list(token_text_alt.encode("utf-8")) + if token_text_alt is not None + else None + ), + } + ) + + content.append( + { + "token": token_text, + "logprob": float(logprob), + "bytes": ( + list(token_text.encode("utf-8")) if token_text is not None else None + ), + "top_logprobs": token_top_logprobs, + } + ) + + return {"content": content} + + class SglangStreamingPostProcessor: """Streaming post-processor using SGLang parsers and HF tokenizer detokenization. @@ -621,6 +684,12 @@ def process_output(self, engine_response: dict[str, Any]) -> dict[str, Any] | No raw_ids = engine_response.get("token_ids") token_ids = raw_ids if isinstance(raw_ids, list) else list(raw_ids or []) finish_reason = engine_response.get("finish_reason") + logprobs = _build_chat_logprobs( + tokenizer=self.tokenizer, + token_ids=token_ids, + log_probs=engine_response.get("log_probs"), + top_logprobs=engine_response.get("top_logprobs"), + ) delta_text = self._incremental_decode(token_ids) if token_ids else "" @@ -630,14 +699,14 @@ def process_output(self, engine_response: dict[str, Any]) -> dict[str, Any] | No "index": 0, "delta": {"role": "assistant", "content": delta_text}, "finish_reason": finish_reason, - "logprobs": None, + "logprobs": logprobs, } elif finish_reason: return { "index": 0, "delta": {}, "finish_reason": finish_reason, - "logprobs": None, + "logprobs": logprobs, } return None @@ -854,7 +923,7 @@ def process_output(self, engine_response: dict[str, Any]) -> dict[str, Any] | No "index": 0, "delta": delta if has_content else {}, "finish_reason": effective_finish, - "logprobs": None, + "logprobs": logprobs, } return None diff --git a/components/src/dynamo/frontend/tests/test_sglang_processor_unit.py b/components/src/dynamo/frontend/tests/test_sglang_processor_unit.py index ba68a376bf9f..08bc69374933 100644 --- a/components/src/dynamo/frontend/tests/test_sglang_processor_unit.py +++ b/components/src/dynamo/frontend/tests/test_sglang_processor_unit.py @@ -1416,6 +1416,32 @@ def test_fast_path_content_output(self, tokenizer): assert choice["index"] == 0 assert choice["logprobs"] is None + def test_fast_path_includes_logprobs(self, tokenizer): + """Fast path maps engine log_probs/top_logprobs into OpenAI chat logprobs.""" + post = SglangStreamingPostProcessor( + tokenizer=tokenizer, tool_call_parser=None, reasoning_parser=None + ) + token_ids = tokenizer.encode("Hello") + choice = post.process_output( + { + "token_ids": token_ids, + "log_probs": [-0.1] * len(token_ids), + "top_logprobs": [ + [{"token": tokenizer.decode([tid]), "logprob": -0.1}] + for tid in token_ids + ], + "finish_reason": None, + } + ) + + assert choice is not None + assert choice["logprobs"] is not None + content = choice["logprobs"]["content"] + assert len(content) == len(token_ids) + assert content[0]["logprob"] == -0.1 + assert "top_logprobs" in content[0] + assert "bytes" in content[0] + # --------------------------------------------------------------------------- # SglangStreamingPostProcessor: reasoning parsing diff --git a/components/src/dynamo/planner/config/planner_config.py b/components/src/dynamo/planner/config/planner_config.py index 93e074bcb6fd..307757381d45 100644 --- a/components/src/dynamo/planner/config/planner_config.py +++ b/components/src/dynamo/planner/config/planner_config.py @@ -52,7 +52,8 @@ class PlannerConfig(BaseModel): "kubernetes", "virtual", "global-planner" ] = SLAPlannerDefaults.environment namespace: str = Field( - default_factory=lambda: os.environ.get("DYN_NAMESPACE", "dynamo") + default_factory=lambda: os.environ.get("DYN_NAMESPACE", "dynamo"), + exclude=True, ) backend: Literal["vllm", "sglang", "trtllm", "mocker"] = SLAPlannerDefaults.backend mode: Literal["disagg", "prefill", "decode", "agg"] = SLAPlannerDefaults.mode @@ -109,7 +110,8 @@ class PlannerConfig(BaseModel): default_factory=lambda: os.environ.get( "PROMETHEUS_ENDPOINT", "http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090", - ) + ), + exclude=True, ) metric_reporting_prometheus_port: int = Field( default_factory=lambda: int(os.environ.get("PLANNER_PROMETHEUS_PORT", 0)) diff --git a/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py b/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py index 50d123dd92f3..4a068cee2d67 100644 --- a/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py +++ b/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py @@ -201,63 +201,74 @@ def _build_logprob_kwargs(request: Dict[str, Any]) -> Dict[str, Any]: return kwargs @staticmethod + def _format_top_logprobs(raw_top_logprobs: Any) -> list[dict[str, Any]]: + formatted: list[dict[str, Any]] = [] + if not raw_top_logprobs: + return formatted + + for rank, item in enumerate(raw_top_logprobs): + if not item: + continue + + logprob = float(item[0]) if item[0] is not None else None + token_id = item[1] if len(item) > 1 else None + token = item[2] if len(item) > 2 else None + formatted.append( + { + "rank": rank, + "token_id": token_id, + "token": token, + "logprob": logprob, + "bytes": list(token.encode("utf-8")) if token else None, + } + ) + + return formatted + + @classmethod def _extract_logprobs( - meta_info: Dict[str, Any], num_output_logprobs_so_far: int - ) -> tuple: - """Extract logprobs from SGLang meta_info for new tokens. + cls, meta_info: Dict[str, Any], start_offset: int, num_new_tokens: int + ) -> tuple[list[float] | None, list[list[dict[str, Any]]] | None, int]: + """Extract logprobs from SGLang meta_info for newly emitted tokens.""" + if num_new_tokens <= 0 or start_offset < 0: + return None, None, start_offset + + output_token_logprobs = meta_info.get("output_token_logprobs") or [] + if not output_token_logprobs or start_offset >= len(output_token_logprobs): + return None, None, start_offset + + end_offset = start_offset + num_new_tokens + selected_logprobs = output_token_logprobs[start_offset:end_offset] + + output_top_logprobs = meta_info.get("output_top_logprobs") or [] + selected_top_logprobs = ( + output_top_logprobs[start_offset:end_offset] if output_top_logprobs else [] + ) - While Dynamo forces stream_output=True (args.py) so that output_ids - are disjoint per chunk, SGLang's output_token_logprobs and - output_top_logprobs in meta_info are always cumulative. We track an - offset to slice out only the new entries each chunk. + log_probs: list[float] = [] + top_logprobs: list[list[dict[str, Any]]] = [] - Args: - meta_info: SGLang response meta_info dict. - num_output_logprobs_so_far: Number of logprob entries already - processed in previous chunks. + for idx, token_logprob in enumerate(selected_logprobs): + if not token_logprob: + continue - Returns: - Tuple of (log_probs, top_logprobs, new_total): - - log_probs: List of floats (selected token logprob per position) - - top_logprobs: List of lists of dicts with rank/token_id/token/logprob - - new_total: Updated count of logprob entries processed so far - """ - output_token_logprobs = meta_info.get("output_token_logprobs") - if not output_token_logprobs: - return None, None, num_output_logprobs_so_far - - new_logprobs = output_token_logprobs[num_output_logprobs_so_far:] - if not new_logprobs: - return None, None, num_output_logprobs_so_far - - # Extract selected-token logprobs: each entry is (logprob, token_id, text_or_None) - log_probs = [float(entry[0]) for entry in new_logprobs] - - # Extract top logprobs if available - top_logprobs: list[list[dict[str, Any]]] | None = None - output_top = meta_info.get("output_top_logprobs") - if output_top: - new_top = output_top[num_output_logprobs_so_far:] - if new_top: - top_logprobs = [] - for position_entries in new_top: - if position_entries is None: - top_logprobs.append([]) - continue - position_list = [] - for rank_idx, entry in enumerate(position_entries): - position_list.append( - { - "rank": rank_idx + 1, - "token_id": entry[1], - "token": entry[2], - "logprob": float(entry[0]), - } - ) - top_logprobs.append(position_list) - - new_total = len(output_token_logprobs) - return log_probs, top_logprobs, new_total + log_probs.append(float(token_logprob[0])) + if selected_top_logprobs: + raw_top = ( + selected_top_logprobs[idx] + if idx < len(selected_top_logprobs) + else None + ) + top_logprobs.append(cls._format_top_logprobs(raw_top)) + + if not log_probs: + return None, None, start_offset + + return ( + log_probs, + top_logprobs if selected_top_logprobs else None, + start_offset + len(selected_logprobs), + ) async def generate( self, request: Dict[str, Any], context: Context @@ -390,8 +401,9 @@ async def _process_token_stream( """ # Use Future pattern for request ID - will be set when first response arrives request_id_future: asyncio.Future[str] = asyncio.Future() - # Logprob offset: output_ids are disjoint (stream_output=True) but - # meta_info logprobs are cumulative — track how many we've emitted. + # output_ids are disjoint per chunk, while SGLang logprob arrays are + # cumulative. Track the emitted-token offset so each chunk slices the + # matching cumulative window. num_output_logprobs_so_far = 0 async with self._cancellation_monitor(request_id_future, context): async for res in stream_source: @@ -425,18 +437,19 @@ async def _process_token_stream( # Pass through disjoint token segments directly out["token_ids"] = output_ids - - # Extract logprobs for new tokens if available ( log_probs, top_logprobs, num_output_logprobs_so_far, - ) = self._extract_logprobs(res["meta_info"], num_output_logprobs_so_far) + ) = self._extract_logprobs( + res.get("meta_info", {}), + num_output_logprobs_so_far, + len(output_ids), + ) if log_probs is not None: out["log_probs"] = log_probs if top_logprobs is not None: out["top_logprobs"] = top_logprobs - routed_experts = res["meta_info"].get("routed_experts") if routed_experts is not None: # Base64-encode tensor bytes to match sglang's output format. diff --git a/components/src/dynamo/sglang/tests/test_sglang_decode_handler.py b/components/src/dynamo/sglang/tests/test_sglang_decode_handler.py index 7e3d546549b8..226973082508 100644 --- a/components/src/dynamo/sglang/tests/test_sglang_decode_handler.py +++ b/components/src/dynamo/sglang/tests/test_sglang_decode_handler.py @@ -3,7 +3,10 @@ import pytest -from dynamo.sglang.request_handlers.llm.decode_handler import _extract_media_urls +from dynamo.sglang.request_handlers.llm.decode_handler import ( + DecodeWorkerHandler, + _extract_media_urls, +) pytestmark = [ pytest.mark.unit, @@ -34,3 +37,74 @@ def test_extract_media_urls_returns_none_for_missing_or_invalid_items(): assert ( _extract_media_urls({"image_url": [{"ignored": "value"}]}, "image_url") is None ) + + +def test_build_logprob_kwargs_token_request_uses_output_options_logprobs(): + assert DecodeWorkerHandler._build_logprob_kwargs( + {"output_options": {"logprobs": 3}} + ) == { + "return_logprob": True, + "top_logprobs_num": 3, + } + + +def test_extract_logprobs_from_meta_info(): + log_probs, top_logprobs = DecodeWorkerHandler._extract_logprobs( + { + "output_token_logprobs": [[-0.25, 11, " hello"], [-0.5, 12, " world"]], + "output_top_logprobs": [ + [[-0.25, 11, " hello"], [-1.0, 99, " hi"]], + [[-0.5, 12, " world"]], + ], + }, + 0, + 2, + ) + + assert log_probs == [-0.25, -0.5] + assert top_logprobs is not None + assert top_logprobs[0][0]["token"] == " hello" + assert top_logprobs[0][0]["bytes"] == list(" hello".encode("utf-8")) + + +def test_extract_logprobs_uses_start_offset_for_streaming_chunks(): + log_probs, top_logprobs = DecodeWorkerHandler._extract_logprobs( + { + "output_token_logprobs": [ + [-0.1, 10, " a"], + [-0.2, 11, " b"], + [-0.3, 12, " c"], + ], + "output_top_logprobs": [ + [[-0.1, 10, " a"]], + [[-0.2, 11, " b"]], + [[-0.3, 12, " c"]], + ], + }, + 1, + 1, + ) + + assert log_probs == [-0.2] + assert top_logprobs == [ + [ + { + "rank": 0, + "token_id": 11, + "token": " b", + "logprob": -0.2, + "bytes": list(" b".encode("utf-8")), + } + ] + ] + + +def test_extract_logprobs_returns_none_when_offset_exceeds_available_entries(): + log_probs, top_logprobs = DecodeWorkerHandler._extract_logprobs( + {"output_token_logprobs": [[-0.25, 11, " hello"]]}, + 1, + 5, + ) + + assert log_probs is None + assert top_logprobs is None diff --git a/deploy/operator/internal/consts/consts.go b/deploy/operator/internal/consts/consts.go index 2d8b67ca533b..a1b9222bb98c 100644 --- a/deploy/operator/internal/consts/consts.go +++ b/deploy/operator/internal/consts/consts.go @@ -27,6 +27,8 @@ const ( DynamoNixlPort = 19090 DynamoNixlPortName = "nixl" + DynamoFPMBasePort = 20380 + MpiRunSshPort = 2222 // Default security context values diff --git a/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go b/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go index 9216c3c453da..4aaf920cf286 100644 --- a/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go +++ b/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go @@ -773,6 +773,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. {Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName}, {Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker}, {Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"}, + {Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"}, {Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"}, {Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-lws-deploy"}, {Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"}, @@ -916,6 +917,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. {Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName}, {Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker}, {Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"}, + {Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"}, {Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"}, {Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-lws-deploy"}, {Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"}, diff --git a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go index 59c0a8982b4a..eb70054264a0 100644 --- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go +++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go @@ -1307,6 +1307,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. Value: string(dgdr.UID), }, } + if r.Config.Infrastructure.PrometheusEndpoint != "" { + profilerEnv = append(profilerEnv, corev1.EnvVar{ + Name: "PROMETHEUS_ENDPOINT", + Value: r.Config.Infrastructure.PrometheusEndpoint, + }) + } // Build volume mounts volumeMounts := []corev1.VolumeMount{ diff --git a/deploy/operator/internal/dynamo/component_worker.go b/deploy/operator/internal/dynamo/component_worker.go index 38d9d0c45503..be19b61c3863 100644 --- a/deploy/operator/internal/dynamo/component_worker.go +++ b/deploy/operator/internal/dynamo/component_worker.go @@ -107,6 +107,10 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: fmt.Sprintf("%d", commonconsts.DynamoNixlPort), }, + { + Name: "DYN_FORWARDPASS_METRIC_PORT", + Value: fmt.Sprintf("%d", commonconsts.DynamoFPMBasePort), + }, }...) if context.WorkerHashSuffix != "" { diff --git a/deploy/operator/internal/dynamo/graph_test.go b/deploy/operator/internal/dynamo/graph_test.go index a6ab18bfc825..f5cab1e18a0d 100644 --- a/deploy/operator/internal/dynamo/graph_test.go +++ b/deploy/operator/internal/dynamo/graph_test.go @@ -2160,6 +2160,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: "19090", }, + { + Name: "DYN_FORWARDPASS_METRIC_PORT", + Value: "20380", + }, { Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-dynamo-graph-deployment", @@ -2374,6 +2378,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: "19090", }, + { + Name: "DYN_FORWARDPASS_METRIC_PORT", + Value: "20380", + }, { Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-dynamo-graph-deployment", @@ -3187,6 +3195,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: "19090", }, + { + Name: "DYN_FORWARDPASS_METRIC_PORT", + Value: "20380", + }, { Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-dynamo-graph-deployment", @@ -3388,6 +3400,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Name: "NIXL_TELEMETRY_PROMETHEUS_PORT", Value: "19090", }, + { + Name: "DYN_FORWARDPASS_METRIC_PORT", + Value: "20380", + }, { Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-dynamo-graph-deployment", @@ -5635,6 +5651,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) { {Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName}, {Name: commonconsts.DynamoComponentEnvVar, Value: "worker"}, {Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"}, + {Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "20380"}, {Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"}, {Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-deployment"}, {Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-deployment"}, diff --git a/recipes/README.md b/recipes/README.md index 0783a8c87377..500aad85fa01 100644 --- a/recipes/README.md +++ b/recipes/README.md @@ -68,6 +68,7 @@ These recipes are under active development and may require additional setup step | Model | Framework | Mode | GPUs | Deployment | Notes | |-------|-----------|------|------|------------|-------| | **[GLM-5-NVFP4](glm-5-nvfp4/sglang/disagg/)** | SGLang | Disagg Prefill/Decode | 20x GB200 | ✅ | NVFP4, EAGLE speculative decoding, TP16 decode + TP4 prefill. Requires [custom container build](glm-5-nvfp4/). | +| **[Nemotron-3-Nano-Omni-NVFP4](nemotron-3-nano-omni/vllm/agg/)** | vLLM | Aggregated | 1x GPU | ✅ | Multimodal text/image/video/audio serving. Requires [custom container build](nemotron-3-nano-omni/). | | **[nvidia/Kimi-K2.5-NVFP4](kimi-k2.5/trtllm/agg/nvidia/)** | TensorRT-LLM | Aggregated | 8x B200 | ✅ | Text only — MoE model, TP8×EP8, reasoning + tool calling. Vision input not yet functional. | | **[DeepSeek-V4-Flash](deepseek-v4/deepseek-v4-flash/vllm/agg/)** | vLLM | Aggregated | 4x B200 | ✅ | Text only — MoE model (284B / 13B active), DP=4 + EP, FP8 KV cache, reasoning + tool calling. Requires [custom container build](deepseek-v4/container/). | | **[DeepSeek-V4-Flash](deepseek-v4/deepseek-v4-flash/sglang/agg/)** | SGLang | Aggregated | 4x B200 | ✅ | Text only — MoE model (284B / 13B active), TP=4, MXFP4 MoE via FlashInfer, EAGLE MTP (3 steps / 4 draft tokens), reasoning + tool calling. Prebuilt image available; optional [custom container build](deepseek-v4/container/). | diff --git a/recipes/nemotron-3-nano-omni/Dockerfile b/recipes/nemotron-3-nano-omni/Dockerfile new file mode 100644 index 000000000000..1d13afe628ff --- /dev/null +++ b/recipes/nemotron-3-nano-omni/Dockerfile @@ -0,0 +1,60 @@ +# syntax=docker/dockerfile:1.10.0 +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Layer the ai-dynamo wheel (and the omni multimodal extras) onto an upstream +# vLLM image. Dynamo is installed from https://pypi.nvidia.com/ai-dynamo/ — +# no source build, no Rust toolchain. Match DYNAMO_VERSION to a nightly that +# targets the same vLLM minor as BASE_IMAGE (see the wheel's METADATA for the +# pinned vllm version). +# +# Build: +# docker build -t /nemotron-omni-vllm:latest \ +# -f recipes/nemotron-3-nano-omni/Dockerfile \ +# recipes/nemotron-3-nano-omni +# +# Override defaults with --build-arg, e.g.: +# --build-arg BASE_IMAGE=vllm/vllm-openai:v0.20.0 +# --build-arg DYNAMO_VERSION=1.2.0.dev20260427 + +ARG BASE_IMAGE="vllm/vllm-openai:v0.20.0" +ARG DYNAMO_VERSION="1.2.0.dev20260427" + +FROM ${BASE_IMAGE} +USER root + +ARG DYNAMO_VERSION + +# ai-dynamo Python package (dynamo.frontend, dynamo.vllm, ...) installed with +# --no-deps so that: +# - the base image's vLLM 0.20 / torch / cuda stack is preserved (otherwise +# pip would try to re-install vllm[flashinfer,otel,runai]==0.20.0 which is +# declared by ai-dynamo[vllm]), and +# - we skip `nixl` (KV transport, only used for disagg) and `ray` (multi-node +# orchestration), neither of which an aggregated single-GPU recipe needs. +RUN pip install --no-cache-dir --no-deps \ + --extra-index-url https://pypi.nvidia.com \ + ai-dynamo==${DYNAMO_VERSION} + +# ai-dynamo's declared core dependencies, plus the Rust _core extension wheel +# (`ai-dynamo-runtime`) that the Python package imports natively. `kubernetes`, +# `pydantic`, and `pydantic-settings` are explicitly pinned because the latest +# unpinned versions (kubernetes 35.x, pydantic 2.13.x) violate ai-dynamo's +# declared compatibility ranges. +RUN pip install --no-cache-dir \ + --extra-index-url https://pypi.nvidia.com \ + ai-dynamo-runtime==${DYNAMO_VERSION} \ + "kubernetes<33.0.0,>=32.0.1" \ + "pydantic<2.13" "pydantic-settings<2.13.0" \ + msgpack msgspec prometheus-client pyzmq transformers + +# ai-dynamo[vllm] extras minus nixl + ray (handled above), plus the +# multimodal-video Python deps (av/ftfy/nvtx/sentencepiece) that the Nemotron +# Nano Omni model needs at runtime but which aren't pulled in by any extra. +RUN pip install --no-cache-dir \ + blake3 librosa soundfile uvloop \ + av ftfy nvtx sentencepiece + +# vllm/vllm-openai's default ENTRYPOINT runs `vllm serve`; reset it so the +# image behaves as a plain dynamo runtime image. +ENTRYPOINT ["/bin/bash"] diff --git a/recipes/nemotron-3-nano-omni/README.md b/recipes/nemotron-3-nano-omni/README.md new file mode 100644 index 000000000000..e4cbfe6031a1 --- /dev/null +++ b/recipes/nemotron-3-nano-omni/README.md @@ -0,0 +1,185 @@ + + +# Nemotron 3 Nano Omni NVFP4 + +Serves [nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4](https://huggingface.co/nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4) +using vLLM with an aggregated Dynamo deployment. + +This recipe builds a custom container that layers the `ai-dynamo` wheel +(from ) onto an upstream vLLM image — no +source build, no Rust toolchain. + +## Topology + +| Role | Replicas | GPUs/replica | Notes | +|------|----------|--------------|-------| +| Frontend | 1 | 0 | Dynamo frontend with prefix-hash KV routing | +| vLLM worker | 1 | 1 | Text, image, video, and audio inputs | + +## Prerequisites + +- A Kubernetes cluster with the [Dynamo Operator](../../docs/kubernetes/README.md) installed +- One NVIDIA GPU per worker replica +- Shared PVC storage for the Hugging Face model cache +- Hugging Face access to `nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4` + +## Step 1: Build the Container + +```bash +docker build \ + -t /nemotron-omni-vllm:latest \ + -f recipes/nemotron-3-nano-omni/Dockerfile \ + recipes/nemotron-3-nano-omni +docker push /nemotron-omni-vllm:latest +``` + +Useful build args: + +- `BASE_IMAGE=` — pin to a different vLLM base (default `vllm/vllm-openai:v0.20.0`). +- `DYNAMO_VERSION=` — pin to a specific `ai-dynamo` release or nightly from . Default tracks the latest tested nightly. Make sure the chosen wheel's `vllm` dependency matches `BASE_IMAGE`. + +## Step 2: Download the Model + +Create the PVC, Hugging Face token secret, and download the model weights: + +```bash +export NAMESPACE= + +# Create the namespace if it does not already exist. +kubectl create namespace ${NAMESPACE} --dry-run=client -o yaml | kubectl apply -f - + +# First edit storageClassName in model-cache.yaml for your cluster. +kubectl apply -f recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml -n ${NAMESPACE} + +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN= \ + -n ${NAMESPACE} + +kubectl apply -f recipes/nemotron-3-nano-omni/model-cache/model-download.yaml -n ${NAMESPACE} +kubectl wait --for=condition=complete job/model-download -n ${NAMESPACE} --timeout=3600s +``` + +## Step 3: Deploy + +Edit `vllm/agg/deploy.yaml` and replace all `` values: + +- `/nemotron-omni-vllm:latest` - your built container image + +If your registry is private, add the appropriate `imagePullSecrets` to the +deployment. + +```bash +kubectl apply -f recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml -n ${NAMESPACE} +``` + +Monitor startup: + +```bash +kubectl get pods -n ${NAMESPACE} -l nvidia.com/dynamo-graph-deployment-name=nemotron-omni-vllm-agg -w +``` + +## Step 4: Test + +```bash +kubectl port-forward svc/nemotron-omni-vllm-agg-frontend 8000:8000 -n ${NAMESPACE} +``` + +In another terminal, send a minimal text request: + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4", + "messages": [{"role": "user", "content": "Hello!"}], + "max_tokens": 128 + }' +``` + +To exercise the multimodal path, attach an image: + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4", + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png"}}, + {"type": "text", "text": "Describe what is in this image."} + ] + }], + "max_tokens": 256 + }' +``` + +…or an audio clip: + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4", + "messages": [{ + "role": "user", + "content": [ + {"type": "audio_url", "audio_url": {"url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"}}, + {"type": "text", "text": "Transcribe this audio clip."} + ] + }], + "max_tokens": 256 + }' +``` + +## Key Configuration Notes + +- `--enable-multimodal` enables image, video, and audio inputs. +- `--media-io-kwargs '{"video": {"num_frames": 512, "fps": 1}}'` samples long + videos at one frame per second, capped at 512 frames. +- `--dyn-tool-call-parser nemotron_nano` and + `--dyn-reasoning-parser nemotron_nano` enable Nemotron Nano tool-call and + reasoning parsing. +- The frontend uses `--router-mode kv --no-kv-events`, which approximates + KV-aware routing with prefix hashing without requiring backend KV events. + +## Optional: Run without NATS + +The Dynamo runtime defaults to NATS for the event plane and connects to a +NATS server if `NATS_SERVER` is set in the environment (the operator +auto-injects this on most clusters). On clusters without NATS — or where +you'd rather avoid the dependency — you can run on TCP request plane + ZMQ +event plane only. Add to both Frontend and VllmWorker: + +```yaml +mainContainer: + env: + - name: DYN_EVENT_PLANE + value: zmq + command: ["/bin/bash", "-lc"] + args: + # Operator-injected NATS_SERVER takes effect even when set to ""; we have + # to actually unset it before the runtime reads env. + - >- + unset NATS_SERVER && + exec python3 -m dynamo.frontend ... # or dynamo.vllm +``` + +The request plane defaults to TCP already, so no further flags are needed. + +## File Layout + +```text +recipes/nemotron-3-nano-omni/ + README.md + Dockerfile + model-cache/ + model-cache.yaml + model-download.yaml + vllm/ + agg/ + deploy.yaml +``` diff --git a/recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml b/recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml new file mode 100644 index 000000000000..5d6e2b6e998b --- /dev/null +++ b/recipes/nemotron-3-nano-omni/model-cache/model-cache.yaml @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: model-cache +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 250Gi + storageClassName: "your-storage-class-name" diff --git a/recipes/nemotron-3-nano-omni/model-cache/model-download.yaml b/recipes/nemotron-3-nano-omni/model-cache/model-download.yaml new file mode 100644 index 000000000000..6e34cf65512b --- /dev/null +++ b/recipes/nemotron-3-nano-omni/model-cache/model-download.yaml @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +apiVersion: batch/v1 +kind: Job +metadata: + name: model-download +spec: + backoffLimit: 3 + completions: 1 + parallelism: 1 + template: + metadata: + labels: + app: model-download + spec: + restartPolicy: Never + containers: + - name: model-download + image: python:3.10-slim + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + seccompProfile: + type: RuntimeDefault + command: ["sh", "-c"] + envFrom: + - secretRef: + name: hf-token-secret + env: + - name: MODEL_NAME + value: nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4 + - name: HF_HOME + value: /model-store + - name: HF_XET_HIGH_PERFORMANCE + value: "1" + args: + - | + set -eux + pip install --no-cache-dir huggingface_hub==1.11.0 + hf download "$MODEL_NAME" + volumeMounts: + - name: model-cache + mountPath: /model-store + volumes: + - name: model-cache + persistentVolumeClaim: + claimName: model-cache diff --git a/recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml b/recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml new file mode 100644 index 000000000000..1c03c392f811 --- /dev/null +++ b/recipes/nemotron-3-nano-omni/vllm/agg/deploy.yaml @@ -0,0 +1,102 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# Nemotron Nano Omni aggregated vLLM deployment. +# +# Prerequisites: +# - Dynamo Platform installed +# - Model weights downloaded into the model-cache PVC +# - Container built from recipes/nemotron-3-nano-omni/Dockerfile +# - HF_TOKEN secret created: +# kubectl create secret generic hf-token-secret \ +# --from-literal=HF_TOKEN= -n +# +# Replace image references before applying: +# /nemotron-omni-vllm:latest +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: nemotron-omni-vllm-agg +spec: + backendFramework: vllm + pvcs: + - name: model-cache + create: false + services: + Frontend: + componentType: frontend + envFromSecret: hf-token-secret + replicas: 1 + volumeMounts: + - name: model-cache + mountPoint: /model-store + extraPodSpec: + mainContainer: + image: /nemotron-omni-vllm:latest + imagePullPolicy: IfNotPresent + startupProbe: + httpGet: + path: /health + port: 8000 + periodSeconds: 10 + timeoutSeconds: 1800 + failureThreshold: 60 + env: + - name: HF_HOME + value: /model-store + command: + - /bin/bash + - -lc + args: + - >- + exec python3 -m dynamo.frontend + --router-mode kv + --no-kv-events + --http-port 8000 + + VllmWorker: + componentType: worker + envFromSecret: hf-token-secret + replicas: 1 + resources: + limits: + gpu: "1" + requests: + gpu: "1" + volumeMounts: + - name: model-cache + mountPoint: /model-store + sharedMemory: + size: 16Gi + extraPodSpec: + mainContainer: + image: /nemotron-omni-vllm:latest + imagePullPolicy: IfNotPresent + startupProbe: + httpGet: + path: /health + port: 9090 + periodSeconds: 10 + timeoutSeconds: 10 + failureThreshold: 120 + env: + - name: HF_HOME + value: /model-store + # Match the --media-io-kwargs num_frames so dynamo's multimodal + # preprocessor and vLLM agree on the video frame ceiling. + - name: DYN_MM_VIDEO_NUM_FRAMES + value: "512" + command: + - /bin/bash + - -lc + args: + - >- + exec python3 -m dynamo.vllm + --model nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4 + --served-model-name nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4 + --max-model-len 131072 + --enable-multimodal + --media-io-kwargs '{"video": {"num_frames": 512, "fps": 1}}' + --trust-remote-code + --video-pruning-rate 0.5 + --dyn-tool-call-parser nemotron_nano + --dyn-reasoning-parser nemotron_nano