From 7c0c63d3981f212aa73c4d96c2fc4ca5c5340729 Mon Sep 17 00:00:00 2001 From: bongwoobak Date: Wed, 1 Apr 2026 02:00:11 +0900 Subject: [PATCH 1/4] MAF-19524: feat(preset): add H200 presets for AI& April launch models Add vLLM v0.15.1 E2E presets for H200-SXM targeting the AI& April launch model scope. All presets include ISVC_USE_KV_EVENTS for precise-prefix-cache-aware Heimdall scheduling. New models: - Qwen3.5-9B (tp1), Qwen3.5-27B (tp1), Qwen3.5-27B-FP8 (tp1) - Qwen3.5-397B (tp8, expert parallel) - DeepSeek V3.2 (tp8, expert parallel) - Nemotron Super 120B BF16/FP8 (tp2) - Nemotron Nano 30B BF16/FP8 (tp1) - GLM-5 BF16/FP8 (tp8, expert parallel) --- ...v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml | 52 ++++++++++++++++++ ...o-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml | 46 ++++++++++++++++ ...no-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml | 46 ++++++++++++++++ ...bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml | 48 +++++++++++++++++ ...-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml | 48 +++++++++++++++++ ...wen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml | 46 ++++++++++++++++ ...en-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml | 46 ++++++++++++++++ ...397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml | 50 +++++++++++++++++ ...wen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml | 46 ++++++++++++++++ ...-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml | 53 +++++++++++++++++++ ...lm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml | 53 +++++++++++++++++++ 11 files changed, 534 insertions(+) create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml new file mode 100644 index 0000000..1103b70 --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml @@ -0,0 +1,52 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.15.1-deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: deepseek-ai + mif.moreh.io/model.name: deepseek-v3.2 + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/parallelism: "tp8-moe-ep8" +spec: + framework: vllm + model: + name: deepseek-ai/DeepSeek-V3.2 + parallelism: + tensor: 8 + expert: true + template: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.15.1 + env: + - name: VLLM_USE_DEEP_GEMM + value: "1" + - name: ISVC_USE_KV_EVENTS + value: "true" + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --max-model-len 65536 + --max-num-seqs 64 + --gpu-memory-utilization 0.90 + --kv-cache-dtype fp8 + --enable-chunked-prefill + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "8" + limits: + nvidia.com/gpu: "8" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: h200-sxm + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml new file mode 100644 index 0000000..1c37fd9 --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml @@ -0,0 +1,46 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.15.1-nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: nvidia + mif.moreh.io/model.name: nvidia-nemotron-3-nano-30b-a3b-bf16 + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/parallelism: "1" +spec: + framework: vllm + model: + name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 + template: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.15.1 + env: + - name: ISVC_USE_KV_EVENTS + value: "true" + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --max-model-len 131072 + --max-num-seqs 128 + --gpu-memory-utilization 0.90 + --kv-cache-dtype fp8 + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "1" + limits: + nvidia.com/gpu: "1" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: h200-sxm + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml new file mode 100644 index 0000000..16a8ebe --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml @@ -0,0 +1,46 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.15.1-nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: nvidia + mif.moreh.io/model.name: nvidia-nemotron-3-nano-30b-a3b-fp8 + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/parallelism: "1" +spec: + framework: vllm + model: + name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8 + template: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.15.1 + env: + - name: ISVC_USE_KV_EVENTS + value: "true" + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --max-model-len 131072 + --max-num-seqs 128 + --gpu-memory-utilization 0.90 + --kv-cache-dtype fp8 + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "1" + limits: + nvidia.com/gpu: "1" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: h200-sxm + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml new file mode 100644 index 0000000..639c7f5 --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml @@ -0,0 +1,48 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.15.1-nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: nvidia + mif.moreh.io/model.name: nvidia-nemotron-3-super-120b-a12b-bf16 + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/parallelism: "tp2-moe-tp2" +spec: + framework: vllm + model: + name: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16 + parallelism: + tensor: 2 + template: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.15.1 + env: + - name: ISVC_USE_KV_EVENTS + value: "true" + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --max-model-len 131072 + --max-num-seqs 128 + --gpu-memory-utilization 0.90 + --kv-cache-dtype fp8 + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "2" + limits: + nvidia.com/gpu: "2" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: h200-sxm + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml new file mode 100644 index 0000000..0706ffb --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml @@ -0,0 +1,48 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.15.1-nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: nvidia + mif.moreh.io/model.name: nvidia-nemotron-3-super-120b-a12b-fp8 + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/parallelism: "tp2-moe-tp2" +spec: + framework: vllm + model: + name: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8 + parallelism: + tensor: 2 + template: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.15.1 + env: + - name: ISVC_USE_KV_EVENTS + value: "true" + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --max-model-len 131072 + --max-num-seqs 128 + --gpu-memory-utilization 0.90 + --kv-cache-dtype fp8 + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "2" + limits: + nvidia.com/gpu: "2" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: h200-sxm + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml new file mode 100644 index 0000000..f6e20b3 --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml @@ -0,0 +1,46 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.15.1-qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: qwen + mif.moreh.io/model.name: qwen3.5-27b-fp8 + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/parallelism: "1" +spec: + framework: vllm + model: + name: Qwen/Qwen3.5-27B-FP8 + template: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.15.1 + env: + - name: ISVC_USE_KV_EVENTS + value: "true" + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --max-model-len 131072 + --max-num-seqs 128 + --gpu-memory-utilization 0.90 + --kv-cache-dtype fp8 + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "1" + limits: + nvidia.com/gpu: "1" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: h200-sxm + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml new file mode 100644 index 0000000..f5f9e41 --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml @@ -0,0 +1,46 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.15.1-qwen-qwen3.5-27b-nvidia-h200-sxm-1 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: qwen + mif.moreh.io/model.name: qwen3.5-27b + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/parallelism: "1" +spec: + framework: vllm + model: + name: Qwen/Qwen3.5-27B + template: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.15.1 + env: + - name: ISVC_USE_KV_EVENTS + value: "true" + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --max-model-len 131072 + --max-num-seqs 128 + --gpu-memory-utilization 0.90 + --kv-cache-dtype fp8 + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "1" + limits: + nvidia.com/gpu: "1" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: h200-sxm + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml new file mode 100644 index 0000000..321723e --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml @@ -0,0 +1,50 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.15.1-qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: qwen + mif.moreh.io/model.name: qwen3.5-397b + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/parallelism: "tp8-moe-ep8" +spec: + framework: vllm + model: + name: Qwen/Qwen3.5-397B + parallelism: + tensor: 8 + expert: true + template: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.15.1 + env: + - name: ISVC_USE_KV_EVENTS + value: "true" + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --max-model-len 65536 + --max-num-seqs 64 + --gpu-memory-utilization 0.90 + --kv-cache-dtype fp8 + --enable-chunked-prefill + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "8" + limits: + nvidia.com/gpu: "8" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: h200-sxm + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml new file mode 100644 index 0000000..75f6d14 --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml @@ -0,0 +1,46 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.15.1-qwen-qwen3.5-9b-nvidia-h200-sxm-1 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: qwen + mif.moreh.io/model.name: qwen3.5-9b + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/parallelism: "1" +spec: + framework: vllm + model: + name: Qwen/Qwen3.5-9B + template: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.15.1 + env: + - name: ISVC_USE_KV_EVENTS + value: "true" + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --max-model-len 131072 + --max-num-seqs 128 + --gpu-memory-utilization 0.90 + --kv-cache-dtype fp8 + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "1" + limits: + nvidia.com/gpu: "1" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: h200-sxm + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml new file mode 100644 index 0000000..1f5d87e --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml @@ -0,0 +1,53 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.15.1-zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: zai-org + mif.moreh.io/model.name: glm-5-fp8 + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/parallelism: "tp8-moe-ep8" +spec: + framework: vllm + model: + name: zai-org/GLM-5-FP8 + parallelism: + tensor: 8 + expert: true + template: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.15.1 + env: + - name: ISVC_USE_KV_EVENTS + value: "true" + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --tool-call-parser glm47 + --reasoning-parser glm45 + --enable-auto-tool-choice + --max-model-len 131072 + --max-num-seqs 64 + --gpu-memory-utilization 0.85 + --kv-cache-dtype fp8 + --enable-chunked-prefill + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "8" + limits: + nvidia.com/gpu: "8" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: h200-sxm + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml new file mode 100644 index 0000000..39b957b --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml @@ -0,0 +1,53 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.15.1-zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: zai-org + mif.moreh.io/model.name: glm-5 + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/parallelism: "tp8-moe-ep8" +spec: + framework: vllm + model: + name: zai-org/GLM-5 + parallelism: + tensor: 8 + expert: true + template: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.15.1 + env: + - name: ISVC_USE_KV_EVENTS + value: "true" + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --tool-call-parser glm47 + --reasoning-parser glm45 + --enable-auto-tool-choice + --max-model-len 131072 + --max-num-seqs 64 + --gpu-memory-utilization 0.85 + --kv-cache-dtype auto + --enable-chunked-prefill + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "8" + limits: + nvidia.com/gpu: "8" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: h200-sxm + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule From c2f7fc187bd0e25f443d7e0f51dcc79679d795cc Mon Sep 17 00:00:00 2001 From: bongwoobak Date: Thu, 2 Apr 2026 01:28:48 +0900 Subject: [PATCH 2/4] MAF-19524: feat(preset): replace v0.15.1 presets with tested v0.17.0 presets Replace untested v0.15.1 H200 presets with v0.17.0 presets validated on aiand-rke2 cluster. All 7 models serving and BBR routing confirmed. Removed: 11 v0.15.1 preset files (incompatible with CUDA driver 580) Added 7 tested presets: - Qwen3.5-9B (L40S tp1, DEEP_GEMM=0, reasoning-parser qwen3) - Qwen3.5-27B-FP8 (L40S tp1, enforce-eager, DEEP_GEMM=0) - GPT-OSS-120B (H100-NVL tp2, max-num-seqs 128) - Nemotron Super 120B FP8 (H200 tp2, mamba-ssm-cache-dtype float16) - DeepSeek V3.2 (H200 dp8-moe-ep8, tokenizer-mode deepseek_v32) - Qwen3.5-397B-A17B-FP8 (H200 dp8-moe-ep8, DEEP_GEMM=0) - GLM-5-FP8 (H200 tp8, dev.local/vllm-glm5:final, gpu-mem-util 0.85) --- ...fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml} | 18 +++---- ...o-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml | 46 ---------------- ...no-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml | 46 ---------------- ...en-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml | 46 ---------------- ...lm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml | 53 ------------------- ...3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml} | 25 ++++----- ...-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml | 12 ++--- ...20b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml} | 21 +++----- ...n-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml} | 19 +++---- ...fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml} | 25 ++++----- .../qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml} | 18 +++---- 11 files changed, 52 insertions(+), 277 deletions(-) rename deploy/helm/moai-inference-preset/templates/presets/vllm/{v0.15.1/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml => glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml} (71%) delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml rename deploy/helm/moai-inference-preset/templates/presets/vllm/{v0.15.1/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml => v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml} (66%) rename deploy/helm/moai-inference-preset/templates/presets/vllm/{v0.15.1 => v0.17.0}/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml (75%) rename deploy/helm/moai-inference-preset/templates/presets/vllm/{v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml => v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml} (59%) rename deploy/helm/moai-inference-preset/templates/presets/vllm/{v0.15.1/qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml => v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml} (67%) rename deploy/helm/moai-inference-preset/templates/presets/vllm/{v0.15.1/qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml => v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml} (63%) rename deploy/helm/moai-inference-preset/templates/presets/vllm/{v0.15.1/qwen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml => v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml} (67%) diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml similarity index 71% rename from deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml rename to deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml index 1f5d87e..a6a0c73 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml @@ -1,7 +1,7 @@ apiVersion: odin.moreh.io/v1alpha1 kind: InferenceServiceTemplate metadata: - name: vllm-v0.15.1-zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8 + name: vllm-glm5-zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8 namespace: {{ include "common.names.namespace" . }} labels: {{- include "mif.preset.labels" . | nindent 4 }} @@ -10,35 +10,29 @@ metadata: mif.moreh.io/role: e2e mif.moreh.io/accelerator.vendor: nvidia mif.moreh.io/accelerator.model: h200-sxm - mif.moreh.io/parallelism: "tp8-moe-ep8" + mif.moreh.io/parallelism: "tp8-moe-tp8" spec: framework: vllm model: name: zai-org/GLM-5-FP8 parallelism: tensor: 8 - expert: true template: spec: containers: - name: main - image: vllm/vllm-openai:v0.15.1 + image: dev.local/vllm-glm5:final env: - - name: ISVC_USE_KV_EVENTS - value: "true" + - name: ISVC_MODEL_PATH + value: /models/glm5-fp8 - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code --tool-call-parser glm47 --reasoning-parser glm45 --enable-auto-tool-choice - --max-model-len 131072 - --max-num-seqs 64 - --gpu-memory-utilization 0.85 - --kv-cache-dtype fp8 - --enable-chunked-prefill + --max-model-len -1 --disable-uvicorn-access-log - --no-enable-log-requests resources: requests: nvidia.com/gpu: "8" diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml deleted file mode 100644 index 1c37fd9..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml +++ /dev/null @@ -1,46 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: vllm-v0.15.1-nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: nvidia - mif.moreh.io/model.name: nvidia-nemotron-3-nano-30b-a3b-bf16 - mif.moreh.io/role: e2e - mif.moreh.io/accelerator.vendor: nvidia - mif.moreh.io/accelerator.model: h200-sxm - mif.moreh.io/parallelism: "1" -spec: - framework: vllm - model: - name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 - template: - spec: - containers: - - name: main - image: vllm/vllm-openai:v0.15.1 - env: - - name: ISVC_USE_KV_EVENTS - value: "true" - - name: ISVC_EXTRA_ARGS - value: >- - --trust-remote-code - --max-model-len 131072 - --max-num-seqs 128 - --gpu-memory-utilization 0.90 - --kv-cache-dtype fp8 - --disable-uvicorn-access-log - --no-enable-log-requests - resources: - requests: - nvidia.com/gpu: "1" - limits: - nvidia.com/gpu: "1" - nodeSelector: - moai.moreh.io/accelerator.vendor: nvidia - moai.moreh.io/accelerator.model: h200-sxm - tolerations: - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml deleted file mode 100644 index 16a8ebe..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml +++ /dev/null @@ -1,46 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: vllm-v0.15.1-nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: nvidia - mif.moreh.io/model.name: nvidia-nemotron-3-nano-30b-a3b-fp8 - mif.moreh.io/role: e2e - mif.moreh.io/accelerator.vendor: nvidia - mif.moreh.io/accelerator.model: h200-sxm - mif.moreh.io/parallelism: "1" -spec: - framework: vllm - model: - name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8 - template: - spec: - containers: - - name: main - image: vllm/vllm-openai:v0.15.1 - env: - - name: ISVC_USE_KV_EVENTS - value: "true" - - name: ISVC_EXTRA_ARGS - value: >- - --trust-remote-code - --max-model-len 131072 - --max-num-seqs 128 - --gpu-memory-utilization 0.90 - --kv-cache-dtype fp8 - --disable-uvicorn-access-log - --no-enable-log-requests - resources: - requests: - nvidia.com/gpu: "1" - limits: - nvidia.com/gpu: "1" - nodeSelector: - moai.moreh.io/accelerator.vendor: nvidia - moai.moreh.io/accelerator.model: h200-sxm - tolerations: - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml deleted file mode 100644 index f5f9e41..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml +++ /dev/null @@ -1,46 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: vllm-v0.15.1-qwen-qwen3.5-27b-nvidia-h200-sxm-1 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: qwen - mif.moreh.io/model.name: qwen3.5-27b - mif.moreh.io/role: e2e - mif.moreh.io/accelerator.vendor: nvidia - mif.moreh.io/accelerator.model: h200-sxm - mif.moreh.io/parallelism: "1" -spec: - framework: vllm - model: - name: Qwen/Qwen3.5-27B - template: - spec: - containers: - - name: main - image: vllm/vllm-openai:v0.15.1 - env: - - name: ISVC_USE_KV_EVENTS - value: "true" - - name: ISVC_EXTRA_ARGS - value: >- - --trust-remote-code - --max-model-len 131072 - --max-num-seqs 128 - --gpu-memory-utilization 0.90 - --kv-cache-dtype fp8 - --disable-uvicorn-access-log - --no-enable-log-requests - resources: - requests: - nvidia.com/gpu: "1" - limits: - nvidia.com/gpu: "1" - nodeSelector: - moai.moreh.io/accelerator.vendor: nvidia - moai.moreh.io/accelerator.model: h200-sxm - tolerations: - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml deleted file mode 100644 index 39b957b..0000000 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml +++ /dev/null @@ -1,53 +0,0 @@ -apiVersion: odin.moreh.io/v1alpha1 -kind: InferenceServiceTemplate -metadata: - name: vllm-v0.15.1-zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8 - namespace: {{ include "common.names.namespace" . }} - labels: - {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: zai-org - mif.moreh.io/model.name: glm-5 - mif.moreh.io/role: e2e - mif.moreh.io/accelerator.vendor: nvidia - mif.moreh.io/accelerator.model: h200-sxm - mif.moreh.io/parallelism: "tp8-moe-ep8" -spec: - framework: vllm - model: - name: zai-org/GLM-5 - parallelism: - tensor: 8 - expert: true - template: - spec: - containers: - - name: main - image: vllm/vllm-openai:v0.15.1 - env: - - name: ISVC_USE_KV_EVENTS - value: "true" - - name: ISVC_EXTRA_ARGS - value: >- - --trust-remote-code - --tool-call-parser glm47 - --reasoning-parser glm45 - --enable-auto-tool-choice - --max-model-len 131072 - --max-num-seqs 64 - --gpu-memory-utilization 0.85 - --kv-cache-dtype auto - --enable-chunked-prefill - --disable-uvicorn-access-log - --no-enable-log-requests - resources: - requests: - nvidia.com/gpu: "8" - limits: - nvidia.com/gpu: "8" - nodeSelector: - moai.moreh.io/accelerator.vendor: nvidia - moai.moreh.io/accelerator.model: h200-sxm - tolerations: - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml similarity index 66% rename from deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml rename to deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml index 1103b70..9182c1d 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml @@ -1,7 +1,7 @@ apiVersion: odin.moreh.io/v1alpha1 kind: InferenceServiceTemplate metadata: - name: vllm-v0.15.1-deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8 + name: vllm-v0.17.0-deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8 namespace: {{ include "common.names.namespace" . }} labels: {{- include "mif.preset.labels" . | nindent 4 }} @@ -10,34 +10,29 @@ metadata: mif.moreh.io/role: e2e mif.moreh.io/accelerator.vendor: nvidia mif.moreh.io/accelerator.model: h200-sxm - mif.moreh.io/parallelism: "tp8-moe-ep8" + mif.moreh.io/parallelism: "dp8-moe-ep8" spec: framework: vllm model: name: deepseek-ai/DeepSeek-V3.2 parallelism: - tensor: 8 + data: 8 expert: true - template: + workerTemplate: spec: containers: - name: main - image: vllm/vllm-openai:v0.15.1 + image: vllm/vllm-openai:v0.17.0 env: - - name: VLLM_USE_DEEP_GEMM - value: "1" - - name: ISVC_USE_KV_EVENTS - value: "true" - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code - --max-model-len 65536 - --max-num-seqs 64 - --gpu-memory-utilization 0.90 - --kv-cache-dtype fp8 - --enable-chunked-prefill + --tokenizer-mode deepseek_v32 + --tool-call-parser deepseek_v32 + --reasoning-parser deepseek_v3 + --enable-auto-tool-choice + --max-model-len -1 --disable-uvicorn-access-log - --no-enable-log-requests resources: requests: nvidia.com/gpu: "8" diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml similarity index 75% rename from deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml rename to deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml index 0706ffb..6470026 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml @@ -1,7 +1,7 @@ apiVersion: odin.moreh.io/v1alpha1 kind: InferenceServiceTemplate metadata: - name: vllm-v0.15.1-nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2 + name: vllm-v0.17.0-nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2 namespace: {{ include "common.names.namespace" . }} labels: {{- include "mif.preset.labels" . | nindent 4 }} @@ -21,19 +21,13 @@ spec: spec: containers: - name: main - image: vllm/vllm-openai:v0.15.1 + image: vllm/vllm-openai:v0.17.0 env: - - name: ISVC_USE_KV_EVENTS - value: "true" - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code - --max-model-len 131072 - --max-num-seqs 128 - --gpu-memory-utilization 0.90 - --kv-cache-dtype fp8 + --max-model-len -1 --disable-uvicorn-access-log - --no-enable-log-requests resources: requests: nvidia.com/gpu: "2" diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml similarity index 59% rename from deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml rename to deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml index 639c7f5..8791331 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml @@ -1,39 +1,34 @@ apiVersion: odin.moreh.io/v1alpha1 kind: InferenceServiceTemplate metadata: - name: vllm-v0.15.1-nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2 + name: vllm-v0.17.0-openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2 namespace: {{ include "common.names.namespace" . }} labels: {{- include "mif.preset.labels" . | nindent 4 }} - mif.moreh.io/model.org: nvidia - mif.moreh.io/model.name: nvidia-nemotron-3-super-120b-a12b-bf16 + mif.moreh.io/model.org: openai + mif.moreh.io/model.name: gpt-oss-120b mif.moreh.io/role: e2e mif.moreh.io/accelerator.vendor: nvidia - mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/accelerator.model: h100-nvl mif.moreh.io/parallelism: "tp2-moe-tp2" spec: framework: vllm model: - name: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16 + name: openai/gpt-oss-120b parallelism: tensor: 2 template: spec: containers: - name: main - image: vllm/vllm-openai:v0.15.1 + image: vllm/vllm-openai:v0.17.0 env: - - name: ISVC_USE_KV_EVENTS - value: "true" - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code - --max-model-len 131072 + --max-model-len -1 --max-num-seqs 128 - --gpu-memory-utilization 0.90 - --kv-cache-dtype fp8 --disable-uvicorn-access-log - --no-enable-log-requests resources: requests: nvidia.com/gpu: "2" @@ -41,7 +36,7 @@ spec: nvidia.com/gpu: "2" nodeSelector: moai.moreh.io/accelerator.vendor: nvidia - moai.moreh.io/accelerator.model: h200-sxm + moai.moreh.io/accelerator.model: h100-nvl tolerations: - key: nvidia.com/gpu operator: Exists diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml similarity index 67% rename from deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml rename to deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml index f6e20b3..ce17c68 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml @@ -1,7 +1,7 @@ apiVersion: odin.moreh.io/v1alpha1 kind: InferenceServiceTemplate metadata: - name: vllm-v0.15.1-qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1 + name: vllm-v0.17.0-qwen-qwen3.5-27b-fp8-nvidia-l40s-1 namespace: {{ include "common.names.namespace" . }} labels: {{- include "mif.preset.labels" . | nindent 4 }} @@ -9,7 +9,7 @@ metadata: mif.moreh.io/model.name: qwen3.5-27b-fp8 mif.moreh.io/role: e2e mif.moreh.io/accelerator.vendor: nvidia - mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/accelerator.model: l40s mif.moreh.io/parallelism: "1" spec: framework: vllm @@ -19,19 +19,16 @@ spec: spec: containers: - name: main - image: vllm/vllm-openai:v0.15.1 + image: vllm/vllm-openai:v0.17.0 env: - - name: ISVC_USE_KV_EVENTS - value: "true" + - name: VLLM_USE_DEEP_GEMM + value: "0" - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code - --max-model-len 131072 - --max-num-seqs 128 - --gpu-memory-utilization 0.90 - --kv-cache-dtype fp8 + --max-model-len -1 + --enforce-eager --disable-uvicorn-access-log - --no-enable-log-requests resources: requests: nvidia.com/gpu: "1" @@ -39,7 +36,7 @@ spec: nvidia.com/gpu: "1" nodeSelector: moai.moreh.io/accelerator.vendor: nvidia - moai.moreh.io/accelerator.model: h200-sxm + moai.moreh.io/accelerator.model: l40s tolerations: - key: nvidia.com/gpu operator: Exists diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml similarity index 63% rename from deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml rename to deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml index 321723e..c548d1d 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml @@ -1,41 +1,36 @@ apiVersion: odin.moreh.io/v1alpha1 kind: InferenceServiceTemplate metadata: - name: vllm-v0.15.1-qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8 + name: vllm-v0.17.0-qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8 namespace: {{ include "common.names.namespace" . }} labels: {{- include "mif.preset.labels" . | nindent 4 }} mif.moreh.io/model.org: qwen - mif.moreh.io/model.name: qwen3.5-397b + mif.moreh.io/model.name: qwen3.5-397b-a17b-fp8 mif.moreh.io/role: e2e mif.moreh.io/accelerator.vendor: nvidia mif.moreh.io/accelerator.model: h200-sxm - mif.moreh.io/parallelism: "tp8-moe-ep8" + mif.moreh.io/parallelism: "dp8-moe-ep8" spec: framework: vllm model: - name: Qwen/Qwen3.5-397B + name: Qwen/Qwen3.5-397B-A17B-FP8 parallelism: - tensor: 8 + data: 8 expert: true - template: + workerTemplate: spec: containers: - name: main - image: vllm/vllm-openai:v0.15.1 + image: vllm/vllm-openai:v0.17.0 env: - - name: ISVC_USE_KV_EVENTS - value: "true" + - name: VLLM_USE_DEEP_GEMM + value: "0" - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code - --max-model-len 65536 - --max-num-seqs 64 - --gpu-memory-utilization 0.90 - --kv-cache-dtype fp8 - --enable-chunked-prefill + --max-model-len -1 --disable-uvicorn-access-log - --no-enable-log-requests resources: requests: nvidia.com/gpu: "8" diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml similarity index 67% rename from deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml rename to deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml index 75f6d14..8856873 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml @@ -1,7 +1,7 @@ apiVersion: odin.moreh.io/v1alpha1 kind: InferenceServiceTemplate metadata: - name: vllm-v0.15.1-qwen-qwen3.5-9b-nvidia-h200-sxm-1 + name: vllm-v0.17.0-qwen-qwen3.5-9b-nvidia-l40s-1 namespace: {{ include "common.names.namespace" . }} labels: {{- include "mif.preset.labels" . | nindent 4 }} @@ -9,7 +9,7 @@ metadata: mif.moreh.io/model.name: qwen3.5-9b mif.moreh.io/role: e2e mif.moreh.io/accelerator.vendor: nvidia - mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/accelerator.model: l40s mif.moreh.io/parallelism: "1" spec: framework: vllm @@ -19,19 +19,15 @@ spec: spec: containers: - name: main - image: vllm/vllm-openai:v0.15.1 + image: vllm/vllm-openai:v0.17.0 env: - - name: ISVC_USE_KV_EVENTS - value: "true" + - name: VLLM_USE_DEEP_GEMM + value: "0" - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code - --max-model-len 131072 - --max-num-seqs 128 - --gpu-memory-utilization 0.90 - --kv-cache-dtype fp8 + --max-model-len -1 --disable-uvicorn-access-log - --no-enable-log-requests resources: requests: nvidia.com/gpu: "1" @@ -39,7 +35,7 @@ spec: nvidia.com/gpu: "1" nodeSelector: moai.moreh.io/accelerator.vendor: nvidia - moai.moreh.io/accelerator.model: h200-sxm + moai.moreh.io/accelerator.model: l40s tolerations: - key: nvidia.com/gpu operator: Exists From 93ae01470d173de616ffa461a5e9a35f4fdea097 Mon Sep 17 00:00:00 2001 From: bongwoobak Date: Thu, 2 Apr 2026 02:43:39 +0900 Subject: [PATCH 3/4] =?UTF-8?q?fix(preset):=20address=20Copilot=20review?= =?UTF-8?q?=20=E2=80=94=20add=20missing=20args=20to=20all=20presets?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add --no-enable-log-requests to all 7 presets (runtime base override) - Add --reasoning-parser qwen3 to Qwen3.5 9B, 27B, 397B - Add --mamba-ssm-cache-dtype float16 --enable-chunked-prefill to Nemotron - Add --gpu-memory-utilization 0.85 to GLM-5 --- .../zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml | 2 ++ ...seek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml | 1 + ...3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml | 3 +++ .../openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml | 1 + .../vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml | 2 ++ ...qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml | 2 ++ .../vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml | 2 ++ 7 files changed, 13 insertions(+) diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml index a6a0c73..c90613a 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml @@ -32,7 +32,9 @@ spec: --reasoning-parser glm45 --enable-auto-tool-choice --max-model-len -1 + --gpu-memory-utilization 0.85 --disable-uvicorn-access-log + --no-enable-log-requests resources: requests: nvidia.com/gpu: "8" diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml index 9182c1d..197c7a1 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml @@ -33,6 +33,7 @@ spec: --enable-auto-tool-choice --max-model-len -1 --disable-uvicorn-access-log + --no-enable-log-requests resources: requests: nvidia.com/gpu: "8" diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml index 6470026..b24d246 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml @@ -27,7 +27,10 @@ spec: value: >- --trust-remote-code --max-model-len -1 + --mamba-ssm-cache-dtype float16 + --enable-chunked-prefill --disable-uvicorn-access-log + --no-enable-log-requests resources: requests: nvidia.com/gpu: "2" diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml index 8791331..cab9930 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml @@ -29,6 +29,7 @@ spec: --max-model-len -1 --max-num-seqs 128 --disable-uvicorn-access-log + --no-enable-log-requests resources: requests: nvidia.com/gpu: "2" diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml index ce17c68..cd36dd8 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml @@ -28,7 +28,9 @@ spec: --trust-remote-code --max-model-len -1 --enforce-eager + --reasoning-parser qwen3 --disable-uvicorn-access-log + --no-enable-log-requests resources: requests: nvidia.com/gpu: "1" diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml index c548d1d..ad3ef7f 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml @@ -30,7 +30,9 @@ spec: value: >- --trust-remote-code --max-model-len -1 + --reasoning-parser qwen3 --disable-uvicorn-access-log + --no-enable-log-requests resources: requests: nvidia.com/gpu: "8" diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml index 8856873..7e829f4 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml @@ -27,7 +27,9 @@ spec: value: >- --trust-remote-code --max-model-len -1 + --reasoning-parser qwen3 --disable-uvicorn-access-log + --no-enable-log-requests resources: requests: nvidia.com/gpu: "1" From a3c1ee732c61ba18a5ec6286549d410ee9cc950c Mon Sep 17 00:00:00 2001 From: bongwoobak Date: Fri, 3 Apr 2026 03:56:35 +0900 Subject: [PATCH 4/4] fix(preset): remove ISVC_MODEL_PATH from GLM-5 preset ISVC_MODEL_PATH should be overridden in InferenceService, not in the preset template, to keep the preset general-purpose. --- ...zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml index c90613a..4c26611 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml @@ -21,10 +21,11 @@ spec: spec: containers: - name: main - image: dev.local/vllm-glm5:final + # glm_moe_dsa arch is not supported in official vLLM. + # CUDA driver 580+ clusters may need a custom image override + # in InferenceService (e.g. dev.local/vllm-glm5:final). + image: vllm/vllm-openai:glm5 env: - - name: ISVC_MODEL_PATH - value: /models/glm5-fp8 - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code