From 7c0c63d3981f212aa73c4d96c2fc4ca5c5340729 Mon Sep 17 00:00:00 2001
From: bongwoobak <bongwoobak@gmail.com>
Date: Wed, 1 Apr 2026 02:00:11 +0900
Subject: [PATCH 1/4] MAF-19524: feat(preset): add H200 presets for AI& April
 launch models

Add vLLM v0.15.1 E2E presets for H200-SXM targeting the AI& April
launch model scope. All presets include ISVC_USE_KV_EVENTS for
precise-prefix-cache-aware Heimdall scheduling.

New models:
- Qwen3.5-9B (tp1), Qwen3.5-27B (tp1), Qwen3.5-27B-FP8 (tp1)
- Qwen3.5-397B (tp8, expert parallel)
- DeepSeek V3.2 (tp8, expert parallel)
- Nemotron Super 120B BF16/FP8 (tp2)
- Nemotron Nano 30B BF16/FP8 (tp1)
- GLM-5 BF16/FP8 (tp8, expert parallel)
---
 ...v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml | 52 ++++++++++++++++++
 ...o-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml | 46 ++++++++++++++++
 ...no-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml | 46 ++++++++++++++++
 ...bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml | 48 +++++++++++++++++
 ...-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml | 48 +++++++++++++++++
 ...wen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml | 46 ++++++++++++++++
 ...en-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml | 46 ++++++++++++++++
 ...397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml | 50 +++++++++++++++++
 ...wen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml | 46 ++++++++++++++++
 ...-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml | 53 +++++++++++++++++++
 ...lm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml | 53 +++++++++++++++++++
 11 files changed, 534 insertions(+)
 create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
 create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml
 create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml
 create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml
 create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml
 create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml
 create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml
 create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
 create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml
 create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
 create mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml

diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
new file mode 100644
index 0000000..1103b70
--- /dev/null
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
@@ -0,0 +1,52 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-v0.15.1-deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: deepseek-ai
+    mif.moreh.io/model.name: deepseek-v3.2
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h200-sxm
+    mif.moreh.io/parallelism: "tp8-moe-ep8"
+spec:
+  framework: vllm
+  model:
+    name: deepseek-ai/DeepSeek-V3.2
+  parallelism:
+    tensor: 8
+    expert: true
+  template:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:v0.15.1
+          env:
+            - name: VLLM_USE_DEEP_GEMM
+              value: "1"
+            - name: ISVC_USE_KV_EVENTS
+              value: "true"
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --max-model-len 65536
+                --max-num-seqs 64
+                --gpu-memory-utilization 0.90
+                --kv-cache-dtype fp8
+                --enable-chunked-prefill
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "8"
+            limits:
+              nvidia.com/gpu: "8"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h200-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml
new file mode 100644
index 0000000..1c37fd9
--- /dev/null
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml
@@ -0,0 +1,46 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-v0.15.1-nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: nvidia
+    mif.moreh.io/model.name: nvidia-nemotron-3-nano-30b-a3b-bf16
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h200-sxm
+    mif.moreh.io/parallelism: "1"
+spec:
+  framework: vllm
+  model:
+    name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
+  template:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:v0.15.1
+          env:
+            - name: ISVC_USE_KV_EVENTS
+              value: "true"
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --max-model-len 131072
+                --max-num-seqs 128
+                --gpu-memory-utilization 0.90
+                --kv-cache-dtype fp8
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "1"
+            limits:
+              nvidia.com/gpu: "1"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h200-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml
new file mode 100644
index 0000000..16a8ebe
--- /dev/null
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml
@@ -0,0 +1,46 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-v0.15.1-nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: nvidia
+    mif.moreh.io/model.name: nvidia-nemotron-3-nano-30b-a3b-fp8
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h200-sxm
+    mif.moreh.io/parallelism: "1"
+spec:
+  framework: vllm
+  model:
+    name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8
+  template:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:v0.15.1
+          env:
+            - name: ISVC_USE_KV_EVENTS
+              value: "true"
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --max-model-len 131072
+                --max-num-seqs 128
+                --gpu-memory-utilization 0.90
+                --kv-cache-dtype fp8
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "1"
+            limits:
+              nvidia.com/gpu: "1"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h200-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml
new file mode 100644
index 0000000..639c7f5
--- /dev/null
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml
@@ -0,0 +1,48 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-v0.15.1-nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: nvidia
+    mif.moreh.io/model.name: nvidia-nemotron-3-super-120b-a12b-bf16
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h200-sxm
+    mif.moreh.io/parallelism: "tp2-moe-tp2"
+spec:
+  framework: vllm
+  model:
+    name: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16
+  parallelism:
+    tensor: 2
+  template:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:v0.15.1
+          env:
+            - name: ISVC_USE_KV_EVENTS
+              value: "true"
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --max-model-len 131072
+                --max-num-seqs 128
+                --gpu-memory-utilization 0.90
+                --kv-cache-dtype fp8
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "2"
+            limits:
+              nvidia.com/gpu: "2"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h200-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml
new file mode 100644
index 0000000..0706ffb
--- /dev/null
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml
@@ -0,0 +1,48 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-v0.15.1-nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: nvidia
+    mif.moreh.io/model.name: nvidia-nemotron-3-super-120b-a12b-fp8
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h200-sxm
+    mif.moreh.io/parallelism: "tp2-moe-tp2"
+spec:
+  framework: vllm
+  model:
+    name: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8
+  parallelism:
+    tensor: 2
+  template:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:v0.15.1
+          env:
+            - name: ISVC_USE_KV_EVENTS
+              value: "true"
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --max-model-len 131072
+                --max-num-seqs 128
+                --gpu-memory-utilization 0.90
+                --kv-cache-dtype fp8
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "2"
+            limits:
+              nvidia.com/gpu: "2"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h200-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml
new file mode 100644
index 0000000..f6e20b3
--- /dev/null
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml
@@ -0,0 +1,46 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-v0.15.1-qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: qwen
+    mif.moreh.io/model.name: qwen3.5-27b-fp8
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h200-sxm
+    mif.moreh.io/parallelism: "1"
+spec:
+  framework: vllm
+  model:
+    name: Qwen/Qwen3.5-27B-FP8
+  template:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:v0.15.1
+          env:
+            - name: ISVC_USE_KV_EVENTS
+              value: "true"
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --max-model-len 131072
+                --max-num-seqs 128
+                --gpu-memory-utilization 0.90
+                --kv-cache-dtype fp8
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "1"
+            limits:
+              nvidia.com/gpu: "1"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h200-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml
new file mode 100644
index 0000000..f5f9e41
--- /dev/null
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml
@@ -0,0 +1,46 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-v0.15.1-qwen-qwen3.5-27b-nvidia-h200-sxm-1
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: qwen
+    mif.moreh.io/model.name: qwen3.5-27b
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h200-sxm
+    mif.moreh.io/parallelism: "1"
+spec:
+  framework: vllm
+  model:
+    name: Qwen/Qwen3.5-27B
+  template:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:v0.15.1
+          env:
+            - name: ISVC_USE_KV_EVENTS
+              value: "true"
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --max-model-len 131072
+                --max-num-seqs 128
+                --gpu-memory-utilization 0.90
+                --kv-cache-dtype fp8
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "1"
+            limits:
+              nvidia.com/gpu: "1"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h200-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
new file mode 100644
index 0000000..321723e
--- /dev/null
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
@@ -0,0 +1,50 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-v0.15.1-qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: qwen
+    mif.moreh.io/model.name: qwen3.5-397b
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h200-sxm
+    mif.moreh.io/parallelism: "tp8-moe-ep8"
+spec:
+  framework: vllm
+  model:
+    name: Qwen/Qwen3.5-397B
+  parallelism:
+    tensor: 8
+    expert: true
+  template:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:v0.15.1
+          env:
+            - name: ISVC_USE_KV_EVENTS
+              value: "true"
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --max-model-len 65536
+                --max-num-seqs 64
+                --gpu-memory-utilization 0.90
+                --kv-cache-dtype fp8
+                --enable-chunked-prefill
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "8"
+            limits:
+              nvidia.com/gpu: "8"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h200-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml
new file mode 100644
index 0000000..75f6d14
--- /dev/null
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml
@@ -0,0 +1,46 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-v0.15.1-qwen-qwen3.5-9b-nvidia-h200-sxm-1
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: qwen
+    mif.moreh.io/model.name: qwen3.5-9b
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h200-sxm
+    mif.moreh.io/parallelism: "1"
+spec:
+  framework: vllm
+  model:
+    name: Qwen/Qwen3.5-9B
+  template:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:v0.15.1
+          env:
+            - name: ISVC_USE_KV_EVENTS
+              value: "true"
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --max-model-len 131072
+                --max-num-seqs 128
+                --gpu-memory-utilization 0.90
+                --kv-cache-dtype fp8
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "1"
+            limits:
+              nvidia.com/gpu: "1"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h200-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
new file mode 100644
index 0000000..1f5d87e
--- /dev/null
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
@@ -0,0 +1,53 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-v0.15.1-zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: zai-org
+    mif.moreh.io/model.name: glm-5-fp8
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h200-sxm
+    mif.moreh.io/parallelism: "tp8-moe-ep8"
+spec:
+  framework: vllm
+  model:
+    name: zai-org/GLM-5-FP8
+  parallelism:
+    tensor: 8
+    expert: true
+  template:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:v0.15.1
+          env:
+            - name: ISVC_USE_KV_EVENTS
+              value: "true"
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --tool-call-parser glm47
+                --reasoning-parser glm45
+                --enable-auto-tool-choice
+                --max-model-len 131072
+                --max-num-seqs 64
+                --gpu-memory-utilization 0.85
+                --kv-cache-dtype fp8
+                --enable-chunked-prefill
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "8"
+            limits:
+              nvidia.com/gpu: "8"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h200-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
new file mode 100644
index 0000000..39b957b
--- /dev/null
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
@@ -0,0 +1,53 @@
+apiVersion: odin.moreh.io/v1alpha1
+kind: InferenceServiceTemplate
+metadata:
+  name: vllm-v0.15.1-zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8
+  namespace: {{ include "common.names.namespace" . }}
+  labels:
+    {{- include "mif.preset.labels" . | nindent 4 }}
+    mif.moreh.io/model.org: zai-org
+    mif.moreh.io/model.name: glm-5
+    mif.moreh.io/role: e2e
+    mif.moreh.io/accelerator.vendor: nvidia
+    mif.moreh.io/accelerator.model: h200-sxm
+    mif.moreh.io/parallelism: "tp8-moe-ep8"
+spec:
+  framework: vllm
+  model:
+    name: zai-org/GLM-5
+  parallelism:
+    tensor: 8
+    expert: true
+  template:
+    spec:
+      containers:
+        - name: main
+          image: vllm/vllm-openai:v0.15.1
+          env:
+            - name: ISVC_USE_KV_EVENTS
+              value: "true"
+            - name: ISVC_EXTRA_ARGS
+              value: >-
+                --trust-remote-code
+                --tool-call-parser glm47
+                --reasoning-parser glm45
+                --enable-auto-tool-choice
+                --max-model-len 131072
+                --max-num-seqs 64
+                --gpu-memory-utilization 0.85
+                --kv-cache-dtype auto
+                --enable-chunked-prefill
+                --disable-uvicorn-access-log
+                --no-enable-log-requests
+          resources:
+            requests:
+              nvidia.com/gpu: "8"
+            limits:
+              nvidia.com/gpu: "8"
+      nodeSelector:
+        moai.moreh.io/accelerator.vendor: nvidia
+        moai.moreh.io/accelerator.model: h200-sxm
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule

From c2f7fc187bd0e25f443d7e0f51dcc79679d795cc Mon Sep 17 00:00:00 2001
From: bongwoobak <bongwoobak@gmail.com>
Date: Thu, 2 Apr 2026 01:28:48 +0900
Subject: [PATCH 2/4] MAF-19524: feat(preset): replace v0.15.1 presets with
 tested v0.17.0 presets

Replace untested v0.15.1 H200 presets with v0.17.0 presets validated on
aiand-rke2 cluster. All 7 models serving and BBR routing confirmed.

Removed: 11 v0.15.1 preset files (incompatible with CUDA driver 580)

Added 7 tested presets:
- Qwen3.5-9B (L40S tp1, DEEP_GEMM=0, reasoning-parser qwen3)
- Qwen3.5-27B-FP8 (L40S tp1, enforce-eager, DEEP_GEMM=0)
- GPT-OSS-120B (H100-NVL tp2, max-num-seqs 128)
- Nemotron Super 120B FP8 (H200 tp2, mamba-ssm-cache-dtype float16)
- DeepSeek V3.2 (H200 dp8-moe-ep8, tokenizer-mode deepseek_v32)
- Qwen3.5-397B-A17B-FP8 (H200 dp8-moe-ep8, DEEP_GEMM=0)
- GLM-5-FP8 (H200 tp8, dev.local/vllm-glm5:final, gpu-mem-util 0.85)
---
 ...fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml} | 18 +++----
 ...o-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml | 46 ----------------
 ...no-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml | 46 ----------------
 ...en-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml | 46 ----------------
 ...lm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml | 53 -------------------
 ...3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml} | 25 ++++-----
 ...-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml | 12 ++---
 ...20b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml} | 21 +++-----
 ...n-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml} | 19 +++----
 ...fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml} | 25 ++++-----
 .../qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml}  | 18 +++----
 11 files changed, 52 insertions(+), 277 deletions(-)
 rename deploy/helm/moai-inference-preset/templates/presets/vllm/{v0.15.1/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml => glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml} (71%)
 delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml
 delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml
 delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml
 delete mode 100644 deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
 rename deploy/helm/moai-inference-preset/templates/presets/vllm/{v0.15.1/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml => v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml} (66%)
 rename deploy/helm/moai-inference-preset/templates/presets/vllm/{v0.15.1 => v0.17.0}/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml (75%)
 rename deploy/helm/moai-inference-preset/templates/presets/vllm/{v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml => v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml} (59%)
 rename deploy/helm/moai-inference-preset/templates/presets/vllm/{v0.15.1/qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml => v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml} (67%)
 rename deploy/helm/moai-inference-preset/templates/presets/vllm/{v0.15.1/qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml => v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml} (63%)
 rename deploy/helm/moai-inference-preset/templates/presets/vllm/{v0.15.1/qwen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml => v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml} (67%)

diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml
similarity index 71%
rename from deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
rename to deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml
index 1f5d87e..a6a0c73 100644
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml
@@ -1,7 +1,7 @@
 apiVersion: odin.moreh.io/v1alpha1
 kind: InferenceServiceTemplate
 metadata:
-  name: vllm-v0.15.1-zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-ep8
+  name: vllm-glm5-zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8
   namespace: {{ include "common.names.namespace" . }}
   labels:
     {{- include "mif.preset.labels" . | nindent 4 }}
@@ -10,35 +10,29 @@ metadata:
     mif.moreh.io/role: e2e
     mif.moreh.io/accelerator.vendor: nvidia
     mif.moreh.io/accelerator.model: h200-sxm
-    mif.moreh.io/parallelism: "tp8-moe-ep8"
+    mif.moreh.io/parallelism: "tp8-moe-tp8"
 spec:
   framework: vllm
   model:
     name: zai-org/GLM-5-FP8
   parallelism:
     tensor: 8
-    expert: true
   template:
     spec:
       containers:
         - name: main
-          image: vllm/vllm-openai:v0.15.1
+          image: dev.local/vllm-glm5:final
           env:
-            - name: ISVC_USE_KV_EVENTS
-              value: "true"
+            - name: ISVC_MODEL_PATH
+              value: /models/glm5-fp8
             - name: ISVC_EXTRA_ARGS
               value: >-
                 --trust-remote-code
                 --tool-call-parser glm47
                 --reasoning-parser glm45
                 --enable-auto-tool-choice
-                --max-model-len 131072
-                --max-num-seqs 64
-                --gpu-memory-utilization 0.85
-                --kv-cache-dtype fp8
-                --enable-chunked-prefill
+                --max-model-len -1
                 --disable-uvicorn-access-log
-                --no-enable-log-requests
           resources:
             requests:
               nvidia.com/gpu: "8"
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml
deleted file mode 100644
index 1c37fd9..0000000
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1.helm.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-apiVersion: odin.moreh.io/v1alpha1
-kind: InferenceServiceTemplate
-metadata:
-  name: vllm-v0.15.1-nvidia-nvidia-nemotron-3-nano-30b-a3b-bf16-nvidia-h200-sxm-1
-  namespace: {{ include "common.names.namespace" . }}
-  labels:
-    {{- include "mif.preset.labels" . | nindent 4 }}
-    mif.moreh.io/model.org: nvidia
-    mif.moreh.io/model.name: nvidia-nemotron-3-nano-30b-a3b-bf16
-    mif.moreh.io/role: e2e
-    mif.moreh.io/accelerator.vendor: nvidia
-    mif.moreh.io/accelerator.model: h200-sxm
-    mif.moreh.io/parallelism: "1"
-spec:
-  framework: vllm
-  model:
-    name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
-  template:
-    spec:
-      containers:
-        - name: main
-          image: vllm/vllm-openai:v0.15.1
-          env:
-            - name: ISVC_USE_KV_EVENTS
-              value: "true"
-            - name: ISVC_EXTRA_ARGS
-              value: >-
-                --trust-remote-code
-                --max-model-len 131072
-                --max-num-seqs 128
-                --gpu-memory-utilization 0.90
-                --kv-cache-dtype fp8
-                --disable-uvicorn-access-log
-                --no-enable-log-requests
-          resources:
-            requests:
-              nvidia.com/gpu: "1"
-            limits:
-              nvidia.com/gpu: "1"
-      nodeSelector:
-        moai.moreh.io/accelerator.vendor: nvidia
-        moai.moreh.io/accelerator.model: h200-sxm
-      tolerations:
-        - key: nvidia.com/gpu
-          operator: Exists
-          effect: NoSchedule
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml
deleted file mode 100644
index 16a8ebe..0000000
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1.helm.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-apiVersion: odin.moreh.io/v1alpha1
-kind: InferenceServiceTemplate
-metadata:
-  name: vllm-v0.15.1-nvidia-nvidia-nemotron-3-nano-30b-a3b-fp8-nvidia-h200-sxm-1
-  namespace: {{ include "common.names.namespace" . }}
-  labels:
-    {{- include "mif.preset.labels" . | nindent 4 }}
-    mif.moreh.io/model.org: nvidia
-    mif.moreh.io/model.name: nvidia-nemotron-3-nano-30b-a3b-fp8
-    mif.moreh.io/role: e2e
-    mif.moreh.io/accelerator.vendor: nvidia
-    mif.moreh.io/accelerator.model: h200-sxm
-    mif.moreh.io/parallelism: "1"
-spec:
-  framework: vllm
-  model:
-    name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8
-  template:
-    spec:
-      containers:
-        - name: main
-          image: vllm/vllm-openai:v0.15.1
-          env:
-            - name: ISVC_USE_KV_EVENTS
-              value: "true"
-            - name: ISVC_EXTRA_ARGS
-              value: >-
-                --trust-remote-code
-                --max-model-len 131072
-                --max-num-seqs 128
-                --gpu-memory-utilization 0.90
-                --kv-cache-dtype fp8
-                --disable-uvicorn-access-log
-                --no-enable-log-requests
-          resources:
-            requests:
-              nvidia.com/gpu: "1"
-            limits:
-              nvidia.com/gpu: "1"
-      nodeSelector:
-        moai.moreh.io/accelerator.vendor: nvidia
-        moai.moreh.io/accelerator.model: h200-sxm
-      tolerations:
-        - key: nvidia.com/gpu
-          operator: Exists
-          effect: NoSchedule
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml
deleted file mode 100644
index f5f9e41..0000000
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-nvidia-h200-sxm-1.helm.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-apiVersion: odin.moreh.io/v1alpha1
-kind: InferenceServiceTemplate
-metadata:
-  name: vllm-v0.15.1-qwen-qwen3.5-27b-nvidia-h200-sxm-1
-  namespace: {{ include "common.names.namespace" . }}
-  labels:
-    {{- include "mif.preset.labels" . | nindent 4 }}
-    mif.moreh.io/model.org: qwen
-    mif.moreh.io/model.name: qwen3.5-27b
-    mif.moreh.io/role: e2e
-    mif.moreh.io/accelerator.vendor: nvidia
-    mif.moreh.io/accelerator.model: h200-sxm
-    mif.moreh.io/parallelism: "1"
-spec:
-  framework: vllm
-  model:
-    name: Qwen/Qwen3.5-27B
-  template:
-    spec:
-      containers:
-        - name: main
-          image: vllm/vllm-openai:v0.15.1
-          env:
-            - name: ISVC_USE_KV_EVENTS
-              value: "true"
-            - name: ISVC_EXTRA_ARGS
-              value: >-
-                --trust-remote-code
-                --max-model-len 131072
-                --max-num-seqs 128
-                --gpu-memory-utilization 0.90
-                --kv-cache-dtype fp8
-                --disable-uvicorn-access-log
-                --no-enable-log-requests
-          resources:
-            requests:
-              nvidia.com/gpu: "1"
-            limits:
-              nvidia.com/gpu: "1"
-      nodeSelector:
-        moai.moreh.io/accelerator.vendor: nvidia
-        moai.moreh.io/accelerator.model: h200-sxm
-      tolerations:
-        - key: nvidia.com/gpu
-          operator: Exists
-          effect: NoSchedule
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
deleted file mode 100644
index 39b957b..0000000
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
-apiVersion: odin.moreh.io/v1alpha1
-kind: InferenceServiceTemplate
-metadata:
-  name: vllm-v0.15.1-zai-org-glm-5-nvidia-h200-sxm-tp8-moe-ep8
-  namespace: {{ include "common.names.namespace" . }}
-  labels:
-    {{- include "mif.preset.labels" . | nindent 4 }}
-    mif.moreh.io/model.org: zai-org
-    mif.moreh.io/model.name: glm-5
-    mif.moreh.io/role: e2e
-    mif.moreh.io/accelerator.vendor: nvidia
-    mif.moreh.io/accelerator.model: h200-sxm
-    mif.moreh.io/parallelism: "tp8-moe-ep8"
-spec:
-  framework: vllm
-  model:
-    name: zai-org/GLM-5
-  parallelism:
-    tensor: 8
-    expert: true
-  template:
-    spec:
-      containers:
-        - name: main
-          image: vllm/vllm-openai:v0.15.1
-          env:
-            - name: ISVC_USE_KV_EVENTS
-              value: "true"
-            - name: ISVC_EXTRA_ARGS
-              value: >-
-                --trust-remote-code
-                --tool-call-parser glm47
-                --reasoning-parser glm45
-                --enable-auto-tool-choice
-                --max-model-len 131072
-                --max-num-seqs 64
-                --gpu-memory-utilization 0.85
-                --kv-cache-dtype auto
-                --enable-chunked-prefill
-                --disable-uvicorn-access-log
-                --no-enable-log-requests
-          resources:
-            requests:
-              nvidia.com/gpu: "8"
-            limits:
-              nvidia.com/gpu: "8"
-      nodeSelector:
-        moai.moreh.io/accelerator.vendor: nvidia
-        moai.moreh.io/accelerator.model: h200-sxm
-      tolerations:
-        - key: nvidia.com/gpu
-          operator: Exists
-          effect: NoSchedule
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml
similarity index 66%
rename from deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
rename to deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml
index 1103b70..9182c1d 100644
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml
@@ -1,7 +1,7 @@
 apiVersion: odin.moreh.io/v1alpha1
 kind: InferenceServiceTemplate
 metadata:
-  name: vllm-v0.15.1-deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-tp8-moe-ep8
+  name: vllm-v0.17.0-deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8
   namespace: {{ include "common.names.namespace" . }}
   labels:
     {{- include "mif.preset.labels" . | nindent 4 }}
@@ -10,34 +10,29 @@ metadata:
     mif.moreh.io/role: e2e
     mif.moreh.io/accelerator.vendor: nvidia
     mif.moreh.io/accelerator.model: h200-sxm
-    mif.moreh.io/parallelism: "tp8-moe-ep8"
+    mif.moreh.io/parallelism: "dp8-moe-ep8"
 spec:
   framework: vllm
   model:
     name: deepseek-ai/DeepSeek-V3.2
   parallelism:
-    tensor: 8
+    data: 8
     expert: true
-  template:
+  workerTemplate:
     spec:
       containers:
         - name: main
-          image: vllm/vllm-openai:v0.15.1
+          image: vllm/vllm-openai:v0.17.0
           env:
-            - name: VLLM_USE_DEEP_GEMM
-              value: "1"
-            - name: ISVC_USE_KV_EVENTS
-              value: "true"
             - name: ISVC_EXTRA_ARGS
               value: >-
                 --trust-remote-code
-                --max-model-len 65536
-                --max-num-seqs 64
-                --gpu-memory-utilization 0.90
-                --kv-cache-dtype fp8
-                --enable-chunked-prefill
+                --tokenizer-mode deepseek_v32
+                --tool-call-parser deepseek_v32
+                --reasoning-parser deepseek_v3
+                --enable-auto-tool-choice
+                --max-model-len -1
                 --disable-uvicorn-access-log
-                --no-enable-log-requests
           resources:
             requests:
               nvidia.com/gpu: "8"
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml
similarity index 75%
rename from deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml
rename to deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml
index 0706ffb..6470026 100644
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml
@@ -1,7 +1,7 @@
 apiVersion: odin.moreh.io/v1alpha1
 kind: InferenceServiceTemplate
 metadata:
-  name: vllm-v0.15.1-nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2
+  name: vllm-v0.17.0-nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2
   namespace: {{ include "common.names.namespace" . }}
   labels:
     {{- include "mif.preset.labels" . | nindent 4 }}
@@ -21,19 +21,13 @@ spec:
     spec:
       containers:
         - name: main
-          image: vllm/vllm-openai:v0.15.1
+          image: vllm/vllm-openai:v0.17.0
           env:
-            - name: ISVC_USE_KV_EVENTS
-              value: "true"
             - name: ISVC_EXTRA_ARGS
               value: >-
                 --trust-remote-code
-                --max-model-len 131072
-                --max-num-seqs 128
-                --gpu-memory-utilization 0.90
-                --kv-cache-dtype fp8
+                --max-model-len -1
                 --disable-uvicorn-access-log
-                --no-enable-log-requests
           resources:
             requests:
               nvidia.com/gpu: "2"
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml
similarity index 59%
rename from deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml
rename to deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml
index 639c7f5..8791331 100644
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml
@@ -1,39 +1,34 @@
 apiVersion: odin.moreh.io/v1alpha1
 kind: InferenceServiceTemplate
 metadata:
-  name: vllm-v0.15.1-nvidia-nvidia-nemotron-3-super-120b-a12b-bf16-nvidia-h200-sxm-tp2-moe-tp2
+  name: vllm-v0.17.0-openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2
   namespace: {{ include "common.names.namespace" . }}
   labels:
     {{- include "mif.preset.labels" . | nindent 4 }}
-    mif.moreh.io/model.org: nvidia
-    mif.moreh.io/model.name: nvidia-nemotron-3-super-120b-a12b-bf16
+    mif.moreh.io/model.org: openai
+    mif.moreh.io/model.name: gpt-oss-120b
     mif.moreh.io/role: e2e
     mif.moreh.io/accelerator.vendor: nvidia
-    mif.moreh.io/accelerator.model: h200-sxm
+    mif.moreh.io/accelerator.model: h100-nvl
     mif.moreh.io/parallelism: "tp2-moe-tp2"
 spec:
   framework: vllm
   model:
-    name: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16
+    name: openai/gpt-oss-120b
   parallelism:
     tensor: 2
   template:
     spec:
       containers:
         - name: main
-          image: vllm/vllm-openai:v0.15.1
+          image: vllm/vllm-openai:v0.17.0
           env:
-            - name: ISVC_USE_KV_EVENTS
-              value: "true"
             - name: ISVC_EXTRA_ARGS
               value: >-
                 --trust-remote-code
-                --max-model-len 131072
+                --max-model-len -1
                 --max-num-seqs 128
-                --gpu-memory-utilization 0.90
-                --kv-cache-dtype fp8
                 --disable-uvicorn-access-log
-                --no-enable-log-requests
           resources:
             requests:
               nvidia.com/gpu: "2"
@@ -41,7 +36,7 @@ spec:
               nvidia.com/gpu: "2"
       nodeSelector:
         moai.moreh.io/accelerator.vendor: nvidia
-        moai.moreh.io/accelerator.model: h200-sxm
+        moai.moreh.io/accelerator.model: h100-nvl
       tolerations:
         - key: nvidia.com/gpu
           operator: Exists
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml
similarity index 67%
rename from deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml
rename to deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml
index f6e20b3..ce17c68 100644
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1.helm.yaml
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml
@@ -1,7 +1,7 @@
 apiVersion: odin.moreh.io/v1alpha1
 kind: InferenceServiceTemplate
 metadata:
-  name: vllm-v0.15.1-qwen-qwen3.5-27b-fp8-nvidia-h200-sxm-1
+  name: vllm-v0.17.0-qwen-qwen3.5-27b-fp8-nvidia-l40s-1
   namespace: {{ include "common.names.namespace" . }}
   labels:
     {{- include "mif.preset.labels" . | nindent 4 }}
@@ -9,7 +9,7 @@ metadata:
     mif.moreh.io/model.name: qwen3.5-27b-fp8
     mif.moreh.io/role: e2e
     mif.moreh.io/accelerator.vendor: nvidia
-    mif.moreh.io/accelerator.model: h200-sxm
+    mif.moreh.io/accelerator.model: l40s
     mif.moreh.io/parallelism: "1"
 spec:
   framework: vllm
@@ -19,19 +19,16 @@ spec:
     spec:
       containers:
         - name: main
-          image: vllm/vllm-openai:v0.15.1
+          image: vllm/vllm-openai:v0.17.0
           env:
-            - name: ISVC_USE_KV_EVENTS
-              value: "true"
+            - name: VLLM_USE_DEEP_GEMM
+              value: "0"
             - name: ISVC_EXTRA_ARGS
               value: >-
                 --trust-remote-code
-                --max-model-len 131072
-                --max-num-seqs 128
-                --gpu-memory-utilization 0.90
-                --kv-cache-dtype fp8
+                --max-model-len -1
+                --enforce-eager
                 --disable-uvicorn-access-log
-                --no-enable-log-requests
           resources:
             requests:
               nvidia.com/gpu: "1"
@@ -39,7 +36,7 @@ spec:
               nvidia.com/gpu: "1"
       nodeSelector:
         moai.moreh.io/accelerator.vendor: nvidia
-        moai.moreh.io/accelerator.model: h200-sxm
+        moai.moreh.io/accelerator.model: l40s
       tolerations:
         - key: nvidia.com/gpu
           operator: Exists
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml
similarity index 63%
rename from deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
rename to deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml
index 321723e..c548d1d 100644
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml
@@ -1,41 +1,36 @@
 apiVersion: odin.moreh.io/v1alpha1
 kind: InferenceServiceTemplate
 metadata:
-  name: vllm-v0.15.1-qwen-qwen3.5-397b-nvidia-h200-sxm-tp8-moe-ep8
+  name: vllm-v0.17.0-qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8
   namespace: {{ include "common.names.namespace" . }}
   labels:
     {{- include "mif.preset.labels" . | nindent 4 }}
     mif.moreh.io/model.org: qwen
-    mif.moreh.io/model.name: qwen3.5-397b
+    mif.moreh.io/model.name: qwen3.5-397b-a17b-fp8
     mif.moreh.io/role: e2e
     mif.moreh.io/accelerator.vendor: nvidia
     mif.moreh.io/accelerator.model: h200-sxm
-    mif.moreh.io/parallelism: "tp8-moe-ep8"
+    mif.moreh.io/parallelism: "dp8-moe-ep8"
 spec:
   framework: vllm
   model:
-    name: Qwen/Qwen3.5-397B
+    name: Qwen/Qwen3.5-397B-A17B-FP8
   parallelism:
-    tensor: 8
+    data: 8
     expert: true
-  template:
+  workerTemplate:
     spec:
       containers:
         - name: main
-          image: vllm/vllm-openai:v0.15.1
+          image: vllm/vllm-openai:v0.17.0
           env:
-            - name: ISVC_USE_KV_EVENTS
-              value: "true"
+            - name: VLLM_USE_DEEP_GEMM
+              value: "0"
             - name: ISVC_EXTRA_ARGS
               value: >-
                 --trust-remote-code
-                --max-model-len 65536
-                --max-num-seqs 64
-                --gpu-memory-utilization 0.90
-                --kv-cache-dtype fp8
-                --enable-chunked-prefill
+                --max-model-len -1
                 --disable-uvicorn-access-log
-                --no-enable-log-requests
           resources:
             requests:
               nvidia.com/gpu: "8"
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml
similarity index 67%
rename from deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml
rename to deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml
index 75f6d14..8856873 100644
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/qwen-qwen3.5-9b-nvidia-h200-sxm-1.helm.yaml
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml
@@ -1,7 +1,7 @@
 apiVersion: odin.moreh.io/v1alpha1
 kind: InferenceServiceTemplate
 metadata:
-  name: vllm-v0.15.1-qwen-qwen3.5-9b-nvidia-h200-sxm-1
+  name: vllm-v0.17.0-qwen-qwen3.5-9b-nvidia-l40s-1
   namespace: {{ include "common.names.namespace" . }}
   labels:
     {{- include "mif.preset.labels" . | nindent 4 }}
@@ -9,7 +9,7 @@ metadata:
     mif.moreh.io/model.name: qwen3.5-9b
     mif.moreh.io/role: e2e
     mif.moreh.io/accelerator.vendor: nvidia
-    mif.moreh.io/accelerator.model: h200-sxm
+    mif.moreh.io/accelerator.model: l40s
     mif.moreh.io/parallelism: "1"
 spec:
   framework: vllm
@@ -19,19 +19,15 @@ spec:
     spec:
       containers:
         - name: main
-          image: vllm/vllm-openai:v0.15.1
+          image: vllm/vllm-openai:v0.17.0
           env:
-            - name: ISVC_USE_KV_EVENTS
-              value: "true"
+            - name: VLLM_USE_DEEP_GEMM
+              value: "0"
             - name: ISVC_EXTRA_ARGS
               value: >-
                 --trust-remote-code
-                --max-model-len 131072
-                --max-num-seqs 128
-                --gpu-memory-utilization 0.90
-                --kv-cache-dtype fp8
+                --max-model-len -1
                 --disable-uvicorn-access-log
-                --no-enable-log-requests
           resources:
             requests:
               nvidia.com/gpu: "1"
@@ -39,7 +35,7 @@ spec:
               nvidia.com/gpu: "1"
       nodeSelector:
         moai.moreh.io/accelerator.vendor: nvidia
-        moai.moreh.io/accelerator.model: h200-sxm
+        moai.moreh.io/accelerator.model: l40s
       tolerations:
         - key: nvidia.com/gpu
           operator: Exists

From 93ae01470d173de616ffa461a5e9a35f4fdea097 Mon Sep 17 00:00:00 2001
From: bongwoobak <bongwoobak@gmail.com>
Date: Thu, 2 Apr 2026 02:43:39 +0900
Subject: [PATCH 3/4] =?UTF-8?q?fix(preset):=20address=20Copilot=20review?=
 =?UTF-8?q?=20=E2=80=94=20add=20missing=20args=20to=20all=20presets?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add --no-enable-log-requests to all 7 presets (runtime base override)
- Add --reasoning-parser qwen3 to Qwen3.5 9B, 27B, 397B
- Add --mamba-ssm-cache-dtype float16 --enable-chunked-prefill to Nemotron
- Add --gpu-memory-utilization 0.85 to GLM-5
---
 .../zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml    | 2 ++
 ...seek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml | 1 +
 ...3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml | 3 +++
 .../openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml  | 1 +
 .../vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml  | 2 ++
 ...qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml | 2 ++
 .../vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml       | 2 ++
 7 files changed, 13 insertions(+)

diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml
index a6a0c73..c90613a 100644
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml
@@ -32,7 +32,9 @@ spec:
                 --reasoning-parser glm45
                 --enable-auto-tool-choice
                 --max-model-len -1
+                --gpu-memory-utilization 0.85
                 --disable-uvicorn-access-log
+                --no-enable-log-requests
           resources:
             requests:
               nvidia.com/gpu: "8"
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml
index 9182c1d..197c7a1 100644
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml
@@ -33,6 +33,7 @@ spec:
                 --enable-auto-tool-choice
                 --max-model-len -1
                 --disable-uvicorn-access-log
+                --no-enable-log-requests
           resources:
             requests:
               nvidia.com/gpu: "8"
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml
index 6470026..b24d246 100644
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml
@@ -27,7 +27,10 @@ spec:
               value: >-
                 --trust-remote-code
                 --max-model-len -1
+                --mamba-ssm-cache-dtype float16
+                --enable-chunked-prefill
                 --disable-uvicorn-access-log
+                --no-enable-log-requests
           resources:
             requests:
               nvidia.com/gpu: "2"
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml
index 8791331..cab9930 100644
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml
@@ -29,6 +29,7 @@ spec:
                 --max-model-len -1
                 --max-num-seqs 128
                 --disable-uvicorn-access-log
+                --no-enable-log-requests
           resources:
             requests:
               nvidia.com/gpu: "2"
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml
index ce17c68..cd36dd8 100644
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml
@@ -28,7 +28,9 @@ spec:
                 --trust-remote-code
                 --max-model-len -1
                 --enforce-eager
+                --reasoning-parser qwen3
                 --disable-uvicorn-access-log
+                --no-enable-log-requests
           resources:
             requests:
               nvidia.com/gpu: "1"
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml
index c548d1d..ad3ef7f 100644
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml
@@ -30,7 +30,9 @@ spec:
               value: >-
                 --trust-remote-code
                 --max-model-len -1
+                --reasoning-parser qwen3
                 --disable-uvicorn-access-log
+                --no-enable-log-requests
           resources:
             requests:
               nvidia.com/gpu: "8"
diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml
index 8856873..7e829f4 100644
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml
@@ -27,7 +27,9 @@ spec:
               value: >-
                 --trust-remote-code
                 --max-model-len -1
+                --reasoning-parser qwen3
                 --disable-uvicorn-access-log
+                --no-enable-log-requests
           resources:
             requests:
               nvidia.com/gpu: "1"

From a3c1ee732c61ba18a5ec6286549d410ee9cc950c Mon Sep 17 00:00:00 2001
From: bongwoobak <bongwoobak@gmail.com>
Date: Fri, 3 Apr 2026 03:56:35 +0900
Subject: [PATCH 4/4] fix(preset): remove ISVC_MODEL_PATH from GLM-5 preset

ISVC_MODEL_PATH should be overridden in InferenceService, not in the
preset template, to keep the preset general-purpose.
---
 ...zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml
index c90613a..4c26611 100644
--- a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml
+++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml
@@ -21,10 +21,11 @@ spec:
     spec:
       containers:
         - name: main
-          image: dev.local/vllm-glm5:final
+          # glm_moe_dsa arch is not supported in official vLLM.
+          # CUDA driver 580+ clusters may need a custom image override
+          # in InferenceService (e.g. dev.local/vllm-glm5:final).
+          image: vllm/vllm-openai:glm5
           env:
-            - name: ISVC_MODEL_PATH
-              value: /models/glm5-fp8
             - name: ISVC_EXTRA_ARGS
               value: >-
                 --trust-remote-code