diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml new file mode 100644 index 0000000..4c26611 --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml @@ -0,0 +1,50 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-glm5-zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: zai-org + mif.moreh.io/model.name: glm-5-fp8 + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/parallelism: "tp8-moe-tp8" +spec: + framework: vllm + model: + name: zai-org/GLM-5-FP8 + parallelism: + tensor: 8 + template: + spec: + containers: + - name: main + # glm_moe_dsa arch is not supported in official vLLM. + # CUDA driver 580+ clusters may need a custom image override + # in InferenceService (e.g. dev.local/vllm-glm5:final). + image: vllm/vllm-openai:glm5 + env: + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --tool-call-parser glm47 + --reasoning-parser glm45 + --enable-auto-tool-choice + --max-model-len -1 + --gpu-memory-utilization 0.85 + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "8" + limits: + nvidia.com/gpu: "8" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: h200-sxm + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml new file mode 100644 index 0000000..197c7a1 --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml @@ -0,0 +1,48 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.17.0-deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: deepseek-ai + mif.moreh.io/model.name: deepseek-v3.2 + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/parallelism: "dp8-moe-ep8" +spec: + framework: vllm + model: + name: deepseek-ai/DeepSeek-V3.2 + parallelism: + data: 8 + expert: true + workerTemplate: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.17.0 + env: + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --tokenizer-mode deepseek_v32 + --tool-call-parser deepseek_v32 + --reasoning-parser deepseek_v3 + --enable-auto-tool-choice + --max-model-len -1 + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "8" + limits: + nvidia.com/gpu: "8" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: h200-sxm + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml new file mode 100644 index 0000000..b24d246 --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml @@ -0,0 +1,45 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.17.0-nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: nvidia + mif.moreh.io/model.name: nvidia-nemotron-3-super-120b-a12b-fp8 + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/parallelism: "tp2-moe-tp2" +spec: + framework: vllm + model: + name: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8 + parallelism: + tensor: 2 + template: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.17.0 + env: + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --max-model-len -1 + --mamba-ssm-cache-dtype float16 + --enable-chunked-prefill + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "2" + limits: + nvidia.com/gpu: "2" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: h200-sxm + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml new file mode 100644 index 0000000..cab9930 --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2.helm.yaml @@ -0,0 +1,44 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.17.0-openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: openai + mif.moreh.io/model.name: gpt-oss-120b + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: h100-nvl + mif.moreh.io/parallelism: "tp2-moe-tp2" +spec: + framework: vllm + model: + name: openai/gpt-oss-120b + parallelism: + tensor: 2 + template: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.17.0 + env: + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --max-model-len -1 + --max-num-seqs 128 + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "2" + limits: + nvidia.com/gpu: "2" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: h100-nvl + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml new file mode 100644 index 0000000..cd36dd8 --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-27b-fp8-nvidia-l40s-1.helm.yaml @@ -0,0 +1,45 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.17.0-qwen-qwen3.5-27b-fp8-nvidia-l40s-1 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: qwen + mif.moreh.io/model.name: qwen3.5-27b-fp8 + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: l40s + mif.moreh.io/parallelism: "1" +spec: + framework: vllm + model: + name: Qwen/Qwen3.5-27B-FP8 + template: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.17.0 + env: + - name: VLLM_USE_DEEP_GEMM + value: "0" + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --max-model-len -1 + --enforce-eager + --reasoning-parser qwen3 + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "1" + limits: + nvidia.com/gpu: "1" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: l40s + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml new file mode 100644 index 0000000..ad3ef7f --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml @@ -0,0 +1,47 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.17.0-qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: qwen + mif.moreh.io/model.name: qwen3.5-397b-a17b-fp8 + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: h200-sxm + mif.moreh.io/parallelism: "dp8-moe-ep8" +spec: + framework: vllm + model: + name: Qwen/Qwen3.5-397B-A17B-FP8 + parallelism: + data: 8 + expert: true + workerTemplate: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.17.0 + env: + - name: VLLM_USE_DEEP_GEMM + value: "0" + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --max-model-len -1 + --reasoning-parser qwen3 + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "8" + limits: + nvidia.com/gpu: "8" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: h200-sxm + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml new file mode 100644 index 0000000..7e829f4 --- /dev/null +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.17.0/qwen-qwen3.5-9b-nvidia-l40s-1.helm.yaml @@ -0,0 +1,44 @@ +apiVersion: odin.moreh.io/v1alpha1 +kind: InferenceServiceTemplate +metadata: + name: vllm-v0.17.0-qwen-qwen3.5-9b-nvidia-l40s-1 + namespace: {{ include "common.names.namespace" . }} + labels: + {{- include "mif.preset.labels" . | nindent 4 }} + mif.moreh.io/model.org: qwen + mif.moreh.io/model.name: qwen3.5-9b + mif.moreh.io/role: e2e + mif.moreh.io/accelerator.vendor: nvidia + mif.moreh.io/accelerator.model: l40s + mif.moreh.io/parallelism: "1" +spec: + framework: vllm + model: + name: Qwen/Qwen3.5-9B + template: + spec: + containers: + - name: main + image: vllm/vllm-openai:v0.17.0 + env: + - name: VLLM_USE_DEEP_GEMM + value: "0" + - name: ISVC_EXTRA_ARGS + value: >- + --trust-remote-code + --max-model-len -1 + --reasoning-parser qwen3 + --disable-uvicorn-access-log + --no-enable-log-requests + resources: + requests: + nvidia.com/gpu: "1" + limits: + nvidia.com/gpu: "1" + nodeSelector: + moai.moreh.io/accelerator.vendor: nvidia + moai.moreh.io/accelerator.model: l40s + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule