Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
apiVersion: odin.moreh.io/v1alpha1
kind: InferenceServiceTemplate
metadata:
name: vllm-glm5-zai-org-glm-5-fp8-nvidia-h200-sxm-tp8-moe-tp8
namespace: {{ include "common.names.namespace" . }}
labels:
{{- include "mif.preset.labels" . | nindent 4 }}
mif.moreh.io/model.org: zai-org
mif.moreh.io/model.name: glm-5-fp8
mif.moreh.io/role: e2e
mif.moreh.io/accelerator.vendor: nvidia
mif.moreh.io/accelerator.model: h200-sxm
mif.moreh.io/parallelism: "tp8-moe-tp8"
spec:
framework: vllm
model:
name: zai-org/GLM-5-FP8
parallelism:
tensor: 8
template:
spec:
containers:
- name: main
# glm_moe_dsa arch is not supported in official vLLM.
# CUDA driver 580+ clusters may need a custom image override
# in InferenceService (e.g. dev.local/vllm-glm5:final).
image: vllm/vllm-openai:glm5
env:
- name: ISVC_EXTRA_ARGS
value: >-
--trust-remote-code
--tool-call-parser glm47
--reasoning-parser glm45
--enable-auto-tool-choice
--max-model-len -1
--gpu-memory-utilization 0.85
--disable-uvicorn-access-log
--no-enable-log-requests
resources:
requests:
nvidia.com/gpu: "8"
limits:
nvidia.com/gpu: "8"
nodeSelector:
moai.moreh.io/accelerator.vendor: nvidia
moai.moreh.io/accelerator.model: h200-sxm
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
apiVersion: odin.moreh.io/v1alpha1
kind: InferenceServiceTemplate
metadata:
name: vllm-v0.17.0-deepseek-ai-deepseek-v3.2-nvidia-h200-sxm-dp8-moe-ep8
namespace: {{ include "common.names.namespace" . }}
labels:
{{- include "mif.preset.labels" . | nindent 4 }}
mif.moreh.io/model.org: deepseek-ai
mif.moreh.io/model.name: deepseek-v3.2
mif.moreh.io/role: e2e
mif.moreh.io/accelerator.vendor: nvidia
mif.moreh.io/accelerator.model: h200-sxm
mif.moreh.io/parallelism: "dp8-moe-ep8"
spec:
framework: vllm
model:
name: deepseek-ai/DeepSeek-V3.2
parallelism:
data: 8
expert: true
workerTemplate:
spec:
containers:
- name: main
image: vllm/vllm-openai:v0.17.0
env:
- name: ISVC_EXTRA_ARGS
value: >-
--trust-remote-code
--tokenizer-mode deepseek_v32
--tool-call-parser deepseek_v32
--reasoning-parser deepseek_v3
--enable-auto-tool-choice
--max-model-len -1
--disable-uvicorn-access-log
--no-enable-log-requests
resources:
requests:
nvidia.com/gpu: "8"
limits:
nvidia.com/gpu: "8"
nodeSelector:
moai.moreh.io/accelerator.vendor: nvidia
moai.moreh.io/accelerator.model: h200-sxm
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
apiVersion: odin.moreh.io/v1alpha1
kind: InferenceServiceTemplate
metadata:
name: vllm-v0.17.0-nvidia-nvidia-nemotron-3-super-120b-a12b-fp8-nvidia-h200-sxm-tp2-moe-tp2
namespace: {{ include "common.names.namespace" . }}
labels:
{{- include "mif.preset.labels" . | nindent 4 }}
mif.moreh.io/model.org: nvidia
mif.moreh.io/model.name: nvidia-nemotron-3-super-120b-a12b-fp8
mif.moreh.io/role: e2e
mif.moreh.io/accelerator.vendor: nvidia
mif.moreh.io/accelerator.model: h200-sxm
mif.moreh.io/parallelism: "tp2-moe-tp2"
spec:
framework: vllm
model:
name: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8
parallelism:
tensor: 2
template:
spec:
containers:
- name: main
image: vllm/vllm-openai:v0.17.0
env:
- name: ISVC_EXTRA_ARGS
value: >-
--trust-remote-code
--max-model-len -1
--mamba-ssm-cache-dtype float16
--enable-chunked-prefill
--disable-uvicorn-access-log
--no-enable-log-requests
resources:
requests:
nvidia.com/gpu: "2"
limits:
nvidia.com/gpu: "2"
nodeSelector:
moai.moreh.io/accelerator.vendor: nvidia
moai.moreh.io/accelerator.model: h200-sxm
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
apiVersion: odin.moreh.io/v1alpha1
kind: InferenceServiceTemplate
metadata:
name: vllm-v0.17.0-openai-gpt-oss-120b-nvidia-h100-nvl-tp2-moe-tp2
namespace: {{ include "common.names.namespace" . }}
labels:
{{- include "mif.preset.labels" . | nindent 4 }}
mif.moreh.io/model.org: openai
mif.moreh.io/model.name: gpt-oss-120b
mif.moreh.io/role: e2e
mif.moreh.io/accelerator.vendor: nvidia
mif.moreh.io/accelerator.model: h100-nvl
mif.moreh.io/parallelism: "tp2-moe-tp2"
spec:
framework: vllm
model:
name: openai/gpt-oss-120b
parallelism:
tensor: 2
template:
spec:
containers:
- name: main
image: vllm/vllm-openai:v0.17.0
env:
- name: ISVC_EXTRA_ARGS
value: >-
--trust-remote-code
--max-model-len -1
--max-num-seqs 128
--disable-uvicorn-access-log
--no-enable-log-requests
resources:
requests:
nvidia.com/gpu: "2"
limits:
nvidia.com/gpu: "2"
nodeSelector:
moai.moreh.io/accelerator.vendor: nvidia
moai.moreh.io/accelerator.model: h100-nvl
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
apiVersion: odin.moreh.io/v1alpha1
kind: InferenceServiceTemplate
metadata:
name: vllm-v0.17.0-qwen-qwen3.5-27b-fp8-nvidia-l40s-1
namespace: {{ include "common.names.namespace" . }}
labels:
{{- include "mif.preset.labels" . | nindent 4 }}
mif.moreh.io/model.org: qwen
mif.moreh.io/model.name: qwen3.5-27b-fp8
mif.moreh.io/role: e2e
mif.moreh.io/accelerator.vendor: nvidia
mif.moreh.io/accelerator.model: l40s
mif.moreh.io/parallelism: "1"
spec:
framework: vllm
model:
name: Qwen/Qwen3.5-27B-FP8
template:
spec:
containers:
- name: main
image: vllm/vllm-openai:v0.17.0
env:
- name: VLLM_USE_DEEP_GEMM
value: "0"
- name: ISVC_EXTRA_ARGS
value: >-
--trust-remote-code
--max-model-len -1
--enforce-eager
--reasoning-parser qwen3
--disable-uvicorn-access-log
--no-enable-log-requests
resources:
requests:
nvidia.com/gpu: "1"
limits:
nvidia.com/gpu: "1"
nodeSelector:
moai.moreh.io/accelerator.vendor: nvidia
moai.moreh.io/accelerator.model: l40s
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
apiVersion: odin.moreh.io/v1alpha1
kind: InferenceServiceTemplate
metadata:
name: vllm-v0.17.0-qwen-qwen3.5-397b-a17b-fp8-nvidia-h200-sxm-dp8-moe-ep8
namespace: {{ include "common.names.namespace" . }}
labels:
{{- include "mif.preset.labels" . | nindent 4 }}
mif.moreh.io/model.org: qwen
mif.moreh.io/model.name: qwen3.5-397b-a17b-fp8
mif.moreh.io/role: e2e
mif.moreh.io/accelerator.vendor: nvidia
mif.moreh.io/accelerator.model: h200-sxm
mif.moreh.io/parallelism: "dp8-moe-ep8"
spec:
framework: vllm
model:
name: Qwen/Qwen3.5-397B-A17B-FP8
parallelism:
data: 8
expert: true
workerTemplate:
spec:
containers:
- name: main
image: vllm/vllm-openai:v0.17.0
env:
- name: VLLM_USE_DEEP_GEMM
value: "0"
- name: ISVC_EXTRA_ARGS
value: >-
--trust-remote-code
--max-model-len -1
--reasoning-parser qwen3
--disable-uvicorn-access-log
--no-enable-log-requests
resources:
requests:
nvidia.com/gpu: "8"
limits:
nvidia.com/gpu: "8"
nodeSelector:
moai.moreh.io/accelerator.vendor: nvidia
moai.moreh.io/accelerator.model: h200-sxm
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
apiVersion: odin.moreh.io/v1alpha1
kind: InferenceServiceTemplate
metadata:
name: vllm-v0.17.0-qwen-qwen3.5-9b-nvidia-l40s-1
namespace: {{ include "common.names.namespace" . }}
labels:
{{- include "mif.preset.labels" . | nindent 4 }}
mif.moreh.io/model.org: qwen
mif.moreh.io/model.name: qwen3.5-9b
mif.moreh.io/role: e2e
mif.moreh.io/accelerator.vendor: nvidia
mif.moreh.io/accelerator.model: l40s
mif.moreh.io/parallelism: "1"
spec:
framework: vllm
model:
name: Qwen/Qwen3.5-9B
template:
spec:
containers:
- name: main
image: vllm/vllm-openai:v0.17.0
env:
- name: VLLM_USE_DEEP_GEMM
value: "0"
- name: ISVC_EXTRA_ARGS
value: >-
--trust-remote-code
--max-model-len -1
--reasoning-parser qwen3
--disable-uvicorn-access-log
--no-enable-log-requests
resources:
requests:
nvidia.com/gpu: "1"
limits:
nvidia.com/gpu: "1"
nodeSelector:
moai.moreh.io/accelerator.vendor: nvidia
moai.moreh.io/accelerator.model: l40s
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
Loading